From 029373ab2e9d2e6802326871f248ba2f21ffb204 Mon Sep 17 00:00:00 2001
From: Amit Khandekar <amitdkhan.pg@gmail.com>
Date: Tue, 21 Jul 2020 16:42:51 +0800
Subject: [PATCH] Auto-vectorize loop to speedup large-precision numeric
 product

A 'for' loop in mul_var() runs backwards by decrementing two
variables. This prevents the gcc compiler from auto-vectorizing the
for loop. So make it a forward loop with a single variable. This gives
performance benefits for product of numeric types with large
precision, with speedups becoming noticeable from values
with precisions starting from 20-40. Typical pattern of benefit is :
precision 50: 5%; precision 60: 11%; 120 : 50%; 240: 2.2x; and so on.
On some CPU architectures, the speedup starts from 20 precision
onwards.
With the precisions used in the numeric_big regression test, the
multiplication speeds up by 2.5 to 2.7 times.

Auto-vectorization happens with gcc -O3 flag or -ftree-loop-vectorize.
So arrange for -free-loop-vectorize flag specifically when compiling
numeric.c. CFLAGS_VECTOR was already present for similar
functionality in checksum.c, but CFLAGS_VECTOR also includes
-funroll-loops which unecessarily makes numeric.o larger. So split
CFLAGS_VECTOR into CFLAGS_UNROLL_LOOPS and CFLAGS_VECTOR.
---
 configure                         | 17 +++++++++++------
 configure.in                      |  9 +++++++--
 src/Makefile.global.in            |  1 +
 src/backend/storage/page/Makefile |  2 +-
 src/backend/utils/adt/Makefile    |  3 +++
 src/backend/utils/adt/numeric.c   | 11 ++++++++---
 6 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/configure b/configure
index cb8fbe1051..36ca734ad5 100755
--- a/configure
+++ b/configure
@@ -734,6 +734,7 @@ CPP
 CFLAGS_SL
 BITCODE_CXXFLAGS
 BITCODE_CFLAGS
+CFLAGS_UNROLL_LOOPS
 CFLAGS_VECTOR
 PERMIT_DECLARATION_AFTER_STATEMENT
 LLVM_BINPATH
@@ -5266,10 +5267,13 @@ BITCODE_CFLAGS=""
 user_BITCODE_CXXFLAGS=$BITCODE_CXXFLAGS
 BITCODE_CXXFLAGS=""
 
-# set CFLAGS_VECTOR from the environment, if available
+# set CFLAGS_VECTOR and CFLAGS_UNROLL_LOOPS from the environment, if available
 if test "$ac_env_CFLAGS_VECTOR_set" = set; then
   CFLAGS_VECTOR=$ac_env_CFLAGS_VECTOR_value
 fi
+if test "$ac_env_CFLAGS_UNROLL_LOOPS_set" = set; then
+  CFLAGS_UNROLL_LOOPS=$ac_env_CFLAGS_UNROLL_LOOPS_value
+fi
 
 # Some versions of GCC support some additional useful warning flags.
 # Check whether they are supported, and add them to CFLAGS if so.
@@ -6102,16 +6106,16 @@ if test x"$pgac_cv_prog_CXX_cxxflags__fexcess_precision_standard" = x"yes"; then
 fi
 
 
-  # Optimization flags for specific files that benefit from vectorization
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -funroll-loops, for CFLAGS_VECTOR" >&5
-$as_echo_n "checking whether ${CC} supports -funroll-loops, for CFLAGS_VECTOR... " >&6; }
+  # Optimization flags for specific files that benefit from loop unrolling
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -funroll-loops, for CFLAGS_UNROLL_LOOPS" >&5
+$as_echo_n "checking whether ${CC} supports -funroll-loops, for CFLAGS_UNROLL_LOOPS... " >&6; }
 if ${pgac_cv_prog_CC_cflags__funroll_loops+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   pgac_save_CFLAGS=$CFLAGS
 pgac_save_CC=$CC
 CC=${CC}
-CFLAGS="${CFLAGS_VECTOR} -funroll-loops"
+CFLAGS="${CFLAGS_UNROLL_LOOPS} -funroll-loops"
 ac_save_c_werror_flag=$ac_c_werror_flag
 ac_c_werror_flag=yes
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -6138,10 +6142,11 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__funroll_loops" >&5
 $as_echo "$pgac_cv_prog_CC_cflags__funroll_loops" >&6; }
 if test x"$pgac_cv_prog_CC_cflags__funroll_loops" = x"yes"; then
-  CFLAGS_VECTOR="${CFLAGS_VECTOR} -funroll-loops"
+  CFLAGS_UNROLL_LOOPS="${CFLAGS_UNROLL_LOOPS} -funroll-loops"
 fi
 
 
+  # Optimization flags for specific files that benefit from vectorization
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -ftree-vectorize, for CFLAGS_VECTOR" >&5
 $as_echo_n "checking whether ${CC} supports -ftree-vectorize, for CFLAGS_VECTOR... " >&6; }
 if ${pgac_cv_prog_CC_cflags__ftree_vectorize+:} false; then :
diff --git a/configure.in b/configure.in
index e91e49a579..17a2b8ca43 100644
--- a/configure.in
+++ b/configure.in
@@ -466,10 +466,13 @@ BITCODE_CFLAGS=""
 user_BITCODE_CXXFLAGS=$BITCODE_CXXFLAGS
 BITCODE_CXXFLAGS=""
 
-# set CFLAGS_VECTOR from the environment, if available
+# set CFLAGS_VECTOR and CFLAGS_UNROLL_LOOPS from the environment, if available
 if test "$ac_env_CFLAGS_VECTOR_set" = set; then
   CFLAGS_VECTOR=$ac_env_CFLAGS_VECTOR_value
 fi
+if test "$ac_env_CFLAGS_UNROLL_LOOPS_set" = set; then
+  CFLAGS_UNROLL_LOOPS=$ac_env_CFLAGS_UNROLL_LOOPS_value
+fi
 
 # Some versions of GCC support some additional useful warning flags.
 # Check whether they are supported, and add them to CFLAGS if so.
@@ -512,8 +515,9 @@ if test "$GCC" = yes -a "$ICC" = no; then
   # Disable FP optimizations that cause various errors on gcc 4.5+ or maybe 4.6+
   PGAC_PROG_CC_CFLAGS_OPT([-fexcess-precision=standard])
   PGAC_PROG_CXX_CFLAGS_OPT([-fexcess-precision=standard])
+  # Optimization flags for specific files that benefit from loop unrolling
+  PGAC_PROG_CC_VAR_OPT(CFLAGS_UNROLL_LOOPS, [-funroll-loops])
   # Optimization flags for specific files that benefit from vectorization
-  PGAC_PROG_CC_VAR_OPT(CFLAGS_VECTOR, [-funroll-loops])
   PGAC_PROG_CC_VAR_OPT(CFLAGS_VECTOR, [-ftree-vectorize])
   # We want to suppress clang's unhelpful unused-command-line-argument warnings
   # but gcc won't complain about unrecognized -Wno-foo switches, so we have to
@@ -556,6 +560,7 @@ elif test "$PORTNAME" = "hpux"; then
 fi
 
 AC_SUBST(CFLAGS_VECTOR)
+AC_SUBST(CFLAGS_UNROLL_LOOPS)
 
 # Determine flags used to emit bitcode for JIT inlining. Need to test
 # for behaviour changing compiler flags, to keep compatibility with
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 9a6265b3a0..05d8af9a44 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -260,6 +260,7 @@ CXX = @CXX@
 CFLAGS = @CFLAGS@
 CFLAGS_SL = @CFLAGS_SL@
 CFLAGS_VECTOR = @CFLAGS_VECTOR@
+CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_SSE42 = @CFLAGS_SSE42@
 CFLAGS_ARMV8_CRC32C = @CFLAGS_ARMV8_CRC32C@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
diff --git a/src/backend/storage/page/Makefile b/src/backend/storage/page/Makefile
index 10021e2bb3..d8ec983fc4 100644
--- a/src/backend/storage/page/Makefile
+++ b/src/backend/storage/page/Makefile
@@ -20,4 +20,4 @@ OBJS =  \
 include $(top_srcdir)/src/backend/common.mk
 
 # important optimizations flags for checksum.c
-checksum.o: CFLAGS += ${CFLAGS_VECTOR}
+checksum.o: CFLAGS += ${CFLAGS_VECTOR} ${CFLAGS_UNROLL_LOOPS}
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index 5d2aca8cfe..df0be0f2fd 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -124,6 +124,9 @@ clean distclean maintainer-clean:
 
 like.o: like.c like_match.c
 
+# Some code in numeric.c benefits from auto-vectorization
+numeric.o: CFLAGS += ${CFLAGS_VECTOR}
+
 varlena.o: varlena.c levenshtein.c
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 1773fa292e..dc0e96cf0d 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -7327,6 +7327,7 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 	int			res_weight;
 	int			maxdigits;
 	int		   *dig;
+	int		   *digptr;
 	int			carry;
 	int			maxdig;
 	int			newdig;
@@ -7463,10 +7464,14 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 		 *
 		 * As above, digits of var2 can be ignored if they don't contribute,
 		 * so we only include digits for which i1+i2+2 <= res_ndigits - 1.
+		 *
+		 * For large precisions, this can become a bottleneck; so keep this for
+		 * loop simple so that it can be auto-vectorized.
 		 */
-		for (i2 = Min(var2ndigits - 1, res_ndigits - i1 - 3), i = i1 + i2 + 2;
-			 i2 >= 0; i2--)
-			dig[i--] += var1digit * var2digits[i2];
+		i2 = Min(var2ndigits - 1, res_ndigits - i1 - 3);
+		digptr = &dig[i1 + 2];
+		for (i = 0; i <= i2; i++)
+			digptr[i] += var1digit * var2digits[i];
 	}
 
 	/*
-- 
2.17.1