From 3d186bc8be218c609ff9005d88ea6222b3af4445 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sat, 14 Feb 2026 19:01:34 +0700
Subject: [PATCH v4 4/6] Enable autovectorizing page checksums with AVX2 where
 available

We already rely on autovectorization for computing page checksums,
but on x86 we can get about twice the performance by annotating
pg_checksum_block() with function target attributes for AVX2,
which uses 256-bit registers.

Co-authored-by: Matthew Sterrett <matthewsterrett2@gmail.com>
Co-authored-by: Andrew Kim <andrew.kim@intel.com>
Reviewed-by: Oleg Tselebrovskiy <o.tselebrovskiy@postgrespro.ru>
Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com
Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal
---
 config/c-compiler.m4                          | 26 ++++++++++
 configure                                     | 46 ++++++++++++++++++
 configure.ac                                  |  9 ++++
 meson.build                                   | 30 ++++++++++++
 src/backend/storage/page/checksum.c           | 44 ++++++++++++++++-
 src/include/pg_config.h.in                    |  3 ++
 src/include/port/pg_cpu.h                     |  3 ++
 src/include/storage/checksum_block_internal.h | 42 ++++++++++++++++
 src/include/storage/checksum_impl.h           | 48 ++++++-------------
 src/port/pg_cpu_x86.c                         |  6 ++-
 src/tools/pginclude/headerscheck              |  2 +
 11 files changed, 224 insertions(+), 35 deletions(-)
 create mode 100644 src/include/storage/checksum_block_internal.h

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 1509dbfa2ab..1f3e31fc2d3 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -613,6 +613,32 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
+# PGAC_AVX2_SUPPORT
+# ---------------------------
+# Check if the compiler supports AVX2 target attribute.
+# This is used for optimized checksum calculations with runtime detection.
+#
+# If AVX2 target attribute is supported, sets pgac_avx2_support.
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
+[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }
+    #endif],
+  [return avx2_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
 # PGAC_AVX512_PCLMUL_INTRINSICS
 # ---------------------------
 # Check if the compiler supports AVX-512 carryless multiplication
diff --git a/configure b/configure
index 185703289b4..2d2c6308005 100755
--- a/configure
+++ b/configure
@@ -17718,6 +17718,52 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
   fi
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5
+$as_echo_n "checking for AVX2 target attribute support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }
+    #endif
+int
+main ()
+{
+return avx2_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  pgac_cv_avx2_support=yes
+else
+  pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+
+  if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for XSAVE intrinsics
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
diff --git a/configure.ac b/configure.ac
index 0955b7e4371..0b4c3970b68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2122,6 +2122,15 @@ else
   fi
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX2_SUPPORT()
+  if test x"$pgac_avx2_support" = x"yes"; then
+    AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+  fi
+fi
+
 # Check for XSAVE intrinsics
 #
 PGAC_XSAVE_INTRINSICS()
diff --git a/meson.build b/meson.build
index f6d5842d852..feea3658ff3 100644
--- a/meson.build
+++ b/meson.build
@@ -2377,6 +2377,36 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of AVX2 support
+###############################################################
+
+if host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx2")))
+#endif
+static int avx2_test(void)
+{
+    return 0;
+}
+
+int main(void)
+{
+    return avx2_test();
+}
+'''
+
+  if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+    cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Check for the availability of AVX-512 popcount intrinsics.
 ###############################################################
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
index 8716651c8b5..030c44f7308 100644
--- a/src/backend/storage/page/checksum.c
+++ b/src/backend/storage/page/checksum.c
@@ -13,10 +13,52 @@
  */
 #include "postgres.h"
 
+#include "port/pg_cpu.h"
 #include "storage/checksum.h"
 /*
  * The actual code is in storage/checksum_impl.h.  This is done so that
  * external programs can incorporate the checksum code by #include'ing
- * that file from the exported Postgres headers.  (Compare our CRC code.)
+ * that file from the exported Postgres headers.  (Compare our legacy
+ * CRC code in pg_crc.h.)
+ * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific
+ * coding without affecting external programs.
  */
+#define PG_CHECKSUM_INTERNAL
 #include "storage/checksum_impl.h"	/* IWYU pragma: keep */
+
+
+static uint32
+pg_checksum_block_fallback(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block_internal.h"
+}
+
+/*
+ * AVX2-optimized block checksum algorithm.
+ */
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+pg_attribute_target("avx2")
+static uint32
+pg_checksum_block_avx2(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block_internal.h"
+}
+#endif							/* USE_AVX2_WITH_RUNTIME_CHECK */
+
+/*
+ * Choose the best available checksum implementation.
+ */
+static uint32
+pg_checksum_choose(const PGChecksummablePage *page)
+{
+	pg_checksum_block = pg_checksum_block_fallback;
+
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+	if (x86_feature_available(PG_AVX2))
+		pg_checksum_block = pg_checksum_block_avx2;
+#endif
+
+	return pg_checksum_block(page);
+}
+
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose;
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 339268dc8ef..1e43e9b2bc4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -665,6 +665,9 @@
 /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
 #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index c9e03d016e9..3c70fd43a23 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -24,6 +24,9 @@ typedef enum X86FeatureId
 	PG_SSE4_2,
 	PG_POPCNT,
 
+	/* 256-bit registers */
+	PG_AVX2,
+
 	/* 512-bit registers */
 	PG_AVX512_BW,
 	PG_AVX512_VL,
diff --git a/src/include/storage/checksum_block_internal.h b/src/include/storage/checksum_block_internal.h
new file mode 100644
index 00000000000..b4e6987d6b5
--- /dev/null
+++ b/src/include/storage/checksum_block_internal.h
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum_block_internal.h
+ *	  Core algorithm for page checksums , semi private to checksum_impl.h
+ *	  and checksum.c.
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum_block_internal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INTERNAL_H here */
+
+uint32		sums[N_SUMS];
+uint32		result = 0;
+uint32		i,
+			j;
+
+/* ensure that the size is compatible with the algorithm */
+Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+/* initialize partial checksums to their corresponding offsets */
+memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+/* main checksum calculation */
+for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+	for (j = 0; j < N_SUMS; j++)
+		CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+/* finally add in two rounds of zeroes for additional mixing */
+for (i = 0; i < 2; i++)
+	for (j = 0; j < N_SUMS; j++)
+		CHECKSUM_COMP(sums[j], 0);
+
+/* xor fold partial checksums together */
+for (i = 0; i < N_SUMS; i++)
+	result ^= sums[i];
+
+return result;
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index 5c2dcbc63e7..8a308e423c3 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -73,11 +73,10 @@
  * 2e-16 false positive rate within margin of error.
  *
  * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
- * multiplication instruction. As of 2013 the corresponding instruction is
- * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
- * Vectorization requires a compiler to do the vectorization for us. For recent
- * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
- * to achieve vectorization.
+ * multiplication instruction. Examples include x86 AVX2 extensions (vpmulld)
+ * and ARM NEON (vmul.i32). For simplicity we rely on the compiler to do the
+ * vectorization for us. For GCC and clang the flags -funroll-loops
+ * -ftree-vectorize are enough to achieve vectorization.
  *
  * The optimal amount of parallelism to use depends on CPU specific instruction
  * latency, SIMD instruction width, throughput and the amount of registers
@@ -89,8 +88,9 @@
  *
  * The parallelism number 32 was chosen based on the fact that it is the
  * largest state that fits into architecturally visible x86 SSE registers while
- * leaving some free registers for intermediate values. For future processors
- * with 256bit vector registers this will leave some performance on the table.
+ * leaving some free registers for intermediate values. For processors
+ * with 256bit vector registers this leaves some performance on the table.
+ *
  * When vectorization is not available it might be beneficial to restructure
  * the computation to calculate a subset of the columns at a time and perform
  * multiple passes to avoid register spilling. This optimization opportunity
@@ -138,6 +138,9 @@ do { \
 	(checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \
 } while (0)
 
+/* Provide a static definition for external programs */
+#ifndef PG_CHECKSUM_INTERNAL
+
 /*
  * Block checksum algorithm.  The page must be adequately aligned
  * (at least on 4-byte boundary).
@@ -145,34 +148,13 @@ do { \
 static uint32
 pg_checksum_block(const PGChecksummablePage *page)
 {
-	uint32		sums[N_SUMS];
-	uint32		result = 0;
-	uint32		i,
-				j;
-
-	/* ensure that the size is compatible with the algorithm */
-	Assert(sizeof(PGChecksummablePage) == BLCKSZ);
-
-	/* initialize partial checksums to their corresponding offsets */
-	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
-
-	/* main checksum calculation */
-	for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
-		for (j = 0; j < N_SUMS; j++)
-			CHECKSUM_COMP(sums[j], page->data[i][j]);
-
-	/* finally add in two rounds of zeroes for additional mixing */
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < N_SUMS; j++)
-			CHECKSUM_COMP(sums[j], 0);
-
-	/* xor fold partial checksums together */
-	for (i = 0; i < N_SUMS; i++)
-		result ^= sums[i];
-
-	return result;
+#include "storage/checksum_block_internal.h"
 }
 
+#else
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page);
+#endif
+
 /*
  * Compute the checksum for a Postgres page.
  *
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 9b462b186b8..a48c39ade98 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -74,7 +74,7 @@ set_x86_features(void)
 	{
 		uint32		xcr0_val = 0;
 
-		/* second cpuid call on leaf 7 to check extended AVX-512 support */
+		/* second cpuid call on leaf 7 to check extended support */
 		memset(exx, 0, 4 * sizeof(exx[0]));
 #if defined(HAVE__GET_CPUID_COUNT)
 		__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
@@ -87,6 +87,10 @@ set_x86_features(void)
 		xcr0_val = _xgetbv(0);
 #endif
 
+		/* Are YMM registers enabled? */
+		if (mask_available(xcr0_val, XMM | YMM))
+			X86Features[PG_AVX2] = exx[1] >> 5 & 1;
+
 		/* Are ZMM registeres enabled? */
 		if (mask_available(xcr0_val, XMM | YMM |
 						   OPMASK | ZMM0_15 | ZMM16_31))
diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck
index 7a6755991bb..569e749b25a 100755
--- a/src/tools/pginclude/headerscheck
+++ b/src/tools/pginclude/headerscheck
@@ -154,6 +154,8 @@ do
 	test "$f" = src/include/catalog/syscache_ids.h && continue
 	test "$f" = src/include/catalog/syscache_info.h && continue
 
+	test "$f" = src/include/storage/checksum_block_internal.h && continue
+
 	# We can't make these Bison output files compilable standalone
 	# without using "%code require", which old Bison versions lack.
 	# parser/gram.h will be included by parser/gramparse.h anyway.
-- 
2.53.0

