From afc847cc5ad2dc6a7054d452bf6c85d80bfcc527 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 1 Nov 2024 11:46:29 +0000
Subject: [PATCH v8] Optimize pg_memory_is_all_zeros()

pg_memory_is_all_zeros() is currently doing byte per byte comparison and so
could lead to performance regression or penalties when multi bytes comparison
could be done instead.

Let's provide an optimized version that divides the checks into four phases for
efficiency:

- Initial alignment (byte per byte comparison)
- Compare 8 size_t chunks at once using bitwise OR (candidate for SIMD optimization)
- Compare remaining size_t aligned chunks
- Compare remaining bytes (byte per byte comparison)

Code mainly suggested by David Rowley.

In passing, let's make use of this new version in PageIsVerifiedExtended() now
that multi bytes comparison is handled.
---
 src/backend/storage/page/bufpage.c | 13 +-------
 src/include/utils/memutils.h       | 51 ++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 15 deletions(-)
  13.4% src/backend/storage/page/
  86.5% src/include/utils/

diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index be6f1f62d2..aa264f61b9 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -89,10 +89,8 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
 {
 	PageHeader	p = (PageHeader) page;
 	size_t	   *pagebytes;
-	int			i;
 	bool		checksum_failure = false;
 	bool		header_sane = false;
-	bool		all_zeroes = false;
 	uint16		checksum = 0;
 
 	/*
@@ -126,18 +124,9 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
 	}
 
 	/* Check all-zeroes case */
-	all_zeroes = true;
 	pagebytes = (size_t *) page;
-	for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
-	{
-		if (pagebytes[i] != 0)
-		{
-			all_zeroes = false;
-			break;
-		}
-	}
 
-	if (all_zeroes)
+	if (pg_memory_is_all_zeros(pagebytes, BLCKSZ))
 		return true;
 
 	/*
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 3590c8bad9..b109a47c81 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -192,17 +192,62 @@ extern MemoryContext BumpContextCreate(MemoryContext parent,
 /*
  * Test if a memory region starting at "ptr" and of size "len" is full of
  * zeroes.
+ *
+ * The test is divided into four phases for efficiency:
+ *  - Initial alignment (byte per byte comparison)
+ *  - Compare 8 size_t chunks at once using bitwise OR
+ *  - Compare remaining size_t aligned chunks
+ *  - Compare remaining bytes (byte per byte comparison)
+ *
+ * Caller must ensure that "ptr" is non-null.
  */
 static inline bool
 pg_memory_is_all_zeros(const void *ptr, size_t len)
 {
-	const char *p = (const char *) ptr;
+	const unsigned char *p = (const unsigned char *) ptr;
+	const unsigned char *end = &p[len];
+	const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1)));
+
+	/* Compare bytes, byte by byte, until the pointer "p" is aligned */
+	while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
+	{
+		if (p == end)
+			return true;
+
+		if (*p++ != 0)
+			return false;
+	}
+
+	/*
+	 * Compare 8 size_t chunks at once.
+	 *
+	 * The logic here is that all comparisons can be done in parallel and
+	 * combined with a single OR operation, making it a good candidate for
+	 * SIMD optimization.
+	 */
+	for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
+	{
+		if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
+			(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
+			(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
+			(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
+			return false;
+	}
+
+	/* Compare remaining size_t aligned chunks */
+	for (; p < aligned_end; p += sizeof(size_t))
+	{
+		if (*(size_t *)p != 0)
+			return false;
+	}
 
-	for (size_t i = 0; i < len; i++)
+	/* Compare remaining bytes, byte by byte */
+	while (p < end)
 	{
-		if (p[i] != 0)
+		if (*p++ != 0)
 			return false;
 	}
+
 	return true;
 }
 
-- 
2.34.1

