From edd188759e4b937089c5bc2259a401cea1f9331f Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 14:27:52 -0500
Subject: [PATCH v3 3/3] optimize pg_lfind32() by processing "tail" with fewer
 vectors

---
 src/include/port/pg_lfind.h | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index 9d21284724..9c6cce0b69 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -100,7 +100,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
 	uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
@@ -120,7 +120,6 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	i = 0;
 #endif
 
-retry:
 	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -160,6 +159,32 @@ retry:
 		}
 	}
 
+retry:
+	nelem_per_iteration = 2 * nelem_per_vector;
+	tail_idx = nelem & ~(nelem_per_iteration - 1);
+	for (; i < tail_idx; i += nelem_per_iteration)
+	{
+		Vector32	vals1,
+					vals2,
+					result1,
+					result2,
+					result;
+
+		vector32_load(&vals1, &base[i]);
+		vector32_load(&vals2, &base[i + nelem_per_vector]);
+
+		result1 = vector32_eq(keys, vals1);
+		result2 = vector32_eq(keys, vals2);
+
+		result = vector32_or(result1, result2);
+
+		if (vector32_is_highbit_set(result))
+		{
+			Assert(assert_result == true);
+			return true;
+		}
+	}
+
 	if (i == nelem)
 		return false;
 	else if (tail_idx > 0)
-- 
2.25.1

