diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..d154b61555 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,34 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+static inline bool
+lfind32_2reg_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				result1,
+				result2,
+				result;
+
+	/* load the next block into 2 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+
+	/* combine the results into a single variable */
+	result = vector32_or(result1, result2);
+
+	/* see if there was a match */
+	if (vector32_is_highbit_set(result))
+		return true;
+
+	return false;
+}
+
 /*
  * pg_lfind32
  *
@@ -100,7 +128,8 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 remaining;
 
 	/* round down to multiple of elements per iteration */
 	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
@@ -119,6 +148,9 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
+	if (nelem < nelem_per_vector * 2)
+		goto slow_path;
+
 	for (i = 0; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -157,8 +189,21 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	nelem_per_iteration = 2 * nelem_per_vector;
+	remaining = nelem - i;
+
+	if (remaining == 0)
+		return false;
+
+	if (remaining > nelem_per_iteration &&
+		lfind32_2reg_helper(keys, &base[i]))
+		return true;
+
+	return lfind32_2reg_helper(keys, &base[nelem - nelem_per_iteration]);
 #endif							/* ! USE_NO_SIMD */
 
+slow_path:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
