From bc23675ced26e11359f19e67cd57e92fd64744f8 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rasheed@gmail.com>
Date: Sat, 21 Jun 2025 13:00:50 +0100
Subject: [PATCH v4 3/5] Optimise non-native 128-bit addition in int128.h.

On platforms without native 128-bit integer support, the unsigned
addition code in int128.h can be made significantly simpler and faster
by noting that the low-part addition is unsigned integer arithmetic,
which is just modular arithmetic, and so the test for carry can be
written as a single "new < old" test. This can then be made branchless
to produce the same machine instructions as native 128-bit addition.

The signed addition case can be coded in almost the same way, with
just a single extra term to compensate for the sign of the input.
Again, this is intended to be branchless, and to match the native
128-bit integer addition code.
---
 src/include/common/int128.h | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index 8c300e56d9a..0f7e90ee887 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -68,17 +68,17 @@ int128_add_uint64(INT128 *i128, uint64 v)
 #else
 	/*
 	 * First add the value to the .lo part, then check to see if a carry needs
-	 * to be propagated into the .hi part.  A carry is needed if both inputs
-	 * have high bits set, or if just one input has high bit set while the new
-	 * .lo part doesn't.  Remember that .lo part is unsigned; we cast to
-	 * signed here just as a cheap way to check the high bit.
+	 * to be propagated into the .hi part.  Since this is unsigned integer
+	 * arithmetic, which is just modular arithmetic, a carry is needed if the
+	 * new .lo part is less than the old .lo part (i.e., if modular
+	 * wrap-around occurred).  Writing this in the form below, rather than
+	 * using an "if" statement causes modern compilers to produce branchless
+	 * machine code identical to the native code.
 	 */
 	uint64		oldlo = i128->lo;
 
 	i128->lo += v;
-	if (((int64) v < 0 && (int64) oldlo < 0) ||
-		(((int64) v < 0 || (int64) oldlo < 0) && (int64) i128->lo >= 0))
-		i128->hi++;
+	i128->hi += (i128->lo < oldlo);
 #endif
 }
 
@@ -93,23 +93,18 @@ int128_add_int64(INT128 *i128, int64 v)
 #else
 	/*
 	 * This is much like the above except that the carry logic differs for
-	 * negative v.  Ordinarily we'd need to subtract 1 from the .hi part
-	 * (corresponding to adding the sign-extended bits of v to it); but if
-	 * there is a carry out of the .lo part, that cancels and we do nothing.
+	 * negative v -- we need to subtract 1 from the .hi part if the new .lo
+	 * value is greater than the old .lo value.  That can be achieved without
+	 * any branching by adding the sign bit from v (v >> 63 = 0 or -1) to the
+	 * previous result (for negative v, if the new .lo value is less than the
+	 * old .lo value, the two terms cancel and we leave the .hi part
+	 * unchanged, otherwise we subtract 1 from the .hi part).  Again, this
+	 * produces identical output to the native code with modern compilers.
 	 */
 	uint64		oldlo = i128->lo;
 
 	i128->lo += v;
-	if (v >= 0)
-	{
-		if ((int64) oldlo < 0 && (int64) i128->lo >= 0)
-			i128->hi++;
-	}
-	else
-	{
-		if (!((int64) oldlo < 0 || (int64) i128->lo >= 0))
-			i128->hi--;
-	}
+	i128->hi += (i128->lo < oldlo) + (v >> 63);
 #endif
 }
 
-- 
2.43.0