From 9d22b244e257d2e4cccc321b7d5ed6d90f5ea3a4 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rasheed@gmail.com>
Date: Thu, 18 Jul 2024 18:32:56 +0100
Subject: [PATCH v3 2/2] Optimise numeric multiplication using base-NBASE^2
 arithmetic.

Currently mul_var() uses the schoolbook multiplication algorithm,
which is O(n^2) in the number of NBASE digits. To improve performance
for large inputs, convert the inputs to base NBASE^2 before
multiplying, which effectively halves the number of digits in each
input, theoretically speeding up the computation by a factor of 4. In
practice, the actual speedup for large inputs varies between around 3
and 6 times, depending on the system and compiler used. In turn, this
significantly reduces the runtime of the numeric_big regression test.

For this to work, 64-bit integers are required for the products of
base-NBASE^2 digits, so this works best on 64-bit machines, for which
it is faster whenever the shorter input has more than 4 or 5 NBASE
digits. On 32-bit machines, the additional overheads, especially
during carry propagation and the final conversion back to base-NBASE,
are significantly higher, and it is only faster when the shorter input
has more than around 30 NBASE digits. Therefore, only use this
approach above a platform-dependent threshold.

For inputs below the threshold, the original base-NBASE algorithm is
used, except that it can be simplified because the threshold is low
enough that intermediate carry-propagation passes are not required.
Above the threshold, the available headroom in 64-bit integers is much
larger than for 32-bit integers, so the frequency of carry-propagation
passes is greatly reduced. In addition, unsigned integers are used
throughout, further increasing the headroom.
---
 src/backend/utils/adt/numeric.c | 512 +++++++++++++++++++++++++-------
 1 file changed, 401 insertions(+), 111 deletions(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 9b9b88662a..c463901428 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -101,6 +101,29 @@ typedef signed char NumericDigit;
 typedef int16 NumericDigit;
 #endif
 
+/*
+ * Above a certain size threshold, it is faster to multiply numbers by
+ * converting them to base NBASE^2, and use 64-bit integer arithmetic. This
+ * threshold is determined empirically, and is necessarily higher on 32-bit
+ * machines, which are less efficient at performing 64-bit arithmetic.
+ *
+ * To simplify the computation below this threshold, it intentially kept below
+ * the point at which intermediate carry-propagation passes may be necessary.
+ * Therefore, as explained in mul_var(), it must be no larger than
+ * (PG_UINT32_MAX - PG_UINT32_MAX / NBASE) / (NBASE - 1)^2, which is 42 when
+ * NBASE is 10000.
+ */
+#define MUL_64BIT_THRESHOLD_MAX \
+	((PG_UINT32_MAX - PG_UINT32_MAX / NBASE) / (NBASE - 1) / (NBASE - 1))
+
+#if SIZEOF_DATUM < 8
+#define MUL_64BIT_THRESHOLD 30
+#else
+#define MUL_64BIT_THRESHOLD 4
+#endif
+
+#define NBASE_SQR	(NBASE * NBASE)
+
 /*
  * The Numeric type as stored on disk.
  *
@@ -8663,6 +8686,85 @@ sub_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result)
 }
 
 
+/*
+ * div_mod_NBASE_SQR() -
+ *
+ *	Divide a 64-bit integer "num" by NBASE_SQR, returning the quotient and
+ *	remainder.  Technically, the remainder could be a 32-bit integer, but the
+ *	caller actually wants a 64-bit integer, so this is more efficient.
+ */
+static inline uint64
+div_mod_NBASE_SQR(uint64 num, uint64 *rem)
+{
+	uint64		quot;
+
+	/* ----------
+	 * On a 32-bit machine, the compiler does 64-bit division using a builtin
+	 * function such as __udivdi3(), which is very slow.  Replace that with a
+	 * multiply-and-shift algorithm, based on the way compilers do it on
+	 * 64-bit machines.  Assuming that DEC_DIGITS is 4, and NBASE_SQR = 10^8,
+	 * the multiply-and-shift formula is
+	 *
+	 *	quot = num / 10^8 = (num * multiplier) >> 90
+	 *
+	 * where multiplier = ceil(2^90 / 10^8) = 12379400392853802749.
+	 *
+	 * The 2^90 scaling factor here guarantees correct results for all inputs.
+	 * See "Division by Invariant Integers using Multiplication", Torbjorn
+	 * Granlund and Peter L. Montgomery, PLDI '94: Proceedings of the ACM
+	 * SIGPLAN 1994 conference on Programming language design and
+	 * implementation (https://dl.acm.org/doi/pdf/10.1145/178243.178249).
+	 *
+	 * Since num and multiplier are 64-bit unsigned integers, their product is
+	 * a 128-bit unsigned integer, but we only require the high part.  This is
+	 * done by decomposing num and multiplier into high and low 32-bit parts,
+	 * and then computing the upper 64 bits of the full product:
+	 *
+	 *	num * multiplier =
+	 *		(num_hi * multiplier_hi) << 64 +
+	 *		(num_hi * multiplier_lo + num_lo * multiplier_hi) << 32 +
+	 *		num_lo * multiplier_lo
+	 *
+	 * We don't bother with this optimization for other NBASE values.
+	 * ----------
+	 */
+#if SIZEOF_DATUM < 8 && DEC_DIGITS == 4
+	const uint64 multiplier = UINT64CONST(12379400392853802749);
+
+	/* high and low 32-bit parts of num and multiplier */
+#define UINT64_HI32(x) ((uint32) ((x) >> 32))
+#define UINT64_LO32(x) ((uint32) (x))
+	const uint32 multiplier_hi = UINT64_HI32(multiplier);
+	const uint32 multiplier_lo = UINT64_LO32(multiplier);
+	uint32		num_hi = UINT64_HI32(num);
+	uint32		num_lo = UINT64_LO32(num);
+	uint64		tmp1,
+				tmp2,
+				prod_hi;
+
+	/* high 64-bit part of 128-bit product */
+	tmp1 = (uint64) num_hi * multiplier_lo;
+	tmp2 = (uint64) num_lo * multiplier_lo;
+	tmp1 += UINT64_HI32(tmp2);
+	prod_hi = (uint64) num_hi * multiplier_hi;
+	prod_hi += UINT64_HI32(tmp1);
+	tmp2 = (uint64) num_lo * multiplier_hi;
+	tmp2 += UINT64_LO32(tmp1);
+	prod_hi += UINT64_HI32(tmp2);
+
+	/* quotient is in the top 38 bits */
+	quot = prod_hi >> 26;
+#else
+	/* just divide normally */
+	quot = num / NBASE_SQR;
+#endif
+	/* remainder */
+	*rem = num - quot * NBASE_SQR;
+
+	return quot;
+}
+
+
 /*
  * mul_var() -
  *
@@ -8677,10 +8779,6 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 	int			res_sign;
 	int			res_weight;
 	int			maxdigits;
-	int		   *dig;
-	int			carry;
-	int			maxdig;
-	int			newdig;
 	int			var1ndigits;
 	int			var2ndigits;
 	NumericDigit *var1digits;
@@ -8688,7 +8786,8 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 	NumericDigit *res_digits;
 	int			i,
 				i1,
-				i2;
+				i2,
+				i2limit;
 
 	/*
 	 * Arrange for var1 to be the shorter of the two numbers.  This improves
@@ -8729,141 +8828,332 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 		return;
 	}
 
-	/* Determine result sign and (maximum possible) weight */
+	/* Determine result sign */
 	if (var1->sign == var2->sign)
 		res_sign = NUMERIC_POS;
 	else
 		res_sign = NUMERIC_NEG;
-	res_weight = var1->weight + var2->weight + 2;
 
 	/*
-	 * Determine the number of result digits to compute.  If the exact result
-	 * would have more than rscale fractional digits, truncate the computation
-	 * with MUL_GUARD_DIGITS guard digits, i.e., ignore input digits that
-	 * would only contribute to the right of that.  (This will give the exact
-	 * rounded-to-rscale answer unless carries out of the ignored positions
-	 * would have propagated through more than MUL_GUARD_DIGITS digits.)
+	 * We do the arithmetic in an array "dig[]" of unsigned 32-bit or 64-bit
+	 * integers, depending on the size of var1.
 	 *
-	 * Note: an exact computation could not produce more than var1ndigits +
-	 * var2ndigits digits, but we allocate one extra output digit in case
-	 * rscale-driven rounding produces a carry out of the highest exact digit.
-	 */
-	res_ndigits = var1ndigits + var2ndigits + 1;
-	maxdigits = res_weight + 1 + (rscale + DEC_DIGITS - 1) / DEC_DIGITS +
-		MUL_GUARD_DIGITS;
-	res_ndigits = Min(res_ndigits, maxdigits);
+	 * If var1 has more than MUL_64BIT_THRESHOLD digits, we convert the inputs
+	 * to base NBASE^2 and multiply using 64-bit integer arithmetic, which is
+	 * much faster, since schoolbook multiplication is O(N^2) in the number of
+	 * input digits, and working in base NBASE^2 effectively halves "N".
+	 *
+	 * Below this threshold, we work with the original base-NBASE numbers, and
+	 * use 32-bit integer arithmetic.  To simplify the algorithm, we ensure
+	 * that the threshold is low enough so that the number of products being
+	 * added to any element of dig[] is small enough to avoid integer
+	 * overflow.  Furthermore, we need to ensure that overflow doesn't occur
+	 * during the final carry-propagation pass.  The carry values can be as
+	 * large as PG_UINT32_MAX / NBASE, and so the values in dig[] must not
+	 * exceed PG_UINT32_MAX - PG_UINT32_MAX / NBASE.  Since each product of
+	 * digits is at most (NBASE - 1)^2, the number of products must not exceed
+	 * (PG_UINT32_MAX - PG_UINT32_MAX / NBASE) / (NBASE - 1)^2.
+	 */
+	StaticAssertStmt(MUL_64BIT_THRESHOLD <= MUL_64BIT_THRESHOLD_MAX,
+					 "MUL_64BIT_THRESHOLD must not exceed MUL_64BIT_THRESHOLD_MAX");
+
+	if (var1ndigits <= MUL_64BIT_THRESHOLD)
+	{
+		uint32	   *dig,
+				   *dig_i1_2;
+		NumericDigit var1digit;
+		uint32		carry;
+		uint32		newdig;
 
-	if (res_ndigits < 3)
-	{
-		/* All input digits will be ignored; so result is zero */
-		zero_var(result);
-		result->dscale = rscale;
-		return;
-	}
+		/*
+		 * Determine the number of result digits to compute and the (maximum
+		 * possible) result weight.  If the exact result would have more than
+		 * rscale fractional digits, truncate the computation with
+		 * MUL_GUARD_DIGITS guard digits, i.e., ignore input digits that would
+		 * only contribute to the right of that.  (This will give the exact
+		 * rounded-to-rscale answer unless carries out of the ignored
+		 * positions would have propagated through more than MUL_GUARD_DIGITS
+		 * digits.)
+		 *
+		 * Note: an exact computation could not produce more than var1ndigits
+		 * + var2ndigits digits, but we allocate at least one extra output
+		 * digit in case rscale-driven rounding produces a carry out of the
+		 * highest exact digit.
+		 */
+		res_ndigits = var1ndigits + var2ndigits + 1;
+		res_weight = var1->weight + var2->weight + 2;
 
-	/*
-	 * We do the arithmetic in an array "dig[]" of signed int's.  Since
-	 * INT_MAX is noticeably larger than NBASE*NBASE, this gives us headroom
-	 * to avoid normalizing carries immediately.
-	 *
-	 * maxdig tracks the maximum possible value of any dig[] entry; when this
-	 * threatens to exceed INT_MAX, we take the time to propagate carries.
-	 * Furthermore, we need to ensure that overflow doesn't occur during the
-	 * carry propagation passes either.  The carry values could be as much as
-	 * INT_MAX/NBASE, so really we must normalize when digits threaten to
-	 * exceed INT_MAX - INT_MAX/NBASE.
-	 *
-	 * To avoid overflow in maxdig itself, it actually represents the max
-	 * possible value divided by NBASE-1, ie, at the top of the loop it is
-	 * known that no dig[] entry exceeds maxdig * (NBASE-1).
-	 */
-	dig = (int *) palloc0(res_ndigits * sizeof(int));
-	maxdig = 0;
+		maxdigits = res_weight + 1 + (rscale + DEC_DIGITS - 1) / DEC_DIGITS +
+			MUL_GUARD_DIGITS;
+		res_ndigits = Min(res_ndigits, maxdigits);
 
-	/*
-	 * The least significant digits of var1 should be ignored if they don't
-	 * contribute directly to the first res_ndigits digits of the result that
-	 * we are computing.
-	 *
-	 * Digit i1 of var1 and digit i2 of var2 are multiplied and added to digit
-	 * i1+i2+2 of the accumulator array, so we need only consider digits of
-	 * var1 for which i1 <= res_ndigits - 3.
-	 */
-	for (i1 = Min(var1ndigits - 1, res_ndigits - 3); i1 >= 0; i1--)
-	{
-		NumericDigit var1digit = var1digits[i1];
+		if (res_ndigits < 3)
+		{
+			/* All input digits will be ignored; so result is zero */
+			zero_var(result);
+			result->dscale = rscale;
+			return;
+		}
 
-		if (var1digit == 0)
-			continue;
+		/* Allocate dig[] to accumulate the digit products */
+		dig = (uint32 *) palloc(res_ndigits * sizeof(uint32));
+
+		/*
+		 * Start by multiplying var2 by the least significant contributing
+		 * digit of var1, storing the results at the end of dig[], and filling
+		 * the leading slots with zeros.
+		 *
+		 * The least significant digits of var1 should be ignored if they
+		 * don't contribute directly to the first res_ndigits digits of the
+		 * result that we are computing.
+		 *
+		 * Digit i1 of var1 and digit i2 of var2 are multiplied and added to
+		 * digit i1+i2+2 of the accumulator array, so we need only consider
+		 * digits of var1 for which i1 <= res_ndigits - 3.
+		 *
+		 * The loop here is the same as the inner loop below, except that we
+		 * set the results in dig[], rather than adding to them.  This is the
+		 * performance bottleneck for multiplication, so we want to keep it
+		 * simple enough so that it can be auto-vectorized.  Accordingly,
+		 * process the digits left-to-right even though schoolbook
+		 * multiplication would suggest right-to-left.  Since we aren't
+		 * propagating carries in this loop, the order does not matter.
+		 */
+		i1 = Min(var1ndigits - 1, res_ndigits - 3);
+		var1digit = var1digits[i1];
+
+		i2limit = Min(var2ndigits, res_ndigits - i1 - 2);
+		dig_i1_2 = &dig[i1 + 2];
+
+		memset(dig, 0, (i1 + 2) * sizeof(uint32));
+		for (i2 = 0; i2 < i2limit; i2++)
+			dig_i1_2[i2] = var1digit * var2digits[i2];
 
-		/* Time to normalize? */
-		maxdig += var1digit;
-		if (maxdig > (INT_MAX - INT_MAX / NBASE) / (NBASE - 1))
+		/*
+		 * Next, multiply var2 by the remaining digits of var1, adding the
+		 * results to dig[] at the appropriate offsets.
+		 */
+		for (i1 = i1 - 1; i1 >= 0; i1--)
 		{
-			/* Yes, do it */
-			carry = 0;
-			for (i = res_ndigits - 1; i >= 0; i--)
+			var1digit = var1digits[i1];
+			if (var1digit != 0)
 			{
-				newdig = dig[i] + carry;
-				if (newdig >= NBASE)
-				{
-					carry = newdig / NBASE;
-					newdig -= carry * NBASE;
-				}
-				else
-					carry = 0;
-				dig[i] = newdig;
+				i2limit = Min(var2ndigits, res_ndigits - i1 - 2);
+				dig_i1_2 = &dig[i1 + 2];
+
+				for (i2 = 0; i2 < i2limit; i2++)
+					dig_i1_2[i2] += var1digit * var2digits[i2];
 			}
-			Assert(carry == 0);
-			/* Reset maxdig to indicate new worst-case */
-			maxdig = 1 + var1digit;
 		}
 
 		/*
-		 * Add the appropriate multiple of var2 into the accumulator.
+		 * Finally, construct the result digits by propagating carries up,
+		 * normalizing back to base-NBASE.  Note that this is still done at
+		 * full precision w/guard digits.
+		 */
+		alloc_var(result, res_ndigits);
+		res_digits = result->digits;
+		carry = 0;
+		for (i = res_ndigits - 1; i >= 0; i--)
+		{
+			newdig = dig[i] + carry;
+			if (newdig >= NBASE)
+			{
+				carry = newdig / NBASE;
+				newdig -= carry * NBASE;
+			}
+			else
+				carry = 0;
+			res_digits[i] = newdig;
+		}
+		Assert(carry == 0);
+
+		pfree(dig);
+	}
+	else
+	{
+		int			var1ndigitpairs;
+		int			var2ndigitpairs;
+		int			res_ndigitpairs;
+		int			pair_offset;
+		int			maxdigitpairs;
+		uint64	   *dig,
+				   *dig_i1_off;
+		uint32	   *var2digitpairs;
+		uint32		var1digitpair;
+		uint64		maxdig;
+		uint64		carry;
+		uint64		newdig;
+
+		/*
+		 * As above, determine the number of result digits to compute and the
+		 * (maximum possible) result weight, except that here we will be
+		 * working in base NBASE^2 and so we process the digits of each input
+		 * in pairs.
+		 */
+		/* digit pairs in each input */
+		var1ndigitpairs = (var1ndigits + 1) / 2;
+		var2ndigitpairs = (var2ndigits + 1) / 2;
+
+		/* digits in exact result */
+		res_ndigits = var1ndigits + var2ndigits;
+
+		/* digit pairs in exact result with at least one extra output digit */
+		res_ndigitpairs = res_ndigits / 2 + 1;
+
+		/* pair offset to align result to end of dig[] */
+		pair_offset = res_ndigitpairs - var1ndigitpairs - var2ndigitpairs + 1;
+
+		/* maximum possible result weight */
+		res_weight = var1->weight + var2->weight + 1 + 2 * res_ndigitpairs -
+			res_ndigits;
+
+		/* truncate computation based on requested rscale */
+		maxdigits = res_weight + 1 + (rscale + DEC_DIGITS - 1) / DEC_DIGITS +
+			MUL_GUARD_DIGITS;
+		maxdigitpairs = (maxdigits + 1) / 2;
+
+		res_ndigitpairs = Min(res_ndigitpairs, maxdigitpairs);
+		res_ndigits = 2 * res_ndigitpairs;
+
+		if (res_ndigitpairs <= pair_offset)
+		{
+			/* All input digits will be ignored; so result is zero */
+			zero_var(result);
+			result->dscale = rscale;
+			return;
+		}
+		var1ndigitpairs = Min(var1ndigitpairs, res_ndigitpairs - pair_offset);
+		var2ndigitpairs = Min(var2ndigitpairs, res_ndigitpairs - pair_offset);
+
+		/*
+		 * Since we will be working in base NBASE^2, we make dig[] an array of
+		 * unsigned 64-bit integers, and since PG_UINT64_MAX is much larger
+		 * than NBASE^4, this gives us a lot of headroom to avoid normalizing
+		 * carries immediately.
+		 *
+		 * maxdig tracks the maximum possible value of any dig[] entry; when
+		 * this threatens to exceed PG_UINT64_MAX, we take the time to
+		 * propagate carries.  Furthermore, we need to ensure that overflow
+		 * doesn't occur during the carry propagation passes either.  The
+		 * carry values could be as much as PG_UINT64_MAX / NBASE^2, so really
+		 * we must normalize when digits threaten to exceed PG_UINT64_MAX -
+		 * PG_UINT64_MAX / NBASE^2.
 		 *
-		 * As above, digits of var2 can be ignored if they don't contribute,
-		 * so we only include digits for which i1+i2+2 < res_ndigits.
+		 * To avoid overflow in maxdig itself, it actually represents the
+		 * maximum possible value divided by NBASE^2-1, i.e., at the top of
+		 * the loop it is known that no dig[] entry exceeds maxdig *
+		 * (NBASE^2-1).
 		 *
-		 * This inner loop is the performance bottleneck for multiplication,
-		 * so we want to keep it simple enough so that it can be
-		 * auto-vectorized.  Accordingly, process the digits left-to-right
-		 * even though schoolbook multiplication would suggest right-to-left.
-		 * Since we aren't propagating carries in this loop, the order does
-		 * not matter.
+		 * The conversion of var1 to base NBASE^2 is done on the fly, as each
+		 * new digit is required.  The digits of var2 are converted upfront,
+		 * and stored at the end of dig[].  To avoid loss of precision, the
+		 * input digits are aligned with the start of digit pair array,
+		 * effectively shifting them up (multiplying by NBASE) if the input
+		 * has an odd number of NBASE digits.
+		 */
+		dig = (uint64 *) palloc(res_ndigitpairs * sizeof(uint64) +
+								var2ndigitpairs * sizeof(uint32));
+
+		/* convert var2 to base NBASE^2, shifting up if length is odd */
+		var2digitpairs = (uint32 *) (dig + res_ndigitpairs);
+
+		for (i2 = 0; i2 < var2ndigitpairs - 1; i2++)
+			var2digitpairs[i2] = var2digits[2 * i2] * NBASE + var2digits[2 * i2 + 1];
+
+		if (2 * i2 + 1 < var2ndigits)
+			var2digitpairs[i2] = var2digits[2 * i2] * NBASE + var2digits[2 * i2 + 1];
+		else
+			var2digitpairs[i2] = var2digits[2 * i2] * NBASE;
+
+		/*
+		 * As above, we start by multiplying var2 by the least significant
+		 * contributing digit pair from var1, storing the results at the end
+		 * of dig[], and filling the leading slots with zeros.
+		 *
+		 * Here, however, digit pair i1 of var1 and digit pair i2 of var2 are
+		 * multiplied and added to digit i1+i2+pair_offset of the accumulator
+		 * array.
+		 */
+		i1 = var1ndigitpairs - 1;
+		if (2 * i1 + 1 < var1ndigits)
+			var1digitpair = var1digits[2 * i1] * NBASE + var1digits[2 * i1 + 1];
+		else
+			var1digitpair = var1digits[2 * i1] * NBASE;
+		maxdig = var1digitpair;
+
+		i2limit = Min(var2ndigitpairs, res_ndigitpairs - i1 - pair_offset);
+		dig_i1_off = &dig[i1 + pair_offset];
+
+		memset(dig, 0, (i1 + pair_offset) * sizeof(uint64));
+		for (i2 = 0; i2 < i2limit; i2++)
+			dig_i1_off[i2] = (uint64) var1digitpair * var2digitpairs[i2];
+
+		/*
+		 * Next, multiply var2 by the remaining digit pairs of var1, adding
+		 * the results to dig[] at the appropriate offsets.
 		 */
+		for (i1 = i1 - 1; i1 >= 0; i1--)
 		{
-			int			i2limit = Min(var2ndigits, res_ndigits - i1 - 2);
-			int		   *dig_i1_2 = &dig[i1 + 2];
+			var1digitpair = var1digits[2 * i1] * NBASE + var1digits[2 * i1 + 1];
+			if (var1digitpair == 0)
+				continue;
+
+			/* Time to normalize? */
+			maxdig += var1digitpair;
+			if (maxdig > (PG_UINT64_MAX - PG_UINT64_MAX / NBASE_SQR) / (NBASE_SQR - 1))
+			{
+				/* Yes, do it (to base NBASE^2) */
+				carry = 0;
+				for (i = res_ndigitpairs - 1; i >= 0; i--)
+				{
+					newdig = dig[i] + carry;
+					if (newdig >= NBASE_SQR)
+						carry = div_mod_NBASE_SQR(newdig, &newdig);
+					else
+						carry = 0;
+					dig[i] = newdig;
+				}
+				Assert(carry == 0);
+				/* Reset maxdig to indicate new worst-case */
+				maxdig = 1 + var1digitpair;
+			}
+
+			/* Multiply and add */
+			i2limit = Min(var2ndigitpairs, res_ndigitpairs - i1 - pair_offset);
+			dig_i1_off = &dig[i1 + pair_offset];
 
 			for (i2 = 0; i2 < i2limit; i2++)
-				dig_i1_2[i2] += var1digit * var2digits[i2];
+				dig_i1_off[i2] += (uint64) var1digitpair * var2digitpairs[i2];
 		}
-	}
 
-	/*
-	 * Now we do a final carry propagation pass to normalize the result, which
-	 * we combine with storing the result digits into the output. Note that
-	 * this is still done at full precision w/guard digits.
-	 */
-	alloc_var(result, res_ndigits);
-	res_digits = result->digits;
-	carry = 0;
-	for (i = res_ndigits - 1; i >= 0; i--)
-	{
-		newdig = dig[i] + carry;
-		if (newdig >= NBASE)
+		/*
+		 * Now we do a final carry propagation pass to normalize back to base
+		 * NBASE^2, and construct the base-NBASE result digits.
+		 */
+		alloc_var(result, res_ndigits);
+		res_digits = result->digits;
+		carry = 0;
+		for (i = res_ndigitpairs - 1; i >= 0; i--)
 		{
-			carry = newdig / NBASE;
-			newdig -= carry * NBASE;
+			newdig = dig[i] + carry;
+			if (newdig >= NBASE_SQR)
+				carry = div_mod_NBASE_SQR(newdig, &newdig);
+			else
+				carry = 0;
+			res_digits[2 * i + 1] = (NumericDigit) ((uint32) newdig % NBASE);
+			res_digits[2 * i] = (NumericDigit) ((uint32) newdig / NBASE);
 		}
-		else
-			carry = 0;
-		res_digits[i] = newdig;
-	}
-	Assert(carry == 0);
+		Assert(carry == 0);
 
-	pfree(dig);
+		pfree(dig);
+
+		/*
+		 * Adjust the result weight, if the inputs were shifted up during base
+		 * conversion (if they had an odd number of NBASE digits).
+		 */
+		res_weight -= (var1ndigits & 1) + (var2ndigits & 1);
+	}
 
 	/*
 	 * Finally, round the result to the requested precision.
-- 
2.35.3