From 551f54a2a1bc271d8686849269868ae9829add05 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rasheed@gmail.com>
Date: Thu, 10 Jul 2025 14:45:10 +0100
Subject: [PATCH v4 5/5] Extend int128.h to support more numeric code.

This adds a few more functions to int128.h, allowing more of numeric.c
to use 128-bit integers on all platforms.

Specifically, int64_div_fast_to_numeric() and the following aggregate
functions can now use 128-bit integers for improved performance on all
platforms, rather than just platforms with native support for int128:

- SUM(int8)
- AVG(int8)
- STDDEV_POP(int2 or int4)
- STDDEV_SAMP(int2 or int4)
- VAR_POP(int2 or int4)
- VAR_SAMP(int2 or int4)

In addition to improved performance on platforms lacking native
128-bit integer support, this significantly simplifies this numeric
code by allowing a lot of conditionally compiled code to be deleted.

A couple of numeric functions (div_var_int64() and sqrt_var()) still
contain conditionally compiled 128-bit integer code that only works on
platforms with native 128-bit integer support. Making those work more
portably would require rolling our own higher precision 128-bit
division, which isn't supported for now.
---
 src/backend/utils/adt/numeric.c            | 502 +++++----------------
 src/include/common/int128.h                | 239 ++++++++++
 src/test/modules/test_int128/test_int128.c | 103 ++++-
 3 files changed, 460 insertions(+), 384 deletions(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index c9233565d57..1f1eb57d832 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -28,6 +28,7 @@
 
 #include "common/hashfn.h"
 #include "common/int.h"
+#include "common/int128.h"
 #include "funcapi.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
@@ -534,10 +535,7 @@ static bool numericvar_to_int32(const NumericVar *var, int32 *result);
 static bool numericvar_to_int64(const NumericVar *var, int64 *result);
 static void int64_to_numericvar(int64 val, NumericVar *var);
 static bool numericvar_to_uint64(const NumericVar *var, uint64 *result);
-#ifdef HAVE_INT128
-static bool numericvar_to_int128(const NumericVar *var, int128 *result);
-static void int128_to_numericvar(int128 val, NumericVar *var);
-#endif
+static void int128_to_numericvar(INT128 val, NumericVar *var);
 static double numericvar_to_double_no_overflow(const NumericVar *var);
 
 static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup);
@@ -4463,25 +4461,13 @@ int64_div_fast_to_numeric(int64 val1, int log10val2)
 
 		if (unlikely(pg_mul_s64_overflow(val1, factor, &new_val1)))
 		{
-#ifdef HAVE_INT128
 			/* do the multiplication using 128-bit integers */
-			int128		tmp;
+			INT128		tmp;
 
-			tmp = (int128) val1 * (int128) factor;
+			tmp = int64_to_int128(0);
+			int128_add_int64_mul_int64(&tmp, val1, factor);
 
 			int128_to_numericvar(tmp, &result);
-#else
-			/* do the multiplication using numerics */
-			NumericVar	tmp;
-
-			init_var(&tmp);
-
-			int64_to_numericvar(val1, &result);
-			int64_to_numericvar(factor, &tmp);
-			mul_var(&result, &tmp, &result, 0);
-
-			free_var(&tmp);
-#endif
 		}
 		else
 			int64_to_numericvar(new_val1, &result);
@@ -4901,8 +4887,8 @@ numeric_pg_lsn(PG_FUNCTION_ARGS)
  * Actually, it's a pointer to a NumericAggState allocated in the aggregate
  * context.  The digit buffers for the NumericVars will be there too.
  *
- * On platforms which support 128-bit integers some aggregates instead use a
- * 128-bit integer based transition datatype to speed up calculations.
+ * For integer inputs, some aggregates use special-purpose 64-bit or 128-bit
+ * integer based transition datatypes to speed up calculations.
  *
  * ----------------------------------------------------------------------
  */
@@ -5566,26 +5552,27 @@ numeric_accum_inv(PG_FUNCTION_ARGS)
 
 
 /*
- * Integer data types in general use Numeric accumulators to share code
- * and avoid risk of overflow.
+ * Integer data types in general use Numeric accumulators to share code and
+ * avoid risk of overflow.  However for performance reasons optimized
+ * special-purpose accumulator routines are used when possible:
  *
- * However for performance reasons optimized special-purpose accumulator
- * routines are used when possible.
+ * For 16-bit and 32-bit inputs, N and sum(X) fit into 64-bit, so 64-bit
+ * accumulators are used for SUM and AVG of these data types.
  *
- * On platforms with 128-bit integer support, the 128-bit routines will be
- * used when sum(X) or sum(X*X) fit into 128-bit.
+ * For 16-bit and 32-bit inputs, sum(X^2) fits into 128-bit, so 128-bit
+ * accumulators are used for STDDEV_POP, STDDEV_SAMP, VAR_POP, and VAR_SAMP of
+ * these data types.
  *
- * For 16 and 32 bit inputs, the N and sum(X) fit into 64-bit so the 64-bit
- * accumulators will be used for SUM and AVG of these data types.
+ * For 64-bit inputs, sum(X) fits into 128-bit, so a 128-bit accumulator is
+ * used for SUM(int8) and AVG(int8).
  */
 
-#ifdef HAVE_INT128
 typedef struct Int128AggState
 {
 	bool		calcSumX2;		/* if true, calculate sumX2 */
 	int64		N;				/* count of processed numbers */
-	int128		sumX;			/* sum of processed numbers */
-	int128		sumX2;			/* sum of squares of processed numbers */
+	INT128		sumX;			/* sum of processed numbers */
+	INT128		sumX2;			/* sum of squares of processed numbers */
 } Int128AggState;
 
 /*
@@ -5631,12 +5618,12 @@ makeInt128AggStateCurrentContext(bool calcSumX2)
  * Accumulate a new input value for 128-bit aggregate functions.
  */
 static void
-do_int128_accum(Int128AggState *state, int128 newval)
+do_int128_accum(Int128AggState *state, int64 newval)
 {
 	if (state->calcSumX2)
-		state->sumX2 += newval * newval;
+		int128_add_int64_mul_int64(&state->sumX2, newval, newval);
 
-	state->sumX += newval;
+	int128_add_int64(&state->sumX, newval);
 	state->N++;
 }
 
@@ -5644,43 +5631,28 @@ do_int128_accum(Int128AggState *state, int128 newval)
  * Remove an input value from the aggregated state.
  */
 static void
-do_int128_discard(Int128AggState *state, int128 newval)
+do_int128_discard(Int128AggState *state, int64 newval)
 {
 	if (state->calcSumX2)
-		state->sumX2 -= newval * newval;
+		int128_sub_int64_mul_int64(&state->sumX2, newval, newval);
 
-	state->sumX -= newval;
+	int128_sub_int64(&state->sumX, newval);
 	state->N--;
 }
 
-typedef Int128AggState PolyNumAggState;
-#define makePolyNumAggState makeInt128AggState
-#define makePolyNumAggStateCurrentContext makeInt128AggStateCurrentContext
-#else
-typedef NumericAggState PolyNumAggState;
-#define makePolyNumAggState makeNumericAggState
-#define makePolyNumAggStateCurrentContext makeNumericAggStateCurrentContext
-#endif
-
 Datum
 int2_accum(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Create the state data on the first call */
 	if (state == NULL)
-		state = makePolyNumAggState(fcinfo, true);
+		state = makeInt128AggState(fcinfo, true);
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_accum(state, (int128) PG_GETARG_INT16(1));
-#else
-		do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT16(1)));
-#endif
-	}
+		do_int128_accum(state, PG_GETARG_INT16(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -5688,22 +5660,16 @@ int2_accum(PG_FUNCTION_ARGS)
 Datum
 int4_accum(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Create the state data on the first call */
 	if (state == NULL)
-		state = makePolyNumAggState(fcinfo, true);
+		state = makeInt128AggState(fcinfo, true);
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_accum(state, (int128) PG_GETARG_INT32(1));
-#else
-		do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT32(1)));
-#endif
-	}
+		do_int128_accum(state, PG_GETARG_INT32(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -5726,21 +5692,21 @@ int8_accum(PG_FUNCTION_ARGS)
 }
 
 /*
- * Combine function for numeric aggregates which require sumX2
+ * Combine function for Int128AggState for aggregates which require sumX2
  */
 Datum
 numeric_poly_combine(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state1;
-	PolyNumAggState *state2;
+	Int128AggState *state1;
+	Int128AggState *state2;
 	MemoryContext agg_context;
 	MemoryContext old_context;
 
 	if (!AggCheckCallContext(fcinfo, &agg_context))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state1 = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
-	state2 = PG_ARGISNULL(1) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(1);
+	state1 = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
+	state2 = PG_ARGISNULL(1) ? NULL : (Int128AggState *) PG_GETARG_POINTER(1);
 
 	if (state2 == NULL)
 		PG_RETURN_POINTER(state1);
@@ -5750,16 +5716,10 @@ numeric_poly_combine(PG_FUNCTION_ARGS)
 	{
 		old_context = MemoryContextSwitchTo(agg_context);
 
-		state1 = makePolyNumAggState(fcinfo, true);
+		state1 = makeInt128AggState(fcinfo, true);
 		state1->N = state2->N;
-
-#ifdef HAVE_INT128
 		state1->sumX = state2->sumX;
 		state1->sumX2 = state2->sumX2;
-#else
-		accum_sum_copy(&state1->sumX, &state2->sumX);
-		accum_sum_copy(&state1->sumX2, &state2->sumX2);
-#endif
 
 		MemoryContextSwitchTo(old_context);
 
@@ -5769,54 +5729,51 @@ numeric_poly_combine(PG_FUNCTION_ARGS)
 	if (state2->N > 0)
 	{
 		state1->N += state2->N;
+		int128_add_int128(&state1->sumX, state2->sumX);
+		int128_add_int128(&state1->sumX2, state2->sumX2);
+	}
+	PG_RETURN_POINTER(state1);
+}
 
-#ifdef HAVE_INT128
-		state1->sumX += state2->sumX;
-		state1->sumX2 += state2->sumX2;
-#else
-		/* The rest of this needs to work in the aggregate context */
-		old_context = MemoryContextSwitchTo(agg_context);
-
-		/* Accumulate sums */
-		accum_sum_combine(&state1->sumX, &state2->sumX);
-		accum_sum_combine(&state1->sumX2, &state2->sumX2);
+/*
+ * int128_serialize - serialize a 128-bit integer to binary format
+ */
+static inline void
+int128_serialize(StringInfo buf, INT128 val)
+{
+	pq_sendint64(buf, PG_INT128_HI_INT64(val));
+	pq_sendint64(buf, PG_INT128_LO_UINT64(val));
+}
 
-		MemoryContextSwitchTo(old_context);
-#endif
+/*
+ * int128_deserialize - deserialize binary format to a 128-bit integer.
+ */
+static inline INT128
+int128_deserialize(StringInfo buf)
+{
+	int64		hi = pq_getmsgint64(buf);
+	uint64		lo = pq_getmsgint64(buf);
 
-	}
-	PG_RETURN_POINTER(state1);
+	return make_int128(hi, lo);
 }
 
 /*
  * numeric_poly_serialize
- *		Serialize PolyNumAggState into bytea for aggregate functions which
+ *		Serialize Int128AggState into bytea for aggregate functions which
  *		require sumX2.
  */
 Datum
 numeric_poly_serialize(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 	StringInfoData buf;
 	bytea	   *result;
-	NumericVar	tmp_var;
 
 	/* Ensure we disallow calling when not in aggregate context */
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state = (PolyNumAggState *) PG_GETARG_POINTER(0);
-
-	/*
-	 * If the platform supports int128 then sumX and sumX2 will be a 128 bit
-	 * integer type. Here we'll convert that into a numeric type so that the
-	 * combine state is in the same format for both int128 enabled machines
-	 * and machines which don't support that type. The logic here is that one
-	 * day we might like to send these over to another server for further
-	 * processing and we want a standard format to work with.
-	 */
-
-	init_var(&tmp_var);
+	state = (Int128AggState *) PG_GETARG_POINTER(0);
 
 	pq_begintypsend(&buf);
 
@@ -5824,48 +5781,33 @@ numeric_poly_serialize(PG_FUNCTION_ARGS)
 	pq_sendint64(&buf, state->N);
 
 	/* sumX */
-#ifdef HAVE_INT128
-	int128_to_numericvar(state->sumX, &tmp_var);
-#else
-	accum_sum_final(&state->sumX, &tmp_var);
-#endif
-	numericvar_serialize(&buf, &tmp_var);
+	int128_serialize(&buf, state->sumX);
 
 	/* sumX2 */
-#ifdef HAVE_INT128
-	int128_to_numericvar(state->sumX2, &tmp_var);
-#else
-	accum_sum_final(&state->sumX2, &tmp_var);
-#endif
-	numericvar_serialize(&buf, &tmp_var);
+	int128_serialize(&buf, state->sumX2);
 
 	result = pq_endtypsend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_BYTEA_P(result);
 }
 
 /*
  * numeric_poly_deserialize
- *		Deserialize PolyNumAggState from bytea for aggregate functions which
+ *		Deserialize Int128AggState from bytea for aggregate functions which
  *		require sumX2.
  */
 Datum
 numeric_poly_deserialize(PG_FUNCTION_ARGS)
 {
 	bytea	   *sstate;
-	PolyNumAggState *result;
+	Int128AggState *result;
 	StringInfoData buf;
-	NumericVar	tmp_var;
 
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
 	sstate = PG_GETARG_BYTEA_PP(0);
 
-	init_var(&tmp_var);
-
 	/*
 	 * Initialize a StringInfo so that we can "receive" it using the standard
 	 * recv-function infrastructure.
@@ -5873,31 +5815,19 @@ numeric_poly_deserialize(PG_FUNCTION_ARGS)
 	initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
 						   VARSIZE_ANY_EXHDR(sstate));
 
-	result = makePolyNumAggStateCurrentContext(false);
+	result = makeInt128AggStateCurrentContext(false);
 
 	/* N */
 	result->N = pq_getmsgint64(&buf);
 
 	/* sumX */
-	numericvar_deserialize(&buf, &tmp_var);
-#ifdef HAVE_INT128
-	numericvar_to_int128(&tmp_var, &result->sumX);
-#else
-	accum_sum_add(&result->sumX, &tmp_var);
-#endif
+	result->sumX = int128_deserialize(&buf);
 
 	/* sumX2 */
-	numericvar_deserialize(&buf, &tmp_var);
-#ifdef HAVE_INT128
-	numericvar_to_int128(&tmp_var, &result->sumX2);
-#else
-	accum_sum_add(&result->sumX2, &tmp_var);
-#endif
+	result->sumX2 = int128_deserialize(&buf);
 
 	pq_getmsgend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_POINTER(result);
 }
 
@@ -5907,43 +5837,37 @@ numeric_poly_deserialize(PG_FUNCTION_ARGS)
 Datum
 int8_avg_accum(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Create the state data on the first call */
 	if (state == NULL)
-		state = makePolyNumAggState(fcinfo, false);
+		state = makeInt128AggState(fcinfo, false);
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_accum(state, (int128) PG_GETARG_INT64(1));
-#else
-		do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT64(1)));
-#endif
-	}
+		do_int128_accum(state, PG_GETARG_INT64(1));
 
 	PG_RETURN_POINTER(state);
 }
 
 /*
- * Combine function for PolyNumAggState for aggregates which don't require
+ * Combine function for Int128AggState for aggregates which don't require
  * sumX2
  */
 Datum
 int8_avg_combine(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state1;
-	PolyNumAggState *state2;
+	Int128AggState *state1;
+	Int128AggState *state2;
 	MemoryContext agg_context;
 	MemoryContext old_context;
 
 	if (!AggCheckCallContext(fcinfo, &agg_context))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state1 = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
-	state2 = PG_ARGISNULL(1) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(1);
+	state1 = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
+	state2 = PG_ARGISNULL(1) ? NULL : (Int128AggState *) PG_GETARG_POINTER(1);
 
 	if (state2 == NULL)
 		PG_RETURN_POINTER(state1);
@@ -5953,14 +5877,10 @@ int8_avg_combine(PG_FUNCTION_ARGS)
 	{
 		old_context = MemoryContextSwitchTo(agg_context);
 
-		state1 = makePolyNumAggState(fcinfo, false);
+		state1 = makeInt128AggState(fcinfo, false);
 		state1->N = state2->N;
-
-#ifdef HAVE_INT128
 		state1->sumX = state2->sumX;
-#else
-		accum_sum_copy(&state1->sumX, &state2->sumX);
-#endif
+
 		MemoryContextSwitchTo(old_context);
 
 		PG_RETURN_POINTER(state1);
@@ -5969,52 +5889,28 @@ int8_avg_combine(PG_FUNCTION_ARGS)
 	if (state2->N > 0)
 	{
 		state1->N += state2->N;
-
-#ifdef HAVE_INT128
-		state1->sumX += state2->sumX;
-#else
-		/* The rest of this needs to work in the aggregate context */
-		old_context = MemoryContextSwitchTo(agg_context);
-
-		/* Accumulate sums */
-		accum_sum_combine(&state1->sumX, &state2->sumX);
-
-		MemoryContextSwitchTo(old_context);
-#endif
-
+		int128_add_int128(&state1->sumX, state2->sumX);
 	}
 	PG_RETURN_POINTER(state1);
 }
 
 /*
  * int8_avg_serialize
- *		Serialize PolyNumAggState into bytea using the standard
- *		recv-function infrastructure.
+ *		Serialize Int128AggState into bytea for aggregate functions which
+ *		don't require sumX2.
  */
 Datum
 int8_avg_serialize(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 	StringInfoData buf;
 	bytea	   *result;
-	NumericVar	tmp_var;
 
 	/* Ensure we disallow calling when not in aggregate context */
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state = (PolyNumAggState *) PG_GETARG_POINTER(0);
-
-	/*
-	 * If the platform supports int128 then sumX will be a 128 integer type.
-	 * Here we'll convert that into a numeric type so that the combine state
-	 * is in the same format for both int128 enabled machines and machines
-	 * which don't support that type. The logic here is that one day we might
-	 * like to send these over to another server for further processing and we
-	 * want a standard format to work with.
-	 */
-
-	init_var(&tmp_var);
+	state = (Int128AggState *) PG_GETARG_POINTER(0);
 
 	pq_begintypsend(&buf);
 
@@ -6022,39 +5918,30 @@ int8_avg_serialize(PG_FUNCTION_ARGS)
 	pq_sendint64(&buf, state->N);
 
 	/* sumX */
-#ifdef HAVE_INT128
-	int128_to_numericvar(state->sumX, &tmp_var);
-#else
-	accum_sum_final(&state->sumX, &tmp_var);
-#endif
-	numericvar_serialize(&buf, &tmp_var);
+	int128_serialize(&buf, state->sumX);
 
 	result = pq_endtypsend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_BYTEA_P(result);
 }
 
 /*
  * int8_avg_deserialize
- *		Deserialize bytea back into PolyNumAggState.
+ *		Deserialize Int128AggState from bytea for aggregate functions which
+ *		don't require sumX2.
  */
 Datum
 int8_avg_deserialize(PG_FUNCTION_ARGS)
 {
 	bytea	   *sstate;
-	PolyNumAggState *result;
+	Int128AggState *result;
 	StringInfoData buf;
-	NumericVar	tmp_var;
 
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
 	sstate = PG_GETARG_BYTEA_PP(0);
 
-	init_var(&tmp_var);
-
 	/*
 	 * Initialize a StringInfo so that we can "receive" it using the standard
 	 * recv-function infrastructure.
@@ -6062,23 +5949,16 @@ int8_avg_deserialize(PG_FUNCTION_ARGS)
 	initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
 						   VARSIZE_ANY_EXHDR(sstate));
 
-	result = makePolyNumAggStateCurrentContext(false);
+	result = makeInt128AggStateCurrentContext(false);
 
 	/* N */
 	result->N = pq_getmsgint64(&buf);
 
 	/* sumX */
-	numericvar_deserialize(&buf, &tmp_var);
-#ifdef HAVE_INT128
-	numericvar_to_int128(&tmp_var, &result->sumX);
-#else
-	accum_sum_add(&result->sumX, &tmp_var);
-#endif
+	result->sumX = int128_deserialize(&buf);
 
 	pq_getmsgend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_POINTER(result);
 }
 
@@ -6089,24 +5969,16 @@ int8_avg_deserialize(PG_FUNCTION_ARGS)
 Datum
 int2_accum_inv(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Should not get here with no state */
 	if (state == NULL)
 		elog(ERROR, "int2_accum_inv called with NULL state");
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_discard(state, (int128) PG_GETARG_INT16(1));
-#else
-		/* Should never fail, all inputs have dscale 0 */
-		if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT16(1))))
-			elog(ERROR, "do_numeric_discard failed unexpectedly");
-#endif
-	}
+		do_int128_discard(state, PG_GETARG_INT16(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -6114,24 +5986,16 @@ int2_accum_inv(PG_FUNCTION_ARGS)
 Datum
 int4_accum_inv(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Should not get here with no state */
 	if (state == NULL)
 		elog(ERROR, "int4_accum_inv called with NULL state");
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_discard(state, (int128) PG_GETARG_INT32(1));
-#else
-		/* Should never fail, all inputs have dscale 0 */
-		if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT32(1))))
-			elog(ERROR, "do_numeric_discard failed unexpectedly");
-#endif
-	}
+		do_int128_discard(state, PG_GETARG_INT32(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -6160,24 +6024,16 @@ int8_accum_inv(PG_FUNCTION_ARGS)
 Datum
 int8_avg_accum_inv(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Should not get here with no state */
 	if (state == NULL)
 		elog(ERROR, "int8_avg_accum_inv called with NULL state");
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_discard(state, (int128) PG_GETARG_INT64(1));
-#else
-		/* Should never fail, all inputs have dscale 0 */
-		if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT64(1))))
-			elog(ERROR, "do_numeric_discard failed unexpectedly");
-#endif
-	}
+		do_int128_discard(state, PG_GETARG_INT64(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -6185,12 +6041,11 @@ int8_avg_accum_inv(PG_FUNCTION_ARGS)
 Datum
 numeric_poly_sum(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	NumericVar	result;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
 	if (state == NULL || state->N == 0)
@@ -6205,21 +6060,17 @@ numeric_poly_sum(PG_FUNCTION_ARGS)
 	free_var(&result);
 
 	PG_RETURN_NUMERIC(res);
-#else
-	return numeric_sum(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_avg(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	NumericVar	result;
 	Datum		countd,
 				sumd;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
 	if (state == NULL || state->N == 0)
@@ -6235,9 +6086,6 @@ numeric_poly_avg(PG_FUNCTION_ARGS)
 	free_var(&result);
 
 	PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sumd, countd));
-#else
-	return numeric_avg(fcinfo);
-#endif
 }
 
 Datum
@@ -6470,7 +6318,6 @@ numeric_stddev_pop(PG_FUNCTION_ARGS)
 		PG_RETURN_NUMERIC(res);
 }
 
-#ifdef HAVE_INT128
 static Numeric
 numeric_poly_stddev_internal(Int128AggState *state,
 							 bool variance, bool sample,
@@ -6514,17 +6361,15 @@ numeric_poly_stddev_internal(Int128AggState *state,
 
 	return res;
 }
-#endif
 
 Datum
 numeric_poly_var_samp(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, true, true, &is_null);
 
@@ -6532,20 +6377,16 @@ numeric_poly_var_samp(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_var_samp(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_stddev_samp(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, false, true, &is_null);
 
@@ -6553,20 +6394,16 @@ numeric_poly_stddev_samp(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_stddev_samp(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_var_pop(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, true, false, &is_null);
 
@@ -6574,20 +6411,16 @@ numeric_poly_var_pop(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_var_pop(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_stddev_pop(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, false, false, &is_null);
 
@@ -6595,9 +6428,6 @@ numeric_poly_stddev_pop(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_stddev_pop(fcinfo);
-#endif
 }
 
 /*
@@ -8330,105 +8160,23 @@ numericvar_to_uint64(const NumericVar *var, uint64 *result)
 	return true;
 }
 
-#ifdef HAVE_INT128
-/*
- * Convert numeric to int128, rounding if needed.
- *
- * If overflow, return false (no error is raised).  Return true if okay.
- */
-static bool
-numericvar_to_int128(const NumericVar *var, int128 *result)
-{
-	NumericDigit *digits;
-	int			ndigits;
-	int			weight;
-	int			i;
-	int128		val,
-				oldval;
-	bool		neg;
-	NumericVar	rounded;
-
-	/* Round to nearest integer */
-	init_var(&rounded);
-	set_var_from_var(var, &rounded);
-	round_var(&rounded, 0);
-
-	/* Check for zero input */
-	strip_var(&rounded);
-	ndigits = rounded.ndigits;
-	if (ndigits == 0)
-	{
-		*result = 0;
-		free_var(&rounded);
-		return true;
-	}
-
-	/*
-	 * For input like 10000000000, we must treat stripped digits as real. So
-	 * the loop assumes there are weight+1 digits before the decimal point.
-	 */
-	weight = rounded.weight;
-	Assert(weight >= 0 && ndigits <= weight + 1);
-
-	/* Construct the result */
-	digits = rounded.digits;
-	neg = (rounded.sign == NUMERIC_NEG);
-	val = digits[0];
-	for (i = 1; i <= weight; i++)
-	{
-		oldval = val;
-		val *= NBASE;
-		if (i < ndigits)
-			val += digits[i];
-
-		/*
-		 * The overflow check is a bit tricky because we want to accept
-		 * INT128_MIN, which will overflow the positive accumulator.  We can
-		 * detect this case easily though because INT128_MIN is the only
-		 * nonzero value for which -val == val (on a two's complement machine,
-		 * anyway).
-		 */
-		if ((val / NBASE) != oldval)	/* possible overflow? */
-		{
-			if (!neg || (-val) != val || val == 0 || oldval < 0)
-			{
-				free_var(&rounded);
-				return false;
-			}
-		}
-	}
-
-	free_var(&rounded);
-
-	*result = neg ? -val : val;
-	return true;
-}
-
 /*
  * Convert 128 bit integer to numeric.
  */
 static void
-int128_to_numericvar(int128 val, NumericVar *var)
+int128_to_numericvar(INT128 val, NumericVar *var)
 {
-	uint128		uval,
-				newuval;
+	int			sign;
 	NumericDigit *ptr;
 	int			ndigits;
+	int32		dig;
 
 	/* int128 can require at most 39 decimal digits; add one for safety */
 	alloc_var(var, 40 / DEC_DIGITS);
-	if (val < 0)
-	{
-		var->sign = NUMERIC_NEG;
-		uval = -val;
-	}
-	else
-	{
-		var->sign = NUMERIC_POS;
-		uval = val;
-	}
+	sign = int128_sign(val);
+	var->sign = sign < 0 ? NUMERIC_NEG : NUMERIC_POS;
 	var->dscale = 0;
-	if (val == 0)
+	if (sign == 0)
 	{
 		var->ndigits = 0;
 		var->weight = 0;
@@ -8440,15 +8188,13 @@ int128_to_numericvar(int128 val, NumericVar *var)
 	{
 		ptr--;
 		ndigits++;
-		newuval = uval / NBASE;
-		*ptr = uval - newuval * NBASE;
-		uval = newuval;
-	} while (uval);
+		int128_div_mod_int32(&val, NBASE, &dig);
+		*ptr = dig;
+	} while (!int128_is_zero(val));
 	var->digits = ptr;
 	var->ndigits = ndigits;
 	var->weight = ndigits - 1;
 }
-#endif
 
 /*
  * Convert a NumericVar to float8; if out of range, return +/- HUGE_VAL
diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index d45296e1ad1..3512f6b30d3 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -37,11 +37,18 @@
  * that a native int128 type would (probably) have.  This makes no difference
  * for ordinary use of INT128, but allows union'ing INT128 with int128 for
  * testing purposes.
+ *
+ * PG_INT128_HI_INT64 and PG_INT128_LO_UINT64 allow the (signed) high and
+ * (unsigned) low 64-bit integer parts to be extracted portably on all
+ * platforms.
  */
 #if USE_NATIVE_INT128
 
 typedef int128 INT128;
 
+#define PG_INT128_HI_INT64(i128)	((int64) ((i128) >> 64))
+#define PG_INT128_LO_UINT64(i128)	((uint64) (i128))
+
 #else
 
 typedef struct
@@ -55,7 +62,28 @@ typedef struct
 #endif
 } INT128;
 
+#define PG_INT128_HI_INT64(i128)	((i128).hi)
+#define PG_INT128_LO_UINT64(i128)	((i128).lo)
+
+#endif
+
+/*
+ * Construct an INT128 from (signed) high and (unsigned) low 64-bit integer
+ * parts.
+ */
+static inline INT128
+make_int128(int64 hi, uint64 lo)
+{
+#if USE_NATIVE_INT128
+	return (((int128) hi) << 64) + lo;
+#else
+	INT128		val;
+
+	val.hi = hi;
+	val.lo = lo;
+	return val;
 #endif
+}
 
 /*
  * Add an unsigned int64 value into an INT128 variable.
@@ -108,6 +136,58 @@ int128_add_int64(INT128 *i128, int64 v)
 #endif
 }
 
+/*
+ * Add an INT128 value into an INT128 variable.
+ */
+static inline void
+int128_add_int128(INT128 *i128, INT128 v)
+{
+#if USE_NATIVE_INT128
+	*i128 += v;
+#else
+	int128_add_uint64(i128, v.lo);
+	i128->hi += v.hi;
+#endif
+}
+
+/*
+ * Subtract an unsigned int64 value from an INT128 variable.
+ */
+static inline void
+int128_sub_uint64(INT128 *i128, uint64 v)
+{
+#if USE_NATIVE_INT128
+	*i128 -= v;
+#else
+	/*
+	 * This is like int128_add_uint64(), except we must propagate a borrow to
+	 * (subtract 1 from) the .hi part if the new .lo part is greater than the
+	 * old .lo part.
+	 */
+	uint64		oldlo = i128->lo;
+
+	i128->lo -= v;
+	i128->hi -= (i128->lo > oldlo);
+#endif
+}
+
+/*
+ * Subtract a signed int64 value from an INT128 variable.
+ */
+static inline void
+int128_sub_int64(INT128 *i128, int64 v)
+{
+#if USE_NATIVE_INT128
+	*i128 -= v;
+#else
+	/* Like int128_add_int64() with the sign of v inverted */
+	uint64		oldlo = i128->lo;
+
+	i128->lo -= v;
+	i128->hi -= (i128->lo > oldlo) + (v >> 63);
+#endif
+}
+
 /*
  * INT64_HI_INT32 extracts the most significant 32 bits of int64 as int32.
  * INT64_LO_UINT32 extracts the least significant 32 bits as uint32.
@@ -178,6 +258,165 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 #endif
 }
 
+/*
+ * Subtract the 128-bit product of two int64 values from an INT128 variable.
+ */
+static inline void
+int128_sub_int64_mul_int64(INT128 *i128, int64 x, int64 y)
+{
+#if USE_NATIVE_INT128
+	*i128 -= (int128) x * (int128) y;
+#else
+	/* As above, except subtract the 128-bit product */
+	if (x != 0 && y != 0)
+	{
+		int32		x_hi = INT64_HI_INT32(x);
+		uint32		x_lo = INT64_LO_UINT32(x);
+		int32		y_hi = INT64_HI_INT32(y);
+		uint32		y_lo = INT64_LO_UINT32(y);
+		int64		tmp;
+
+		/* the first term */
+		i128->hi -= (int64) x_hi * (int64) y_hi;
+
+		/* the second term: sign-extended with the sign of x */
+		tmp = (int64) x_hi * (int64) y_lo;
+		i128->hi -= INT64_HI_INT32(tmp);
+		int128_sub_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32);
+
+		/* the third term: sign-extended with the sign of y */
+		tmp = (int64) x_lo * (int64) y_hi;
+		i128->hi -= INT64_HI_INT32(tmp);
+		int128_sub_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32);
+
+		/* the fourth term: always unsigned */
+		int128_sub_uint64(i128, (uint64) x_lo * (uint64) y_lo);
+	}
+#endif
+}
+
+/*
+ * Divide an INT128 variable by a signed int32 value, returning the quotient
+ * and remainder.  The remainder will have the same sign as *i128.
+ *
+ * Note: This provides no protection against dividing by 0, or dividing
+ * INT128_MIN by -1, which overflows.  It is the caller's responsibility to
+ * guard against those.
+ */
+static inline void
+int128_div_mod_int32(INT128 *i128, int32 v, int32 *remainder)
+{
+#if USE_NATIVE_INT128
+	int128		old_i128 = *i128;
+
+	*i128 /= v;
+	*remainder = (int32) (old_i128 - *i128 * v);
+#else
+	/*
+	 * To avoid any intermediate values overflowing (as happens if INT64_MIN
+	 * is divided by -1), we first compute the quotient abs(*i128) / abs(v)
+	 * using unsigned 64-bit arithmetic, and then fix the signs up at the end.
+	 *
+	 * The quotient is computed using the short division algorithm described
+	 * in Knuth volume 2, section 4.3.1 exercise 16 (cf. div_var_int() in
+	 * numeric.c).  Since the absolute value of the divisor is known to be at
+	 * most 2^31, the remainder carried from one digit to the next is at most
+	 * 2^31 - 1, and so there is no danger of overflow when this is combined
+	 * with the next digit (a 32-bit unsigned integer).
+	 */
+	uint64		n_hi;
+	uint64		n_lo;
+	uint32		d;
+	uint64		q;
+	uint64		r;
+	uint64		tmp;
+
+	/* numerator: absolute value of *i128 */
+	if (i128->hi < 0)
+	{
+		n_hi = 0 - ((uint64) i128->hi);
+		n_lo = 0 - i128->lo;
+		if (n_lo != 0)
+			n_hi--;
+	}
+	else
+	{
+		n_hi = i128->hi;
+		n_lo = i128->lo;
+	}
+
+	/* denomimator: absolute value of v */
+	d = abs(v);
+
+	/* quotient and remainder of high 64 bits */
+	q = n_hi / d;
+	r = n_hi % d;
+	n_hi = q;
+
+	/* quotient and remainder of next 32 bits (upper half of n_lo) */
+	tmp = (r << 32) + (n_lo >> 32);
+	q = tmp / d;
+	r = tmp % d;
+
+	/* quotient and remainder of last 32 bits (lower half of n_lo) */
+	tmp = (r << 32) + (uint32) n_lo;
+	n_lo = q << 32;
+	q = tmp / d;
+	r = tmp % d;
+	n_lo += q;
+
+	/* final remainder should have the same sign as *i128 */
+	*remainder = i128->hi < 0 ? (int32) (0 - r) : (int32) r;
+
+	/* store the quotient in *i128, negating it if necessary */
+	if ((i128->hi < 0) != (v < 0))
+	{
+		n_hi = 0 - n_hi;
+		n_lo = 0 - n_lo;
+		if (n_lo != 0)
+			n_hi--;
+	}
+	i128->hi = (int64) n_hi;
+	i128->lo = n_lo;
+#endif
+}
+
+/*
+ * Test if an INT128 value is zero.
+ */
+static inline bool
+int128_is_zero(INT128 x)
+{
+#if USE_NATIVE_INT128
+	return x == 0;
+#else
+	return x.hi == 0 && x.lo == 0;
+#endif
+}
+
+/*
+ * Return the sign of an INT128 value (returns -1, 0, or +1).
+ */
+static inline int
+int128_sign(INT128 x)
+{
+#if USE_NATIVE_INT128
+	if (x < 0)
+		return -1;
+	if (x > 0)
+		return 1;
+	return 0;
+#else
+	if (x.hi < 0)
+		return -1;
+	if (x.hi > 0)
+		return 1;
+	if (x.lo > 0)
+		return 1;
+	return 0;
+#endif
+}
+
 /*
  * Compare two INT128 values, return -1, 0, or +1.
  */
diff --git a/src/test/modules/test_int128/test_int128.c b/src/test/modules/test_int128/test_int128.c
index 239f2fcc765..da27a8316fd 100644
--- a/src/test/modules/test_int128/test_int128.c
+++ b/src/test/modules/test_int128/test_int128.c
@@ -92,8 +92,13 @@ main(int argc, char **argv)
 		int64		x = pg_prng_uint64(&pg_global_prng_state);
 		int64		y = pg_prng_uint64(&pg_global_prng_state);
 		int64		z = pg_prng_uint64(&pg_global_prng_state);
+		int64		w = pg_prng_uint64(&pg_global_prng_state);
+		int32		z32 = (int32) z;
 		test128		t1;
 		test128		t2;
+		test128		t3;
+		int32		r1;
+		int32		r2;
 
 		/* check unsigned addition */
 		t1.hl.hi = x;
@@ -125,25 +130,111 @@ main(int argc, char **argv)
 			return 1;
 		}
 
-		/* check multiplication */
-		t1.i128 = (int128) x * (int128) y;
+		/* check 128-bit signed addition */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t3.hl.hi = z;
+		t3.hl.lo = w;
+		t1.i128 += t3.i128;
+		int128_add_int128(&t2.I128, t3.I128);
 
-		t2.hl.hi = t2.hl.lo = 0;
-		int128_add_int64_mul_int64(&t2.I128, x, y);
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX + %016lX%016lX\n", x, y, z, w);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check unsigned subtraction */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 -= (int128) (uint64) z;
+		int128_sub_uint64(&t2.I128, (uint64) z);
 
 		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
 		{
-			printf("%lX * %lX\n", x, y);
+			printf("%016lX%016lX - unsigned %lX\n", x, y, z);
 			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
 			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
 			return 1;
 		}
 
+		/* check signed subtraction */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 -= (int128) z;
+		int128_sub_int64(&t2.I128, z);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX - signed %lX\n", x, y, z);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check 64x64-bit multiply-add */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 += (int128) z * (int128) w;
+		int128_add_int64_mul_int64(&t2.I128, z, w);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX + %lX * %lX\n", x, y, z, w);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check 64x64-bit multiply-subtract */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 -= (int128) z * (int128) w;
+		int128_sub_int64_mul_int64(&t2.I128, z, w);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX - %lX * %lX\n", x, y, z, w);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check 128/32-bit division */
+		t3.hl.hi = x;
+		t3.hl.lo = y;
+		t1.i128 = t3.i128 / z32;
+		r1 = (int32) (t3.i128 % z32);
+		t2 = t3;
+		int128_div_mod_int32(&t2.I128, z32, &r2);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX / signed %08X\n", t3.hl.hi, t3.hl.lo, z32);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+		if (r1 != r2)
+		{
+			printf("%016lX%016lX %% signed %08X\n", t3.hl.hi, t3.hl.lo, z32);
+			printf("native = %08X\n", r1);
+			printf("result = %08X\n", r2);
+			return 1;
+		}
+
 		/* check comparison */
 		t1.hl.hi = x;
 		t1.hl.lo = y;
 		t2.hl.hi = z;
-		t2.hl.lo = pg_prng_uint64(&pg_global_prng_state);
+		t2.hl.lo = w;
 
 		if (my_int128_compare(t1.i128, t2.i128) !=
 			int128_compare(t1.I128, t2.I128))
-- 
2.43.0