micro-optimize nbtcompare.c routines
Here's a patch that adjusts several routines in nbtcompare.c and related
files to use the branchless integer comparison functions added in commit
6b80394. It's probably unlikely this produces a measurable benefit (at
least I've been unable to find any in my admittedly-limited testing), but
in theory it should save a cycle here and there. I was hoping that this
would trim many lines of code, but maintaining the STRESS_SORT_INT_MIN
stuff eats up most of what we save.
Anyway, I don't feel too strongly about this patch, but I went to the
trouble of writing it, and so I figured I'd post it.
--
nathan
Attachments:
v1-0001-micro-optimize-nbtcompare.c-routines.patchtext/plain; charset=us-asciiDownload
From bd2e1eb92b28cf855b529b09813f1b09084c7cfc Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Thu, 26 Sep 2024 15:03:32 -0500
Subject: [PATCH v1 1/1] micro-optimize nbtcompare.c routines
---
src/backend/access/nbtree/nbtcompare.c | 162 ++++++++++++-------------
src/backend/storage/page/itemptr.c | 18 +--
src/backend/utils/sort/tuplesort.c | 21 ++--
3 files changed, 96 insertions(+), 105 deletions(-)
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index 1c72867c84..2df0c5dbe0 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -57,17 +57,16 @@
#include <limits.h>
+#include "common/int.h"
#include "utils/fmgrprotos.h"
#include "utils/sortsupport.h"
-#ifdef STRESS_SORT_INT_MIN
-#define A_LESS_THAN_B INT_MIN
-#define A_GREATER_THAN_B INT_MAX
-#else
-#define A_LESS_THAN_B (-1)
-#define A_GREATER_THAN_B 1
-#endif
-
+#define STRESS_SORT_INT_MIN_CMP(a, b) \
+( \
+ ((a) > (b)) ? INT_MAX : \
+ ((a) < (b)) ? INT_MIN : \
+ 0 \
+)
Datum
btboolcmp(PG_FUNCTION_ARGS)
@@ -84,7 +83,11 @@ btint2cmp(PG_FUNCTION_ARGS)
int16 a = PG_GETARG_INT16(0);
int16 b = PG_GETARG_INT16(1);
- PG_RETURN_INT32((int32) a - (int32) b);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s16(a, b));
+#endif
}
static int
@@ -93,7 +96,11 @@ btint2fastcmp(Datum x, Datum y, SortSupport ssup)
int16 a = DatumGetInt16(x);
int16 b = DatumGetInt16(y);
- return (int) a - (int) b;
+#ifdef STRESS_SORT_INT_MIN
+ return STRESS_SORT_INT_MIN_CMP(a, b);
+#else
+ return pg_cmp_s16(a, b);
+#endif
}
Datum
@@ -111,12 +118,11 @@ btint4cmp(PG_FUNCTION_ARGS)
int32 a = PG_GETARG_INT32(0);
int32 b = PG_GETARG_INT32(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s32(a, b));
+#endif
}
Datum
@@ -134,12 +140,11 @@ btint8cmp(PG_FUNCTION_ARGS)
int64 a = PG_GETARG_INT64(0);
int64 b = PG_GETARG_INT64(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s64(a, b));
+#endif
}
#if SIZEOF_DATUM < 8
@@ -149,12 +154,11 @@ btint8fastcmp(Datum x, Datum y, SortSupport ssup)
int64 a = DatumGetInt64(x);
int64 b = DatumGetInt64(y);
- if (a > b)
- return A_GREATER_THAN_B;
- else if (a == b)
- return 0;
- else
- return A_LESS_THAN_B;
+#ifdef STRESS_SORT_INT_MIN
+ return STRESS_SORT_INT_MIN_CMP(a, b);
+#else
+ return pg_cmp_s64(a, b);
+#endif
}
#endif
@@ -177,12 +181,11 @@ btint48cmp(PG_FUNCTION_ARGS)
int32 a = PG_GETARG_INT32(0);
int64 b = PG_GETARG_INT64(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s64(a, b));
+#endif
}
Datum
@@ -191,12 +194,11 @@ btint84cmp(PG_FUNCTION_ARGS)
int64 a = PG_GETARG_INT64(0);
int32 b = PG_GETARG_INT32(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s64(a, b));
+#endif
}
Datum
@@ -205,12 +207,11 @@ btint24cmp(PG_FUNCTION_ARGS)
int16 a = PG_GETARG_INT16(0);
int32 b = PG_GETARG_INT32(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s32(a, b));
+#endif
}
Datum
@@ -219,12 +220,11 @@ btint42cmp(PG_FUNCTION_ARGS)
int32 a = PG_GETARG_INT32(0);
int16 b = PG_GETARG_INT16(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s32(a, b));
+#endif
}
Datum
@@ -233,12 +233,11 @@ btint28cmp(PG_FUNCTION_ARGS)
int16 a = PG_GETARG_INT16(0);
int64 b = PG_GETARG_INT64(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s64(a, b));
+#endif
}
Datum
@@ -247,12 +246,11 @@ btint82cmp(PG_FUNCTION_ARGS)
int64 a = PG_GETARG_INT64(0);
int16 b = PG_GETARG_INT16(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_s64(a, b));
+#endif
}
Datum
@@ -261,12 +259,11 @@ btoidcmp(PG_FUNCTION_ARGS)
Oid a = PG_GETARG_OID(0);
Oid b = PG_GETARG_OID(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(a, b));
+#else
+ PG_RETURN_INT32(pg_cmp_u32(a, b));
+#endif
}
static int
@@ -275,12 +272,11 @@ btoidfastcmp(Datum x, Datum y, SortSupport ssup)
Oid a = DatumGetObjectId(x);
Oid b = DatumGetObjectId(y);
- if (a > b)
- return A_GREATER_THAN_B;
- else if (a == b)
- return 0;
- else
- return A_LESS_THAN_B;
+#ifdef STRESS_SORT_INT_MIN
+ return STRESS_SORT_INT_MIN_CMP(a, b);
+#else
+ return pg_cmp_u32(a, b);
+#endif
}
Datum
@@ -305,12 +301,16 @@ btoidvectorcmp(PG_FUNCTION_ARGS)
for (i = 0; i < a->dim1; i++)
{
- if (a->values[i] != b->values[i])
+ Oid aval = a->values[i];
+ Oid bval = b->values[i];
+
+ if (aval != bval)
{
- if (a->values[i] > b->values[i])
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+#ifdef STRESS_SORT_INT_MIN
+ PG_RETURN_INT32(STRESS_SORT_INT_MIN_CMP(aval, bval));
+#else
+ PG_RETURN_INT32(pg_cmp_u32(aval, bval));
+#endif
}
}
PG_RETURN_INT32(0);
diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c
index af21170307..d85592951e 100644
--- a/src/backend/storage/page/itemptr.c
+++ b/src/backend/storage/page/itemptr.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "common/int.h"
#include "storage/itemptr.h"
@@ -57,18 +58,11 @@ ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
BlockNumber b1 = ItemPointerGetBlockNumberNoCheck(arg1);
BlockNumber b2 = ItemPointerGetBlockNumberNoCheck(arg2);
- if (b1 < b2)
- return -1;
- else if (b1 > b2)
- return 1;
- else if (ItemPointerGetOffsetNumberNoCheck(arg1) <
- ItemPointerGetOffsetNumberNoCheck(arg2))
- return -1;
- else if (ItemPointerGetOffsetNumberNoCheck(arg1) >
- ItemPointerGetOffsetNumberNoCheck(arg2))
- return 1;
- else
- return 0;
+ if (b1 != b2)
+ return pg_cmp_u32(b1, b2);
+
+ return pg_cmp_u16(ItemPointerGetOffsetNumberNoCheck(arg1),
+ ItemPointerGetOffsetNumberNoCheck(arg2));
}
/*
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index c960cfa823..ec1ff3e656 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -102,6 +102,7 @@
#include <limits.h>
#include "commands/tablespace.h"
+#include "common/int.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "storage/shmem.h"
@@ -3138,12 +3139,18 @@ free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
int
ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup)
{
+#if SIZEOF_DATUM == 4
+ return pg_cmp_u32(x, y);
+#elif SIZEOF_DATUM == 8
+ return pg_cmp_u64(x, y);
+#else
if (x < y)
return -1;
else if (x > y)
return 1;
else
return 0;
+#endif
}
#if SIZEOF_DATUM >= 8
@@ -3153,12 +3160,7 @@ ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup)
int64 xx = DatumGetInt64(x);
int64 yy = DatumGetInt64(y);
- if (xx < yy)
- return -1;
- else if (xx > yy)
- return 1;
- else
- return 0;
+ return pg_cmp_s64(xx, yy);
}
#endif
@@ -3168,10 +3170,5 @@ ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup)
int32 xx = DatumGetInt32(x);
int32 yy = DatumGetInt32(y);
- if (xx < yy)
- return -1;
- else if (xx > yy)
- return 1;
- else
- return 0;
+ return pg_cmp_s32(xx, yy);
}
--
2.39.5 (Apple Git-154)
On Fri, 27 Sept 2024 at 08:17, Nathan Bossart <nathandbossart@gmail.com> wrote:
Here's a patch that adjusts several routines in nbtcompare.c and related
files to use the branchless integer comparison functions added in commit
6b80394. It's probably unlikely this produces a measurable benefit (at
least I've been unable to find any in my admittedly-limited testing), but
in theory it should save a cycle here and there. I was hoping that this
would trim many lines of code, but maintaining the STRESS_SORT_INT_MIN
stuff eats up most of what we save.
I had been looking at [1]https://godbolt.org/z/33T8h151M (which I've added your version to now). I
had been surprised to see gcc emitting different code for the first 3
versions. Clang does a better job at figuring out they all do the same
thing and emitting the same code for each.
I played around with the attached (hacked up) qsort.c to see if there
was any difference. Likely function call overhead kills the
performance anyway. There does not seem to be much difference between
them. I've not tested with an inlined comparison function.
Looking at your version, it doesn't look like there's any sort of
improvement in terms of the instructions. Certainly, for clang, it's
worse as it adds a shift left instruction and an additional compare.
No jumps, at least.
What's your reasoning for returning INT_MIN and INT_MAX?
David
Attachments:
On Fri, Sep 27, 2024 at 02:50:13PM +1200, David Rowley wrote:
I had been looking at [1] (which I've added your version to now). I
had been surprised to see gcc emitting different code for the first 3
versions. Clang does a better job at figuring out they all do the same
thing and emitting the same code for each.
Interesting.
I played around with the attached (hacked up) qsort.c to see if there
was any difference. Likely function call overhead kills the
performance anyway. There does not seem to be much difference between
them. I've not tested with an inlined comparison function.
I'd expect worse performance with the branchless routines for the inlined
case. However, I recall that clang was able to optimize med3() as well as
it can with the branching routines, so that may not always be true.
Looking at your version, it doesn't look like there's any sort of
improvement in terms of the instructions. Certainly, for clang, it's
worse as it adds a shift left instruction and an additional compare.
No jumps, at least.
I think I may have forgotten to add -O2 when I was inspecting this code
with godbolt.org earlier. *facepalm* The different versions look pretty
comparable with that added.
What's your reasoning for returning INT_MIN and INT_MAX?
That's just for the compile option added by commit c87cb5f, which IIUC is
intended to test that we correctly handle comparisons that return INT_MIN.
--
nathan