add AVX2 support to simd.h
On Wed, Nov 22, 2023 at 12:49:35PM -0600, Nathan Bossart wrote:
On Wed, Nov 22, 2023 at 02:54:13PM +0200, Ants Aasma wrote:
For reference, executing the page checksum 10M times on a AMD 3900X CPU:
clang-14 -O2 4.292s (17.8 GiB/s)
clang-14 -O2 -msse4.1 2.859s (26.7 GiB/s)
clang-14 -O2 -msse4.1 -mavx2 1.378s (55.4 GiB/s)Nice. I've noticed similar improvements with AVX2 intrinsics in simd.h.
I've alluded to this a few times now, so I figured I'd park the patch and
preliminary benchmarks in a new thread while we iron out how to support
newer instructions (see discussion here [0]/messages/by-id/20231107024734.GB729644@nathanxps13).
Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1]/messages/by-id/057a9a95-19d2-05f0-17e2-f46ff20e9b3e@2ndquadrant.com [2]/messages/by-id/20220713170950.GA3116318@nathanxps13, I see the following:
writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12
It's been a while since I ran these benchmarks, but I vaguely recall also
seeing something like a 50% improvement for a dedicated pg_lfind32()
benchmark on long arrays.
As is, the patch likely won't do anything unless you add -mavx2 or
-march=native to your CFLAGS. I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.
I plan to start another thread for AVX2 support for the page checksums.
[0]: /messages/by-id/20231107024734.GB729644@nathanxps13
[1]: /messages/by-id/057a9a95-19d2-05f0-17e2-f46ff20e9b3e@2ndquadrant.com
[2]: /messages/by-id/20220713170950.GA3116318@nathanxps13
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
Attachments:
v1-0001-add-avx2-support-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From 5a90f1597fdc64aa6df6b9d0ffd959af7df41abd Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 29 Nov 2023 10:01:32 -0600
Subject: [PATCH v1 1/1] add avx2 support in simd.h
---
src/include/port/simd.h | 50 ++++++++++++++++++++++++++++++++---------
1 file changed, 39 insertions(+), 11 deletions(-)
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 1fa6c3bc6c..0e698dcfab 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,15 @@
#ifndef SIMD_H
#define SIMD_H
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
/*
* SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
* that compilers targeting this architecture understand SSE2 intrinsics.
@@ -105,7 +113,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
static inline void
vector8_load(Vector8 *v, const uint8 *s)
{
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+ *v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
*v = _mm_loadu_si128((const __m128i *) s);
#elif defined(USE_NEON)
*v = vld1q_u8(s);
@@ -118,7 +128,9 @@ vector8_load(Vector8 *v, const uint8 *s)
static inline void
vector32_load(Vector32 *v, const uint32 *s)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ *v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
*v = _mm_loadu_si128((const __m128i *) s);
#elif defined(USE_NEON)
*v = vld1q_u32(s);
@@ -132,7 +144,9 @@ vector32_load(Vector32 *v, const uint32 *s)
static inline Vector8
vector8_broadcast(const uint8 c)
{
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+ return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
return _mm_set1_epi8(c);
#elif defined(USE_NEON)
return vdupq_n_u8(c);
@@ -145,7 +159,9 @@ vector8_broadcast(const uint8 c)
static inline Vector32
vector32_broadcast(const uint32 c)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
return _mm_set1_epi32(c);
#elif defined(USE_NEON)
return vdupq_n_u32(c);
@@ -268,7 +284,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
static inline bool
vector8_is_highbit_set(const Vector8 v)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
return _mm_movemask_epi8(v) != 0;
#elif defined(USE_NEON)
return vmaxvq_u8(v) > 0x7F;
@@ -305,7 +323,9 @@ vector32_is_highbit_set(const Vector32 v)
static inline Vector8
vector8_or(const Vector8 v1, const Vector8 v2)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
return _mm_or_si128(v1, v2);
#elif defined(USE_NEON)
return vorrq_u8(v1, v2);
@@ -318,7 +338,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
static inline Vector32
vector32_or(const Vector32 v1, const Vector32 v2)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
return _mm_or_si128(v1, v2);
#elif defined(USE_NEON)
return vorrq_u32(v1, v2);
@@ -336,7 +358,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
static inline Vector8
vector8_ssub(const Vector8 v1, const Vector8 v2)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
return _mm_subs_epu8(v1, v2);
#elif defined(USE_NEON)
return vqsubq_u8(v1, v2);
@@ -352,7 +376,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
static inline Vector8
vector8_eq(const Vector8 v1, const Vector8 v2)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
return _mm_cmpeq_epi8(v1, v2);
#elif defined(USE_NEON)
return vceqq_u8(v1, v2);
@@ -364,7 +390,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
static inline Vector32
vector32_eq(const Vector32 v1, const Vector32 v2)
{
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+ return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
return _mm_cmpeq_epi32(v1, v2);
#elif defined(USE_NEON)
return vceqq_u32(v1, v2);
--
2.25.1
On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:
I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.
That's completely understandable, yet I'm confused why there is a
commitfest entry for it marked "needs review".
On Mon, Jan 01, 2024 at 07:12:26PM +0700, John Naylor wrote:
On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.That's completely understandable, yet I'm confused why there is a
commitfest entry for it marked "needs review".
Perhaps I was too optimistic about adding support for newer instructions...
I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.
There is likely still follow-up work to make these improvements more
accessible, but I'm not sure that is a strict prerequisite here.
(In case it isn't clear, I'm volunteering to set up such a buildfarm
machine.)
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
Nathan Bossart <nathandbossart@gmail.com> writes:
I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.
There is likely still follow-up work to make these improvements more
accessible, but I'm not sure that is a strict prerequisite here.
The patch needs better comments (as in, more than "none whatsoever").
It doesn't need to be much though, perhaps like
+#if defined(__AVX2__)
+
+/*
+ * When compiled with -mavx2 or allied options, we prefer AVX2 instructions.
+ */
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
Also, do you really want to structure the header so that USE_SSE2
doesn't get defined? In that case you are committing to provide
an AVX2 replacement every single place that there's USE_SSE2, which
doesn't seem like a great thing to require. OTOH, maybe there's
no choice given than we need a different definition for Vector8 and
Vector32?
regards, tom lane
On Tue, Jan 02, 2024 at 12:50:04PM -0500, Tom Lane wrote:
The patch needs better comments (as in, more than "none whatsoever").
Yes, will do.
Also, do you really want to structure the header so that USE_SSE2
doesn't get defined? In that case you are committing to provide
an AVX2 replacement every single place that there's USE_SSE2, which
doesn't seem like a great thing to require. OTOH, maybe there's
no choice given than we need a different definition for Vector8 and
Vector32?
Yeah, the precedent is to use these abstracted types elsewhere so that any
SIMD-related improvements aren't limited to one architecture. There are a
couple of places that do explicitly check for USE_NO_SIMD, though. Maybe
there's an eventual use-case for using SSE2 intrinsics even when you have
AVX2 support, but for now, ensuring we have an AVX2 replacement for
everything doesn't seem particularly burdensome.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Tue, Jan 2, 2024 at 11:11 PM Nathan Bossart <nathandbossart@gmail.com> wrote:
Perhaps I was too optimistic about adding support for newer instructions...
I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.
That means that we would be on the hook to fix it if it breaks, even
though nothing uses it yet in a normal build. I have pending patches
that will break, or get broken by, this, so minus-many from me until
there is an availability story.
On Wed, Jan 03, 2024 at 09:13:52PM +0700, John Naylor wrote:
On Tue, Jan 2, 2024 at 11:11 PM Nathan Bossart <nathandbossart@gmail.com> wrote:
I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.That means that we would be on the hook to fix it if it breaks, even
though nothing uses it yet in a normal build. I have pending patches
that will break, or get broken by, this, so minus-many from me until
there is an availability story.
How will this break your patches? Is it just a matter of adding more AVX2
support, or something else?
If the requirement is that normal builds use AVX2, then I fear we will be
waiting a long time. IIUC the current proposals (building multiple
binaries or adding a configuration option that maps to compiler flags)
would still be opt-in, and I'm not sure we can mandate AVX2 support for all
x86_64 builds anytime soon.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Tue, Jan 02, 2024 at 10:11:23AM -0600, Nathan Bossart wrote:
(In case it isn't clear, I'm volunteering to set up such a buildfarm
machine.)
I set up "akepa" to run with -march=x86-64-v3.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Wed, Jan 3, 2024 at 10:29 PM Nathan Bossart <nathandbossart@gmail.com> wrote:
If the requirement is that normal builds use AVX2, then I fear we will be
waiting a long time. IIUC the current proposals (building multiple
binaries or adding a configuration option that maps to compiler flags)
would still be opt-in,
If and when we get one of those, I would consider that a "normal"
build. Since there are no concrete proposals yet, I'm still waiting
for you to justify imposing an immediate maintenance cost for zero
benefit.
On Fri, Jan 05, 2024 at 09:03:39AM +0700, John Naylor wrote:
On Wed, Jan 3, 2024 at 10:29 PM Nathan Bossart <nathandbossart@gmail.com> wrote:
If the requirement is that normal builds use AVX2, then I fear we will be
waiting a long time. IIUC the current proposals (building multiple
binaries or adding a configuration option that maps to compiler flags)
would still be opt-in,If and when we get one of those, I would consider that a "normal"
build. Since there are no concrete proposals yet, I'm still waiting
for you to justify imposing an immediate maintenance cost for zero
benefit.
I've been thinking about the configuration option approach. ISTM that
would be the most feasible strategy, at least for v17. A couple things
come to mind:
* This option would simply map to existing compiler flags. We already have
ways to provide those (-Dc_args in meson, CFLAGS in autoconf). Perhaps
we'd want to provide our own shorthand for certain platforms (e.g., ARM),
but that will still just be shorthand for compiler flags.
* Such an option would itself generate some maintenance cost. That could
be worth it because it formalizes the Postgres support for those options,
but it's still one more thing to track.
Another related option could be to simply document that we have support for
some newer instructions that can be enabled by setting the aforementioned
compiler flags. That's perhaps a little less user-friendly, but it'd avoid
the duplication and possibly reduce the maintenance cost. I also wonder if
it'd help prevent confusion when CFLAGS and this extra option conflict.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:
Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:
I've been antagonistic towards the patch itself, but it'd be more
productive if I paid some nuanced attention to the problem it's trying
to solve. First, I'd like to understand the benchmark a bit better.
writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12
There doesn't seem to be any benefit at 256 at all. Is that expected
and/or fine?
It's been a while since I ran these benchmarks, but I vaguely recall also
seeing something like a 50% improvement for a dedicated pg_lfind32()
benchmark on long arrays.
The latest I see in
/messages/by-id/20220808223254.GA1393216@nathanxps13
writers head patch
8 672 680
16 639 664
32 701 689
64 705 703
128 628 653
256 576 627
512 530 584
768 450 536
1024 350 494
Here, the peak throughput seems to be around 64 writers with or
without the patch from a couple years ago, but the slope is shallower
after that. It would be good to make sure that it can't regress near
the peak, even with a "long tail" case (see next paragraph). The first
benchmark above starts at 256, so we can't tell where the peak is. It
might be worth it to also have a microbenchmark because the systemic
one has enough noise to obscure what's going on unless there are a
very large number of writers. We know what a systemic benchmark can
tell us on extreme workloads past the peak, and the microbenchmark
would tell us "we need to see X improvement here in order to see Y
improvement in the system benchmark".
I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.
Also, by reading 4 registers per loop iteration, that's 128 bytes on
AVX2. I'm not sure that matters, but we shouldn't assume it doesn't.
Code I've seen elsewhere reads a fixed 64-byte block, and then uses 1,
2, or 4 registers to handle it, depending on architecture. Whether or
not that's worth it in this case, this patch does mean future patches
will have to wonder if they have to do anything differently depending
on vector length, whereas now they don't. That's not a deal-breaker,
but it is a trade-off to keep in mind.
On Sat, Jan 6, 2024 at 12:04 AM Nathan Bossart <nathandbossart@gmail.com> wrote:
I've been thinking about the configuration option approach. ISTM that
would be the most feasible strategy, at least for v17. A couple things
come to mind:* This option would simply map to existing compiler flags. We already have
ways to provide those (-Dc_args in meson, CFLAGS in autoconf). Perhaps
we'd want to provide our own shorthand for certain platforms (e.g., ARM),
but that will still just be shorthand for compiler flags.* Such an option would itself generate some maintenance cost. That could
be worth it because it formalizes the Postgres support for those options,
but it's still one more thing to track.Another related option could be to simply document that we have support for
some newer instructions that can be enabled by setting the aforementioned
compiler flags. That's perhaps a little less user-friendly, but it'd avoid
the duplication and possibly reduce the maintenance cost. I also wonder if
it'd help prevent confusion when CFLAGS and this extra option conflict.
The last one might offer more graceful forward compatibility if the
multiple-binaries idea gets any traction some day, because at that
point the additional config options are not needed, I think.
Another consideration is which way would touch the fewest places to
work with Windows, which uses the spelling /arch:AVX2 etc.
One small thing I would hope for from the finial version of this is
the ability to inline things where we currently indirect depending on
a run-time check. That seems like "just work" on top of everything
else, and I don't think it makes a case for either of the above.
On Mon, Jan 08, 2024 at 02:01:39PM +0700, John Naylor wrote:
On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12There doesn't seem to be any benefit at 256 at all. Is that expected
and/or fine?
My unverified assumption is that the linear searches make up much less of
the benchmark at these lower client counts, so any improvements we make
here are unlikely to show up here. IIRC even the hash table approach that
we originally explored for XidInMVCCSnapshot() didn't do much, if anything,
for the benchmark at lower client counts.
Here, the peak throughput seems to be around 64 writers with or
without the patch from a couple years ago, but the slope is shallower
after that. It would be good to make sure that it can't regress near
the peak, even with a "long tail" case (see next paragraph). The first
benchmark above starts at 256, so we can't tell where the peak is. It
might be worth it to also have a microbenchmark because the systemic
one has enough noise to obscure what's going on unless there are a
very large number of writers. We know what a systemic benchmark can
tell us on extreme workloads past the peak, and the microbenchmark
would tell us "we need to see X improvement here in order to see Y
improvement in the system benchmark".
Yes, will do.
I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.
Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.
Also, by reading 4 registers per loop iteration, that's 128 bytes on
AVX2. I'm not sure that matters, but we shouldn't assume it doesn't.
Code I've seen elsewhere reads a fixed 64-byte block, and then uses 1,
2, or 4 registers to handle it, depending on architecture. Whether or
not that's worth it in this case, this patch does mean future patches
will have to wonder if they have to do anything differently depending
on vector length, whereas now they don't. That's not a deal-breaker,
but it is a trade-off to keep in mind.
Yeah. Presently, this AVX2 patch just kicks the optimization down the road
a bit for the existing use-cases, so you don't start using the vector
registers until there's more data to work with, which might not even be
noticeable. But it's conceivable that vector length could matter at some
point, even if it doesn't matter much now.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:
I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.
What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.
On 29.11.23 18:15, Nathan Bossart wrote:
Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12
AFAICT, your patch merely provides an alternative AVX2 implementation
for where currently SSE2 is supported, but it doesn't provide any new
API calls or new functionality. One might naively expect that these are
just two different ways to call the underlying primitives in the CPU, so
these performance improvements are surprising to me. Or do the CPUs
actually have completely separate machinery for SSE2 and AVX2, and just
using the latter to do the same thing is faster?
On Tue, 9 Jan 2024 at 16:03, Peter Eisentraut <peter@eisentraut.org> wrote:
On 29.11.23 18:15, Nathan Bossart wrote:
Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12AFAICT, your patch merely provides an alternative AVX2 implementation
for where currently SSE2 is supported, but it doesn't provide any new
API calls or new functionality. One might naively expect that these are
just two different ways to call the underlying primitives in the CPU, so
these performance improvements are surprising to me. Or do the CPUs
actually have completely separate machinery for SSE2 and AVX2, and just
using the latter to do the same thing is faster?
The AVX2 implementation uses a wider vector register. On most current
processors the throughput of the instructions in question is the same
on 256bit vectors as on 128bit vectors. Basically, the chip has AVX2
worth of machinery and using SSE2 leaves half of it unused. Notable
exceptions are efficiency cores on recent Intel desktop CPUs and AMD
CPUs pre Zen 2 where AVX2 instructions are internally split up into
two 128bit wide instructions.
For AVX512 the picture is much more complicated. Some instructions run
at half rate, some at full rate, but not on all ALU ports, some
instructions cause aggressive clock rate reduction on some
microarchitectures. AVX-512 adds mask registers and masked vector
instructions that enable quite a bit simpler code in many cases.
Interestingly I have seen Clang make quite effective use of these
masked instructions even when using AVX2 intrinsics, but targeting an
AVX-512 capable platform.
The vector width independent approach used in the patch is nice for
simple cases by not needing a separate implementation for each vector
width. However for more complicated cases where "horizontal"
operations are needed it's going to be much less useful. But these
cases can easily just drop down to using intrinsics directly.
On Tue, Jan 09, 2024 at 09:20:09AM +0700, John Naylor wrote:
On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:
I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.
Sorry, I'm not sure I understood this. Do you mean processing the first
several elements individually or with SSE2 until the number of remaining
elements can be processed with just the AVX2 instructions (a bit like how
pg_comp_crc32c_armv8() is structured for memory alignment)?
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Tue, 9 Jan 2024 at 18:20, Nathan Bossart <nathandbossart@gmail.com> wrote:
On Tue, Jan 09, 2024 at 09:20:09AM +0700, John Naylor wrote:
On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:
I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.Sorry, I'm not sure I understood this. Do you mean processing the first
several elements individually or with SSE2 until the number of remaining
elements can be processed with just the AVX2 instructions (a bit like how
pg_comp_crc32c_armv8() is structured for memory alignment)?
For some operations (min, max, = any) processing the same elements
multiple times doesn't change the result. So the vectors for first
and/or last iterations can overlap with the main loop. In other cases
it's possible to mask out the invalid elements and replace them with
zeroes. Something along the lines of:
static inline Vector8
vector8_mask_right(int num_valid)
{
__m256i seq = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0);
return _mm256_cmpgt_epi8(_mm256_set1_epi8(num_valid), seq);
}
/* final incomplete iteration */
Vector8 mask = vector8_mask_right(end - cur);
final_vec = vector8_and((Vector8*) (end - sizeof(Vector8), mask);
accum = vector8_add(accum, final_vec);
It helps that on any halfway recent x86 unaligned loads only have a
minor performance penalty and only when straddling cache line
boundaries. Not sure what the state on ARM is. If we don't care about
unaligned loads then we only need to care about the load not crossing
page boundaries which could cause segfaults. Though I'm sure memory
sanitizer tools will have plenty to complain about around such hacks.
On Tue, Jan 9, 2024 at 11:20 PM Nathan Bossart <nathandbossart@gmail.com> wrote:
On Tue, Jan 09, 2024 at 09:20:09AM +0700, John Naylor wrote:
On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:
I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.Sorry, I'm not sure I understood this. Do you mean processing the first
several elements individually or with SSE2 until the number of remaining
elements can be processed with just the AVX2 instructions (a bit like how
pg_comp_crc32c_armv8() is structured for memory alignment)?
If we have say 25 elements, I mean (for SSE2) check the first 16, then
the last 16. Some will be checked twice, but that's okay.
On Wed, Jan 10, 2024 at 09:06:08AM +0700, John Naylor wrote:
If we have say 25 elements, I mean (for SSE2) check the first 16, then
the last 16. Some will be checked twice, but that's okay.
I finally got around to trying this. 0001 adds this overlapping logic.
0002 is a rebased version of the AVX2 patch (it needed some updates after
commit 9f225e9). And 0003 is a benchmark for test_lfind32(). It runs
pg_lfind32() on an array of the given size 100M times.
I've also attached the results of running this benchmark on my machine at
HEAD, after applying 0001, and after applying both 0001 and 0002. 0001
appears to work pretty well. When there is a small "tail," it regresses a
small amount, but overall, it seems to improve more cases than it harms.
0002 does regress searches on smaller arrays quite a bit, since it
postpones the SIMD optimizations until the arrays are longer. It might be
possible to mitigate by using 2 registers when the "tail" is long enough,
but I have yet to try that.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
Attachments:
v2-0003-test_lfind32-benchmark.patchtext/x-diff; charset=us-asciiDownload
From 9b2b61927a8b52637f70659d513ddfeba7c03024 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 12:28:00 -0500
Subject: [PATCH v2 3/3] test_lfind32() benchmark
---
.../modules/test_lfind/sql/test_lfind.sql | 67 +++++++++++++++++++
.../modules/test_lfind/test_lfind--1.0.sql | 4 ++
src/test/modules/test_lfind/test_lfind.c | 16 +++++
3 files changed, 87 insertions(+)
diff --git a/src/test/modules/test_lfind/sql/test_lfind.sql b/src/test/modules/test_lfind/sql/test_lfind.sql
index 766c640831..d8fa461bfa 100644
--- a/src/test/modules/test_lfind/sql/test_lfind.sql
+++ b/src/test/modules/test_lfind/sql/test_lfind.sql
@@ -8,3 +8,70 @@ CREATE EXTENSION test_lfind;
SELECT test_lfind8();
SELECT test_lfind8_le();
SELECT test_lfind32();
+
+\timing on
+SELECT drive_lfind32(0);
+SELECT drive_lfind32(1);
+SELECT drive_lfind32(2);
+SELECT drive_lfind32(3);
+SELECT drive_lfind32(4);
+SELECT drive_lfind32(5);
+SELECT drive_lfind32(6);
+SELECT drive_lfind32(7);
+SELECT drive_lfind32(8);
+SELECT drive_lfind32(9);
+SELECT drive_lfind32(10);
+SELECT drive_lfind32(11);
+SELECT drive_lfind32(12);
+SELECT drive_lfind32(13);
+SELECT drive_lfind32(14);
+SELECT drive_lfind32(15);
+SELECT drive_lfind32(16);
+SELECT drive_lfind32(17);
+SELECT drive_lfind32(18);
+SELECT drive_lfind32(19);
+SELECT drive_lfind32(20);
+SELECT drive_lfind32(21);
+SELECT drive_lfind32(22);
+SELECT drive_lfind32(23);
+SELECT drive_lfind32(24);
+SELECT drive_lfind32(25);
+SELECT drive_lfind32(26);
+SELECT drive_lfind32(27);
+SELECT drive_lfind32(28);
+SELECT drive_lfind32(29);
+SELECT drive_lfind32(30);
+SELECT drive_lfind32(31);
+SELECT drive_lfind32(32);
+SELECT drive_lfind32(33);
+SELECT drive_lfind32(34);
+SELECT drive_lfind32(35);
+SELECT drive_lfind32(36);
+SELECT drive_lfind32(37);
+SELECT drive_lfind32(38);
+SELECT drive_lfind32(39);
+SELECT drive_lfind32(40);
+SELECT drive_lfind32(41);
+SELECT drive_lfind32(42);
+SELECT drive_lfind32(43);
+SELECT drive_lfind32(44);
+SELECT drive_lfind32(45);
+SELECT drive_lfind32(46);
+SELECT drive_lfind32(47);
+SELECT drive_lfind32(48);
+SELECT drive_lfind32(49);
+SELECT drive_lfind32(50);
+SELECT drive_lfind32(51);
+SELECT drive_lfind32(52);
+SELECT drive_lfind32(53);
+SELECT drive_lfind32(54);
+SELECT drive_lfind32(55);
+SELECT drive_lfind32(56);
+SELECT drive_lfind32(57);
+SELECT drive_lfind32(58);
+SELECT drive_lfind32(59);
+SELECT drive_lfind32(60);
+SELECT drive_lfind32(61);
+SELECT drive_lfind32(62);
+SELECT drive_lfind32(63);
+SELECT drive_lfind32(64);
diff --git a/src/test/modules/test_lfind/test_lfind--1.0.sql b/src/test/modules/test_lfind/test_lfind--1.0.sql
index 81801926ae..6b396dbd58 100644
--- a/src/test/modules/test_lfind/test_lfind--1.0.sql
+++ b/src/test/modules/test_lfind/test_lfind--1.0.sql
@@ -14,3 +14,7 @@ CREATE FUNCTION test_lfind8()
CREATE FUNCTION test_lfind8_le()
RETURNS pg_catalog.void
AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION drive_lfind32(n int)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_lfind/test_lfind.c b/src/test/modules/test_lfind/test_lfind.c
index c04bc2f6b4..2234f148b6 100644
--- a/src/test/modules/test_lfind/test_lfind.c
+++ b/src/test/modules/test_lfind/test_lfind.c
@@ -146,3 +146,19 @@ test_lfind32(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+
+PG_FUNCTION_INFO_V1(drive_lfind32);
+Datum
+drive_lfind32(PG_FUNCTION_ARGS)
+{
+ int array_size = PG_GETARG_INT32(0);
+ uint32 *test_array = palloc0(array_size * sizeof(uint32));
+
+ for (int i = 0; i < 100000000; i++)
+ {
+ if (pg_lfind32(1, test_array, array_size))
+ elog(ERROR, "pg_lfind32() found nonexistent element");
+ }
+
+ PG_RETURN_VOID();
+}
--
2.25.1
avx2_bench_graph.jpgimage/jpegDownload
���� JFIF h i �� C
$.' ",#(7),01444'9=82<.342�� C
2!!22222222222222222222222222222222222222222222222222�� ��"