add AVX2 support to simd.h

Started by Nathan Bossartabout 2 years ago48 messages
#1Nathan Bossart
nathandbossart@gmail.com
1 attachment(s)

On Wed, Nov 22, 2023 at 12:49:35PM -0600, Nathan Bossart wrote:

On Wed, Nov 22, 2023 at 02:54:13PM +0200, Ants Aasma wrote:

For reference, executing the page checksum 10M times on a AMD 3900X CPU:

clang-14 -O2 4.292s (17.8 GiB/s)
clang-14 -O2 -msse4.1 2.859s (26.7 GiB/s)
clang-14 -O2 -msse4.1 -mavx2 1.378s (55.4 GiB/s)

Nice. I've noticed similar improvements with AVX2 intrinsics in simd.h.

I've alluded to this a few times now, so I figured I'd park the patch and
preliminary benchmarks in a new thread while we iron out how to support
newer instructions (see discussion here [0]/messages/by-id/20231107024734.GB729644@nathanxps13).

Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1]/messages/by-id/057a9a95-19d2-05f0-17e2-f46ff20e9b3e@2ndquadrant.com [2]/messages/by-id/20220713170950.GA3116318@nathanxps13, I see the following:

writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12

It's been a while since I ran these benchmarks, but I vaguely recall also
seeing something like a 50% improvement for a dedicated pg_lfind32()
benchmark on long arrays.

As is, the patch likely won't do anything unless you add -mavx2 or
-march=native to your CFLAGS. I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.

I plan to start another thread for AVX2 support for the page checksums.

[0]: /messages/by-id/20231107024734.GB729644@nathanxps13
[1]: /messages/by-id/057a9a95-19d2-05f0-17e2-f46ff20e9b3e@2ndquadrant.com
[2]: /messages/by-id/20220713170950.GA3116318@nathanxps13

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v1-0001-add-avx2-support-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From 5a90f1597fdc64aa6df6b9d0ffd959af7df41abd Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 29 Nov 2023 10:01:32 -0600
Subject: [PATCH v1 1/1] add avx2 support in simd.h

---
 src/include/port/simd.h | 50 ++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 1fa6c3bc6c..0e698dcfab 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,15 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -105,7 +113,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -118,7 +128,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -132,7 +144,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -145,7 +159,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -268,7 +284,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -305,7 +323,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -318,7 +338,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -336,7 +358,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -352,7 +376,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -364,7 +390,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
-- 
2.25.1

#2John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#1)
Re: add AVX2 support to simd.h

On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.

That's completely understandable, yet I'm confused why there is a
commitfest entry for it marked "needs review".

#3Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#2)
Re: add AVX2 support to simd.h

On Mon, Jan 01, 2024 at 07:12:26PM +0700, John Naylor wrote:

On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.

That's completely understandable, yet I'm confused why there is a
commitfest entry for it marked "needs review".

Perhaps I was too optimistic about adding support for newer instructions...

I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.
There is likely still follow-up work to make these improvements more
accessible, but I'm not sure that is a strict prerequisite here.

(In case it isn't clear, I'm volunteering to set up such a buildfarm
machine.)

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#4Tom Lane
tgl@sss.pgh.pa.us
In reply to: Nathan Bossart (#3)
Re: add AVX2 support to simd.h

Nathan Bossart <nathandbossart@gmail.com> writes:

I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.
There is likely still follow-up work to make these improvements more
accessible, but I'm not sure that is a strict prerequisite here.

The patch needs better comments (as in, more than "none whatsoever").
It doesn't need to be much though, perhaps like

+#if defined(__AVX2__)
+
+/*
+ * When compiled with -mavx2 or allied options, we prefer AVX2 instructions.
+ */
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;

Also, do you really want to structure the header so that USE_SSE2
doesn't get defined? In that case you are committing to provide
an AVX2 replacement every single place that there's USE_SSE2, which
doesn't seem like a great thing to require. OTOH, maybe there's
no choice given than we need a different definition for Vector8 and
Vector32?

regards, tom lane

#5Nathan Bossart
nathandbossart@gmail.com
In reply to: Tom Lane (#4)
Re: add AVX2 support to simd.h

On Tue, Jan 02, 2024 at 12:50:04PM -0500, Tom Lane wrote:

The patch needs better comments (as in, more than "none whatsoever").

Yes, will do.

Also, do you really want to structure the header so that USE_SSE2
doesn't get defined? In that case you are committing to provide
an AVX2 replacement every single place that there's USE_SSE2, which
doesn't seem like a great thing to require. OTOH, maybe there's
no choice given than we need a different definition for Vector8 and
Vector32?

Yeah, the precedent is to use these abstracted types elsewhere so that any
SIMD-related improvements aren't limited to one architecture. There are a
couple of places that do explicitly check for USE_NO_SIMD, though. Maybe
there's an eventual use-case for using SSE2 intrinsics even when you have
AVX2 support, but for now, ensuring we have an AVX2 replacement for
everything doesn't seem particularly burdensome.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#6John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#3)
Re: add AVX2 support to simd.h

On Tue, Jan 2, 2024 at 11:11 PM Nathan Bossart <nathandbossart@gmail.com> wrote:

Perhaps I was too optimistic about adding support for newer instructions...

I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.

That means that we would be on the hook to fix it if it breaks, even
though nothing uses it yet in a normal build. I have pending patches
that will break, or get broken by, this, so minus-many from me until
there is an availability story.

#7Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#6)
Re: add AVX2 support to simd.h

On Wed, Jan 03, 2024 at 09:13:52PM +0700, John Naylor wrote:

On Tue, Jan 2, 2024 at 11:11 PM Nathan Bossart <nathandbossart@gmail.com> wrote:

I'm tempted to propose that we move forward with this patch as-is after
adding a buildfarm machine that compiles with -mavx2 or -march=x86-64-v3.

That means that we would be on the hook to fix it if it breaks, even
though nothing uses it yet in a normal build. I have pending patches
that will break, or get broken by, this, so minus-many from me until
there is an availability story.

How will this break your patches? Is it just a matter of adding more AVX2
support, or something else?

If the requirement is that normal builds use AVX2, then I fear we will be
waiting a long time. IIUC the current proposals (building multiple
binaries or adding a configuration option that maps to compiler flags)
would still be opt-in, and I'm not sure we can mandate AVX2 support for all
x86_64 builds anytime soon.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#8Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#3)
Re: add AVX2 support to simd.h

On Tue, Jan 02, 2024 at 10:11:23AM -0600, Nathan Bossart wrote:

(In case it isn't clear, I'm volunteering to set up such a buildfarm
machine.)

I set up "akepa" to run with -march=x86-64-v3.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#9John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#7)
Re: add AVX2 support to simd.h

On Wed, Jan 3, 2024 at 10:29 PM Nathan Bossart <nathandbossart@gmail.com> wrote:

If the requirement is that normal builds use AVX2, then I fear we will be
waiting a long time. IIUC the current proposals (building multiple
binaries or adding a configuration option that maps to compiler flags)
would still be opt-in,

If and when we get one of those, I would consider that a "normal"
build. Since there are no concrete proposals yet, I'm still waiting
for you to justify imposing an immediate maintenance cost for zero
benefit.

#10Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#9)
Re: add AVX2 support to simd.h

On Fri, Jan 05, 2024 at 09:03:39AM +0700, John Naylor wrote:

On Wed, Jan 3, 2024 at 10:29 PM Nathan Bossart <nathandbossart@gmail.com> wrote:

If the requirement is that normal builds use AVX2, then I fear we will be
waiting a long time. IIUC the current proposals (building multiple
binaries or adding a configuration option that maps to compiler flags)
would still be opt-in,

If and when we get one of those, I would consider that a "normal"
build. Since there are no concrete proposals yet, I'm still waiting
for you to justify imposing an immediate maintenance cost for zero
benefit.

I've been thinking about the configuration option approach. ISTM that
would be the most feasible strategy, at least for v17. A couple things
come to mind:

* This option would simply map to existing compiler flags. We already have
ways to provide those (-Dc_args in meson, CFLAGS in autoconf). Perhaps
we'd want to provide our own shorthand for certain platforms (e.g., ARM),
but that will still just be shorthand for compiler flags.

* Such an option would itself generate some maintenance cost. That could
be worth it because it formalizes the Postgres support for those options,
but it's still one more thing to track.

Another related option could be to simply document that we have support for
some newer instructions that can be enabled by setting the aforementioned
compiler flags. That's perhaps a little less user-friendly, but it'd avoid
the duplication and possibly reduce the maintenance cost. I also wonder if
it'd help prevent confusion when CFLAGS and this extra option conflict.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#11John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#1)
Re: add AVX2 support to simd.h

On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:

I've been antagonistic towards the patch itself, but it'd be more
productive if I paid some nuanced attention to the problem it's trying
to solve. First, I'd like to understand the benchmark a bit better.

writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12

There doesn't seem to be any benefit at 256 at all. Is that expected
and/or fine?

It's been a while since I ran these benchmarks, but I vaguely recall also
seeing something like a 50% improvement for a dedicated pg_lfind32()
benchmark on long arrays.

The latest I see in
/messages/by-id/20220808223254.GA1393216@nathanxps13

writers head patch
8 672 680
16 639 664
32 701 689
64 705 703
128 628 653
256 576 627
512 530 584
768 450 536
1024 350 494

Here, the peak throughput seems to be around 64 writers with or
without the patch from a couple years ago, but the slope is shallower
after that. It would be good to make sure that it can't regress near
the peak, even with a "long tail" case (see next paragraph). The first
benchmark above starts at 256, so we can't tell where the peak is. It
might be worth it to also have a microbenchmark because the systemic
one has enough noise to obscure what's going on unless there are a
very large number of writers. We know what a systemic benchmark can
tell us on extreme workloads past the peak, and the microbenchmark
would tell us "we need to see X improvement here in order to see Y
improvement in the system benchmark".

I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.

Also, by reading 4 registers per loop iteration, that's 128 bytes on
AVX2. I'm not sure that matters, but we shouldn't assume it doesn't.
Code I've seen elsewhere reads a fixed 64-byte block, and then uses 1,
2, or 4 registers to handle it, depending on architecture. Whether or
not that's worth it in this case, this patch does mean future patches
will have to wonder if they have to do anything differently depending
on vector length, whereas now they don't. That's not a deal-breaker,
but it is a trade-off to keep in mind.

#12John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#10)
Re: add AVX2 support to simd.h

On Sat, Jan 6, 2024 at 12:04 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

I've been thinking about the configuration option approach. ISTM that
would be the most feasible strategy, at least for v17. A couple things
come to mind:

* This option would simply map to existing compiler flags. We already have
ways to provide those (-Dc_args in meson, CFLAGS in autoconf). Perhaps
we'd want to provide our own shorthand for certain platforms (e.g., ARM),
but that will still just be shorthand for compiler flags.

* Such an option would itself generate some maintenance cost. That could
be worth it because it formalizes the Postgres support for those options,
but it's still one more thing to track.

Another related option could be to simply document that we have support for
some newer instructions that can be enabled by setting the aforementioned
compiler flags. That's perhaps a little less user-friendly, but it'd avoid
the duplication and possibly reduce the maintenance cost. I also wonder if
it'd help prevent confusion when CFLAGS and this extra option conflict.

The last one might offer more graceful forward compatibility if the
multiple-binaries idea gets any traction some day, because at that
point the additional config options are not needed, I think.

Another consideration is which way would touch the fewest places to
work with Windows, which uses the spelling /arch:AVX2 etc.

One small thing I would hope for from the finial version of this is
the ability to inline things where we currently indirect depending on
a run-time check. That seems like "just work" on top of everything
else, and I don't think it makes a case for either of the above.

#13Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#11)
Re: add AVX2 support to simd.h

On Mon, Jan 08, 2024 at 02:01:39PM +0700, John Naylor wrote:

On Thu, Nov 30, 2023 at 12:15 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12

There doesn't seem to be any benefit at 256 at all. Is that expected
and/or fine?

My unverified assumption is that the linear searches make up much less of
the benchmark at these lower client counts, so any improvements we make
here are unlikely to show up here. IIRC even the hash table approach that
we originally explored for XidInMVCCSnapshot() didn't do much, if anything,
for the benchmark at lower client counts.

Here, the peak throughput seems to be around 64 writers with or
without the patch from a couple years ago, but the slope is shallower
after that. It would be good to make sure that it can't regress near
the peak, even with a "long tail" case (see next paragraph). The first
benchmark above starts at 256, so we can't tell where the peak is. It
might be worth it to also have a microbenchmark because the systemic
one has enough noise to obscure what's going on unless there are a
very large number of writers. We know what a systemic benchmark can
tell us on extreme workloads past the peak, and the microbenchmark
would tell us "we need to see X improvement here in order to see Y
improvement in the system benchmark".

Yes, will do.

I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.

Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.

Also, by reading 4 registers per loop iteration, that's 128 bytes on
AVX2. I'm not sure that matters, but we shouldn't assume it doesn't.
Code I've seen elsewhere reads a fixed 64-byte block, and then uses 1,
2, or 4 registers to handle it, depending on architecture. Whether or
not that's worth it in this case, this patch does mean future patches
will have to wonder if they have to do anything differently depending
on vector length, whereas now they don't. That's not a deal-breaker,
but it is a trade-off to keep in mind.

Yeah. Presently, this AVX2 patch just kicks the optimization down the road
a bit for the existing use-cases, so you don't start using the vector
registers until there's more data to work with, which might not even be
noticeable. But it's conceivable that vector length could matter at some
point, even if it doesn't matter much now.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#14John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#13)
Re: add AVX2 support to simd.h

On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.

Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.

What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.

#15Peter Eisentraut
peter@eisentraut.org
In reply to: Nathan Bossart (#1)
Re: add AVX2 support to simd.h

On 29.11.23 18:15, Nathan Bossart wrote:

Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:

writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12

AFAICT, your patch merely provides an alternative AVX2 implementation
for where currently SSE2 is supported, but it doesn't provide any new
API calls or new functionality. One might naively expect that these are
just two different ways to call the underlying primitives in the CPU, so
these performance improvements are surprising to me. Or do the CPUs
actually have completely separate machinery for SSE2 and AVX2, and just
using the latter to do the same thing is faster?

#16Ants Aasma
ants.aasma@cybertec.at
In reply to: Peter Eisentraut (#15)
Re: add AVX2 support to simd.h

On Tue, 9 Jan 2024 at 16:03, Peter Eisentraut <peter@eisentraut.org> wrote:

On 29.11.23 18:15, Nathan Bossart wrote:

Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:

writers sse2 avx2 %
256 1195 1188 -1
512 928 1054 +14
1024 633 716 +13
2048 332 420 +27
4096 162 203 +25
8192 162 182 +12

AFAICT, your patch merely provides an alternative AVX2 implementation
for where currently SSE2 is supported, but it doesn't provide any new
API calls or new functionality. One might naively expect that these are
just two different ways to call the underlying primitives in the CPU, so
these performance improvements are surprising to me. Or do the CPUs
actually have completely separate machinery for SSE2 and AVX2, and just
using the latter to do the same thing is faster?

The AVX2 implementation uses a wider vector register. On most current
processors the throughput of the instructions in question is the same
on 256bit vectors as on 128bit vectors. Basically, the chip has AVX2
worth of machinery and using SSE2 leaves half of it unused. Notable
exceptions are efficiency cores on recent Intel desktop CPUs and AMD
CPUs pre Zen 2 where AVX2 instructions are internally split up into
two 128bit wide instructions.

For AVX512 the picture is much more complicated. Some instructions run
at half rate, some at full rate, but not on all ALU ports, some
instructions cause aggressive clock rate reduction on some
microarchitectures. AVX-512 adds mask registers and masked vector
instructions that enable quite a bit simpler code in many cases.
Interestingly I have seen Clang make quite effective use of these
masked instructions even when using AVX2 intrinsics, but targeting an
AVX-512 capable platform.

The vector width independent approach used in the patch is nice for
simple cases by not needing a separate implementation for each vector
width. However for more complicated cases where "horizontal"
operations are needed it's going to be much less useful. But these
cases can easily just drop down to using intrinsics directly.

#17Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#14)
Re: add AVX2 support to simd.h

On Tue, Jan 09, 2024 at 09:20:09AM +0700, John Naylor wrote:

On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.

Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.

What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.

Sorry, I'm not sure I understood this. Do you mean processing the first
several elements individually or with SSE2 until the number of remaining
elements can be processed with just the AVX2 instructions (a bit like how
pg_comp_crc32c_armv8() is structured for memory alignment)?

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#18Ants Aasma
ants.aasma@cybertec.at
In reply to: Nathan Bossart (#17)
Re: add AVX2 support to simd.h

On Tue, 9 Jan 2024 at 18:20, Nathan Bossart <nathandbossart@gmail.com> wrote:

On Tue, Jan 09, 2024 at 09:20:09AM +0700, John Naylor wrote:

On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.

Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.

What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.

Sorry, I'm not sure I understood this. Do you mean processing the first
several elements individually or with SSE2 until the number of remaining
elements can be processed with just the AVX2 instructions (a bit like how
pg_comp_crc32c_armv8() is structured for memory alignment)?

For some operations (min, max, = any) processing the same elements
multiple times doesn't change the result. So the vectors for first
and/or last iterations can overlap with the main loop. In other cases
it's possible to mask out the invalid elements and replace them with
zeroes. Something along the lines of:

static inline Vector8
vector8_mask_right(int num_valid)
{
__m256i seq = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0);
return _mm256_cmpgt_epi8(_mm256_set1_epi8(num_valid), seq);
}

/* final incomplete iteration */
Vector8 mask = vector8_mask_right(end - cur);
final_vec = vector8_and((Vector8*) (end - sizeof(Vector8), mask);
accum = vector8_add(accum, final_vec);

It helps that on any halfway recent x86 unaligned loads only have a
minor performance penalty and only when straddling cache line
boundaries. Not sure what the state on ARM is. If we don't care about
unaligned loads then we only need to care about the load not crossing
page boundaries which could cause segfaults. Though I'm sure memory
sanitizer tools will have plenty to complain about around such hacks.

#19John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#17)
Re: add AVX2 support to simd.h

On Tue, Jan 9, 2024 at 11:20 PM Nathan Bossart <nathandbossart@gmail.com> wrote:

On Tue, Jan 09, 2024 at 09:20:09AM +0700, John Naylor wrote:

On Tue, Jan 9, 2024 at 12:37 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

I suspect that there could be a regression lurking for some inputs
that the benchmark doesn't look at: pg_lfind32() currently needs to be
able to read 4 vector registers worth of elements before taking the
fast path. There is then a tail of up to 15 elements that are now
checked one-by-one, but AVX2 would increase that to 31. That's getting
big enough to be noticeable, I suspect. It would be good to understand
that case (n*32 + 31), because it may also be relevant now. It's also
easy to improve for SSE2/NEON for v17.

Good idea. If it is indeed noticeable, we might be able to "fix" it by
processing some of the tail with shorter vectors. But that probably means
finding a way to support multiple vector sizes on the same build, which
would require some work.

What I had in mind was an overlapping pattern I've seen in various
places: do one iteration at the beginning, then subtract the
aligned-down length from the end and do all those iterations. And
one-by-one is only used if the total length is small.

Sorry, I'm not sure I understood this. Do you mean processing the first
several elements individually or with SSE2 until the number of remaining
elements can be processed with just the AVX2 instructions (a bit like how
pg_comp_crc32c_armv8() is structured for memory alignment)?

If we have say 25 elements, I mean (for SSE2) check the first 16, then
the last 16. Some will be checked twice, but that's okay.

#20Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#19)
4 attachment(s)
Re: add AVX2 support to simd.h

On Wed, Jan 10, 2024 at 09:06:08AM +0700, John Naylor wrote:

If we have say 25 elements, I mean (for SSE2) check the first 16, then
the last 16. Some will be checked twice, but that's okay.

I finally got around to trying this. 0001 adds this overlapping logic.
0002 is a rebased version of the AVX2 patch (it needed some updates after
commit 9f225e9). And 0003 is a benchmark for test_lfind32(). It runs
pg_lfind32() on an array of the given size 100M times.

I've also attached the results of running this benchmark on my machine at
HEAD, after applying 0001, and after applying both 0001 and 0002. 0001
appears to work pretty well. When there is a small "tail," it regresses a
small amount, but overall, it seems to improve more cases than it harms.
0002 does regress searches on smaller arrays quite a bit, since it
postpones the SIMD optimizations until the arrays are longer. It might be
possible to mitigate by using 2 registers when the "tail" is long enough,
but I have yet to try that.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v2-0003-test_lfind32-benchmark.patchtext/x-diff; charset=us-asciiDownload
From 9b2b61927a8b52637f70659d513ddfeba7c03024 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 12:28:00 -0500
Subject: [PATCH v2 3/3] test_lfind32() benchmark

---
 .../modules/test_lfind/sql/test_lfind.sql     | 67 +++++++++++++++++++
 .../modules/test_lfind/test_lfind--1.0.sql    |  4 ++
 src/test/modules/test_lfind/test_lfind.c      | 16 +++++
 3 files changed, 87 insertions(+)

diff --git a/src/test/modules/test_lfind/sql/test_lfind.sql b/src/test/modules/test_lfind/sql/test_lfind.sql
index 766c640831..d8fa461bfa 100644
--- a/src/test/modules/test_lfind/sql/test_lfind.sql
+++ b/src/test/modules/test_lfind/sql/test_lfind.sql
@@ -8,3 +8,70 @@ CREATE EXTENSION test_lfind;
 SELECT test_lfind8();
 SELECT test_lfind8_le();
 SELECT test_lfind32();
+
+\timing on
+SELECT drive_lfind32(0);
+SELECT drive_lfind32(1);
+SELECT drive_lfind32(2);
+SELECT drive_lfind32(3);
+SELECT drive_lfind32(4);
+SELECT drive_lfind32(5);
+SELECT drive_lfind32(6);
+SELECT drive_lfind32(7);
+SELECT drive_lfind32(8);
+SELECT drive_lfind32(9);
+SELECT drive_lfind32(10);
+SELECT drive_lfind32(11);
+SELECT drive_lfind32(12);
+SELECT drive_lfind32(13);
+SELECT drive_lfind32(14);
+SELECT drive_lfind32(15);
+SELECT drive_lfind32(16);
+SELECT drive_lfind32(17);
+SELECT drive_lfind32(18);
+SELECT drive_lfind32(19);
+SELECT drive_lfind32(20);
+SELECT drive_lfind32(21);
+SELECT drive_lfind32(22);
+SELECT drive_lfind32(23);
+SELECT drive_lfind32(24);
+SELECT drive_lfind32(25);
+SELECT drive_lfind32(26);
+SELECT drive_lfind32(27);
+SELECT drive_lfind32(28);
+SELECT drive_lfind32(29);
+SELECT drive_lfind32(30);
+SELECT drive_lfind32(31);
+SELECT drive_lfind32(32);
+SELECT drive_lfind32(33);
+SELECT drive_lfind32(34);
+SELECT drive_lfind32(35);
+SELECT drive_lfind32(36);
+SELECT drive_lfind32(37);
+SELECT drive_lfind32(38);
+SELECT drive_lfind32(39);
+SELECT drive_lfind32(40);
+SELECT drive_lfind32(41);
+SELECT drive_lfind32(42);
+SELECT drive_lfind32(43);
+SELECT drive_lfind32(44);
+SELECT drive_lfind32(45);
+SELECT drive_lfind32(46);
+SELECT drive_lfind32(47);
+SELECT drive_lfind32(48);
+SELECT drive_lfind32(49);
+SELECT drive_lfind32(50);
+SELECT drive_lfind32(51);
+SELECT drive_lfind32(52);
+SELECT drive_lfind32(53);
+SELECT drive_lfind32(54);
+SELECT drive_lfind32(55);
+SELECT drive_lfind32(56);
+SELECT drive_lfind32(57);
+SELECT drive_lfind32(58);
+SELECT drive_lfind32(59);
+SELECT drive_lfind32(60);
+SELECT drive_lfind32(61);
+SELECT drive_lfind32(62);
+SELECT drive_lfind32(63);
+SELECT drive_lfind32(64);
diff --git a/src/test/modules/test_lfind/test_lfind--1.0.sql b/src/test/modules/test_lfind/test_lfind--1.0.sql
index 81801926ae..6b396dbd58 100644
--- a/src/test/modules/test_lfind/test_lfind--1.0.sql
+++ b/src/test/modules/test_lfind/test_lfind--1.0.sql
@@ -14,3 +14,7 @@ CREATE FUNCTION test_lfind8()
 CREATE FUNCTION test_lfind8_le()
 	RETURNS pg_catalog.void
 	AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION drive_lfind32(n int)
+	RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_lfind/test_lfind.c b/src/test/modules/test_lfind/test_lfind.c
index c04bc2f6b4..2234f148b6 100644
--- a/src/test/modules/test_lfind/test_lfind.c
+++ b/src/test/modules/test_lfind/test_lfind.c
@@ -146,3 +146,19 @@ test_lfind32(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+PG_FUNCTION_INFO_V1(drive_lfind32);
+Datum
+drive_lfind32(PG_FUNCTION_ARGS)
+{
+	int			array_size = PG_GETARG_INT32(0);
+	uint32	   *test_array = palloc0(array_size * sizeof(uint32));
+
+	for (int i = 0; i < 100000000; i++)
+	{
+		if (pg_lfind32(1, test_array, array_size))
+			elog(ERROR, "pg_lfind32() found nonexistent element");
+	}
+
+	PG_RETURN_VOID();
+}
-- 
2.25.1

avx2_bench_graph.jpgimage/jpegDownload
����JFIFhi��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222����"��������k��09.�r��Q�9
	�&���j	�&���j	�&���j	�&���j	�&���j	�&���j	�&���j	�#i�	��@^t�@�������t�G���s^�1Xd�(&���j	�&���j	�&���j	��T�����%C�������j	�&���j	�&���j	�&���6���i6�W���vg_����N��ju�/����3
	�&���j	�&���j	�&c�l�L�R�ug52�y��GM&���j	�&���j	�&���j	�&���6���x�6f>G=����t�5�:h��Mf9�j��,�@�LQ1D�LQ1D�LS�����6��iQ1D����5�t���&(��b��&(��b��&(��b��&��������|��@�>N/5��Mt�~5�����:������b��&(��b��&(��FW-����LQ1D�1m��|��4�b��&(��b��&(��b��&(��b��#`�f�.��|���L�Y
�1r�y��:h����������)�&(��b��&(��b�����jn�Q1D�LS����M�J&(��b��&(��b��&(��b��&(�R6%���m^i��^kg����5�
m�lr�\�
O�������2�������:�b�b�_(t�'I�'�����M7�������}T���������F�l�I���394}:�	1r�y��:h��{xD���0������W7�B6
_�f�z6,L���`9}(����PMA5�W_�YZh��j%5�PMA5��qy�e4��j	�&���j	�&���j	�&���mn���m:R�#vs��b�W�w�F�s��q����Q��4�7�M�x1r�N�(y������:��X���4s�^���$"@�Mw<��m)���<��RU�P��a�7�.3
	�&���*�=���/�vg-f7�n��GO�PMA5�PO&����MA5�S��]��x���O'�s]L��g�������XZ\�w�D�E/=C��~lt[�+`�2�N����m����[u����x�S�x��8H����Q��9��3M��a{q��=��A���9��&y���u<�Gz[��[��oY����?��z�}��4
�E��OO#�����0����j�?yM�2fS�<cS/�I������������o�����s��m����I��sPML$di5�&���[�L��K������k�%��������5���<H�}�S�;�����|E5YQ�koy�I�%�k������_G����>��E:ZBj�&P\�G.�m$��Y�����������F�u�1����p��k~�=�_���xCsx�5������-�[��s�n1����v�����X��-9+F+)Y������x�����������^%��L\AqHcd��@^�������t��%,,�/_<��������of��M�b�>r�\�oC��2��~v�gfzEa9��|d�l
�?6�~������&(��b��&(��b��&(��b��&���&<�EFVCz�H�Sj�*�:�`�����rQiG����"��",��"��",��"��1�DYdE�DYdE�DYf?���",��"��T��hy�g�\��|�S�C�}i�z�2XY���1�^g0��-~����6��Mf���8F���k6��/�1��ag�Z����Nm������^���C�Q�<���XF[`y`�3}�e6H��
N����t��C:�~�����������q���pOX;�&�w��s��(�7�c�N����v�F?3����:3}�s_���c���rFO���G�����u2��fljz�;L#g��������.@023 !1#"P4A`$D�����"�������X$��`�Z�KZ63��(G-�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�FC�?0=��������_��xNA�"�f��A���-D$�)XS~[�o���B.�Wm{��d�&�k&�k&�k&�k&�k&�k&�k&�k&�k&�k&�k&�k&�i�9����5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�Y5�C�Oi�!	�A�!���w�B��/��9�_0=�I����R�a��������Z���0����q�~s|�<2M�ArBMbx��i�7!��� .@\��r��� .@\��r��� .@\��1	I�� .@F`�� .@\��r��� .@\��r��� .@\��r��� .@\��r�.�'��AE�e"�{|C-�"f�Yci�"����|���������/��o��o�O�����/d?��/���W�[��Gt$$�g�l�^�}�s|�<sf�V��I-h��'����e�-h�FZ2����e�-h�FZ2����e�)��e�-h�FZ2����8h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����d-�<������A�������;����������`�3�R��4E"o�3�l7���o���B>�1����1��>�7�Md�Md�Md�Md�Md�Md�Md�Md�Md��Rd�Md�Md�Md�Md�ME�����������������������������������������������x�N����0�21�=���J���/��W�`�J/w��/��o��q��s|�Sh��r������+��q�9.��r�ri�����#�;~�R�6�����'����r@a�@\��r��� .@\��r��� .@NB�r��� .@\��r��ZN@\��r��� .@\��r��� .@\��r��� .@\��r��]�O���e�-YO28w�
[PD�b);�����V���d]�t�~��������o����o��_)��������������+����O���,�����a���Gv��o��_���
�7��9�_��d�6���!�#R��B�!��������o�������oU��y���7���n*YBp�=�E=��tw�k������*�;���+�	���y����q�	%���������e�-h�FZ2����e��fa�~1Z2����e�-h�FZ2����e�-k��2����e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-bE�n \@��q��y�9;f�[I����k�O���	$���
��N�z�nYN�E]���O�p������-u*��B������D���R�������:�V�u��a����To��o�D���!xE�#���F%�m���+]M����1����_���'���6�&��`!��|d�Md�Md�Md�M��G�5�Y47���Y5�Y5�Y5�Y5�Y5�Y5�Y5�E�.Md�Md�Md�Md�Md�MX��������Q�=3`��Y5�Y5�Y5$�15��T^K�MU�5&�k&���P�Np�R�*A�R�A����XE����r�%�k����1?��"k=R8�>I��} ������NRG��Gz��~�_m���J-�������XE���c��^(�c�?��m�j�0��^�m_��&�U���h*CN+�
PV������L%����2��V*���D������0���'�!�4�E�L�1��#���J����=���M���;����?�v>�����5/I��G��� �=�����f��!�����Hq��q_\W���Z%����e:��+�����KuK��5�#�������9�h��Pc� .@\��Qf~@O4l��gj��r��V9]�Id��QE-y���� .@\��r�����[^������	�2�X�� .@Vl�5�Hu�r�;���� .@]i�����H���}�@\��r��� .@\��DcR�����?�@�e"�{~�Z���A�DN��F�l�%�:L �,,��]'�?���j�-uW��5����^�P�{�����������=��e��l���?Q��m��.�������TX�[/Q�E|��Ar�V�����n+q[�bgM�������_����c����#r�����8G����=9�W���U�����^�o�g�YX����\@�p
��+�W. VJ����h
2�V���q���. \@��J2�����g�1�jB�v
���mE�Y���;Q�o�2Z�#�����+��lu=6�U�������wOv�.������]�{]�_k!V��Sz�a�>��J�SoY�_ /^��;�! ��$��v9/^j���,��*~�����mbC��c=��u�h�=���v�.��`t� e�@�h���g]=a�Q��s�o��	;�$��lg�PcFRK�w����WF���FZ2����e�-h�����K�g��P���e�-h�FZ2����e�)C1���e�-h�FZ2����e�-h�FZ2����e�-h�FZ2����e�-h�[y���f&�u9��������?��S���c|�a�#���EB���K��[��-���|�I��F>�7��b7f#v��r6g#fr6NF���9'#d�l�<�y����c���#t�n����b7LF������r��������9'#d�l���������c����,F���1�#t�n����1�����93��r6NF���9'#e��c����'� �D������f�j�;e����N�N�N�N�N�X�����?8�����������������f�j�;e����N�N�N�N�N�X�����?8��a3a3a3a3a3a3a������gl��S�S�S�S�S�V?8���������H&i�]��QJF���bLcfn
����V&����+3�e�&54������x�����dLM�6�8&��)�h k2�"�7p\�Ubb�Cj)IM9���
�SY����H�|<�JF��d$&*����J��>����Y�S�Q�6�S�#KV�v��%��,����v��$��d�1�/�O�<�I��mG������
�`9d��B�5Zp�v"9�GeZ�U0�����������OZ}�i.	�����H��;�^�W�%Z��;�
��q�����Rq��	D=@6��}Z&�k����m�w�����Pw[��Y��Y���rv�+
h!���O rF1�����Xg�GR8��GR8�D,CZ�V�KJD���@0SV�u�du��_�p�'��q1��1D;0��"�!,M0�`����dT�)�?��4!1AQ 2@P"0BRa��$3�S`pq���?����h���c���H���3$$��{;Hm�*`a���^o���C����6�O�=��
xG)��2O3wyOg������H9)?T��>Q.&`zp���/>aD�Fs��?�3�d�T��ja�z��/�p$`�v��q�]p��f|e�b���	p�3�������.����ybs�C���&��^������S��A����)�t MN����2:��w�l��Cj���b�.�PE1?���AV[��]�-���Z�b�\?����xCfh��c8��c��b�;~axY���sdzi�(P��:����D1w�y��A�~�E����)�� �����jl'�M�:`.iM���*J4S����Ge������l��+���R/t��RT��R*D'8����K�r�87���������I�����Gph^/��
/��l���i^/����"�J�{\	h�f�6�����h�Un�Z;Uh�UN�9�'qU�U2�v��j�v��j�r>�i8+f�iR.���0��e�7K��"�9+G���~����W��_�dh������Z*2QW�r�T��:$������?��91Q !2AR"@Pq�#03Ba�4D���`pr����?��3�IvSec�h>M6�c0��yF���n^��6�N/��f���~c�����GXh�V$���:1���A4Z+}����?�^�DP�w�o����Oc=�����m��VO��_��X\�m��5e�����J��9�i�#?����B��������y�a��O��>��S��<�5��w=�G�}��6
m#y���J�fh8;��X�IV$��AA��O��=���W�����ux�Blmf��p$R����8p�:N
����"(i���`�-���K��784T��9����!��{�+py�`�.N}v)1�M�mr������C68'
�����������<4*9��
V{#M`	�Y
�Za�~����U���[��=�)��WY#u�I�U�r�o�W����b������������~������"�U��>��.iB����
�5���pO����E���#'iN�����G���5~�+�������5��6���\�^��W��^35x����@(;������i�����B$A�1Q8�U�3A�<B3�64��f���fkHx0��h�;g�,��h�?��*:4'�]RU�a�Wr��4zm`CG�|���+��B��r��������h����Ch�_$��U��K+��)L$3`E���m��UVco
�-�YnJ�rV�����n��l����������M����`A�|a������C2!B��1AQa"@q 03R���#bP$4CS`r����c������?�0wq��1���C�����>��s�<
}�y;�/j3#4����t���ke�{����l��$9��e���C��[p���=��{]��-����k{��Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-E~����O����w�?�O��������#
�X^0�^�6�ax��oFb0�oh7�}O���V]�s�}���e��Kbu��Z�6���y���]�����N�.��}wx��f�
�����[����[����[����[����[����[����[����[����[����[����[����[����[����U-���=f��f�nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���>C�7g�8�b6� ��{a�<�p���`y��/������.��P����y��������P���y�������������������
��(�2&�`Y���}�$KM�+L��|���m�M��� *���V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�T���nj���nj����f��3U�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sN����:3��6�������mA�f�Y�u
��j*g�T&�5�P���9L�}O��M�\����>���O������!�����S�����O��y��hw�Fdo��Cg���<}"�&x���.wg��|���>��Fsa������;�l�i�|V���Ok���-�Hs[��=��[l��wU�5�y��j+��j+��j+��j+��j+��j+��j+��j+��j+��j)��5�QX��QX��QX��QX��Q\sX��QX��QX��QX��QX��QX��QX��QX��QX��QX��QX��QX��QX��QX��QX��QN�w�>S��}|��w�>Nh���>|�0Z��Or0���{C�Fb0�oh7�
�^7��>����G/t|��chk�f��(gghd(3,?1���wR�f=wx��u�w������5�o<U-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��t��U-�R��-�R��-�R��-�R��-�R����[����[����[����[����[����[����[����[����[����[����[����[����[����[����[����O����A�`A���5�q���w��{a�<�p�}O��������S��Ev�>}�����k}������w4M|8�e��}����_'�Ap$HN�2�$:�t2?)�~������Pd�c{�u��ec2(&m�U��#��Ym�������k��������i�ei�O�����-6,�2B��[4w��#6HHO�����[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[s@�H*���������������������������w�IV��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�:Fw��np[�������Qm|9��hCf�
��8?��H��3�w���%����e�f��xoF4X��B,�R{�Etxn�\;���Cc���
����
��j*g�T&�5�P���9L�}O���Z;��w�>������'y/����=������T�_��5|f������kN ��O"#�1Z�;��%	��[Dn/�&��>�!��h
�"�,����i|��|����b�6�D��wMm0���c��Fr�w�������C����A��{C�x��
���+����|��5���f��	��y�w�d5����
BDK���cZ�M���"\���3�����.rg�����"�v}O���j�����A�PUPUi�Cw�kC�D�Dzt�K����V��-Y��|��w�1�hm�f`n�����%�q���B�Z]�s�|���i������Vp���h�how���>aC���}�.���Z�d�ZGQ�]�l���#����.�\2[.��=��+d��!�l{?t��	m�[u�W�����Z����Z����Z����Z����Z�����QS%���g�q�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�T��b�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-Eb�V-E;�v�[���@���0�;<����wr�m�c'�����#_�Dk_d�w��P� ����q-Y*+��wn�	t�e�L�#��C�7�������i��f�&c�0����C�������f_�������������\]�Cor���a�}�x���	�������W�6����qYa�	#�=��Fc������n�W�cei����|q>b�H�����A�~����U�n�m�� 4�M�� �ww�l����r���[w��� 8�1>W�h�;6��Cs���29��[�~����/��ka������.��j�3[���R��-�R��-�R��-�R��-�^��E��-�R����BZY�[����[����[����[����[����[����[����[�m�~����nj���nj���nj���nj���nj�����:Oz��g��C��Y�V�W5KsT�5KsT�5KsT�58�GW)��?����PoR���>k�����oT�5KsT�4�@��w�8�v�3�5���>��Y��[k/�}��v}O��L�N����]or�����=���L�SquV ~�C�[��N��������q�����p]���y�
��l8O�m���w�~�����;�?�n�i���dwY���1��;�Jv�|�?j�
��X�c�G� ���<;-�~�8����sB�8r<��'�o���8�
�w[!�b/���f�<O�sy�(����S���?.��g��V��/���H"��8��CB�Hc���7|�z��0�eh��|�F��=	e�������?ls�q]�1e�s����s�����������~��N$�M�M�b��� �oc[�|f������[NK�����/�NWG|f����g-��t�<�?��]/�.+g�bD�������l����D�!!>J��o<Um�V��m�V���i�nj���b75b��{�l�3^8��VvX3��A�n�%��w!�H��9�R8���������������������C�Wt��{vx~��������nj�������0�z����������&�����U�5[sV������������������nk�!�68x�*��h����H
�����������������������������9����p�����qP���2~
��j*g�T&�5�P���9L�}O������O2
���7&4��et�������]�M��]��L�����W����_��>|}����M�J������d����A���+�?�_
.K��T=J�:���Q'�|����9��� �%���d���x�
�
z�����*��*��*��+�>���O��
���l��.V��s�	��XG4v��-�O`����C�wP�q�K���y�<=���4�pm�z�_��O�����o+y[����o+���(�eMi!B���5L�����r��-�r��-�r�7�>^w�>�y��_���%e�~���F�-�
���w�TC��{1���x�����*���}����V!~�>c�[1��E������'�|O�����/��_[a|=P��:�%N)�`|����]x�����##i��w��,�F��L{��3�����.rg�����"�v}O���B+�p@4Z���^'�z+O�W�w���O� TX�&��Od6�d.�a��?<����e����L�|&d�2_	��5|&d�2_	��6NN	�������>������7�l0�&�'��v^��t9[%�	kc�����Km��������8�-Ed8�+��U�"$Gp��:�����X��QX��QX��QX��QX���������������.���Cm���Z����Z����Z����Z����Z����Z���k�k��j+��j+��j+��j+��j+��j+��j+��j+��j+��j+��j+��j+��j+��j+���<���Q�1��]�&6vow5.��������,���7|G�����y�s'���:�KF��Cm��������8�:m�����<�#|���#�m�]1|��f�
���f��/���/������l���l���Fj���f��_���U+#5+#5+#5+#5+#5+#4|"���E�Q����Fj���f��/���/������l���l���Fj���f��_���U+#5+#5+#5+#5+#4|"���E�Q����Fj���f��/���/������l���l���Fj���f��_���U+#5+#5+#4��n�Y�������RDsDs�D:!�MMMMMM��G��$%c��������tC������=Q��_`HJ�	I�����E555?9�0�t�6*��V� �-<d(��w�wA��C�!����->�m����:����x*$YN�����c�s�%g�W�LN$5���+Z���M���������pV��S�)�v0Z������m�x�=���[}���r�k\���78pSk��D^��rNdf��m�7�����]��i��a���5�t���@������#�m��)o�cX|GYl�.�Di�+��n�����FBI�Y��'���{`Ck���}�/�Mx��>�8�Ai��ns�|7Zi�dW
��-��i�q��>S�yp=������}����qHDE#�c�].�1���L�#+;���Na���`��%�^3 k�jSsA���}
'wq2�p���Ns�8�������4��x���g����3�6ih�R*�+�:�
��;9� 9���w�}n����Z�������{�N��|������(D�-5pO��d��Gg��gd�;C�3i��2P��1Cg�S�I(�b���K��+���r������2@�b�I��<,�;�����pcY�v])kf-xKZ�����<��<rhg����?���X.&dq��B�f�{d���2��C�a��"�$����u��rm�����E��l�bX�����w�<A���>3��gk��u����f�����No1$�5����}������l�`����:gr��F�+-�] ���'���G�!��'6Oi�OU�p��8N�Q��3��2���O�8[�}Q�e�i�E������K��* �;�1<�:���].�)�!�v���"��?��"	���E��I��QY����l!-���q���g�"�`5����c��w_����F�����Bm�hF��C��
k��L��1�5�p�;<@��;n��;<k�^\���k�^\���ai�nV[2x��=�	p�+��0Hv��/�� �	4n����
��}�.���"ff}{$�!�5�p�,x�]q5�7�����d��I�=���O�2*��e������&�����7n�����������+!1a��AQ�@q�0�� ���P`��?!���`y��F`(��
H����#X05"�j�3�����bX:k��'C@4�yz5	�<����������-	�t)��2� �wU�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{@�X�H����/��m�[:�����=C/���'
`��%
�
J=']VxsI<���p�~�����-�������)j�X��������H��Nwg���MS��xYO)�e<,������S��xYO)�e<,������S��xYO)�e<,������S��xFF�?H��������k�e<,������S��xYO)�e<,������S��xYO)�e<,������S��xYO)�e<,������S��xRH��A+��D����yR���T�� ��I�E����&^����6���t]Q�$L����_��nz��E�I��
���Jg�G��nz��<������CS�*``"���&���n�t�	���(#~xV0���,acX��0���,acX��0���,acP���� 	cX����4��0���,acX��0���,acX��0���,acX��0���,acEA���
%!2EAt������!o1���D��DtFc�n��u�t�s]��L��@�a������g]J������5�s���.��D��/�}�����nz�;�#0��Q$�wQ���}5{�E��I�E�0������K���m�PA�L��),h����<����������-	�t)��A��nU{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�T)�^�U{U�UW�U^�U{U�UW�T�,�y>�{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�U@������.�F��3)�z���g]F�_��}�����
��8R0 ���Q!�IB�C������D�a�����6��~�Qw"a�3l�yOU��������H��Nwg���MS��xYO)�e<,������S��xYO)�e<,������S��xLa��xYO)�e<,������S��xYO�1Y�YO)�e<,������S��xYO)�e<,������S��xYO)�e<,������S��xYO)�e<,��������/�P4�G�7xp">��� ��!*�.��u�tT�&��}�����=l�����	�Oo�u�m�[:��.�>������@�If���X~K�F��N��4<(��
 �O��#�h-�#`X����s��@$*)�X@ Z�q��L;�dO00p��t�S�!7Ll{pE��M��aA�����,acX��0���,acX��0��R5M����,acX��0���,aH
E�0���,acX��0���,acX��0���,acX��0���,acEA����Xe�Xd���vx}�n`��8Q��f���`�P=|��vI(6�����w${�[�x#&"�����FL"E��":#1�h��a�:���"G�^6��v0�i�GE:��@9����W��n}1��Z����V������x��9�R�@N� �I�C���&n�c�F	��N��t�)���G3����9�M�.1�A:���@9,,D@a��N�����g!�(�Q"?E
�����,H�B���@�jM
%+p��c�;���
25�R/�� �/c=H����:���8K��{�u�m��,�gqgQ�K`ID��M��m����2�71��H$�s@:
nP�7����"/8(�� �T�c�#f�������vZ���:|�@u�����f�7bte��.�c� ��IY�d�C+�{�	�H^��:����7B��D{��W�U^�U{U�UW�U^�U{U�UW�U^�U{LN����Q�u�@�E9{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�U2cs���*����b�����*����b�����*����b�����*����b�����*����b�����*����b�����*����b�����*��������@dU��p��D��@��8'gb��s'!�O���V���J�.N�@�!t.G��AE'"������s������2� ���@l� ?7��)�wP�H�����hI�������P7� B�#�O�:����*�9� :/n������H�%�E��Vm��p�F�q���/m�s�71=���%3���#��h&]���R���g]�J�C��� I`�����[�j�O�Y:���z	?��#$�PD�I>�1����xO���Cw����[��Nwg���MS��xYO)�e<,���������haO��)�"�������S��xYO)�e<,������S��xYO)�e<"t` 0YO)�e<,������S��xYO)�e<()4������?����������,�4YO)�e<,���������!���p?I���%��B���_	�qn������S��;V���# ��FX�1.��Cqs�8�< ���)vqx��t^����������j�H|��N��}���� L"�UL������#��HJ����,L������2Q�AG�������o�?�^���t_���-7�=HC�=�����`>����x)=���4������������j��u���1��,�+�A5C<��)v���Q�	��/N�?i��a���5�7�+�%�}l����z�+�P�
.��0H0��,x�
���yE�
E�Y���07���UV>.���$�t�����@I��1�&�Eb]���QGK����������(�@D�uB#$V��������?��<�s�j!�<�U?������0��>����s��`<�h��l�*``"���&���n�t�	���(#~xV0���,a=Jl
��@��k�E��B�����|	As������x�wE4��X��0���,ac�9�DM!����@_��o���XB���4��	����,ac.<Q�FcG����+X��!����0���,acZi�(�G��/���%�V!����,acX��0���,a1T@.�G��
��(P���k�!o1���D��DtFc�n��u�~[Qg;CpA������Z��W��we�(\U =/i���E����d�7=z���v%������Pcq, �5Y��+���	��[��$f�&1p<r�����%�D(-`P}(�`"��3$Y�BB<�5�D�"�p m+8��8��8��8��������g]�N�
��x�:D�
`����I�L���S�/2-�P��������y�v�?:[���y���#L�A	-�Bd4_�$w?����Co����B��D���������W
�U��p��\( Hb�����t_�p��S�u����u(1	�ni�]���K��W��q�P��EW|�>���<q�L�d��")�"�w�W|���KF�"��j���o��O�M
���H���|����������-�y�����I8V��$f��`�����0��R/��zN�/P��~�
�L��Q�����f��I�<�M|@���k0pO8-������`�X*�x)7�+X*�x �$~8@�$-<�wO�]f���t��N��h<���j�'N�@�$/M��>��U�Uj��N�6��2�`9#�1�/Z�b�����*����b�����*����b��p�m$�M��(���S��,P����b�����*����b�����*����b�����}H��N�aW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�T�/�[:��@�e�A�,��J)�Y��q�\;���J��nz�q1w@������t�	$	?���������_�l�����0�sz20����'[��!.&���	������=�_c�������=�	s4?K��y�B\��� MvDi���M5�@��M��4�n�P1�v����Lh���c@�D@&����]��k��i>�H�����G��&�vBk �e}��f�vFk �d%��<�!.f���	s4?�M5�	��"4�eA7j��	�UD��
��&4n�Q1�v� MvDi���M5�	q4�K��x�B\M#��Y�!5�{����3Y�#5�{��h~�3@�������@&����]��k��� ��Ti���cH&�U �t����71K����������
��B d!�[�; d#�B;���,w�X���@"��[�1��"��/�1��"���/�1���S��/��s�nt-��,7'�XnO�����!�B e� #�B; d!c�?B�p~���� nt-��E���/�1��"���/�1���S��/��s�nt-��,7'�XnO�����!�B e� #�B; d!c�?B�p~���� nt-��E���/�1��"��/�1���z��3����4��s8��&MV��Q,	gC���6��@�P�������`��i�E��F� �>�bHL��-%�cwLv�>FX(]���h(V��,GWt�:������'Pt.X��6K@�;N����������d*,�'(OO�}&6��\�6�s�;�1�O8p���!�E}����8)�}��aF�8�"��m@�$��Ag�M���8(�@��r7���Q�X�5�JSh�'@��j�v �P���'�o��P�#,�4P�0RZ5��F���=H��h~B!H�����>���B��6�K�}�� ���$L������Y�������f�k�N�E�F�T�N����6����:(�� �0t?�H1..���q��
��`�O)�3pu�����b�"B�s��q��
��x�i;��YD^�!8�����3��DP�@�KI�c�"�
 �*"�1G�*�

F�q>�2=�4ES��KA��h���G<�ic4Dt�mH6!��Ptt&&�"�����,����������}�GFK���g6w@K�;���<X|F%��N@��r��F&�0�OX�hD�;�C��"-��1*��0��%��L���.;P�;���D��@��EH�ShNK�c���4p���L`x���F\������hq���x#�"�Zz! 
�E��b��}P�o�A��e62�f��#H�A�a����X��!�6��9�	����OH��F��z�������@Wap����I'T��}��?��Y���w'S��<w��	����P��b�HG�Y����D�;Z���A��
T�=�j0�@A ����P�s�T�A�u^Bd�nI��O�� /�E`i���8	cb�(����A��BD�?�/���O<��<��<��<��<��<��<��<��<��<��<��< �0�0�0�0�0�0�0�4��<��<��<��<��<��<��;��<��<��<��<S�<��<��<��0�0�0���0�0�0�1O<��<��<��(��<��<��>��|��<��<��<��<��<��<��(`�0�0�P�!0�0�0�0���<��<��<��<��<��=�<g�<��<��<��<S�<��<��<3�0�0�0�0��0�0�0�0�O<��<��<��(��<��<��<��|��<��<��<��<��<��<��4�0�0���!�0�0�0���<��<����<��<�����<��<��<��<��<S�<��<��<��0�0�!�0�0�0�0�0�A���4��<��(��<��:[�<��|��<��<��<��8�~,��<��<��<��>��<��)��<��<��<�����N4����0�p�0�$�0�0�0�S�,��<��<��<��9s�<��<��<��<��<����O<��<��<��0�^��0�0��0�'�L ��0�|��<��<��<��93����<������5��������<��<��,��
pFd�-��p��3�c��<��<��<��<����2���~����q+|{�<��<��<��<�O$�C��<1���0�0�0�<��<��<��<�#���<[�<��<��<��<��<����<��<��<�\0�0�0�0�0�0�0�0S�<��<��<��<��<�4��<s�0��8�L�<��<��<��<����<��pJs�4�,s� 0�,s�<��<��<��<��<��<��<��<��<��<��<��<��<��<����,!1 AQaq���@P���0���`p��?����3��������
���!�T[��I6���u���� -�uYh��@�z��m�u�t�=��s�����C����|�t��"�{O�B��)���it>DEx������)F�?�����qoK'��7QFg��A)��n�����q���4T�&h� 
�)[=}�~L�P�h�qY�:��e^LDPs8L��"����w?���+��	[l������E�L��HCe��tV]�EY��2�X��mf$�+[�
����)�D��m~�y��+��
S����(<4/�������)o� s^���K�n��f\��3M�[zB�a�/ 
����D���F�_��u�U������b{�u����M���;�3�N��Sp�-��3%�(?��X�~��u�	���GY��:�x��f�n���H�-�y
i���f8��F��;JU����TPKw�9���0F
�v��b�"=�
�)�g�`���S1QX|��������[�8�(k�i��kM�ox�7q�a_�)E�[�}C!��&��%�I�����tqo�0�Ovo��O���`9>lu�����i|������	��b�
;��{z+����?��,1!Aa�Qq @P��������0`p��?����c�:��_��*�MBS���PG����J�I�Ng�t��_�wt��K�G5������t������@8����S;�wB����Q�����-���]\�gtb�9,�~��s��������k���<��:�:�����}-&���j��Z���Z�����<���yJ9�%ZU���zZ�O��p�<kK�W���.D����S>$��:�K����H����*j)p#�:H
��93|r��o>��j;+�P�Sg���:L����<����V��	/r������d�i08V3��!����F��S ����kS&���@Q��n-}�L��p|�o�#���z�@��x�=������	�P''ZH��b�J�������S�1��ZZY�>j{-��nW����1��(Y�~���1���{,9Y{�T�����K���������
���H,tY�y)��DA�e6gr���#�9�B��r����b�.�~�@�;����.�iw(�:�0;�0���S��g���C�����n� �V�r����Vo����RYw(l��V��Z!�<0���h�F��Z*
�R��(Mc���b���.�_�W���.��!94z@���'��T-��7/
�s~)�S�aK������a�����i6�2m@XmJ\mSpHS3��w������;�iR�x�R�{�+���8��_����,!1A��Qaq����0@� P�`�p��?��X%��(!�t�D���BD�j$�XB$��R����!"K����"	�-\d�rNL�y�B[�c��I���6oH���$��@�7���S�����6�"��4J|D������J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�%tHD�<�C��T#P�����Nb�����J�]XG2*Mua�d���	��1��~����c���"L�e�:��;,|�b	v���}�������z���k��4���F�������*T�R�J�*T�R�J�*T�R�J�*��!+�����j�I�C�!W���G���*T�R�J�*T�R�J�*T�R�J�*T�V�8D�C�;�y�CRk�fA-J���B_:�ayH���S�i���~���>i����j W�9��0K������B53��N�=�Vb�>�|��?��"��D�=����,�����0\��Y�B�R�*��X�B�R�)Y�>�l6XF5��p���o\+z�[�
��V�����p���o\+z�[�
��V�����p���o\+z�[�
����
q�OL��
��V�����#�X
�lc �����p���o\+z�[�
��V�����p���o\+z�[�
��V�����p���o\+z�[�
��V�����p���o\+z�[�����=?�>|��p�*:Z�ddl�i@��\[��C"6�Y�uEXuDKh�,���p��:�X�����w�E>>���_�5:c���-O�~���x������}�#�)z����r�d���q�IC���Y�6��$�[0AaH��-Kf� ��."��Ro�7�������/�r2P_&��lO�
,�)>x���f��
p�1t&�DRbX��ON6����T�R�J�*T�R�J�*T�R^��2{�k�jT�R�J�
�fR�FZ�h'���R�J�*T�R�J�*T�R�J�*T�R�J�C���_�j]2m��2�^����#S��}������X���������Nb��vI��J�]XG2*Mua�d�Z�B�;~�+
�>����y���FOUn�������rU�E���s�S95�Z���x���iY�
�35Y{��*T�R�J�*T�R�J�*U��Y������*T�R�J��������*T�R�J�*T�R�J�*T�R�J�*T�R�[��q��b�$��J��#Q$5*���	|�U��"��%N�
L9w��n��w��#R�x_;��O����j(.���}�K�����W�Bu�����f�3���KY�u]Es���2v$�H���
d��
/!5���1�����E�c|aR>���R�qs0��Y�B�R�)���f�A������.�A���@nO�
���k�o\+z�[�
��V�����p���o\+z�[�
��V�����p���oD�`��u�k�o\+z�[�
��V�����p���o\+z�[�6�B
�k�o\+z�[�
��V�����p���o\+z�[�
��V�����p���o\+z�[�
��V�����p���o\+z�[�
��V�����I&z~�" �F�@��+�
���X�D��1^?��36��{R.F��n�s��"�������t{�qZ�t`�u%���t��$��`������~�(dF�01W�6P���`b�QV@��01�u3�Xuh������?'�DoO�$�������9p�D&��`��Z\c�"�4}k�W�W�T��yTc'Rx$��7�D��tJ���J���`(J�I��[��xJe$f�bUaB����V���@��H�64C T�T�a����Q���j��w���m������x��IYLr1�|*��k"-V�#H��@� b$��
�L�@��,���T)\���	qS��8�!�!qH-� ��$Iq��q
BD�o����ui`��_����gwi�	dJ�W�U�~�_�T	�U8G@[T
��
�2b
\�o�$��X����@��f%�h�7�Nm�M�,��`�G`�FI��;����������/Y�(�
�q���		Z�����4RQ
��/&�"j��E�K���� q��,$a��:S����]C�
���<L]D	�x�z8\��k�)1,sD��a�{�T�R�J�*T�Q@q7��#-G�������OH��T�R�J�*T�R�Bv����}�>�J�*T�R�J�*T�R�J�*T�R�J�*Tb���j�)N������W����S��1����f�$�Z�����/RtJ��0�X$&��`�\�X!��q]+�R=W�N`����f!����f��)ir6])�(��d�H���?�F�����X���szY�5"'Z��D���eRDT��|���pZ���:��333��"�$�"�VfQ��?����I@���k����I"�����H��Y�R�RM�*K*#����6����%��+�R\R2xI�]\vt������^�o�j"H
5h0H���������T���_�Gz����[��n��0z��[�������G���;��tA~��K�(�e��N��OJq��&#k ��4=���ph�a�""��x� +Ax:H���Ti�����2�J�*T�X����I��G�� �*�0�PT��rbm�o�*T�R�J�*T�R�J���$s�O�T�R�J�*T�U��Mn�?�ZV��h���}�%������2[t�NN���R�J�D!#��]����K�l	,����U���|4Z�F�]����J���7Gb0���iM���L|>+-%-�P7z��9U;,��S4�TdD��/���/�-@�m���T���1��/ibqz��q4��� �FD����%�I�|�q)�c��Lk�gB�=J2��@�K�wS��T+W&2����;�����[����-u<H{T�������P�����x�'��IQ_�Z�~�?��Y����&-��;���Ti\�W�x����,������#�|e���W�UoV��FX��������n�DT��?W�*d&�fF��&�B���Q�:C��~���`��Z �:}�0#!���s�����}�K
y��c\����?���G�VV�O�����d�#W��@@��<%x�
)����@y�y�l$��X�+W���� ����y����P�/����<��������I�q�w�A�S-��r�.��(�����Jz��M���W��X�Vv����Da���b���X�����O���CH�����4��O9��Es��+c'��������+����A��F%4�2�����O�{�Mo��"H����tze@���R���ck3\��W�]��^��W� 7'�����������p���oO4�m��
�t[�k�oH����V�b�����Z�e�~��������H��f,�0x�Jv�����-����g-B- @m\+z�[�
��V�����p��!*��u�x��4�eq	�3��K/ i��/z�Z�
��/����@��=/��p���oO���[�O*
���/+z�[�����Gj�[�
��V�����p��IX#PS6�)j!��>��Y^�v��"dDE��%���o\+z�[�
��V�����p���o\+z�[�� )=hMfH���	+��HJM��(�6y��~�(dF�01N���H�m��Ubac�G/��G��<��`��Q��$/������{K��������Lj�u/p:T�4�K�3�D�@���v/I������dS���A������Y	��"������k~�XP0�����W�%�����(lQ���J�������i��3���p�F��iMu�&A���Q7ZrR$c�:~������0�ai���Cd�/�G�rN{<��(SK��U������X:�5�~�_�W�U�~�_�W�U�����D�$~
]����a�w��z�xKt��c�
Q �qn�^�L��(d�Wyq)>�9����1n��p���3�!O9K�S��9� 
?��@@��I�NLC��/R��A��	=��w,u���I��L���v�u:I�V4���,��O��v��������9+���KV��U\��RQ���.�"

�(9@MX����M~�_�Vt�
W����]����Uw����U�����Q)/�sP�VF����j2)�b�f����|�T!���i�m���c���,��yl��������z�`?� J�C�������q0W���@�Q����#���=�+�e�/z)����-:�-q����Xb�=� �+�v�;�`�r����-�
6J 
cT�!�.P��if��P��z�����X�
N��.�K^����{�E`����\Ef,!	\E�l�@B��%�[��&C��=
�P_�����e_m1x�5|����"(�����1�� _
y�F�� ��E~��!Xd�G�`����8?F)��|;`C�}
��J��W��&��|X���?*��W�����(X|�]!k���W=(�$T���>'��A�M�j�%S�rd��
��<L]D	�x�z8\��k�)1,sD��a�z��G�(��9|S�
�[u����O��y;���1`����R�J�*T�Ri2u1��������B�BMh(������<�Q�e]�����*T�R�J�*T�A8�W�B��`S����T�R�J�*T�R�J�*T�R�J�*T�R��/��W��1���u��\!"'����w�(� �Q�.��5��"���~�L,k��)����D�M���Qn��z���?M��A�,t��~gY��^���g������?���F��L�||����1!b��x� +Ax:H���Ti���������#��c0�Sc0�_&�"|��h@�j�h@�j%1�y@{�Jc`����������4�!����x��iM<y������q�I[hg �T���q�C_hg�{�5��q
��C_hg�{�&�D����O"^`~e	��/0?2��)m).R>�R\<�}�l`�jl`�k����O����M]��MD�6(}	LlP���<�=���<��ZSOa~-)��0����� A��+m�j����!P��k��z����!P��M<�y���&�D����O"^`~e).R>�R\<�}��0�yH�h���$����$���9��=;9���9���LlP���<�=�%1�y@{�M<y�����<��ZSOa~-%m��@��RV��>�%m��@��Sx�"u��s4�TdD��/�J(�uJ���Q�����a@0N��.`"��No��"f)�D$�$P�@�����(�a�	�BQ�����0#�&�)�`GXM�S,�������i
/��r�
!E�R�X�(�
� 4�S�Ad�*|�#,��O�	�dOX
�,���&Y�=`6)J(�uJ���Q�����(�q�+�N��.`"��\�EkZ��"=i�D$�$S��I�H�(�a�	�BQ������T'�)�`GXM�S,������Y�a7ig,�_�,��B�����H>T�@i���2�
 T�P�fD����2����Be��b����T��)E��_�R�7R�4�(�)�P�V����#���BL�E;�����P�%,:�>(J XuB|R�fu���2��	�Je��v�r�
!E�R�X�(�
Y�4��Ad�*|�2�*y�.�ff������E1j+���p��f�����1 X2�9\��rQ�1��H���)�
�{����Jl���D�RM~�r��qr
V���L}P�D���&]k���(�Ge�M�O�J�)
Ub�!WtcI�3�A��I��Q�>��t,e=W�r�
!8a9���z��"
�]V�E8�C�x%�a��ZD-Y���]��������PtZ7�DV�U[���	?Bch;�,E����*,�"m$��M��EV:���=�
K%��Q>��V��J,���TO��T�F� �Y��e $�a�������K*��G������Z��r��&0��!	�
\��K��������^W��
�f������G$$�VX���x�fBl�"�a�&���
����^t�1�;��5���%K�9�P��<�	quo��kn�.0�JlqJ�%1��=Y�n��ig"���J�������#�(�����B�!�a�����1fAi��%0���nC�"�����,"B���/���-�R�`�.�"�� ����"����T�2ivE��ie��&H���2��O��X��a8�&��H���A\�+�P	�1��� ��4T��d�!j��&��r�i�=��HH����A�Y�d��L?C�eZ1$�Z"#�jL�D!���m@4}0RQJ��4h�"��3dKh"�GK�i�>��R�@1�&�����1� �p(uI$�E����b�u�F�5�
�a�
����Ibn^�M�P��>t���:1Y�&�^�X�����,eV�
�`�&�:��BP"Ix�1�lB��Ale�U`�b�h{2Z�W�KBURC�@�i�)����2�a�49C�\�l�&�R$*�[�Hn�HP.@P��s6Y*����.!�f���mAe�$��C��)jR��5��Y�E���:p9�p���0��],V�"Hd�����&�f(�%��IH�$�$�9�K�������W�+ �I9�����h}0���iZB&#Pap�>�S�H9�p���}�N�BW.0}[�N�BW.0}	))5	d+�`>�o�5FHQ.T�"� ��Q���H ���"������`�F&�(���fh�4>�9����,���J#�*��X�!(�:Q�
���Y[��1J��d��u�,��DGk!"Aq�R���*V�f�>������H�*R�U���8��+$*(�j
&i�~#;���� @�6A���e-[���
v2-0001-pg_lfind32-process-tail-with-SIMD-intructions.patchtext/x-diff; charset=us-asciiDownload
From 3a4d74eeab18d9e8f510e11185109ed910e40268 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 12:26:26 -0500
Subject: [PATCH v2 1/3] pg_lfind32: process "tail" with SIMD intructions

---
 src/include/port/pg_lfind.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..9d21284724 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -103,7 +103,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
-	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
+	uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
 	bool		assert_result = false;
@@ -117,9 +117,11 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			break;
 		}
 	}
+	i = 0;
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+retry:
+	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
 					vals2,
@@ -157,6 +159,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	if (i == nelem)
+		return false;
+	else if (tail_idx > 0)
+	{
+		tail_idx = nelem;
+		i = nelem - nelem_per_iteration;
+		goto retry;
+	}
+
 #endif							/* ! USE_NO_SIMD */
 
 	/* Process the remaining elements one at a time. */
-- 
2.25.1

v2-0002-add-avx2-support-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From 0ac61e17b6ed07116086ded2a6a5142da9afa28f Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 12:26:52 -0500
Subject: [PATCH v2 2/3] add avx2 support in simd.h

---
 src/include/port/simd.h | 58 ++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..767127b85c 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,15 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +115,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +130,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +146,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +161,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +286,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +326,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +357,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +372,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +392,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +410,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +424,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +441,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

#21Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#20)
4 attachment(s)
Re: add AVX2 support to simd.h

On Fri, Mar 15, 2024 at 12:41:49PM -0500, Nathan Bossart wrote:

I've also attached the results of running this benchmark on my machine at
HEAD, after applying 0001, and after applying both 0001 and 0002. 0001
appears to work pretty well. When there is a small "tail," it regresses a
small amount, but overall, it seems to improve more cases than it harms.
0002 does regress searches on smaller arrays quite a bit, since it
postpones the SIMD optimizations until the arrays are longer. It might be
possible to mitigate by using 2 registers when the "tail" is long enough,
but I have yet to try that.

The attached 0003 is a sketch of what such mitigation might look like. It
appears to help with the regressions nicely. I omitted the benchmarking
patch in v3 to appease cfbot.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v3-0001-pg_lfind32-process-tail-with-SIMD-intructions.patchtext/x-diff; charset=us-asciiDownload
From 3817435d200af5da954d505ae66245662dea064c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 12:26:26 -0500
Subject: [PATCH v3 1/3] pg_lfind32: process "tail" with SIMD intructions

---
 src/include/port/pg_lfind.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..9d21284724 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -103,7 +103,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
-	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
+	uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
 	bool		assert_result = false;
@@ -117,9 +117,11 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			break;
 		}
 	}
+	i = 0;
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+retry:
+	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
 					vals2,
@@ -157,6 +159,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	if (i == nelem)
+		return false;
+	else if (tail_idx > 0)
+	{
+		tail_idx = nelem;
+		i = nelem - nelem_per_iteration;
+		goto retry;
+	}
+
 #endif							/* ! USE_NO_SIMD */
 
 	/* Process the remaining elements one at a time. */
-- 
2.25.1

v3-0002-add-avx2-support-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From a867e342db08aae501374c75c0d8f17473a6cbc9 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 12:26:52 -0500
Subject: [PATCH v3 2/3] add avx2 support in simd.h

---
 src/include/port/simd.h | 58 ++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..767127b85c 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,15 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +115,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +130,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +146,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +161,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +286,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +326,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +357,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +372,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +392,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +410,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +424,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +441,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

v3-0003-optimize-pg_lfind32-by-processing-tail-with-fewer.patchtext/x-diff; charset=us-asciiDownload
From edd188759e4b937089c5bc2259a401cea1f9331f Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 15 Mar 2024 14:27:52 -0500
Subject: [PATCH v3 3/3] optimize pg_lfind32() by processing "tail" with fewer
 vectors

---
 src/include/port/pg_lfind.h | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index 9d21284724..9c6cce0b69 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -100,7 +100,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
 	uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
@@ -120,7 +120,6 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	i = 0;
 #endif
 
-retry:
 	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -160,6 +159,32 @@ retry:
 		}
 	}
 
+retry:
+	nelem_per_iteration = 2 * nelem_per_vector;
+	tail_idx = nelem & ~(nelem_per_iteration - 1);
+	for (; i < tail_idx; i += nelem_per_iteration)
+	{
+		Vector32	vals1,
+					vals2,
+					result1,
+					result2,
+					result;
+
+		vector32_load(&vals1, &base[i]);
+		vector32_load(&vals2, &base[i + nelem_per_vector]);
+
+		result1 = vector32_eq(keys, vals1);
+		result2 = vector32_eq(keys, vals2);
+
+		result = vector32_or(result1, result2);
+
+		if (vector32_is_highbit_set(result))
+		{
+			Assert(assert_result == true);
+			return true;
+		}
+	}
+
 	if (i == nelem)
 		return false;
 	else if (tail_idx > 0)
-- 
2.25.1

avx2_bench_graph_2.jpgimage/jpegDownload
#22John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#21)
Re: add AVX2 support to simd.h

On Sat, Mar 16, 2024 at 2:40 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

On Fri, Mar 15, 2024 at 12:41:49PM -0500, Nathan Bossart wrote:

I've also attached the results of running this benchmark on my machine at
HEAD, after applying 0001, and after applying both 0001 and 0002. 0001
appears to work pretty well. When there is a small "tail," it regresses a
small amount, but overall, it seems to improve more cases than it harms.
0002 does regress searches on smaller arrays quite a bit, since it
postpones the SIMD optimizations until the arrays are longer. It might be
possible to mitigate by using 2 registers when the "tail" is long enough,
but I have yet to try that.

The attached 0003 is a sketch of what such mitigation might look like. It
appears to help with the regressions nicely. I omitted the benchmarking
patch in v3 to appease cfbot.

I haven't looked at the patches, but the graphs look good.

#23Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#22)
5 attachment(s)
Re: add AVX2 support to simd.h

On Sun, Mar 17, 2024 at 09:47:33AM +0700, John Naylor wrote:

I haven't looked at the patches, but the graphs look good.

I spent some more time on these patches. Specifically, I reordered them to
demonstrate the effects on systems without AVX2 support. I've also added a
shortcut to jump to the one-by-one approach when there aren't many
elements, as the overhead becomes quite noticeable otherwise. Finally, I
ran the same benchmarks again on x86 and Arm out to 128 elements.

Overall, I think 0001 and 0002 are in decent shape, although I'm wondering
if it's possible to improve the style a bit. 0003 at least needs a big
comment in simd.h, and it might need a note in the documentation, too. If
the approach in this patch set seems reasonable, I'll spend some time on
that.

BTW I did try to add some other optimizations, such as processing remaining
elements with only one vector and trying to use the overlapping strategy
with more registers if we know there are relatively many remaining
elements. These other approaches all added a lot of complexity and began
hurting performance, and I've probably already spent way too much time
optimizing a linear search, so this is where I've decided to stop.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v4-0001-pg_lfind32-Optimize-processing-remaining-elements.patchtext/x-diff; charset=us-asciiDownload
From 2f4a7747025cd3288453fdabd520638e37e3633c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 10:44:08 -0500
Subject: [PATCH v4 1/3] pg_lfind32(): Optimize processing remaining elements.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 42 +++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..bef0e2d5be 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -95,15 +95,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
-	 * on a block of four registers.  Testing for SSE2 has showed this is ~40%
-	 * faster than using a block of two registers.
+	 * on a block of registers.  We first do as much processing as possible
+	 * with a block of 4 registers, then we try to process what remains with a
+	 * block of 2 registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32		nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
-	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
+	uint32		tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
 	bool		assert_result = false;
@@ -157,6 +158,39 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	/*
+	 * Try processing the remaining elements using 2 registers instead of 4.
+	 */
+	nelem_per_iteration = 2 * nelem_per_vector;
+	tail_idx = nelem & ~(nelem_per_iteration - 1);
+
+	for (; i < tail_idx; i += nelem_per_iteration)
+	{
+		Vector32	vals1,
+					vals2,
+					result1,
+					result2,
+					result;
+
+		/* load the next block into 2 registers */
+		vector32_load(&vals1, &base[i]);
+		vector32_load(&vals2, &base[i + nelem_per_vector]);
+
+		/* compare each value to the key */
+		result1 = vector32_eq(keys, vals1);
+		result2 = vector32_eq(keys, vals2);
+
+		/* combine the results into a single variable */
+		result = vector32_or(result1, result2);
+
+		/* see if there was a match */
+		if (vector32_is_highbit_set(result))
+		{
+			Assert(assert_result);
+			return true;
+		}
+	}
 #endif							/* ! USE_NO_SIMD */
 
 	/* Process the remaining elements one at a time. */
-- 
2.25.1

v4-0002-pg_lfind32-Further-optimize-processing-remaining-.patchtext/x-diff; charset=us-asciiDownload
From 68ee8bf34c80a0a3df02c2aae8357f664895b4de Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 10:55:50 -0500
Subject: [PATCH v4 2/3] pg_lfind32(): Further optimize processing remaining
 elements.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index bef0e2d5be..83fb8f50d2 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -96,8 +96,8 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
 	 * on a block of registers.  We first do as much processing as possible
-	 * with a block of 4 registers, then we try to process what remains with a
-	 * block of 2 registers.
+	 * with a block of 4 registers, then we process what remains with a block
+	 * of 2 registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
@@ -120,6 +120,15 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
+	/*
+	 * If there aren't enough elements for the SIMD optimizations, jump
+	 * straight to the standard one-by-one linear search code.  Testing has
+	 * shown that the gains of skipping to the standard linear search code are
+	 * worth the extra check.
+	 */
+	if (nelem < nelem_per_vector * 2)
+		goto slow_path;
+
 	for (i = 0; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -165,6 +174,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	nelem_per_iteration = 2 * nelem_per_vector;
 	tail_idx = nelem & ~(nelem_per_iteration - 1);
 
+retry:
 	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -191,8 +201,25 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	/*
+	 * Process the remaining elements via the 2-register loop above.  This
+	 * will cause us to process some elements more than once, but that won't
+	 * affect correctness, and testing shows that this approach helps more
+	 * than it harms.
+	 */
+	if (i != nelem)
+	{
+		tail_idx = nelem;
+		i = tail_idx - nelem_per_iteration;
+		goto retry;
+	}
+
+	Assert(!assert_result);
+	return false;
 #endif							/* ! USE_NO_SIMD */
 
+slow_path:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

v4-0003-Add-support-for-AVX2-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From 41882bbf78f2d8a1fe817a0cbac70f221a0debf4 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 11:02:05 -0500
Subject: [PATCH v4 3/3] Add support for AVX2 in simd.h.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/simd.h | 61 ++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..f06b21876b 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,18 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+/*
+ * XXX: Need to add a big comment here.
+ */
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +118,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +133,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +149,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +164,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +289,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +329,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +360,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +375,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +395,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +413,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +427,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +444,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

v4_x86.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"��������0��[:����DbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDbDR����E;������0 ��0�yk�+��z�T2��n�~�z6o���$�)"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1"1">����������
�3�9#�O����\�]1[����9�lCh�����n��G$r>�9N�L������_�?���Q����?A�z����(9��0�j�5��+J�� � � � � � � � � � � � � � � � ���/O=o3�LF����=
)�^�G=����su������O��5�7��H��1�m�W���L���lCM��W��1�B�D�k����.o�v�&�����#0i�_#�9�����[�����vr�;*�e�"�G�-�nd��y�^��F��G�`�x��%�����lq�z����V�!AAAAAAAAAAAAA9�p�kj���,�hl����|��9���i����9�zw=z��kc��#��4����"��^��-^�.oB���O6��|��tVq6��H�J�zn��o��.��rz����5?���t~������6�����s��go8L3�e�]Ox�Dk{��6$����^���kJ��J�&�X�&7�6
e�*���P��>IZRDbDbDbDbDbDbDbH}d�MYr�#f';���6�s����fC�N����gl-�Xm��h���2{���KF/���������Z���u5���"1�����~�-�?I[����n�jb��Z�^����<{t����
�6~�}c�E��w0���43&S��R�]���g���u��������=r�������w���|�1��A��J�poxf"d��@��2�U�sW������Z�=sW�k7Xb�t~N{������`��3"4����M$�����;Op��<%�:)�V/k�����ks���|�����F�#Z�<�\���~{�����&�;��0��[��������9#���J�0�����V�r�7�Yqe:8��&�,�U���>�L�_�i��&R��V����cf�.��J��#��?|�n{�7F�zx��v�����]�>k<�o�l���N�s����7GA�i������r������^2}t�y\Q�([��KH�S��X��"1"1"1"0�����A��J+������S���ML.�����5!�(��
�2�p1�����r{Y=FX���uME���v��[��@�H��H�<p�.F{�|�7gI6/?n}k�ry{6��J�o?uX�6<����~t�3��vd4��e`- �����k���^���
>^�ft5��z'���9�]s9���c��jh�����5��i^4�4�4�4�4�4�4�4�4�4�4�4�4�4�4�4��������ro=�3}8�t���T�H���K��z�����9#�o]b������
�.�)�����+x��������?JK����
�.�)���������������?J^��).
�.�)���������������?J�}�IP�k�����C��)��_;��������:oQ�szx=A/+�r��1���}!6&�(k����J^��>X����9�u5Mng����y�	�~��*CD�c&�	,~�z�}�x�25��&�|�0�C����03P!"#@ $041%`25p���S6�����Rf�]wt���u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:���f���DT����@	
�A$>�o'K����xR?bK�jv�������]�Q�j7<�����
��6�
����D�C�v��wP��~c�1�|'(����5iJ���:���,`�X2���e�,`�X2���e�,`�X2���e�,`�X2���e�,`�X2���e�,`�\�V������_����}.�b��x6�
`@jt�MF�J��oW���>�h�&�
p`k*�v��\�fQ���+��X�C�v���u���6�����h9�(g$i��=�}.�����h�v<�������V�<��q��
�o6�o�1I^��Okr&���m��$�<�d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�T�1�G�F��N�u|��cv�Xsw��k����Z.+v�
!���y�������(�^4��HhUb9����:�����*����{��e��,-B�����s��Z��x��6��h�8C����u\�n]���/+d����(���*�������y4k{�<�(��*�c�bK��J���Q�#������j�U>2����@�����w�.Oa'���V��o������
O	4�EP��T��j���lr�k+6ajxH���������� 65������$�!��&�4+Q!�_1�����'6���_�-@��zl_��k#�j��+$ �%xqG�tH`�{e$P����	���ZoVu���j���ou7X2���e�,`�X2���e�,`�X2���e�,`�X2���Bl�,}�ChP�����CFq�V�*uZ�[d`����`�Pw�&j�2���e87�L�rB$mB�����#��}/�b�����z���F?uS����UY��tD�e�7�[�@S,"���'K��y���+��+��+��+Qg�d9�R�aF�X�lsM��cQ���|kg��������I�3�7$���Q-�+����t=D���S��V�{����(���Fc0���w��6D�v!��6�W�}�4�}�Lv�];L���K�kO��iq5�5���R�c���������ir�j/��4[=S�����
������s \�2`��J��sf$y�2�N<ec����|��!�?F�+V�K���
�������XM���yjc&f�B/����*p�M���F�������(���W\�u�W\�V>���L��Ac������Rf-A��4�h�kn��'���'Y:��N�u���d�'Y:��N�u���j��)?)��G������:��d����Z���O��|��\A���a=���5��d�'Y:�R��o������Cx�'Y:��N���&m��J-8F��i
�G4>���i[�C���B'%v�JO�U��P�[j�g���^�mR������&�����)N��/{R� �_��/�n,�K�%���_�i
�"�k�-f������(���:������a�h���}/�S@giE�f
���8������t���f
h<�*4$Kv�j�[jz���{���o����<qh�}�0�MZ��`5q~t�	*�%@d����b3l�l�XGy���T�)B-~a����lW�c>������\
<4d83n�g�j���P���nD ��i4.���'�G��Bmf��[��}c}��%�H����/�{���9/K�����u	��-�9���������
3���R�������b�������-���u)<A�z��^���O���U�8�)��.>�#^O-3�fsj��"�e�+����_�>�k'��
I���1�/Ckm����/�P���#.H��2x����#Ue����:�^2w�v�	E�@�^2(@8��V��1�����a�B�G:+^���}/I��Gxy�j��b�T���'��w�ja��o�ME���_�4�ge	�q�G���F����0�Y�Mk�=n1�:_4�Iu,;��5iJ���:���,`���Mt1AE�X2����;��pr������,�R�8��W~���@���2��+q�������M?!]r�i����e�,`�X2���e�,`�X2����>�>����4����������q��
����4������
�f�I�U�dxnNB��+�Qh�j�d�����9:]�}/
D�T��j��N���I�����;5������]�"���0���"�\]K�?�|rt��JRhD���Ulj�'f�?:�+@��������'&3�#����9[j�������v��p&���gk��.�>��O���iH��p�0���^R!%�!����C}HN��Ou��-��yS�j,�����i:_6l�N`kr&���m���6�����`
�q�_q#F<�R
��=���l���:�w��e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�Nm����c���i
�G4>�����y����VG�W��J�����.�h�*q��p`���[(����v�t���5 a�1Q��N�a$� q�������GZ���WD.�N���)�JVm�:�X��oV[�VL���[z�g���������d�-����&yl�-�&wym���[;�oVL���wg������<�w�������3�gym���[;�oVL����<�w���������e�q��ng�^=tv+��iL��"sU�������,�B1���$���Wl��2��1���b9
Xv��sE���.f�K�!g8������Vn�rU���Y�)D���I�r�K��������5m�M8Y�7�@�Yl`Y������G`@��(���)��G	
3$6-�!Fd��RP���@�a`g�hP)� �-dx�fA`c
�W!��#1��&����:�Qi���t�����H	J�k
QiF�r5z�=��q��Q��f�V���4E��m�H�+�+6������+�G�#8��Qx���|K��/w�����:!1A"Q#2@Paq�R�� $3���04B`pr���?����!�Q�TwC�u9��Q33��4L$�U/���:J)a1>�Ps;����^�����.����<���Q�{���j�mc.ip�Yc��+��qJ�w9���o���3�����6����k��M����L�������K��T�zSL��!�'��������WG��B.B��V���}^�J�TSP�����E����9�q��2V0�]��^"n���p�w��M��R3�MV&�(iOu�����h�$��W��Nc�����Gf�k[g������Z+kzR��N��F�_��~)����F#a&�
�W�$
v�����i�h����m�����t�%8��T�����n#+NB��g�c�OgN&9
}s�(�������+���c��=�SE��E:G��k��$�l{Kp��P�d���L:r<W��yv��d�.�������&����S9�4�O'F��7^	����@�+���.�M6���k�������MO�]�/�lQ��J��)��U��?�MPm��LCj�t���V��?J������	U<���e����b���@}��>���X1���Z���Tc*	�jkU,������������d��4r����N�~J��Iy�+0�-r�b������n��P(�|�u*rP��������pM�G !�T�C	��)g��8��#����E6�N�{�C�mm��%$��nI��4fS%�\gc2��&7�eF��s��^5�s�\K���'���3!1Q"#2@APa3R� 0Cq��S`p��?���G��;����D-J����� ��p�3��|<��Z�2�Rf5�[�r�!p%#J���_��}��[0������6f
�'��3�� `��M����36��z:�|����`q�,�H�5�l��U�y�N�Y�j�cc��q���n��U�#*V�������q�){�x�s�*�������@0 ^����M��<�Nvk�18�4������5�s`b*x����fi^����A��63�VY���h�+X�6c�}���)�^�&3�����U�&��LG��HR�����9���A���e��}y���W�G>3���~#J@�?�m69�1�N<����}������1S���C�UO17I�����u
B^���Y_���:����:���Z�-m�+i[]�'�� -k�GMC��8�\����������V��f��
�+6���{��-���}���J�('�pFY]���iY{[J�.�����pf���7_fU�G>��bG!5���I�=��z�F��m-���3���HKh�+Xp`��E��m6����������TMVt�'�E#$�������W��7�7����3j�- !�-z@"l��RM�t�����{�T s�>��&�Bp<e��n��4�a��k%uF���8�<_�X q�
/\��f;��z���?(�0���5��##���~�������H���^������B!12�"ABQq�Pa #3@R��04br����C`��Scps���?�i�i�F�D�N�ZLAq������� tQAt'�[l�h��=��[|�T�vK�X����d�;%��,�`vK�X����d�;%��,�`vK�X����d�;%��,�`vK�X����d�;%��,�`vK�X����d�;%��,�`vK�X����d�;%��,�L���A���\����4^[�[I��,�Si��Lm����;��������6e�����^�B��x&�q��2O��yx~�����_��������!�jS^��!j����/X����V/0���M������v�o������<8--��{_[}S��
-B'���
�����L7N���.P!�`�
U�V��o�6��%��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou�.��ZmM�mxHqD[��>KF���m��.��-DC�Ca����&��:���hom�a��]�K�Z)���
��&��Q��K���9ql��!2rT��i��;���
��$��!;�����
(��� u�0�&��!�&��"�o/���wD���R�y��(h��$]������)��
���X��w�{��T���8�uP�:�;@�+�T7N�N�/P"Dx�un��VSu
��X����d�;%��,�`vK�X����d�;%��,�`vK�X����d�;%��,�`vK�X����d�;%��,�`vK�X�kS_�i0�$�?���,�`vHhP���J�R��(O6���I�m��4�R'b�a�l��I��Qa.}�B�5Jg/|"r�
%Q��lLmt�l�����
��z�>.��pA�$H����n���u��}����%q3��-GPZ���!�}a�T!�=�J��}��!nE@���A�+d��o/o/$cu�Li��]2�3�#>����0kD�Pg�����)�����S�D��P���'�m@]�m��1$�i*����Bg�j�����
��S]kR�b}����|�M`����T������m���f�d��{6��3Mn����g��;���[[D���|�eu�k�����%
�!��0�*����&���=�Mh�^����^+���|-Z�b��8����X������pE�� ejs��Qt��9�_)9�L<M���pNy�+��V���a>���f�4��q;�K
,c�����k�{�Q�L�E�3#[�w}?���=��t�l�"��F`�[ei���So��[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�Q[�Eou��V�QM��TbD����;�F6�,KE��U[a���i:��
�����|��^�&0���n���b�}��ou��f�}�������+{�����}����*�#�d2u�96��q]�&���FA4<���O��v���q��^�������2{��%��0��8#ZH�L;!�Hd/��c(l�A�=r���I��������?�~=��k����Z8-=�mE�;�X�@Y�
w�K���e��Q{�C|&���|�7���'�k���N���[
��/�L������6������.�l�Hq��Q����e�(AL��(�`-���J�<�x�.�_�<�s���:n�)������l������w/o/w,<,�yD��y�����5V�i����j(��T�@d���'��c�&�;]��65R�E��������4���{��������s@E|�i�&n^��U�Y�^��U�Y�C��]���G|&��7�^�)��X�;�[{���E�3���������K[���o�jE���m����V����5�o�\1������S�Wdz�"mA��2��pD�r�<���$�Q!�������^P���z�����@��lX3X3X3X3���E�G��^�$���Y�����d�j,(��]���K��wL�DeO36K���p������;����i1*���$8�uP�:�;@�@`�� !��wmXIM�7,�`vK�X����d�;%��,�`vK�X����d�;%��*H/�3T���(�����`vHI����:�j*&�6��
��V��EF`�QI����rQ
��=�+[�o}Q����6�X<�d�;%��,�C�
���VV)��Lc��&��X<�d�;%��,�H��V�R���	�������y�/X����Vn<���R���������f�S~h�8���"��&�-�T���0����d������!���$���{ ��zO~��[g/I��`+X
��
�bKN|[_
����E�J�z���
�J�z���
�@����oASc*<'%�eN��{�+#49���A���ZH�!6���rR�Gx	��������1������T'DuQ3���������J|X�hi���f�hCG��"~���5�N���B$�J�C���y��b�X7D�`�(��#Td�<�A�`�������������{i��r�������������m�������������.�G	f��S{m�`��6a�L��bl��R)2���:�N��l� �9&~�_v�L��/���)�lq���ly����i.�I� ��P���&Sf�He����ZoMg��g�O�m��	Q����R%g��wI{&�d�UMK�5F���?/������g�&���5���cS��������&=�f74X��K� Yt�&�19.�C|G���&�L|Y��ApM����mz�����$Z�^����mp�N�2)���{�kS(c��OPl�2�=�t����K!�{��@�J��C"m�t����o/E�eN;'���2�M>J��X�������P�(��V�cB�C2�fv(��eM���E�B�$�?�E����ZC�F���!�����r��2�������"E��N_�r�S�����.t��l���|����O{����T��4~��s��`���Z(�����qd�S�>���7X]e�$C
�lF6T6v��jq!<�`mR36f��p��#3fj���f���C.�	&�s)$B��!��&��
z�PXi���O��bW`l�fD��/�>��]��-���������{�6N����'2B�MB�Oo��>��9��c�j�+�a��\&�������ts�t�9vC�b?`)�l~:�i���COQ�.�z��2���A���'��������F��H4\,W���|��g�{V}T���{=(o��������^E��Q`�������'��n�m��\�<C���l�;��m�|K{�������425�����S��[�E0[wE��n�
l���-����j7��i/��SfORcA��W�~��+�������j��n^��R���B<7��d�{�7�K�����������+{���������+{���������+{���������+{���������)���xyv��`�|C��M|�?�;�S��=�A��/�h��C�M�[%:+�����3�r����q wR�/d�}S~�/��xw/o.�'k�!2��P�v}>k�:���^A(��]�yS�+�d�r
m��������xw/o.��z~�l����E��CE�F�bp2��hE��.�mMmS��,y"^���t�y)���e���cYO�;����`���C\�hP,coS74X��<��w�c$�sN/y�t��wmc������cX��zh6�����S�����/f���v��������5��Xx(����{a�un{�*�$8�uP�:�;@�@`�� !��wm[~�������Du��y���mn��-�7`N��.	�l�@���m�J��Ut�4p��=)�F��n{������J��+{������J��+{������J��+{������J��+{������J��+{������J��+{������J��+{������J��+{������Ju�|>��T[P��z��h��M4:�K��s���+7bD�Z)mf���HJ�L���w��&X��8box��n��I��y��h��P�����a��l�4xx�k�
��: ���c��|O���r�;p��?���I��O����9����|A���q�Z&J0�C.mb�A��N�|A��46�sL��q!�Um��b�B����nvJU�R��Y�5*�)l�������nvJU�R��
sssG\����8���:��d�Y�-��774u���J��[8!�nnh�����vpG\����8����������)VqKg5���c;x!�nnh�����g�pC\����7;%*�)l�������mq�774u���J��[8!�nnh�����g�pC\����8����������)VqKg5���ss�R���������=s��1�m�VpM�D��y�J�����]T�/�j {)|7Rdf�����5��SuF!2�vH ��M��1b��m�2T=&Z$�c�L�.�������;l	����<=��v�/)'0���'8���#��U'����@�jt9k|@�8��x��%��.��O��=���������t���-.����D�6�H��Qs����;�`Dm���z0"J���.$^�&��Rs^�,����d�PZ�������gA&nt����rL�Y|�&fZ��lkb�V�R%P��PZ�z���f��m�vC�����"�������Q�I�N���df����Bq��[�O������)�"uS�t���9�B�F���&��4�#��F9����D�#~�+;Th�a;�`�lG�=�� ^�aqE4I����a� ��K������E�Dh��&]��w.h5�M��{.O�!9��%�7�{�{'%�Bv�{��:`���e���eL
k$�����j!���fJ�d�C[&������Cgm�����/i���T8T9�@p1"RdF�Q�)�l��:�	(���Q��!�n�A��9��%�E/����������M�Iw:G���?���-���6����D�u�(%���^��36V�r��M����x��:j&VO�k�%6�r]��
-�&���4}#�G�=[���o�����g��09� �����4t�����lx�3I�
�]�<�`sA�q��j�-��<��s���qy���f\F�����D`��9��NmFr.:�d�\����Y8Gk�����!l��.�Lh��d-��yv�:cF�!l�����?��,!1AaQq�������P� @�0`p��?!��\C�p���`�BT[��z��t�x���J�
�z�X_�@�6+�j^�R����������/}�{�K�j^�R����������/}�{�K�j^�R����������/}�{�K�j^�R����������/}�{�K�j^�R����������/}�{�K�j^�R����������/}���0��F�`b��T&D��)9��������}m�sd�b�``,8v�!��|k��*_��|x:�wNY�s�wk���i��(�'�t#�B0�29�"��V*�	�F*1��fd1=&��,���@��k����y,����$���uH�hD�xmf�l�%��y���H@����9<��yu+�W�.��]_����ub��������W�.��]_����ub��������W�.��]_����ub��������W�.��]_����ub��������W�.��]_����ub��������W�.��]_�� }0�@w	��\�{�?3���OB$k�}�����Y @��n"�:�ah4|�������Op}(n�~
��|M�^�{(v�p��d7�pU1[�K�D��5�|,% -�}��������[��.2������@�9
��_K[]JV��?���~>�`�~.e�/a�u�%����:DBz�l�����oL�G�&	��rI;v���"��D�E,�������F��YB("��6�H�1q�����������/}�{�K�j^�R����������/}�{�K�j^�R����������/}�{�K�j^�R����������/}�{�K�j^�R�������4A�f9 ,�1^�R��������h6���*����C;�i$W�u%$*
e��i�S���S�8����|~�����mF���A;3�"��`:�������5�@�0���~>
���Rq�D�H+��1 ��P�m���O��"tB���6�Tp`S���R�*��
�BI�.����eK�(��a�6�"lC�Q{��[��8������?t#��G��4�OH�zY�$�h ���~>���(��pdW@x��
�y����\1'}�y���"�{��`s�'ERr@����x��s���
'��v�.\L��LV��M�8�x^>�1�:
�D�5 ���_������H�(j��s����0�%lC�{A��w�������,8��F�&��AfC�j
��m��Dxv��r�I��l�p��(
�'z�
F
��
�)�_q]��b���<
���1pT��@���t�"@�Pq���x"�,�$8�9Q*X�����D��:����aW�R��ub��������W�.��]_����ub��������W�.��]_����ub��������W�.��]_����ub��������bv(D����nyR�h!�8�|2,���C�W2��!�7<�@'���@1���|������t�(���u#1���W�.��]_���>����eD�9Jm4���R��2\����Zp��	�������f�d������2�oy[�O�6?��>�:Or��>~�����y���s,��p��l�:@���~lL�'�� �z`@���"�}�;�E��A�$� �g$@��4c@@�������(�a=���E
t�L�.j��&]�y*`HB�����}�].�Y�����u�ptl������L�D�����������009?����_��1"-#O%L(�t�>�� \���[3�tu�-�$s�� �A_�dr�Tr�b�������'���l���3D���<�f���j��t��fr�?�@9&��W��E^��t%�A�X*xP� 8������L��-y�`���b��Vp�80�(��<�q=���~��$R
 ��C��D>���H���8��_K��;�����8�2��Q"~�
}=�q���������Y���������W�j��_u�����] D���=�B�L��� � 	�2P	�k���
��Z"��@&1�.AB(N$OR�HTO	��e�����"�����j^�R����������/}�{�K�j^�R����������/}�{�H��H������}�OEa�����p�'�{`X�T@���6Bc�6"3-�`Q��$J�-$%�F3V���0�����]��Tv�s��J�=K�j^�R������8H�;�`�s�Jhj�	� �P�=K�j^�R��������&i�������H������y��u1A;3�"��`:�A`P9L&<;_���X��/^l���%z�� ;Q4��V��d#�g�vN'�P�O� d9WwO�D�Y���b���T��ny
9L'��	$�FE�=��<#?.���'H�_�33��03�����"P8"IUmNl��%U�9��$��������c�F��"
�\�Hp���"��P!��*M�x"�����Z:�z�C�������;&Q�p8*�E�`��4h�dg(����4�B����yJj��L����8t3O{�U�\�\��U:�z2����
�Q@�p
�s�� �O<8Lp
�0	�D������#�
�ndT�`��`-����`6L��`������4K���8����cd0�=`B�`74z���} `�!X)��������F���PV��3�����	{�7(o���$�_<Q�t��$D�'���d�����t�:�f
l&�4�� 0�9+]
�BB.@u[?���W�Jb
S���Q�}����;	�B�^b�x�0�0�����c��H!�x�0�8V��
p�0u�c�&z�n"��lY�^�1D����k���L�bY8��e��B����&b@�D����f�@�Q��N� ��"T����`�`�����a����A����r@��(�*��'���hGTC.��0Ol��'��IGh#\d�t�����l�DH�
p�p&�f<�_�)�>�p�_�5247�@� �	����~tru[����V�����G�����0��6(I�7
�6(I�7
�������HB�l.l#�D��d��F�������I6H�0�:3)&� }�$�7P@*�9� k]�o>����D�,#B�g���8�8"��&�[T�P
'bp	�����J^�1��fd1=&��,���@��k����7�sD����xF)�: T��zV ���%t��#I��20���S�sr�L[0�7#���d���0��T�6%=uz��������p�a��;���da�b�
!\��^y��T��9#<�u�G'��0�.�b�����������
�U�N�!<�������(>�<L���y���������&S�E��I�
�^q��A��Tp2@
j%�oE����5��Y'����s�
���l��U�Z��CD��z��ub��������W�.��]_����ub��������W�.��]_����t!^}_��q�b��;�O���8<7��(��>�8pNc��Ab~�@���'�r�=�U�Qd��%��\����r��T���IGf`�f����~>���8x@c�t��?_H�$o��@Q�h����*�P!b���c�A� ���I��ht0%�9���O���~>���?��'���;�l���5����wT������dO)&�"Jp2�
�L�(��d��]�U.�{[ DG���DQ�+\���\vo��~>���FT��p��eLoe���A��
���@�d@,2A��mQ�5&���Xj����`#�y�Jf.�	�
��)=h ���~�{���lH2��
#
���A''�)e$*'��c���QF#��-.��� ����$;9����(����8�3�$�������A���%hw�	�14Y/�[��	�(#Uob������+{V�,��Y[����eob������+{V�,��Y[����eob������+{V�,��Y[����eob������+{V�,��Y[����eob������#������L(�C ��
	��f�j��`01�Ls&�S�B]��<��Z2H�Bubw@�����d�`8 	9n8 ���N@�c�;���|��%M��B�k6.�Q�>'�Fs���w?_��|#
���lj4�fp���	��@_��|��ox8|�{��

��m��DP��1B�i�!��|����D�$ y�PE��DB��A�A$*�Yx0Fy	J']Yx0E�a$*�I��%e�\�I���$$B����s$B����sY&B��d�I
�V^�d�I
�A�A$*�<�
E�A�A$*�Yx0E�a$*�I��%e�\�I���$�|4�I���$$B����sY&B��d�I
�V^�d�I
�A��P��,�	!VH2H$�Y+/��L$�Y � �d���"�0�d���"�0�d�$�HU���.`�$�HU��	!VJ����&�aW����&���`�`a�a�L0	�	�0�0�&���`a�a�L0	�	�0�0�&���`a�a�L0	�	�0�&&���`a�L0L0��g8$�oh�' ,{�F!$���<3j�\�DH���@��)f�Q#b��BQ� �����&�s	�I����@�-�(��O�A�@vg�5��u7D�L�Y`]���

S��7�N����cu��������,��L<�8R������A
+i���\
��G���� ���b~�@+��P��h�����1<���3Z�l���&a�$A���$|�(n&Pw3���������$1������jY4��d@��O���"�������0i`���x�y�O�G4���cR��MT�C��s��V�(M`?
$��V(�	��%�0q@+�������	�'jWQ�@�z����@
Xl�\���<3���	:���
�	u�����e`b0� `��4�K��&�!D0Ed�Fg$ba�D"��� �����(e�4
rCd�9b.LPF+q�B
4	�2P�F 9#�1N����t�X^A2AB�,0��I�0��&$A���C��8�Y!�H�0Mp�P���j��N=J�`#P��0`r$�.JS�$�C�D!A�; �CIVHrM9�f�F+>x����1�OtA�g�a�'�5x.�`s$<=�u��B5I��h�g��W������3�F�9��D)^�����v�BDp������f�?.��]�4G�Dq �R#��f�8���]���?����8�0�0�0�0�0�4��<��<��<���O<��<��<��<��<��<�O<��<��<��<����<��<��<��<��<�����<��<��<��<a�0�0�0�0�0�5��<��<��<��<����<��<��<��<��*q���<��<��<��<�O<��<��<��<��4��N�<��<��<��<p�$0�0�0�0�*R��p���<��<��<��<�O<��<��<��<�A�:o<��<��<��<�M��<��<��<�����W�G��<��<��<���C0�0�0�2�����3O:�<��<��<����<��<��T\n����N��O=��<��<��0��0�0�w��T���Rp��
9�s�<��<����<�����![��>����_�
�O<��<�O:���Nu����3�v��<�O5��,�Or�<�����&*CW�#G�0�<0���<��<��<��<�J7{.����1o<��<��<�O<��<��<��<����Y��u��>��<��<����<��<��<��������0�0�0�0�3O<��<��<��<����|��<��<��<��<����<��<��<��8��<��<��<��<��<��<�O<��<��<��<��,BBB0B0B0B<��<��<��<��<�J8�Ir�,��0b�<�O<��<��<��<��<����<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��,!1AQaq���P�@��� �0`p���?�E=�P4}��:�a��{4�7���U8��*TK�&H���k���w�(����iXYW�`��f��=�$�����\@x?�������������fT��z�A<���Ie�i
�3�����������l��qZ*�uK�Lr��P���S�f_�$1H@�@���H�YD�����Nk��yR�V�K<���q�5�_!'�Bq����������D��G6I�k��O����z�����'���
��ZA�,C&�:,39������o��p���T�`�
t�������Wh}��+$>�"�OV����7�X�s�lN(wh0�N�9�G�i	�j[���D\Z�MD(�9�f��������N�jrV��Q&�Z�v�A;����547g���C1N���6I7��������/u��������bEv W���E�6��)u����$M���*"j�hH3�u��J�!@_G��N�G���}�
����j��_�#�y|R�3buZ!�����b��=O���B-��oF� ����js�?��	��rG�8����U���f��|��L]����eW�S`���	���k�54��E���]��p<��S����3
tV
��$U;
���R���sj,��z`3C�7���.F>+�B��z�B��V5�an������n��x(��D�(4�=b(�F�+��_�0�����3�C�!-w�>�k)���zP_��RIs�"����4!9A�����h������)/������,!1AQa��q����@P�� 0`p���?����I�D����X������^����%�"l����pM
���7h����*��ceh������bK(�@zGN�|6wb
���~��u���
m����%e�~d4#�����I�8_�v�����i����_n(��R���~9�"�+��A����*�^��c���@���`vr����0�k+��e,�\�����8J]���4!��������D��t���J�B/x|���<�;����r<��6�;J�%xT{��Yg6���[��oX�,����*h�s
*����U��\U?MQ��u������n���k'�y�D&�/;��w���V^��C���`"��
����n���Eg���f���g'��t>����:���9Uz����F1� ����7Gx_ �4�9��PR5���s�5yE��g=���@S�>���r�]�WO�P���xw�U���;/iQ�����S���48����'o���[�&�>��]����x�������\Y����K�^�����'1���>��?c��e��(�h�O�C���K�U���^k>r�R����`F��B�[�m�#��/���F�/����^�����o�Lq����D�xBf��e;��F�F���
�]>�����~��L�o����0��5���4�y��Lt[��[F����z�������6�s��)p�y)�f���f��/j���(�9�<�.i�qDS�KV<|���H`,�&v�3��:������,!1AQaq��P���� @�0��`p��?����$�N��\{���eE����-hK�������&�r��u���6��B��{�����
%��n0 @� @� @� @� @� @�	�[~��-��!���);�!���!�	IVXD�5��(�� ��-K.���~�x}.�0�]�G��<�m,�d=m�K����<n�`:P�� ������g��`���Gg�d�6�JntD���<*(��DJ��]�ca�@�*��N�������]�_l\q>"�����u�E��M����G �$ aj�'��*fEHBp(S"l/z�y9�7�V�+!v.���V���mJ�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*	$��fp=ZAFQ(	r�A���,����{�jFm#�����g�������U���(�(����H*�0}��[%�"L��Q�.�O<����p���t�;�vyJ] �P@aCb&<���X���)[YV�%:$�����^C�?��D�3!���(g��y��K��x�`�=:/�*D�(���*H!��k8K>�[\jT�N��8�R����+V'���9�����K��Sp�-�����TX�����Y��~��L�!�x�L?��th��&]�f�gr�v<Y�� �2G�}��c��iQ�F5���_D@	*3��163".-j���`M����Z��JB2���Z�����h�0 ��{�� @� @� @� @���uGZ� 7���,L��svE�3r&2_�R!	PZ^5��JYL�V������'���)�M�K�f5���g�'�J`�&q�������51}z���~���T�D�7�_@w��X,��KT�(fSfiKT�(fSfi�f(���$�>K���]������V��"x	qO@ �L�����wAdt)��!�E��*�r.���Jy�JA@��]9��c���_C.%�$�(b"��p���#���H~�E��<��\������43p�(/�bI�(?�8�����4�B�)jM-�Yw\�r(I�s������%M�#8%��J����[���@���>����D^}J����=?�/C �����,	WW�[@�������m��!V��[$��.���FO�0�����$�%��a��`�;��u���F���Ac=���1fP��u�8I�%�h��>k�%d��&���
�H|��vyJK�a
g48�@�n9�Y�����:U�q[�������}�x�-
���ze0 P�v�bA��z*�F���Y��6��
Z��:X/T�"�rP�|U��O8O���w�{�O���9�%X8h��+d ���t
6�Ub�9#�l	r����?t��*�4S�r���?`�������9�2L�(pL�|Z�K���(g��5���q���x$p�o�B=j�`�\��,��� �})�,�_���� $�z�[�������XD���e�@ anqV?t�������oj0� H��y!	��XL�������YX^�����S�uX���*T�R�J�*T�R�J�*T�R�J�*T�-�c��[����C4h�Pfv	R�z_Ue(�����6F���3�@���JA?��|/�-S�D�J3�Y��\����X�W|5.��V�(� @���|*��(�)��DL���#��R�C�����T.�q�v��1#pAf��k1d�z	��Lq00n��.-hn	F@X�B$�O��]��2E[���c}���pr������5���L-�_sW8�:�!
�e�����H���<2a�:8�A
���&^�c��?���M��Dag�QE	(9@"�X*��.'���
��W�f�v�����hU59�y��)�KF�I!n����?\y
���vy
G�������*�m
`<;-G�=2���iNO�b���D�9=��Z�'�����!���g�5?����9������Z��H�1���^O+����.����E�"w(��C�4�G��>�w��.����I�^y�	�
����e�&�=�FJ�� �kR%c".����V�|�l=z��!Y}�.����;'��s.��i"Ps8`Za���o*�<��	\��]00C�)�����4���sZ-@U�c
��!�,L��e���fA��2k�=�!2��Oz�'�}����N�����K�-2C<�@!K�J^v��RL���h�@��$�
q5�$:�2��@l�A�sd�mpDE�S�v~�\!r*�wJ���:��=���HO��F�[]J��~��������!��l��#�K��X�Ak_�;�������,"��T4��k��f���������+���Bi�	�?�7�?�VN$w�_�ui�d�?��f%��}������z����~��F�0 �g%*��P,�E���D�rP��mP�e��l����
t?@8I	<&��U#���K�Q��L���qkT��HFC���kR����
�/� @� @�&��s!6N�r���7��Am��(��1co�$&�X���H}]�]R�����mCJ��4IM�"�R����ST��#yByP�9������Lh������ad������18�=�]F�f����r64r����/���&�eA�L.A�c�id��yP�b�Z�� @����;/QR�,���pNQ�v�$@$`f��L�`@6f��b�.rK3���?ADT��FSg@����
�l�o����y��@!
[@n�B�@5v'��2��Q��[��>����T2�}��V���=��V5w�A��=&J���rC���@+��`;��+�6^O���x��R�p���w��ak����_i����W�~(�v�K�K��������f�3�
# ��M
# ��MN��{�0I�im�B���uG�0�B�L��a�}`NJ"���`��v��	6
!:��l�������q{�!��TS2�B[�n�����9�X��X��$jE$hr*L������(W%R�n�������D7 }�%��5k��I����E���"P�,�T�H\C'b����Q�Cu~�\Wr?zC%J����R�.]���IB�q��!PL����������up�5��G��[=j6�@11�mQ(�0�(������Ha� 
DY�;�p5f�����2d	$���V�!"�u��j�)`A++�Za���I��a�����)8T77��'
B/P�0�	t���.�Uu��v������;�}�_��T�5�����Zl�����E�iJ����]���f�08�*C�R�3�u��i��)w���>�{�
,Y'�V�
"���=H��Q2���X]r0�� I6��3��H��D$�
�������+
����=j4�-4�%���pw�7�4��?���-�.j�M��B����
0�0�� �`%�\pY��(q����1'xC-����*��|��VQ7�qBXE@	�kQ7�qBXE@	�kT�<��*$&��;bsF��h��:��n�&D���T�3��&`A�DL��3��&`A�DL��o1H���k
L0_"��H[�}.���p,�w�o7�XPN�c�H��u
a���G�+����5ga\f������"{�����\�U����
���t�T�wj@.��@�uP}3�t|P"���\u@�
�df�2�7:z�4�Ab9�>-�o��`��~&�A������r?��w~���D��i��!!��1B�oc3�fA�@��@����"M��"+,�4����"K�&�h�,�E�8�R���
#$+�����dAd �fR��n�B,Z%��!����X�KCzt��"��DJ�+������]�\���I [2e%LZ�$�P���'Z@��UB�r�h9Q���*���R�eH��������d����,�"l7@�hEQv)���8��� �K��;�+@�����r	x�W�=< ������'������>�d���\��PyN��g�{T�T�!54��W=%=�����T�a=�������2�='��+�|�/,��1���������M!�e`�%\:Z���}���Et����"P�������K���B$��N4�s�9��)q-jB�@�2�a{�+�������Y�v����&
��R�J�p��X�w�*>��A����l��A�������v���	L,��jp��l�H�
�����P�������8,�4�2)|7R���0�p1�*w{V,+p��rrY�
^K�I����P�+RlDX�[���'<T�1���^nk���
1
�!oy�S�1�����������o�}7���S���T�R�J�*T�R�J�*'�0��G��<}�
�(��K8��Bx4�y���K����������
8L��v��19��*##��J���p�QT�&8B�*�%t�n��y��t0R'�H��M�=�0�����z�U�c#��(t\�SIR�f��!=a�����vx���\#�R������{���v��I�Br)�EF��b�"��i��0R|,�)H���6M�W7��q�|�������@��h%�8�;����~����?����Z����F�BC�2yebO����'
	�N����jN�1=�Z�f`p8��Q�BU�jv Bd�]�;RX4a���f�#�E��,���3b���?u
�~n���jGp� OB%�R�(�������u�����>�w��.��8K7���[
��������
m�@S�V��X���Bo�R��*I:��`*I"	az+t�%P��iQn�W��p�
���I=R�5�����;b����*
���f�$h��1NA`L���������`2A���	��d�"q������K<��d;I�+a6.�����Q�!1�h�N�F$�Yqf���!��WU�<)�O@���������tF���
�L&�b, ���XL4n���:W�]�_��x0fQ��>IR�J�*T�R�J�*T�R�J�*T�R�J�*T�R�C���&S���*�JF��IIiV6N4��`3��4���Ip[�Y�@���%6��6��k9��A�����@1����t6��9�+�����E �d��U0 �G�#���!7(	p`��y<��wy�A	�b�ig��BW�]�OR#�y�����\U�O\�A�Tf��<�}.�#`U��!����8�#%�sr��Q����T[w���<��wy>������l/�q�������%{��qK`K�]�
$q!�:#7C��>�w�BZv�Aax��P�' �����a6#��
QC
������o�|�}hH��A��[�����Z=t�rt7�~�
��O�	�i9:�?
N������]4�
��O�	�i9:�?
N������]4��
��'|�N>,N��
N������]4��
��'|�S�BG��NN����|\�v4��
��'|�S�BG��NN�����������#�M''C}�N���Y�����������o�|�}hH��I���A�Prp7�>U>�$z����o��*}hH��I���A�Prp7�>U>�$z����o��(98�*�Z=t�zI�*$p[��#cM�S������	���n,Di�\�L�������lT�ni�j��ff5�FF��b�#sM�W&31�*r74�5ra33��#cM�S������	���m����7�(��m����7�\�L�������lT�ni�j��ff5�FF��b�d}7�FF��b�#sM�W&31�*264�9�o��0���qQ�����Y���dli�*r74�5ra33��#cM�S������	���m��0���qQ��������|�����k����6�NF����L&fc\U�����D��O��'h����8+#F:�C�x�b�Atj����d�6����"P���,R�+�"��e���.0MD��H�4�
��lS��E���%'Y����$�7�"��B�Q`^��E�(�:`��Y�(��K�.&	��GR��,	`��N�w\���fBb��!`���:����,���[(A�[Y�[��$�q*�)X��Z�""�h�Z�%�C��Q�H���~mF%^Y'=
�)3qW7�`��I0F�a ��U�P@��2HF`���.aZR���U��S�t��HS %���#
grs��ETH�JP*�]])l���JADo*F��lI1!$JP�Q��;6��[����[CU��k3�;���)!�;a���^E�0�lO�^���p$����)�
2�0�
*Q%�N���nP��P����)�bZaa�������� ��jC�?)"GZ#&I!��
JfY&��L����
�I!S�C�U)&T*�UV��!eRm%�Z����R�T\�7@~������B��(��e�`&������&���Dz�D3�]z�k�uhQ	Q�vH��Bqj�%m�.���u���A.X	]�mR���Z��b��D�F�G"��+r��	#YD�&��
��mK(W�q�@c/1�	r2���c�`4-E���iQ�7�^�e�Sp�H�I?N$6�lkvZ�bQ���
�`-����x"�T��!HF%�hg�>��� .���c�4B��yA@���0��T8*�&���`�/�G EW@U���+q������(.��tH��d��p,P�2�KQ&0K�LYp,
`6�zP�{%E���"X2�1�])y8�!�-q���c�����,6��"X2�1�])y8�!�-q��6d�����&TPe�u��r^��w��[���,-��pn&�JHXz�=�HA����q6rU�U����7���pn&�J�j��V�������
v4_arm.jpgimage/jpegDownload
#24John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#23)
Re: add AVX2 support to simd.h

On Tue, Mar 19, 2024 at 9:03 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

On Sun, Mar 17, 2024 at 09:47:33AM +0700, John Naylor wrote:

I haven't looked at the patches, but the graphs look good.

I spent some more time on these patches. Specifically, I reordered them to
demonstrate the effects on systems without AVX2 support. I've also added a
shortcut to jump to the one-by-one approach when there aren't many
elements, as the overhead becomes quite noticeable otherwise. Finally, I
ran the same benchmarks again on x86 and Arm out to 128 elements.

Overall, I think 0001 and 0002 are in decent shape, although I'm wondering
if it's possible to improve the style a bit.

I took a brief look, and 0001 isn't quite what I had in mind. I can't
quite tell what it's doing with the additional branches and "goto
retry", but I meant something pretty simple:

- if short, do one element at a time and return
- if long, do one block unconditionally, then round the start pointer
up so that "end - start" is an exact multiple of blocks, and loop over
them

#25Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#24)
Re: add AVX2 support to simd.h

On Tue, Mar 19, 2024 at 10:03:36AM +0700, John Naylor wrote:

I took a brief look, and 0001 isn't quite what I had in mind. I can't
quite tell what it's doing with the additional branches and "goto
retry", but I meant something pretty simple:

Do you mean 0002? 0001 just adds a 2-register loop for remaining elements
once we've exhausted what can be processed with the 4-register loop.

- if short, do one element at a time and return

0002 does this.

- if long, do one block unconditionally, then round the start pointer
up so that "end - start" is an exact multiple of blocks, and loop over
them

0002 does the opposite of this. That is, after we've completed as many
blocks as possible, we move the iterator variable back to "end -
block_size" and do one final iteration to cover all the remaining elements.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#26John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#25)
Re: add AVX2 support to simd.h

On Tue, Mar 19, 2024 at 10:16 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

On Tue, Mar 19, 2024 at 10:03:36AM +0700, John Naylor wrote:

I took a brief look, and 0001 isn't quite what I had in mind. I can't
quite tell what it's doing with the additional branches and "goto
retry", but I meant something pretty simple:

Do you mean 0002? 0001 just adds a 2-register loop for remaining elements
once we've exhausted what can be processed with the 4-register loop.

Sorry, I was looking at v2 at the time.

- if short, do one element at a time and return

0002 does this.

That part looks fine.

- if long, do one block unconditionally, then round the start pointer
up so that "end - start" is an exact multiple of blocks, and loop over
them

0002 does the opposite of this. That is, after we've completed as many
blocks as possible, we move the iterator variable back to "end -
block_size" and do one final iteration to cover all the remaining elements.

Sounds similar in principle, but it looks really complicated. I don't
think the additional loops and branches are a good way to go, either
for readability or for branch prediction. My sketch has one branch for
which loop to do, and then performs only one loop. Let's do the
simplest thing that could work. (I think we might need a helper
function to do the block, but the rest should be easy)

#27Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#26)
2 attachment(s)
Re: add AVX2 support to simd.h

On Tue, Mar 19, 2024 at 04:53:04PM +0700, John Naylor wrote:

On Tue, Mar 19, 2024 at 10:16 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

0002 does the opposite of this. That is, after we've completed as many
blocks as possible, we move the iterator variable back to "end -
block_size" and do one final iteration to cover all the remaining elements.

Sounds similar in principle, but it looks really complicated. I don't
think the additional loops and branches are a good way to go, either
for readability or for branch prediction. My sketch has one branch for
which loop to do, and then performs only one loop. Let's do the
simplest thing that could work. (I think we might need a helper
function to do the block, but the rest should be easy)

I tried to trim some of the branches, and came up with the attached patch.
I don't think this is exactly what you were suggesting, but I think it's
relatively close. My testing showed decent benefits from using 2 vectors
when there aren't enough elements for 4, so I've tried to keep that part
intact. This changes pg_lfind32() to something like:

if not many elements
process one by one

while enough elements for 4 registers remain
process with 4 registers

if no elements remain
return false

if more than 2-registers-worth of elements remain
do one iteration with 2 registers

do another iteration on last 2-registers-worth of elements

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

johns_suggestion.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..d154b61555 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,34 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+static inline bool
+lfind32_2reg_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				result1,
+				result2,
+				result;
+
+	/* load the next block into 2 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+
+	/* combine the results into a single variable */
+	result = vector32_or(result1, result2);
+
+	/* see if there was a match */
+	if (vector32_is_highbit_set(result))
+		return true;
+
+	return false;
+}
+
 /*
  * pg_lfind32
  *
@@ -100,7 +128,8 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 remaining;
 
 	/* round down to multiple of elements per iteration */
 	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
@@ -119,6 +148,9 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
+	if (nelem < nelem_per_vector * 2)
+		goto slow_path;
+
 	for (i = 0; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -157,8 +189,21 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	nelem_per_iteration = 2 * nelem_per_vector;
+	remaining = nelem - i;
+
+	if (remaining == 0)
+		return false;
+
+	if (remaining > nelem_per_iteration &&
+		lfind32_2reg_helper(keys, &base[i]))
+		return true;
+
+	return lfind32_2reg_helper(keys, &base[nelem - nelem_per_iteration]);
 #endif							/* ! USE_NO_SIMD */
 
+slow_path:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
reduce_branches.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"��������4f��}:��t{G�D�LQ1D�LQ1D�LQ1D�LQ1D�LQ1D�LQ1D��]�?sF`N���&P�:	�e|��V�n��:N�t�@y�:Q�:1��{�P��j	�&���j	�&���j	�&���j	�&���j	�&���jsy��5h���������5����:9��G�V�Y�0�;-6���Q�����B��:9�k�9���j�����<k�]���G����!������VGK��ngs�PMA5�PMA5�PMA5�PMA5�PMA���b�������{�|����������f��W������K��:��9x�����Q��8�%l�	�eN��5�����{b�\�_MF/O)���(�	�e��i)������Lj�MA5�PMA5�PMA5�PMA5�H�{���cu��g4����|/�}����8��;�Bf���~/2�*�Ms�>�����h�
�U�d5����m������6mV��'@�g�������=�qYzg�r�{�s-<{"�'I�(�#v�t�����V�;����&�q�;��|4_7�K�	�t������U�-rv�����qz��cs�7��G:V�[	�e��D���];�SMgG�^~{�A���m��OM#�����>U��|����n����D�LQ1D�LQ1D�+}��g#.���:d��n��c�]��nng/vESJ&(��b��&(��b��&)7��'I�4?=oy���>������w�;7��c���!��t�@��'!�ec���aZ�v{^xmy/[su\��������2�nk���C�1�Q
]4���4����w���-�v������-���V�9�*j���D�LQ1D�LQ1D�LQ1D�LRo��F���kvA:O-�[�?D:|�m����C����|�:����l�	�eKCm�=W�{�c���Q<�LV�['I�����s�&6Nz��	�e8iv��r~�^�~�^�~�^�~�^�~�^�~�^�~�^�~�^�~�^�~��3~�3~�3~�3~�3~�3~�3~�3~�3~�3~�3~�3~�3~����'���'�9��������������#;g��j���a����
�=���]���]�������n��7w����u��|M�Q�<s��p����9����39���=t:L��=|�m�z�;�&����3v�����e�]�5q*���������-@"23P0 !#$%1`p������D,��+�������wL{�N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u����d-B���j:�D���������������"�w��_d^��?��������wx��������|�?�������-���������5�t|`��q����.0\`��q����.0\`��q����.0\`��q����.0\`��q����.0\`��q����.0\`��]��]���%������1���+���w��������p������[,&>]s�����}O���
/�,w��_YO���������GO���w�J�4R��)|#���l��x�SQ_�Y9mO6��)���M��u�sv�+b�(���wu���@5q�>��A���5���nks[�����5���nks[�����5���nks[�����5���nks[�����5���nh��4�.?Z��G�1p�j����[b�V�6�W�G#�u��ni�����p�(�C^�_Ka��A��������X���iY����QMN�����V_�w��_Kq��M���0����(g������}.�����W��V�
'h?Y{���/��d��Z�hA���c��I4q���w��_E���v�#��"W+42�O3E_F�_������0��B[��	�	���q���� .@\��r��� .@\��r��� .@\��r��� .@\��Z(��%4�T��V'��R��5	���M������r��� .@\������&��5�����C9�����=z�����6�Q������j����2J[Uu�W[U�$+�$U^���9Zk�O�zb��=��+@"w�;������|��3���c����G�g+��T���k��u�EU7�/w����SMyM5�4�Z}h%��Y��)��*�S�%r����T��{%�Y��J���w�//|��!��%9��ORG��#�����od���g���V\���j\���/w��������>��Gj����xq������u�.�e�L��$��AgP�-q��\wnM8��j�9�8���������E
������jA���vn�9#��j��z���^Q!�����/w���S���Du �IN	X�6R���@��H7�]�8�!�D\���U�f%���,jC��^}J-�=^(��K)�;[i0U{R�wS<�	�6�I��K�����5���lkc[����5���lkc[����5�ujy+��jw��*i��{�Nx��1GJ�Z���5���lkc[����5���lkc[����5���lkc[��������5���1P(�d{����V�Iy!}�+i�u=�{���/�[��x9�;~cpW�YRu:���{���/��Z�n;���*0�@��m-��������~	�+�9��$RE��Bxb�������Z�k��"��Q��T������~�vf{�����������:�(�h|����j
�V�/wtya%�����a)�������6E0Z�a'Sne����r�@!�]MN3<�f�U��Z�����6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l����������E�Q�6�G�-F|�����&��Vfn���^. �m�y]��Cc������[���3I������F.QFl��������z��Q��b�S� ^y���>�������Z�KMxm�����w��Rkd^���c��;3��p��{��������gC�>��,=YnXz����d�[z�w-�Y;����[>L������[>L���9m�L���3�����oV[���,=YnXz����e�a�����'r����g����|��'vr��wg-�}��c��z���W�yk�#�D�7'��)�k�C!T�P
�l+�Q�� �,GZ8A�*���2`	�kJ��<��Y�����.q���Qej�V#�Y�����e9#"`6�Yj�<d�^����G;0�!Jt�Fm,NL�f#`i�"����<�XS�
RYbn�Z�N�)b	�����;����
�!V�@�5<�1�#6�p\�C�;�1���%�b�!ie�Z��u)�x��L'���,��(����0����8��&��)T���Z�4�GVlt����m,�8E���Q1��d8��5��O8LsH_�R8[���\��\�h��[���z�^��R�����6!1APQ�#02@Rpq�"3BDSab������?��&WZv��������#�sX\��i{�D%xu��c$k�,��Y�i��H��=v��ju<\*�9p���e��MO$n�w~d,+��We�u����P����)���/���c�!�47nK3����$0D�6��1����������&����+����Ym�Q�p1�9��!,#�x�\�2Q��\(�y��X���/��a��T�����S���V;-��W;�h���i������,dh������Z��.R��Y�u$������q(7Z��$��F�*E�s��,���:�����*h�M42x+fQ�E��K�&#1��U%��(�#����p	�2!�|N���&���\��Uczr64���q���p�����6�G�YmA�m������<�*?�h��'�K�������,!P1Q"AR03@ap#2C����?��Q���{F���
�������`����
�E ��;�Q{6�k}�yT�E�Lb�p��	T]�,�@�<�o2C�����_��6�1n�+�"�*��K"���C�#�����'��m���j�������N�L�(R|.�WrfG�f��m���z�T��JN��+�S�c���=&������E����;����K��/�����zV`�i4�b���n��Q��3&��t���Gk�^	�(�����dqW@-c���f��2l.f�5���)�w�K�i����^�����W��;Fl��zk�����@
2�!1A"BPQq��@a�#034R� b�$`r���C�Sp��?�)�r�`�:M���J����GM(��;g��pJ0�����&�����2�/���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���doi���F���T��Q�}�������l���j�F���7�;M�xLH�mm������i�����4��w�w�y�;�|��<����]��O���*�4��^�/?������}�'��]�\Y���Ju��e���~e��4��|Q _�9e&���sp�����,��,(�i���%��E�m7���R�.�R�7&mQ�^���2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"�2.�"���<��|��9F����c'(�0���wZZ�6.|aLV�:�	?N�_.�dm�gQxI^��B���lXF/W�q���|����s�B:�9�r��Sg�p/�v����&Qo�yP�"�P�c����S��m�Z��u��	�=D{o�o��]�h�5�1R|t�LT���Qh�US-e��]�nUo���8���R�7�z��7,��������AwGo��'F-m�?���>O�i�:6��L��v
�3*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*�2��*��]��5KX�3*�)U���f���u�K*�	���mXl�:*���U�,�R?�e,8X"�a�2��*��o��|a4�"_~�=4K��-xz:h���[�|����L_���#lE;���<L67�@�)'��k��	��R�+�m�������j����5���6�m����t��w�w3~���G;�=��f�~�<����]��B�����=Z4��*����Qo�O?�p/�q�oc6�^gx�r�=G�U�����r�O�z������mO�}���!6��:��lq&���
�3:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�3��:�R`|g����UZv�1J)V�S�g]fu�S�KTc������H?����~Z_���������[|��������������������K0;~����[@���L����e�I@<ON�~����$��_.��?H�<����d�G���C�1����:�z�O��G
>s��S��S��S
VFc�R�"�@[�/x�)�G����>�^�3D8U3�N�Bi��Q�
��U�a��a�7�G��J��6fQ�r�f��^T=FZb��n����L]�
��N�*gec�Q���l��tZl��xz����jp�?gS�����oo�_/�(C<&�lL����������q�i��3���2��G�}�y�?�Qq{��>�?���3�0T��a��)�S��.�������� R�t#�	����R�vQZ{<A19=�3(��eLVUkZ��!aeZJxU��r��i��{�-9]��Uoc|W�m���t��U�|W�k�;t<��;�T��u���j������pa���O���nMt�K��~��{ZT�u�V�?lW������?nb�*R��5C�Z~]?��-� ��r�~U����	��������UA�����Le���G��~�
fz����������s}�~U��Z~U�V�����R��T��A��l�) JK����-�GH����v�NIU�3
����	Qs�+���?^^���U��l�N�vO��T���v�������!��N����V��:������@��N��}��|5���OG��biPA�w�|M��-*�J�=�'�����U����r�|y�>���w\"�2�#	��
�n�F��g�����������7����0X&[��L	���~�0e�������LO�����1 �|����-���3
6�_h"��|��{"#(;8��I�Z�������l���6r��������2y�>-� 
Kuo+�N#����N���r�����t����X��|��4��n���I�t��I�t��I�t��I�t��I�t��I�t��I�t��I�t��I�t�l$��
f���&� ��Q0�S4���CV�c�x�Kaz�`�T��y��f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�f]&e�%�������A\��l�����:�p5��	�o9�a�nS�N�Y�����������T�6�O?�p/��W
��T����u�~����)+�v�H6��jy�;�|����PS?gd��f<���Sqk/�������.�5�S�[���):�G�%TV'���������������|f����l8�H��x��m��������3?���#\�"�bN�R�U���l�3�3����4�������g�[e�O���R,�P0�	w���F;@a)S�W�A���t��w���I�k�����/N���r���l����4�~i?z"�]|F;^�;^�:5�*�7Jt��
�S;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�;^�~�����ab65�CB�JmI�
;������ �PYm���Z��t���ne����*o�Oa9�E{�lZ���V������C�l��-uc�l����x�kr�W�{�<����P���%fl���������
�.)������%���^�O?�p/����#/�������O���?���5F����wRL� ^<y��P,Rnp����wVm��w�y�>5���E��
:���1�X�0���O?������M�� �,km���Ds�e�����	�r^[�n�g%�������r^[�n������s[��=s��������\�ohz�c�����9��\�oi�����w7��xxL���s�}����G:�r���;X�u�����s[��s�}������	�r^[�n�g%�������r^[�ng%�������r^[�n������v0!����\�`8C�;��g<<!����3�9��\�_y�����w/�RX������}��,F<v|�xE�E��>|�j@J5�h�k(��� ��,��m2�RK�bSk��������R�����t
m����I�4`:��#��#il7�����L��E=-���1�����/���*�����ke�l��P��|�1��p*2�D�[|�q��P7SpA��e�g�����_+�
���Q��nF����@b�H��D�������h,���oF�3t�V������l�M��_n��2���J��=�Xyy�-�Sl
n0�(�g^�cER�GT�����B�X.5�x�2�>`	����)�4O�Y����k�
oU�ewRjSm.'e�"���_d�s�V��*+}��e� 7e������y2����5����������H�}�2�M��S�����R��}�Rz3��8����Xn ���,�p�)P]L!o�y&��F���}�
�\����6(�m�o�����j���1���&�����]�q������[rv���m��t8W)�����TA���(���Z%�����t��[��I�1������yOI���e7���3�t�lv�nb�Z�&���"�����Y�2����}������>3�j���ag��Ae�WG)Q2�@���)&�{����0 ��M�xJ�����V�e���unN��_�����&+�!-���)���M�O���l���]eCV��yGXT��cT��T;�G�w"����;�<Q�}��*�v���#���w�����j]o��	����*5L S�P�]���*r7�����w�\��lom���B�m��a��Cf�C3(9���M�u*nj����W���	wR����c������< ����k�G�����,!1Qa�A�Pq��@����0� `p��?!����!�8Q���O��D�$i^�cs���A���� ��*���"��6���q���������{r�nY��=�g�,�������{r�nY��=�g�,�������{r�nY��=�g�,�������{r�nY��=�g�,�������{r�nY��=�g�,�������{r�nY��=�g�,������	m�P�LX��C6,A��#0����_[h9��x1Y(`,8v��g���;_�3�n�����7r������k��~M�����vA����TM��L�������@1D�n;L1�������Q�&���0�a��!)����d%?@�4�����3A���7$���'$SX��0���,acX��0���,acX��0���,acX��0���,acX��0���,acX��0���,acX�����A���iN0]��t!5�H�3�'45�����:��%��@:�~A���h�q���M������G@��i�@ ���������oN7���LM����R�����rr@K��G��HA�ND.�s�(&i�~7��&
@��	M%�Y�������rr4�=P�`&Z�I�d�r�W���3�F_���"��>�	
�;&��C �z��~X��FfB�y�$a�V.��fG��D7��'����m������S��vYN�)�e;,�e�����S��vYN�)�e;,�e�����S��vYN�)�e;,�e�����S��vYN�)�e;,�e�����S��vYN�l@��m�	c�?�;"����T$�,�YN����W)�3�L���
t��F�Lu�������AAK�[��0�F���20E4�un���<|�4Z����
l�#�'��`0Jz�*���G��oA��H��>M����rg���`u�D8k���80]�������k���
�;��q��N�P����&�A���#��"WG(�&&#%hq`���:��ra�y7r�����CQ2��/ l�	a�^�2����S�M:��&�W�i�P�7d���%��O��q�K�["l	�A�F@�0���,acX��0���,acX��0���,acX��0���,acX���]@��N��(���������&�z��S0��0�D�.�C�qX��0���,acAc��Y�b�"�2I9�iJ(DN��9�n���a"e	8�4����-7~��!4��]G�����v�?��������+X��,�IA�<��T��	�E�5�����T�	�jD`����i�Dl��#�����0A�����,X�F��P���i��CB�h]�6d��x��#M���N�H8�������D�C\�y���r��=��������[3�@���r,@rB�MM�Hg�~g�,	Z�4?��w���D|+�?W�~��G�������4A�~E��1p
d�@n� Aj!��f�Q	)5G�BM�tm�R���5�h��d�/����0`��v�0DBk%dB�;8F�
h�}�;���M���@������9��A�m�{_��~ |���@�S��QMJ:�ZH�X!	 A��+�A���;_��6�	&`����Z�?�u&S�$�H������_��7�9&#�;/��s�wb9�!�hZC�</&�b���4}�E;�&!�����8����c�3�M	��'�k�I�� �ui0@�DH��X*h�E��#	p�^����N$���'����T�E���#�<�$^|����������DDV�cd0�?0!d0�?0!d�&�����/�;&�K�M��Ej���@]j���@]BW����>Bs�~H��#~��"��[A�E���Qt������8e����&a 5L"���^u@m�O
 h<�`[O�M����2�L4��0�	����X;�Yi#�@�_K |��P���1YN�)�e;��u�����S��wYN�)�e;��u�����S��wYN�i��G�|�Es�~o��P4�Px-�f�\�D{�H[��^hw���=VS��wYN�)�e;��u�����S��wYN�)�e;��u�����S��wYN�)�e;��u�����EAK5_�v��@Qz,��;d�.�f�r)������x�'��AgL����{��F�-����k��O��q,%0.������'����^����v������t��me����3�Q��X����*�������v����%n������wC2�������,	`H&�c�	������F����&�A������2Q`��J���N!����/��?����b��������\����7z��S}>P�!X��
^���6�Q=���x6��(�{L���+~'TP��C�|�D�����F�MYI~c�!}(��o�eo�d)�hp��{�c�BgEo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�eo�d�CW�u��U�6���P��l�`D�]`�p�d�������p�me�����&����@��wL1�U66�_�������3�<����v�����@�"���#"���D�C���m��������A��(����^��d@d,>��]�}�=�o�w <LaO�uR:�����a�3������Xa'������k���A`6�MDQ�BkP���7z�E����H@��PE��D�����(���N6���\��Q8����r��!)D�k�.�� � ��+.�� � ��+.�� � ��"�n��A�A$5tO C�Qj � ��'�!���d�I
]���Z����-DY&CWD��C���L$���;�����d�I
]�B�d�I
]�B��@���V]�ByJ'YYt.Q�%(�-ue��Dg���p���B��BR���V]�A�A$5tE��55��$�Hj��=�jk]I���<��E��$�Hj��@�N��M ��Aj"�0��&�c���d�I
]��1n@a@�Q0�L(�P&L(
&	�
���aD��0�a@�Q0�L(�Q0�L(�P&L(
&	�
���aD��0�a@�Q0�L(�P&L(
&	�
���
���aO^��g8�L6QbN���8����n�����%����)��a��Tk,�f����9rH�bk�L����W^� � q='D�v�0!xs>���4wwH�l���)c�'StMd1��K
psV
gZ��bD�$�T��dFG��iI����������x�\d�	,���4t \�wH�@D�w 2`�&�WM��
8%
��NkV��n����!��M(����Zmy��@S�"	�pD��`|��f�E}8H���x���tb��r	8@��&hsH���J1wP��������'D����][x��8�$I,~�-�$��������^{�	q
8� ���P����D��&(?��F�������/`�@��ay�D~�N`��R��
�R	��-y��Dx <9,����d%u ��B
�
�C,��u���r�\��B@M�D� ��tj� �!pd8�4����N(����NWK%GB�LH?"�4NP$���b�0�fk��!lf�jN��g�]c����:K�����s���?�>���BD��M�51#�
�������S.!)2�~C��� ��uzW��v�����3���@`�$�-I:�[�u�Doq.����������IV�������8��1e� h>nHt����>O"=�
+^�c,�(#��hp���I�����tF�M�(80FKM�QT��V��u��?���L8�0�0�0�0�0�0��<��<��<���O<��<��<��<��<��<��<��<��<��<����<��<��<��<��<��<��<��<��<��0�O<��<��<��<��<��<��<��<��<��<��$0�0�0�0�0�0��O�<��<��<��<�O<��<��<��<��<��(�����<��<��<�O��<��<��<��<��9�_
��<��<��<��,�C0�0�0�0����Xb<��<��<��<����<��<��<��<�����<��<��<��<��<�O<��<��<��<��{�9���<��<��<��<�$0�0�0�0�
p����#���0��0��<�O<��<��<��*�oos�<��<�������<����<��<��3�H���<��<��N9B<��A�0�0���B��0�0�<��<��<��<����<������<��<��<��<��<��<��<�O<�����|��<��<��<��<��<��<��<�����j��0�0�0�0��<��<��<��<�
<�����<��<��<��<��<��<��<��<��=��|��<��<��<��<��<��<��<��<��8�<��<��<��<��<��<��<��<��<��<��( � � AAA@#�<��<��<��<�E0sK�G��,8��,�<��<��<��<��<����<��<��<��<q�<s�<��<��<��<��<��<��<��<��<��<��<��<��<��<��)!1APQaq���0@p��� ���?���i��*���s�ji��B-��9���������A.��E��bi�c������Z�&O��w6��<�N}*P��c5�������+�f
��`Ye���$�o�V��{�An�u/pNo�>�3NpUvP�{w������P-�@�PR���]kX�����t�f�ft�QG�Z�ue��XEa��V��^i��S�gB%�='G�������}@Ku{�E�A��:
h�s�@�X^�R����,D,�:gX��x~~���
�[�
zs�����C���������PX�	W��9f��}b�E�M���L��0kl8���RK:��BM��=�S�����#�����n��bYi����������g=����c����-<������(�B,�%K��{����2��Dj�x��a5����n�����)�0�
�@�>����*!1AaP�Qq�@����� 0p���?�A�x�w�R������trZ���c�U��FliU�z�J�	��M�"O:�v*��>4��d�7�3/!~�z��=�x�)s �����8��\}���V_E{N'��>��q����.�(�3iw��v�@���
����b|X���S=��Us�ULs9Y��
6Nwx�J�����`�����A�u�6�����T
3���EO���t��!�?j������3C��_D�W}�����w"����TE�@�h`�J(���}g��e}Ey�����1u�lE*��v�X7�\���
���Nwx�����9�s��/Z��hv��R�#����8O����Zu9}����,!1AQa����Pq�@� 0���`p��?����$�N��O�����e���4k	�PWVIz��}�
�����L�k	�}�J�nG�i/�p��� @� @� @� @� @� @���,,Yh?������Dx�����2P��'0���:��"���(�� �
��LVi�^��<.�s�^��]�������=��{�v{�R����>��Nj$X�7�*�$�k$}Z-�pA0(�#{��C'L
 H������\1,���g��/�[�B\]4�������E�p��+d,�d�E�p��+d,�d�"��e�ILKR���i���U��xWj����]���W�v�
�^��+�xWj����]���W�v�
�^��+�xWj����]���W�v�
�^��+�xWj����]���W�v�
�^��+�xWj����]���W�v�
�^��+�xWj����]���W�v�
�^��+�xWj����SctS����d
��09��d�;���:��tl�����e�	��������S�
-hP��z��d�>P}{������%����(����!�M��3}�/�T��"w
�dqsPx�O�9{����>��{>�1eW�����U���n#�������.�B �������@��Xa���3V��:_����bZ�/�P*�]Z�,����n�G�1���C#����v{8�iz�~Y`8-^�����>(�6cLL����E�B������WJ��-�X�	��	�7����@�?��X'�'�Y�ICR�>���8�!?,U�����XD$�10���&���\i�&*�$jq��Y�hL�*��Q�v*� ���J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*W��lS���c�U��6�� (����U�*�3��a��S�0�B����*zn�~%��h(�$NqV1 �!,
�4�*U���������6GZ�$����pA-E���|�{�O?O���.>�^��Z��f9-3�j���������)���	�2EfB <A�Z�V	r�������2l�E�N�W��{�v{7�����e{�)�B,�,'ji��&�(�
 �����>g�xf�ny'�=b�yC�:��ef��G�u/`|.�dU	;-`�
l��+�$3+���
�+	Ve(�@�_���^��]��.EzJ�����	�J��A:JX�����E 7X�LbB�`��ry-�S��=K��y�q���g��	�p�Z�px���#��0+&p^���F�@�k����w�
�^��+�xWz����]�����w�
�^��+�xWz����]�����w�
�^��+�xWz����]�����w�
�^��+�xWz����(���\^�.����:�L?%����G|"���xWz���|m�@	����h&��	��F�N�����,j�s^��+�xWz����]�����w�
�G�%
c��o;9���4I1���!(7kj�u�X�,'-���G0'+�����
�!���P�9f�
�SF�E>���2p��!!��H�y_����hJX,K��xz�>��}�GAoXai���*�/f�s�T�HP��*|�ap`%),L�k�Y[���3r&�L�����B��]�I���b�L���H��$ �����7���K��J$�� Z;$�+$�BIJ�y�4�E �i�#��+��fKAx����g��9���hhts�B�,�d?�������	&.���5�B^K�U�$o��7�1����L���J����#�BpO������1"�&D�
�*_�4�e�-9�q@�����36���<����zs��- X^h�r(F5��\5D�$�(thPx�b$��5���d�Hb*�b^�  ��-��Z��,�D 	,�ED�B�(��,�SjM�A�+{���y�P�0&E\���@�Q��r�=����"���b'X���]���M�
������HJ8�TXH�!�zn�V�6m����������K���Tl��B���2B�k����dk7H1�Hc$��G�/�'\5t&��������Y�!6���a�\!A:���,5o�	��r*	6
!:��l����`��v����FKwvdSyv����8F�o�r&R��Q�D���f9Q�D���f9U���q
�1`3%����=C�ko����%H@����uM]�i��D_�*��p`����C��G#S�!��`���E�=���C���V�	��}K�V0���<D�*����r��@�q�y�nRp�X:�3A�bH6)qa��\�M���D��T8�]��I�oW��63 Se��o5y��c26Y���Q#5D��UD�+W	J���H^���^&B���@�
Y%��{�;$[�&B���g�.���~=!���j;��L����lK(�����:(����V&t��qB�hAC�eK�(�ZC|���u/�B`WcJ��i���R�bv������7��Q�$-�������7H���N���P�_��*T�R�J�*T�R�J���*7�-*�h���MS����s�Q{,���c�
.�MF2.
�+n���f�
�����e���<BIi�Bc�%J�*T�R�J�*T�R�J�*T�R����u^��v~d���m
S+b!�S�i#��6D�sH��%�~G�6�e�"�K�!,�������}d��m�@����@����?������?�`��HnS�4�A��q_(�}��M�ZU�!��������?�bM�T�Jr���W����*��T���!(��Z��6��O�u/`|.��YpKhA�*WVH�F�x��"q�|YM�B�����J�e]���xz�B���VY��&!���������c�2��d������{�v ���L��+�H^�w�d�(��b�:���g���������+����R�n��������-���J1��LLU�g�,�Zy��i'�_q$�^��`a���$�l&��O��aL��D�9$��B�c#@b���^���K�*)mP;�lRX��J��� 3����h(P����Q	B��F_J��/?�/�T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�C1D�>T?/^��\&3D nc
�K*��!�`�$�z��,�bH.���g��n&�x@|��r�P��I-�N�YD�.i�A�@0���� 7����P�E��,e�P�L�H����{�vP�c�,�oN�u�|�_��ff~�������3�gE�?(Sd��������]K�)@],��p�/����(���	D.�.U��}03$1al�FATpP��R�o������v���c��J`d��wb��+������������Yp1��g��������%~�p�8�%�%�.���l�F�)YA-���~k	i������B���`�+�����T�j�o�[�OZ:���k-�I�eOZ:���k-�I�eOZ:���k-�A�uOZ:����o�~�=hH�������T��#��N��N���]�]A��������������o�~�����k�Prp7�?jq�bu5�)���������o����n������
���������������T��#��NN���S����i6Z�rpYS����i6Z�rpYS����h7Z�pp]S����h7Z�pp]S����h7Z�pp]S����h98��;[�w�u'|��k`v.�����o�~�����k�Prp7�?jq�bu5�)���������o������hc��rt7�J6�p�6���4�ra33������6��	���n,DDi�\�L����qb"#M���ff5�O��|�����k�������	���?3M�Y,�-����o�Q�-��?3M�J2���b��i�iFP��lP�o����m��e�-�G��lV,�mj>F�b�L&fc\T|�6�\�L����qb"#M���ff5�[�mW&31�*�X���j�0���qV��DF�U����k��""4��L&fc\T��7�d������i�k%�e��?3M�J2���b��i�iFP��lP�o����m��
��5#M�X,�%�����~v�;DL�����Y1��H�q�E�&��X�����T�A8���U�I�bR�@�.XD�(Y�ak�b��������
�p�L�O����3d"ap��IL�\�Bn����J96c�"�b���AE'$�%���D���*�
q����R�n���Y1��H�3KX5mL��P����"�RI�Q���7�����eM��u|1aV��k0���O@�#`1�C��0wQ�����0`H��R8�%������4�@@��H�34�P�0! ���&25sy�
`]`^�H/�$4Ht����P���h��%�� ���S�GDV�i����{��KM L3b�����|@��T;xN,���T=%(��#���|���M�'���x� ��
lM`�� Vv�0cDR���bf����5W���*�X(LRf�������hU`
A~���a#
�RJF��lI1!$������M�(!@���b$PHD@B"�"�R���Z��b� d��$�4�$C����!�%����f%�a26�Kp�
>� ��[,��`U���I�6{Cz����
�K���� �DF�B/�2�d�(!@��B�H:�E�� �,���6�[n�R�b�1C]��!)��Ct���)cC��h�HU2KH��������%�A��&�� '�v�ZD���{�����`0]���K4�@��9���J�I#*K�U����� � �-'E�w���b�V
�4�;s[2��A��J�j$���37�3#Xte����e8����J���
���
{RK�a�6e�� �����(6����n��f"]z�k�ukX�����D`�Q�K�� H��e�x~E�T�A���hB��:���jR7�U�h ����D�	��� 	3�QI&(��	�������@(�K%J�,��d����
X��:����}%�3w\����fa�0f
�n�H���H� �HB�I|�q�fQ����
#28John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#27)
Re: add AVX2 support to simd.h

On Tue, Mar 19, 2024 at 11:30 PM Nathan Bossart
<nathandbossart@gmail.com> wrote:

Sounds similar in principle, but it looks really complicated. I don't
think the additional loops and branches are a good way to go, either
for readability or for branch prediction. My sketch has one branch for
which loop to do, and then performs only one loop. Let's do the
simplest thing that could work. (I think we might need a helper
function to do the block, but the rest should be easy)

I tried to trim some of the branches, and came up with the attached patch.
I don't think this is exactly what you were suggesting, but I think it's
relatively close. My testing showed decent benefits from using 2 vectors
when there aren't enough elements for 4, so I've tried to keep that part
intact.

I would caution against that if the benchmark is repeatedly running
against a static number of elements, because the branch predictor will
be right all the time (except maybe when it exits a loop, not sure).
We probably don't need to go to the trouble to construct a benchmark
with some added randomness, but we have be careful not to overfit what
the test is actually measuring.

#29Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#28)
Re: add AVX2 support to simd.h

On Wed, Mar 20, 2024 at 01:57:54PM +0700, John Naylor wrote:

On Tue, Mar 19, 2024 at 11:30 PM Nathan Bossart
<nathandbossart@gmail.com> wrote:

I tried to trim some of the branches, and came up with the attached patch.
I don't think this is exactly what you were suggesting, but I think it's
relatively close. My testing showed decent benefits from using 2 vectors
when there aren't enough elements for 4, so I've tried to keep that part
intact.

I would caution against that if the benchmark is repeatedly running
against a static number of elements, because the branch predictor will
be right all the time (except maybe when it exits a loop, not sure).
We probably don't need to go to the trouble to construct a benchmark
with some added randomness, but we have be careful not to overfit what
the test is actually measuring.

I don't mind removing the 2-register stuff if that's what you think we
should do. I'm cautiously optimistic that it'd help more than the extra
branch prediction might hurt, and it'd at least help avoid regressing the
lower end for the larger AVX2 registers, but I probably won't be able to
prove that without constructing another benchmark. And TBH I'm not sure
it'll significantly impact any real-world workload, anyway.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#30Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#29)
3 attachment(s)
Re: add AVX2 support to simd.h

On Wed, Mar 20, 2024 at 09:31:16AM -0500, Nathan Bossart wrote:

On Wed, Mar 20, 2024 at 01:57:54PM +0700, John Naylor wrote:

On Tue, Mar 19, 2024 at 11:30 PM Nathan Bossart
<nathandbossart@gmail.com> wrote:

I tried to trim some of the branches, and came up with the attached patch.
I don't think this is exactly what you were suggesting, but I think it's
relatively close. My testing showed decent benefits from using 2 vectors
when there aren't enough elements for 4, so I've tried to keep that part
intact.

I would caution against that if the benchmark is repeatedly running
against a static number of elements, because the branch predictor will
be right all the time (except maybe when it exits a loop, not sure).
We probably don't need to go to the trouble to construct a benchmark
with some added randomness, but we have be careful not to overfit what
the test is actually measuring.

I don't mind removing the 2-register stuff if that's what you think we
should do. I'm cautiously optimistic that it'd help more than the extra
branch prediction might hurt, and it'd at least help avoid regressing the
lower end for the larger AVX2 registers, but I probably won't be able to
prove that without constructing another benchmark. And TBH I'm not sure
it'll significantly impact any real-world workload, anyway.

Here's a new version of the patch set with the 2-register stuff removed,
plus a fresh run of the benchmark. The weird spike for AVX2 is what led me
down the 2-register path earlier.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v5-0001-pg_lfind32-add-overlap-code-for-remaining-element.patchtext/x-diff; charset=us-asciiDownload
From d47b3219fd1b803a5dedff9babaa5134c07e6947 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 20 Mar 2024 14:20:24 -0500
Subject: [PATCH v5 1/2] pg_lfind32(): add "overlap" code for remaining
 elements

---
 src/include/port/pg_lfind.h | 102 +++++++++++++++++++++++++-----------
 1 file changed, 71 insertions(+), 31 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..21af399dc4 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,49 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+/*
+ * pg_lfind32_helper
+ *
+ * Searches one 4-register-block of integers.  The caller is responsible for
+ * ensuring that there are at least 4-registers-worth of integers remaining.
+ */
+static inline bool
+pg_lfind32_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				vals3,
+				vals4,
+				result1,
+				result2,
+				result3,
+				result4,
+				tmp1,
+				tmp2,
+				result;
+
+	/* load the next block into 4 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+	vector32_load(&vals3, &base[nelem_per_vector * 2]);
+	vector32_load(&vals4, &base[nelem_per_vector * 3]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+	result3 = vector32_eq(keys, vals3);
+	result4 = vector32_eq(keys, vals4);
+
+	/* combine the results into a single variable */
+	tmp1 = vector32_or(result1, result2);
+	tmp2 = vector32_or(result3, result4);
+	result = vector32_or(tmp1, tmp2);
+
+	/* return whether there was a match */
+	return vector32_is_highbit_set(result);
+}
+
 /*
  * pg_lfind32
  *
@@ -119,46 +162,43 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
+	/*
+	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * one-by-one linear search code.
+	 */
+	if (nelem < nelem_per_iteration)
+		goto one_by_one;
+
+	/*
+	 * Process as many elements as possible with a block of 4 registers.
+	 */
 	for (i = 0; i < tail_idx; i += nelem_per_iteration)
 	{
-		Vector32	vals1,
-					vals2,
-					vals3,
-					vals4,
-					result1,
-					result2,
-					result3,
-					result4,
-					tmp1,
-					tmp2,
-					result;
-
-		/* load the next block into 4 registers */
-		vector32_load(&vals1, &base[i]);
-		vector32_load(&vals2, &base[i + nelem_per_vector]);
-		vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
-		vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
-
-		/* compare each value to the key */
-		result1 = vector32_eq(keys, vals1);
-		result2 = vector32_eq(keys, vals2);
-		result3 = vector32_eq(keys, vals3);
-		result4 = vector32_eq(keys, vals4);
-
-		/* combine the results into a single variable */
-		tmp1 = vector32_or(result1, result2);
-		tmp2 = vector32_or(result3, result4);
-		result = vector32_or(tmp1, tmp2);
-
-		/* see if there was a match */
-		if (vector32_is_highbit_set(result))
+		if (pg_lfind32_helper(keys, &base[i]))
 		{
 			Assert(assert_result == true);
 			return true;
 		}
 	}
+
+	/*
+	 * If any elements remain, process the last 'nelem_per_iteration' elements
+	 * in the array with a 4-register block.  This will cause us to check some
+	 * elements more than once, but that won't affect correctness, and testing
+	 * has demonstrated that this helps more cases than it harms.
+	 */
+	if (i != nelem &&
+		pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]))
+	{
+		Assert(assert_result);
+		return true;
+	}
+
+	Assert(!assert_result);
+	return false;
 #endif							/* ! USE_NO_SIMD */
 
+one_by_one:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

v5-0002-Add-support-for-AVX2-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From e8337b123d828671d5c547d2a96485ef15f4ddfe Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 11:02:05 -0500
Subject: [PATCH v5 2/2] Add support for AVX2 in simd.h.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/simd.h | 61 ++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..f06b21876b 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,18 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+/*
+ * XXX: Need to add a big comment here.
+ */
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +118,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +133,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +149,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +164,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +289,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +329,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +360,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +375,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +395,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +413,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +427,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +444,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

v5_x86.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"����������1�vnGbo^<LQ1D�LQ1D�LQ1D�LQ1D�LQ1D�LQ1D�LQ1D��>�����0'I�N���{@^t�@�:�����u�')���G&aA5�PMA?������g�f�y��j	�&���j	�&���j	�&���j	�&���j	�!}v��t:L�>j�:��������f������g�����@�������{?�2SLQ1D�#��-e�����7�(��b��&(��b��&(��b��&(��b��'��w�����W[��Er�o'���	�e������\�\�=0Nk�t���<e����nyK��e��-���M�g�����n{)�e������\�\�=0��1���N����N����S�Mlo���?;�/�������O-���g�>�5���f�x�L��j	�&���&9^k���Q5���j	�&���j	�&���j	�%�lg�z~���w�=����y������g!s������x{ZMC��Q:T��t�@b�b�y�+(L1��/��v�|�����g�d}��1��cbz�:N��R}���6Q��PMA5��_<bd�#���WQ���&���j	�&���jnaM��9>����<�3��w�����:}����j	�&���A�u*�Y��ym��kQ����a�w�Gu��	�e'��y��Y���������N������;���$�w3�;���������c���j6UvC�,�^��4��o�#���6�G��4h�y�M����[�yty�M���bs�Z�������LQ1D����������y�+#�uM��&(�7S��������<�_m�^�����wG��WS��g�9n�������9T��D�LQ��'�V�����h�_=
c���������zs|��7��fk_�o���x�v�k)���>a�����{3#G���������K���������o�~�Zg��2��������t���PM�n���<n�O��B,����VO�kU��S����8���b}��2�d���{W<=����Y��SE�~%�������t�@���x�����
4y:�3�v�=��O\�E��s����C��yh����|6�=��������@���6�������������yA���:N�7����vL���(',�,/���k{��F�O3��u~��������y�2v��rTg��������>[��j3�j
F>����^���������#O4��_?�>�O���>O�W&�y��j	�&����������w������Z���Z��L�=5�9�U��_D��E���9���^; ����G��^c���'@C�2t�gbo'�&O??�:h�x���i��]�%^����L��;�C����;�7���s�/[��?�4�+;��#�6�D����&(��b��&(��b��&(��aH\<�e�.�\�NU���i�ze������A�y��E�y���>oG?����N��x�3�.����x>}>�:N��x���c2����~�����'���7���^�F3(a{��b���c2����+(a{��b���c2�~�~�����'���7���^�~�������/���3(b���ce���A�A��#�&�w��L}6�Hm�}�r���<s�':n��T�v����:�T��<���4}!m.��6�ll�����2�[����D����0m6�9�?w7�,�A���k�cm�<���<���m�x��������&�>}-���0
 !"023@#P1$%4`&p����?������WE�3Ek����l�d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�u���d�'Y:��N�����t����OH"QU�Q�_�/w�����{���������c�M�&�^��/����n����!}C^|�g����nks[�����5���nks[�����5���nks[�����5���nks[�����5���nks[�����5���nks[�����5���nks[������vb`�^"�rb�1E�����;�~�;QX����zO�3�[��F�a����s���zG��?��ydc��Q���kZ;0���W����C1(�/J_�t_?�u-E���Ann��1�^�~�;0��W)/Ct��4�������u��]b�X���.�u��Y�qtL�����.�u��]b�X���.�u��]b�X���.�u��]b�X���.�u��]b�X���.�u��xw~��"��.���C����)���
��R$�������E���?�{Oh#��C����)}�O���0�J2�j.�l��ip����O3�A�]Y��~��*����c�����@0��!�:��Bf�
���/xj�U�f����$��%��Q�^�|�O��;��/Ij������=��V�^?�^��jP�0�6��F��{���-���������5�t�x2���e�,`�X2��?�R��,`�X2���e�,`�X2���e�,`�X2���e�,jS��8:���-Nm��@�^����`�W��T�\$X2��4���e�-N"��q�^����}�R~T4����Cj^�UQ��kN�%4�n�{���2���&� �Z]V����u�1�|Pw5��H����{����M��i��XZ��_��{b,"��j��6����JN����[��	�	���q���� .@\��r�7 )H]r��� .@\��r��� .@\��r��
KG�O�r������<"���j�=fu�@\��r��� .@\��rr.����R��K+�,���e�fG8��K�	X{4�*��+s����T�����u����b��Le����C����o�=7����������bQ�h����TB_^?������S�_KR�FUk
X%�� 2�4Q���T+��E|�D��o��3JE�D����n���4v��#}x�/@�)rm�O��u;^�	^j*��Z>Y$�Q�Fn	 �xk����
����,(�C���K#Ej�<e����K�{����P����������������6xi\�MM�Sr8������G�z?��L��GU�������x��dl5��K�!������k�Q��5r�O�8T���i&��a�{���^�4�#�Ri�����3 6���l�e�,�f�6O#3�f�6Y����6Y���l�e�(��T��l�v�[���-����������ul�f�6Y�����'�1�6l�e�,�f�6Y���0.PY��3
j�E_6Y���l�d��]��Ar��!���1Enkj+s[EvW����Z�c�Y9����[�S	��1��,}�01�����j?���/��?�����l�����T��h��u��oO��cZ���)�����^�����n���
���8�!=�A��(lJ����e��)V�-�X��,�0��`=E0����TT�����Xw(��G4Rs�9�0�4FU,���G���w��I�f����]6l�Xt������M%���XZ�-o�X�=P��������kuz���qW� �`X���-��}��.�������uv��U�����0Uu-.��KS��E�h��}�o�}X�]����i��N%�q.�r]zd���C��������$fl���]�4���v����T�^�$Z�F�������C�O�kM��%j�v��U�@-[f[2��e���=B�}i�ha)�;�������w@O����d��v��,M�>N�u���T�K�ruk��������g���d�Z������@��N�u������O8{F��%�=�]u��V�i�7\���3.fF\����d�'Y:��N�u���d�'Y:ww/���u �EV	G�t�
���:���m�Z�3+m�G��X&���~�i�����������<��
�. \@��p����W. B>�{��N|�V����e�����5z-2s����Z�j���K�
�:�u����]�0j1��e�<>��^�����\z����e������6�MC��������b��a���X���.�bQFh��1�>�{��O�E*	���?�Y�9��!�c���7W����@�����
������v�.�d���r�*�U��iV������{���qX����?��>H3��TZl/^������u[VK���U��kc[���a���`vX���$�5���lkc[����5���lkc[����5���lkc[����5���lkc[����5���>Y}b1
�� �(�C]�Ui���P����t���^�v�2�8�h�������#m����z��xk��L�7M��4�������N�5(YDqn�������/w�3�+C$��!�~���b�E=6"�m��d����nX>L�l�'}�2�-�2�-�����d��a�^e�`�3����|���6N��e�[��?�'r��g�d�;�Xy��nX>L�l��g����d��a�^e�`�3��l���oO�����e���y�������I�v9�����^f��"�M�V"�M0B/)Y���*�`�`�Y�U��&|o1	��lIFY�%�!������!JD�'���,�Fc��<$V?�����_�Z���������^+��r�c�+v@��T�gb5m�1��7��4F��.1�x���kE;�`��csV�u\d+h��)
���2�NFy��b�NA��x�A�Z���X+F�-[�@6`�	i�rEp4����SCZ8�24��
h�F X�n iHX�Hp��<X�x$#����QCF����9��
����m�b8:�Ct��j�puf�F�Y���P�
��
����	�:���
7iz!��/DT�/����5!12"@A0BQRa�� 3q��#SpCP���?��S�t��<%����y�=< 7Z�����I)��U>|#����$���&�Gw��9���'d:�������v�D�K���,ng�$�#�t�t&`
f;�5����S��Q`g���v��m;�[��k��EME;H���"2O������r*���fx4�)���|7����T��	�GX���F�K�d�T]���`�o
�f
g%M�v�B���Ob�1����ctXt�K������9���JkCrU4���0U6_xj�P��9+��D�_�y�2�a�w�s�+�)���
T�S�J�n_z����_����G���*������"eJ����L�h����D2��v�J��q}��w�����O=�-�?u����[Z_�������p���$�����c�d��]����T�S1���6b���~S��@��M�I��F+s�n-��2=3�WD�������1M5�,-�����n�OG�*�G���U^�L8�.7Z�0]�|����-��n��h�=�����m��Y���_�E��8�Ln��vP������7��O�m(�=�����T�!��6�g�?��9
1"2@AR!03BQaq�� $C��p��#4Pb��?�|����l�/l�Sd���)���E���d�v��[dtbY�pPk?�c{!���M�$]s�:���h-Q���M��N�?v�������wL�����T����������Q�d��2��d���V��I�By��Nqq�Qk)((��au@&K���E�I��tx�zs��-6��Mvrh*�QHZ������
qQ�T����4��W���A�lD�cKI���]`
>���
��=�M
h��S
ICP�SJ|q����Z���Q����}�)�'���P�8�+�8Y^�KCI���C@Wz~^,�b��GP|7������6���~H�k.QM���=?���������XP��	��Vp^(���J������R~kf�.:�3���<�HdB��!�#E����rs�W&�qQ�#�Vm��4M3��a���4PX�Ax�k��?*<j�Cuw����e�����
��+UvN%q�I���M�(ydD�#Z��7c\S�x��|�%`�;"�
Nmg���l�����_5��r�
+����_0{�����7\�N��7A�$)��c�E���>���T�t�_�M%��I{�A9EN���F��������.�L�m�f�r��T.M!��rGq�����G�\�qu���Q����*S�r���>�I�Ss�h%���J���a�������9��A����C2!1�"ABQ��@aq #0��34PRb����$r�C`dps����?��$��]���9%�`tH.��9�^>�X.���m��@C��R���%N��*�P���tT;�����wEC�*�P���tT;�����wEC�*�P���tT;�����wEC�*�P���tT;�����wEC�*�P���tT;�����wEC�*�P���tT;�����wEC�*�P���tT;�����-"~��4>g�q��f�7���,�Y���
�|�y&���~.�_��o�����x��~.�_��6\###�,�-�4{��|��f�w��6L����=~^���{�s�U��hZL��~�78�W���4���~Y-��+�����6��KsT�5KsT�5KsT�5KsT�5KsT�5KsT�5�y�-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�R��-�2`c���D��+P�Bc���+R��=�62x��=�62x��o��t��Va�v~G�7��8&z~��\��8������e�O�G�4&��DxMG�d@Z���_���������-^����_)���o��)���PH����M3E�f�f� ���������Vv�i�����NsB����b�N��� }P.�l'�[.�h���Kg����s���n��wE[�*��V���tU����n��%V����V���tU����n��wE[�*��V���tU����n��wE[�*��V���tU����n��wE[�*��V���tU����n��wE[�*��V���tU�����;��������g�~H8�wE�����o��oc?����(��;dL��+���S�?�kuS� uPG��d?6���o��gc?tG�4#������-�(���-f�"���h�2
�r�T6�h�1�?_�O���x�Oph�T�r���h�7�(1�M/�7����uPG���8����?/�n��|4��,3�7��d���]��^$��������N�!������Z}���m�g���Ei>�Sh���������Hg]�=l�u��x�������h�i��{t'~�����?%�Bt�����m�5'������9�����������P�D���@�d������[��os��V�1[��os��V�1[��os+�[��7��1[��os��V�1[��os��V�1[��os��V�1[��os��V�1[��os��V�1[��os��V�1[��
�D�q���{�E��P�`�b���&����TH�>L�[��os���5�}��GZk{���b�YkX�z��+{����t������c<S���BdB�},x&����H�CnCJ~�W�>��b'�P4�bV�G��_��o��c<I|GI�6!m��{{4s��T��������
�K��&C��^�=~^1��
����W���th0'l���%��&��Z$f�|��h�xk������xw���@�o�B����{�EOH ��X���C�����������u�Um�V��m�V��m�V��m�V��m�V����U�4�Um�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m�V��m���C�?�|�[sx*��k_#
��H9�"[`�Um�V�����s��4BD�$m�V��m�V��m�V��m�V��m�V��m�V��$�o�v���~����_p_������-1�i��^�;���L�/������%�q�?�(:9�`[��}�5��dE}�K����2,�k�jWN$+Vm�S@E�������jZI`����C���`���v�7����A��S��
m����������`����������{f�#�2��lI<�I��n�@��N�^�����8����D���w�{��R�<}>C�x1�@�#��Bs�A�o�9��H�?�k@!��6ph�����&���b&:R���d�K���$�n��7���,�n�rl�;B��	������S����k�q���jR��>-2q2
�f������Wy�������������>�b��3�����IF?�J���s�����/7-R��=,��L�k[Jrs���{~����/�9}���~��c�(�����N�='���m�^���zO���6�M�V���k"[c���+HEm�O�O��NW���Fn�iK��S+�M��,,}{,b&���\<����6P;t'~��p����|I�!�
��
e��/+qk		�wym��i1-|���+�j�m��T�"m��j��n8~U��V�)[��or���V�)[��m�Zt�[��or���V���)[��or���V�)[��wm>�����+{���/�D���xfw;���G��e6��&����%���)����r��+{���R��J�]}��������JN�+{���R��J��+{���R��J��+�nK{�����Ll�=��or���V�)[��or��q�����b�M,m��d��5�{a�>��vt@@iu����1�09����E3�����8L�L��6mt����@�f��q.�BZ�M�K�����-[&�%�T��ms\[��%����m�d��K\&�$B�s�6y�/x����-�1"D-[l�;��?H�hh�o�����s%�>��L�������ip�
x��T�|WT{4'~��{�+f���{0����1�nV�J������-s\O����W��
nc�0���T��)�M.;{i
��
����,m
py�z�`8��������X�X�w_���v���9��|�kg��{��7:xardx6m4������F��
���!����l���A��p{���E�������c�7���)R�N��H�w��b?�S��������`�`k(Cfh�26�n���(�w(#���o��g[���eS�4�;L��?ex�X|&z��c}=�-8�R�D(Z�����f���C��4��8pV����#�S��g�;4'~��E�`���=��0������f�=�5��T=P�C����/��w�!�1���������Kg!�S%s�=���HGK�~��
	���d6�C���[��n��QO%q�G��~�������+�.h%���X,;J��/��6-& �8���v�~�X.���m��@C��R���%7P��wEC�*�RB�a��md-�e�������wDbDiz/�c���5P��G6H��$5�N�����I��C�*�P���F���5����Z���*�P���tT;�0���� kM���od�?�W���_e%�1rL`k�c4oP�='0�r���
��P���tT;�����wEC�*�P���tT;���5H����u�l�Dl�f�;�+Lm�������!��<J�|������ ������M�a}�_�6���'��!�Co���
���f��`�=��
���_��o��@���d3��_��}U�
x�}����*�����D��&{.�!k;Y�f�_E_E_Et��/x���
�������������A�3v'��� |���?w�����T�
"���l�����6J����fn�}�2_d���3%�
��Z�i�U��x�z��A�������w��6L���1����c};;�0�g��v��xhi��q=����8Y�x�C`Z����bD'Z!��,rd�vKd��c4�
��
��
���������(����;���HL��_`�?,������S�U�d�x�����v\5q+���K�
�9�SrU7$fF<-�H�@`a�6�nU7%SrU7%SrFdd�%��������o*�����[��nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���21����\U�p!1�Z����������������gxwY4cF.��4`=����tg0��<��+���������8��W&5����Y�����Hw� �L�)`�bz#�~ �_���GHuO��)fEH	����.{���2Pa�p�m�Q����=~^&46���(�0C�����TB���Q�78
�k��F�
�D�gg+f�l��;x"m�&����lC\��6 m�&����lC\��6 m�&�;8)[5KgY��l�4u��b���l�5Y��J��[8 m�&�;8)[5KgY��l�4u��b���l�5Y��J��[8#�pwEY��J��[8*���f���np�7���f����
V�R��s��*��
V�R��Vv�D�4Mss������l@�4MVvpR�j��
�����Bn&N�x�x��%�=�cv��w��H�G�`�"Q!�f[�Jc���9�7�D�����gd�SbJS��M����)�\2bA��)^�s^���N�7N��d���;����G���������St��&J{��zg(��[�\S�@Hn��`���Q#�wt���VN��Bk��L0��pj�H�[���LH��j0���d�x������t���-.�����A�m�3n\� /%L�/pui.r��i
���<8���4bJ`�"C�2�l�])����qx�i�8���0�-���MPM����-��.i��K�3�x7cq�>}��@�a8�a��zq�����82-���a�����)��]�a��_�~�v=��b�Y>��:�v�Aq�
�;�h`���� ���a�9��C��:��|�A���^�� 8�FtWB�6$��odXb���&�g�>6���2�n6���0 ����4`;C$��^�����%'5�	��������Y����I{�(�������Y��0����,��� ���'$�Av�Q��K�?��V$�H�hx'�et�����8�$�7�X�'.�[Q���L�BF��m1�
�
���%�"6����O��	�8I<E�������D�e�y�$����L�� �"	a�������Qc��q
xm.���?)F-�r$J-8%�G1��2PK���B�@��\Qi���3���g���&Wz!%������`�pv�jr%��W(n{]o9�~�Wr�CKo����
H�Q��v�1�[�L�;��6K�i���8�2�7G��z;������f��-v��	��qe�n�=Q��w��	����!��u�v>d&1���8�P5�h�l��������G�Z�78�u�t8d�;^���g���x30v�-������0t�g��������,!1a��AQ�q� @���0��P`p��?!��\C�p�SR\���oq�T{� �#z���}j�,+�]@�8+=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\��,��=r�\�F1�-L7,~V��� �(����s@�u�a�����9����_��j�����~M^��������}�(�NvG�����IAgV��_��j�����S�����4
�oe~���%�
����b��#�m��e6YM�Se��e6YM�Se��e6YM�Se��K���l��,��)��l��,��)��l��,��)��l��,��)��l��,��)��l��,��)��l��,��)��l��,��)��l��,��)��l��,��)��l�

��OL2�1�F�a�R�������}�F��������!3tq���T^�A�z���������!o�yk���k��%Qjz�k��f�4Y����3h�)���G���Q�y���9,:""Hy��h���������[��"���(��EG�k��f�"`���_��eI�yFC,&6��Lo@%#U<���b�5(�4Bn��tE��S�V�:$b�Q\S�2Yc�X�:���c�X�:���c�X���H���b���,u�b�X��,u�b�X��,u�b�X��,u�b�X��,u�b�X��,u�b�X��,u�b�X��,u�b�X��,u�b�X�'�L�p�����m4,X��b1��{_���+�$u2��N��s�|4(�@��RTc�?A6��p����;_���U�
3@�*:��([�r��B�rU��:b��c(������
�����������u���V�0�FG7��X����>�t+���#�����v{�g��!C>QQ7b13�b�n�bf%�M��0`���������}XF���r���$$���I�q�� ;@_p����t*Y�����xp���,� �f���v#0����}�����o�7�����sE�i�a^^�_���U~
��U�*��UW����U_���S&1�?���T���EW����U_���U~
��U�*��UW����U_���U~
��U�*��UW����U_���U~
��U�*��UW����TYFDX6��o���C�#�W��t�`a�QB- �Q�Dj���U�*�sL�i��1�~�~
��BD�U~
��F9%��+z�������w��'7x�gD�I����uY0E�=Z(� �����H�&�D�,�����v���<;�R����C��;�M$��6T0����@�@ ����gk��q�xw>��6��1H`N'��h����`�R=����{~W&�N|'���F�<38L����,��~��� �s�����v(0H��0���,acX��0�H'R��%!��acX��0���,acX��0���,acL)�Y�11�A�0��<�	�!�$c�-�X���&��R:�	��X��0���,acX��0��6��� ��!��.�k��a8�L�����������<�N�E��:88�����1��b"��pv��%��p�1`�H�BN!
+����1����^�FK2E ��8�BaB��o
�	���p�������������s4�� n�eGs9�1I�H>�n�u����M'\B
X����tP7���g�o.�!�6��d�;��g��
�C�|�'�me#@���d`@B��������jQ���:@ �d�*��(0�
Q`�-""��<��>�A������tOd���?��NF�@AX�����v�=�s�Ohx�bQ�@1C�rE����Lxuxw��~PUo�?�Ab�z
�����H�!�e:���>����q���q�����!>�zw���	���0H�� �����-�C�;}�A���!2E@�=��}�K�u�d�h D��b
���@�@������T��!�o�3z3Y�������_�����#��\S��2�a���������F=��"�"��D�E.�B�xH�<��"�1�(@��J�>
*|T�(��QS����D�oy�p���O���>
!0��A�pQS����EO���>
-`��'���EO�����.A�F������������B
dL��S����c�v��>
*|T�(��E��� �b��?h�>
*|T�(��QS����EO���%�|�fwu�|M��rP��>��*|T�(��QS����D�CW� \�r��2�H�Jq��q�TQF
��Hc�(���$1���88�	�(� L�%
�X���v��H�;����`8Y�+��F�������	~�FY���2.{yG�������y��|�����v`03PXI��D����y�I�@80T��G��
������4��O��	�0��7@>�����=�U���k����a�����_N�}��3���0h-��3B@�T8G�(6��t�9�Ljxc�����|q����`�;�Q\����"��0	,`wAs���0;��y�f��)�b;�A�XD����=�j���-�"�L����!��"�L����!���M�f@A\�`P|������(	+�a������ud4X�7�������l�z
��"KS�yF�����w��V�}��?�+�	'�M`o��4h�5�#���!��0H����v��0
T����L$(�'�2���b$)�(@!K|LJc�C�#M�B����H�$������|����zt��9(yY����L����.���+�.9�]T,������;_��X$��z���bWw���94�7�a�������\�O��[��-����K�������>�[���!�H��S�T�
��P|&ixT�t����;�������,��z@=���%ADAn_Z��
��BEa�����=r�\�l`	k�8��	��"���z���WQQwL�N�w��g�DJ@1d�or$B%�G������,��=r8L[cT��a"R���P���g�Y��z���; po�sh�M���W!�P	F�9�7v�x��E���Us������]����z���g�Y��z���g�Y��z���g�Y������_P*�c{��]m�sD�b�l`(<;_���&%u�����}�)����`���=�jQ��[O�G�����<M�b����M�0�k>�|�s$�c�\�r�W!U>$O���\�@�p�����;_�����g��x/��tx���m/����Zi�*"X:pC�����)�c
���������m�>���O�S�4~���j�����!0DL
�>�=O�I3�U�
# ��0�'��@z_�Em�8��Ba��yA��&� X*�QW%�d#W9���`���j��J>�)���#�RPY��;_���yRQD��o������BfH�'�!���M��@r�A�Fz�Y���1�,ABP�������ff`(Y����	��, �e���g$h@-����2=D�!���>U��0-���l��E�Xi��,���)��n���g�;�D��5P�:������)��n�9�7��#m���xuZ+��\U�u�]K���n�����)��n�����)��n�����)��n�����)��n�����)��n�����)��n�����)��n�����)��n�����k5W��0Zv�d`�i>�#SIwV�:I�S� �t�Hf���/�"	��:�c�e*k��N��"M�0��#���dI��������d�H����y�&�i����fH������`��vDv@��3	��&�R�`���,���@�1
�4�_�W��YTQ��D��=eQqL��hV�E��55*���hzQ�%(�)T�=��l����&�c��[d'��q�g�
MJ��n�Fy	J'
U{p�������Dg���p�S���-���*l�A��Am��@���E��55*���hzQ�%(�)T�=��l����#<���*�@��Qm��Sd�}�l���N4�,����T��C���!)D�J��!��[eE�T�I���<�n��*/b����t�	�(�iDY��SR�;�����BR���O C���������	!��y�E�T^�M�H1��-��Q8�������Pw�
J#<���*�@��Qm��Sd�}�l�$�Hj���2���6	����L�a�L6L6L6L6	������0�0�0�(>��l�l�l
�
�
�a�a�a�L6L6L6L6	������0�	0.��� �&&&�d�d�`�l�l�l
�c��&�d�d�`�l�l�l�l
�[R���~�0�E�8mc�x@Ft��B�v@(�5DH�H��8�9c��f��{�������'|0@�j��^�Ao�E�`6��B
�!��W��!�%�	�d���&�����Z,�b��&��z.L ��M�	�H�)��Q�w��f�f�0=fD�a�$��2	����P ��4�`P.�@�.88�b�$$o��2N!H� �X�N F��J]um��qHI`5A� ��H��D��h�&�b�c!��1E��i�z*�p�u�'�N�����
'�}������0��m��rhIG����C��H �������O�G4Gq��}��9����jxW�B�f��F�F�l�G�=����"��`i�@pf
cwm��G�A���u������-;�F��A�-�D�a�KRw)��%�p�N�0���k	�'ru)����d0
�R��c7$�N�!E�
�iB[�N<���� ����&A����`��"��b�D  �@�qC���1����;��j�� 2��Q �P�X-���@�$�����L|"$@$���*!��&�HF%���g@
�
q�� ����	�
�a��R�I(�$`R"y�k�2�>��pP���js�x�	���<��!�H�u���p��q� ��@P<>��:�	��O&mx.�lz�A�G�a�'�6�<
Z\�d���@��`�8#<�mI��5>9x��Ht�[����`�� �G]N����f��QH.��� �tG��R��Dx`�Tl���������4�0�0�0�0�0�0�4��<��<��r�<��<��<��<��<��<��<�O<��<��<��(��<��<��<��<��<��<����<��<��8q�0���0�0�0�0�1<��<��<��(��<��<��<��<��<��<��^��<��<��(L0�0G0�0�0�0�M,�_<��<��<��(��<�U|��<��<��<��������<��<��8��<��W�<��<��<��0$�b��<��<��<��0�0QL0�0�0�6%����5��<��<��<��<��W�<��<��?��?����O<��<��<��0�dAL0�0������0�����O<��<��<��}W�<�����������z����4��<�
$�5�p�������0�
<�����
<��<��<�7<G���':��~��Q��mw[�4�,1�<��(����U�}�<�W<���7�<����<��<��8��l�����C����L0�1O<��<��<��(��8��^���~�<��<��<����<��<��$�IK��uOkS�0�0�0�O<��<��<��(��=�s<��<��<��<��<����<��<��<��|��<��<��<��<��<��<�O<��<��<��� @����<��<��<�<L(��,��Q��4��1�<��<��<��<��,��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��+!1A@Qa��0q���� ��pP���?�\�����|Olb�r�wL=f���(��=����������9��s�{��k:t}�$�a�A�L�����=���)`Z�J<�S	�!�����f_��������S*M�\l��Un�XJ��]��[���;���lJQ��bU��:�z�E
V�(�vj+�w�1jW���=�lK�oH'�	���aq[y����>�(�D
,G�y~�R�& �;BZ��BA�Y��b�y+�\���^��������+}'!�r�z��
�*��{;�����������xL	�r�Fs��cN~<�����s��d�������������l(8�y\�\4�9��R��1l�;V�m���7��s~�����K�l�����r#�`����s|`��(y[�WW^P�h����L&	���`���Mv��Z�F�^���yJ��P��~���aY��c���*������e��7,w�������I��@�O���F��"��&,�h��7{H�Y���<���f<gxt&�����c�D��N3�T��h8��U sx��M��q��n�:�H)��Q��xo����e�����F��)~��#g�/�4�ylT������]r�/��~�k�U9���0c#�H���(������+!1AQa@q����0��� ��pP��?�
������!�:�����2�?
���7K\�R�f�K���������zP/-
���t��N�n�_����)Yh��u��Z��������*��%��_i�5`�� g{�����Z~*�;�����P�9W��f9�j������
������PR����e��Yv0�)e�v��Q
;�_kOH�����w�v	��1B����7@%N���J��?��&(���BC�k��S����
�'�qz�nU�8?%��Ao��:��ky������H
�j�.t��t-����98y�^W�������a�����\�{U��_�K!��^d����EE,������]����uv��S��PS������T�����as��L5���cp�i��{U�K	r�����l�
�N(�*[Yx�*���;,��Uej����*�oY���T���6��+X��	,	�r��U��j����#3s�i2��P'O�i9~�7���2�]V;���{�D�`�d���
n���t��a�E&[��W(�a��"'��/��]���X���M�s�9y�*���w��7����8�k�l��<�ia�a�������7#����,�OV\%�����)� =�>-����y�V]��xL�{�d~�+����_4	5_�W�TBM[Ue}������,����H���1,���*�
��*vT#�e��L�����w��,!1AQaq����� 0@�P���`p��?��k�$�N��^2�fQf��&�O�b�D�^`M�H
�v�|4�qa!k��+	��8��I��. @� @� @� @� @� @� @�,d����h�f�H`"=FL��B�@�\5��Xz�+&`��]A��V,��f���@��������~c��?��/�~����������l6Sw�P��-@�}1��x��7��G�1����?��/�CPLR^7V�HBu��,LR@��h';�:]6�?s78�s���&mW���:*� ����*T�R�J�*4iY����*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�m�B�xQ����!�M�����9b���Y%�J��KQs>8��Z��	���^p��G/�����L/��[��O��^o�����G �������[4"w���������z�c��4�����@��D��Z��4�:Y,L��x��#0%V�0m����Hu�4�4u@JIx��@��
RK/���Eu��Y�g���}�C�&V&�������8����z��
pI8��FTj�ceN����a�3���*��	l�"g@�L N�\&��H��(+E��39Y@n���!\�HZ�U��pD�N=T�U�� @����:<:C8S$[���) @� @� @� @�&J
�Y��;p��k�0��kfhA*���HI-8o��^n����~��R��tpz�bQ�����N�~�j�TC�2~��"�r��<���I<���g�����|/��y��O�� M7~�W@�v�����I�:>3%��5$�2W������T�"�l�=��LQw��K����>����>�����g��A������&g��%C����6���t�3��c9�2zNTQ?��4��0d7�����T~3Z�V6q���'�${�Z8��h(�#wXZ8��h(�#wX��6��0B����������f���~Y�i
k���"Iq2����%L���G U�C|v�8�c��!z6��������2����{(������4�����M�������B]-��������T���V����-S�ZB�J�X�p
[��d�VB�^C��V��~�J�*T�R�J���O�}{*?�)�����u*T�R�J�*T�R�J�*T�R�J�*_���N��zct�3�HF-]�"���E���za��T�pa��
��A����$����VL<q(�P�n�I�:z��K���R���&���^_���>��������Q9Q|�O1��
�B']s�K��'����&@6�����7��?C�X@��r��������K������'�����������`7^2�x���;����q��c��z�l���_9�V���u2m���^������}������o���[����"
ZD�i�8�-[�ho���
����xPm����/�WS�:����H5[���X�ix^��	0#��I��60�`��
���{�����^���|�W�|+�>��
���{��B}s�^�����>��
���{�����^���|�W�|+�>��
���{�����^���|�W�|+�>��
���{�n�!^�6�FB�'"�;���|+��5�=�����4���r�|s�^���|�M�-�\���|�W�|+�>��
���{�����^���|�W�|+�>��
�������t�TQE�����fQ�$�S:�Ee�aP�&�{VIl����D�#����9��&�T�l�	f�-���)�����u�Z@'V����`N��������L�|��Y�pdJ�����;��v��rr���M�h�,Q�MQ�������@@"D�m�&�I��q��^�%�#���2p	R��Ma1)�����4��?��J�����>�����K��j���I6�������w;Io�/(���n������;�sY����K��!�J���2+�`�����
:��_%HxMV0�0����2J3���@���V�c��s)�T{B�^b�A������}Pu��.�8 
��e��[���"�f�$����>�&H&9�H!{���=�����H�VRmO�5��Id���P-����r"$����)+����.$���2���I���[��]�u|���X�)���d�!W�/@���Xr�x�c�2
 m;��3����\&'����H�7�3IQ�����N���_Xa��C)X��$�,(�	��B������9w�e����I��+Q�26O�� ��������^e�>u�q��=4>�Gh^Q}����0��j�Z@�_�<8-�B��#����5��&�y#0D#U���X�[$DA
��u�o=�4a����������;9�9D3��.����D�BOI���H��a1����n&���b�%�JB2$�X�����1�8,)��}��*T�R��C/������#��R����4wC!8��R�J�Hrp�I���mJ��0i�&��6���*V���Zt��z9yU���=��Ik�%�vT�P�&���jT�R:3�|7/�l�B���h�*�*T�R�ER����-�?��S�t�����@<t>����T�R��k;�;z�6u�l���������L�Z����cQ�IX���
m{mX���
m{mY����U&(�h�L��r�_�)�7���������!�t��zD���C�%��&��&f00ZIEF�5�<���KX�)O��9D#�8�NH
�H����
`@4��b�,�)�g�����	W.:(�����=����V�W�������5���0:�+�+�h&kn�N�g���/������'��T����a��� �y����i�O�6�(�`=^� �:���;����I�@�[���=\����?�����::�H��q�z���-��kBt��sR���\���$���
�:]���?�g�,��@d+�)
�(L``�5����`�5����;�&������H�C �$���l
5��P�y���uL���RI3�z���!T�18�k����N�Je�#@�tW��^\���|8` >�>�
A'MO5�zW�]D�����
M}��@tu���A���q5��!��Rz����'����B)N��l�+!�����*�R���$Td��O����K��������=S�P�]�h�:g���[��"pN�*D�g��1�^S��.p������EB������t��gv�y����a�KaJ[eF����Uv�����k$���� ������������lQ���X
`<[��T���-`(�A��@��h���J�����kVY<���@E��*��#;�>��w��1�^L&
V�;N���'��8~�l} 
*^�^��,�u�5M������='�z?���:p��~VzURT8m����y���b�\�[�4z'���|�N'B��_V	a���Vq�BB��O@|p8���X��G�;��w�}K����9X1�b��� �up{m�@W;����k�	]��XNH�a�������@�H�E���@����*
����T�+"�����"�
K����!��z���B
��E���0���x�Do���q+O�}s��Z�.0	��[Ai[�}WZ���UT�tQ%N�
!E$��y��u>=��>�=;���'��|Cs��L�V4��p9��F$��x��B���_������"�� @� @�1��_�!�	IVZ"V���QK�7r�������}phq���>�6�'z��,���4��I;��
���&ydO��A��vM����q�R�������/
��:���|$�#��>B����.�Q����StZ��Ee�8h�����g_�����Qg�'������{��/���7�_�(��Y_��_��/�&���{�&��>��27v�x����.�|'��5����D�pv��wjn�K��O|���������|w�>;����T3k�)�L�bfj���R�������R���xv���eLS���vN��VN�1;�d��_����pc�RPUq����R]~��_������H���Czcw������8Xw�Fn%*�.O�'�|�>	����O�j�{��jL�m��$��~����+k*���c��AQ��x�(d}�=����V	�	F�y����
.�
�W�"N���������d���q�P$2i��ON�����
���=���@�	iM�U��>i��4���{��bP\��<C�����3�����>w����3Ir�������fQ��,�D�T>�hO1�Xk$�}'K��bc'�bF��w����*��:'DZ��RSJ���rl����*�L*=]���$������Ymk��jd@ZL#M%��gfSKja6�G�T�R�z���7����-�(v^F��S?����?�����*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T����W��WKQL$  �u$8 ���|q��0���w�m�*XN��C!�Bh�4����EEz��L�����Sz�1���K���"LG�sB����uZ(��}c(��_�}K�En<H<Lx�%BF�x���L���>A�[�|/��v���f���g�~ZF�j�N��z������^^~���	��i08����~Jl�m+�c�!�q-Y���D�G\"`�E����,%�k/V�r���YP��a8"y��V��:)���oz���<�������c��'s^C'���]8���������W!'E1��x-�[����06���A�luh{��v����L
�ept[|Q;��>�$z��F�<�&���	:)���oz���<�������c��'s^C'���]8Z�� ��8��w5�2}hH���>,��yM-ertSZ������x)�SkY\�N���O�	�p98��|Q;��>�$z��F�<�&���	:)���oz���<�������c��'s^C'���]8�p��>(��y�Z=t���#cGKY\���V����lwl
{����WE�����!��BG��q�dlc��rt9����t�.���(H���������������WDm�(���L�a33��2���+�"6�5)l��hR�+|�(���������Jd����"#n1	�V?��&31�� ��r��"#n3R��[��)���������B��i�L&fc}2��"#n1FP��e�	���L�
���\P���Ke��B��[�q@DF�b��kn4���TV���`E%���(���L�a33��2���+�"6�5)l��hR�+|�(���Q�-m��d�ff7�'#�nu�B��i���D���������	q���Z�4)L��WDm�(���L�a33��2���##�n4�&31�����O��'����dx����I�v����(;6�*! tGvW��C��7���J�
�Q�i��� Q����<���4�
��
��!H����Xe~J�B�/QRb(M"���z�`����@UMJ0hN�XC
$�EI��]Q`^��6n*!�����J��-��ld�4
f��n�0��)�!�F������IEI110��h�,	`����2t��AD�e�3����S"��
��@Z�eD
�A.d=�uQ`I�5���`U��B�Q)#d&f�7��(�D�$n�Z�j�_��(� AMBHD����!�A���,	Ss�1y�������j��?]&z�b�T��dQ��4+P$��Q�l�	�	%�X�U�q=���2�@���+�!1��<D�
	&�+��@A��.�A%���Cn����r�7���?�'��e�mX{�,4
[2OLO"�YaFW1���IH'�O�Z@lE��2��
L����\\�Fp�hlc�# ����������Mo��0))��N&0!@���e���8p��eN�.��II��K"��m�!(Z��@�e����I�8��0�P	�L����Z�F����r�zrd)/T�����I2�TuU\�|��7��
�e��@Zu�D!R�eB����(���������	P�]$2� W@�0Q	q�U��
s:��b����+Um	��(�1�(����+r��F*�I ��*TKX���`N��X$�C�FX�����4
����u2
*7��I�,.B���bb&Z �t��{������
�#�
Q�E�	�
��$)���04P��@"Y�S��Uf]��)a!�~���@�4I�+5A�b��h`Y�����d���z�p�@L���8(
�G�1:��4A��nZ$d������.
*D�j�1�.�1d}��(q����!Ba�"J.��
*D�j�1�.�1d}��(q�����b�
������"X5D���l^N,i����<xMJ!bS��
hPe��@*����
�#��y-���b����k�86&�LHXz�>,��uA�86&�LHXz�>,��uA�86&�L��ykW�������
#31John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#30)
Re: add AVX2 support to simd.h

On Thu, Mar 21, 2024 at 2:55 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

On Wed, Mar 20, 2024 at 09:31:16AM -0500, Nathan Bossart wrote:

I don't mind removing the 2-register stuff if that's what you think we
should do. I'm cautiously optimistic that it'd help more than the extra
branch prediction might hurt, and it'd at least help avoid regressing the
lower end for the larger AVX2 registers, but I probably won't be able to
prove that without constructing another benchmark. And TBH I'm not sure
it'll significantly impact any real-world workload, anyway.

Here's a new version of the patch set with the 2-register stuff removed,

I'm much happier about v5-0001. With a small tweak it would match what
I had in mind:

+ if (nelem < nelem_per_iteration)
+ goto one_by_one;

If this were "<=" then the for long arrays we could assume there is
always more than one block, and wouldn't need to check if any elements
remain -- first block, then a single loop and it's done.

The loop could also then be a "do while" since it doesn't have to
check the exit condition up front.

plus a fresh run of the benchmark. The weird spike for AVX2 is what led me
down the 2-register path earlier.

Yes, that spike is weird, because it seems super-linear. However, the
more interesting question for me is: AVX2 isn't really buying much for
the numbers covered in this test. Between 32 and 48 elements, and
between 64 and 80, it's indistinguishable from SSE2. The jumps to the
next shelf are postponed, but the jumps are just as high. From earlier
system benchmarks, I recall it eventually wins out with hundreds of
elements, right? Is that still true?

Further, now that the algorithm is more SIMD-appropriate, I wonder
what doing 4 registers at a time is actually buying us for either SSE2
or AVX2. It might just be a matter of scale, but that would be good to
understand.

#32Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#31)
4 attachment(s)
Re: add AVX2 support to simd.h

On Thu, Mar 21, 2024 at 11:30:30AM +0700, John Naylor wrote:

I'm much happier about v5-0001. With a small tweak it would match what
I had in mind:

+ if (nelem < nelem_per_iteration)
+ goto one_by_one;

If this were "<=" then the for long arrays we could assume there is
always more than one block, and wouldn't need to check if any elements
remain -- first block, then a single loop and it's done.

The loop could also then be a "do while" since it doesn't have to
check the exit condition up front.

Good idea. That causes us to re-check all of the tail elements when the
number of elements is evenly divisible by nelem_per_iteration, but that
might be worth the trade-off.

Yes, that spike is weird, because it seems super-linear. However, the
more interesting question for me is: AVX2 isn't really buying much for
the numbers covered in this test. Between 32 and 48 elements, and
between 64 and 80, it's indistinguishable from SSE2. The jumps to the
next shelf are postponed, but the jumps are just as high. From earlier
system benchmarks, I recall it eventually wins out with hundreds of
elements, right? Is that still true?

It does still eventually win, although not nearly to the same extent as
before. I extended the benchmark a bit to show this. I wouldn't be
devastated if we only got 0001 committed for v17, given these results.

Further, now that the algorithm is more SIMD-appropriate, I wonder
what doing 4 registers at a time is actually buying us for either SSE2
or AVX2. It might just be a matter of scale, but that would be good to
understand.

I'll follow up with these numbers shortly.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v6-0001-pg_lfind32-add-overlap-code-for-remaining-element.patchtext/x-diff; charset=us-asciiDownload
From 5d4d91d169b973838c99e8d4fdadcb09df36a6ea Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 20 Mar 2024 14:20:24 -0500
Subject: [PATCH v6 1/2] pg_lfind32(): add "overlap" code for remaining
 elements

---
 src/include/port/pg_lfind.h | 103 ++++++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 33 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..22a3711ab5 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,49 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+/*
+ * pg_lfind32_helper
+ *
+ * Searches one 4-register-block of integers.  The caller is responsible for
+ * ensuring that there are at least 4-registers-worth of integers remaining.
+ */
+static inline bool
+pg_lfind32_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				vals3,
+				vals4,
+				result1,
+				result2,
+				result3,
+				result4,
+				tmp1,
+				tmp2,
+				result;
+
+	/* load the next block into 4 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+	vector32_load(&vals3, &base[nelem_per_vector * 2]);
+	vector32_load(&vals4, &base[nelem_per_vector * 3]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+	result3 = vector32_eq(keys, vals3);
+	result4 = vector32_eq(keys, vals4);
+
+	/* combine the results into a single variable */
+	tmp1 = vector32_or(result1, result2);
+	tmp2 = vector32_or(result3, result4);
+	result = vector32_or(tmp1, tmp2);
+
+	/* return whether there was a match */
+	return vector32_is_highbit_set(result);
+}
+
 /*
  * pg_lfind32
  *
@@ -119,46 +162,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+	/*
+	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * one-by-one linear search code.
+	 */
+	if (nelem <= nelem_per_iteration)
+		goto one_by_one;
+
+	/*
+	 * Process as many elements as possible with a block of 4 registers.
+	 */
+	do
 	{
-		Vector32	vals1,
-					vals2,
-					vals3,
-					vals4,
-					result1,
-					result2,
-					result3,
-					result4,
-					tmp1,
-					tmp2,
-					result;
-
-		/* load the next block into 4 registers */
-		vector32_load(&vals1, &base[i]);
-		vector32_load(&vals2, &base[i + nelem_per_vector]);
-		vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
-		vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
-
-		/* compare each value to the key */
-		result1 = vector32_eq(keys, vals1);
-		result2 = vector32_eq(keys, vals2);
-		result3 = vector32_eq(keys, vals3);
-		result4 = vector32_eq(keys, vals4);
-
-		/* combine the results into a single variable */
-		tmp1 = vector32_or(result1, result2);
-		tmp2 = vector32_or(result3, result4);
-		result = vector32_or(tmp1, tmp2);
-
-		/* see if there was a match */
-		if (vector32_is_highbit_set(result))
+		if (pg_lfind32_helper(keys, &base[i]))
 		{
 			Assert(assert_result == true);
 			return true;
 		}
-	}
+
+		i += nelem_per_iteration;
+
+	} while (i < tail_idx);
+
+	/*
+	 * Process the last 'nelem_per_iteration' elements in the array with a
+	 * 4-register block.  This will cause us to check some of the elements
+	 * more than once, but that won't affect correctness, and testing has
+	 * demonstrated that this helps more cases than it harms.
+	 */
+	Assert(assert_result == pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]));
+	return pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]);
+
 #endif							/* ! USE_NO_SIMD */
 
+one_by_one:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

v6-0002-Add-support-for-AVX2-in-simd.h.patchtext/x-diff; charset=us-asciiDownload
From 7e7781454646992218a990cf75f0654c67ce2dab Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 11:02:05 -0500
Subject: [PATCH v6 2/2] Add support for AVX2 in simd.h.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/simd.h | 61 ++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..f06b21876b 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,18 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+/*
+ * XXX: Need to add a big comment here.
+ */
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +118,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +133,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +149,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +164,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +289,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +329,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +360,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +375,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +395,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +413,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +427,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +444,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

v6_x86.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"��������5����n��y��!�������U	�&���j	�&���j	�&���j	�&���j	�&���j	�&���j	�&��g@'I�	�t��K�su�>���6a�U	�&���j	�9c�a�<�MG�9�&���j	�&���j	�&���j	�&���j	�&���U��'@�����^.P�'I�>k�C�������d|�5J&(��b��x�i��XY%zb��&(��b��&(��b��&(��b��&(��b��&(�RUt�\�l�5��b���t�?z#n���5�{��1�Q1D�O��������VN�D���b��&(��b��&(��b��&(��b��&(��_���MQ0��0�F%.!p�'@N������u�����z`A�Q�cC��Wq��.�a����	�s��j6��cT�b��&)P��H{�G+'�oLQ1D�LQ1D�LQ1D�'eq��.�>r�8[��f�����l=^]�T��^_��g�
J��O�v�kvA:N������O�x������z`�a�\k�mveNq<��ck��r��t��;����X���n[���:���N����U<��T��
�<����������8OL�k0�'@bR>sG�,\�\�5��pi��fCq������u���z9n���<��?���>�N��������<i���>�po.��=�����h�2����{�P��j	��"9~k��Tad�+�sQ���&���j	�&��>gq�����y�iu��e��_�����S�����I����"���vx��,�����%Y�B;t�������No�����k�M�Ccp��	�t������G��\�\�5���|�O�����Hjf�w�[��-��x������^4�7L���@���]���]���z���o`=�0'I���^���gy��������`��oF�}��6L�r�_O�L
��U	�&�9��S���5�f����W�����G+S������l^\������O���#������F;s�������Ty�PML(��?�V[]�����-����
w�������z�j	Vtt���Tw�=�xy����{��)��54��V[�|��[}�T�u�����^=t��`wW����vMl��n�|�m478o[���,����8��}������|
d	�t����?K�>S�����Sk|��r=V[�>co,�ll����8���|�F'S����������������&9�y�������V��;OZ��;������o���<��d�����b��&(��b��%P'I�s[^w/N^��=>.w��{l=r����<y-x���/��wEXs�[�����
�����t���������)�����9����Ep:'I�����u
.��Lm_;��8}%r����n���;y���M��s�#��^����>�'��v��.���e;��~h����sD��LQ1D�LQ1D�LQ1D�
J��F���7/���`~Y��;���x�z�m��8n�[�/ t����4��,,���t�p����j�MA5��&���j	� MA5��@��j	�'<�5�Pc��j	�&����PMA*�
N�&�o�����L
�`�w��0����7�	�����A��
.���o��y������{C4x����)�����/ 203@!"P#$%14`D����w����^��%����w�Dl����=��k.�8��q����.0\`��q����.0\`��q����.0\`��q����.0\`��q����.0\`��q����.0\`��q����}�2?k����#��j�2?k��������5ir��jL�GZ���>=�nks[�����5���nksBd�nks[�����5���nks[�����5���nks[�����5���nks[�����5���nks[�����5���nj>�����J�:pD���%X`%���{������L���Hv����I�)��~�;~3���I"���%�� �p�G�z.�4-����Hv�g��NH�������C9��.���.0\`��q�����,c�������.0\`��q����.0\`��q����.0\`��q����.0\`��q����.0\`��q����G�z3�/����������~�;~8�	���h�A4R�G�z.Y��?R��v�1
�-}C��>H3�����5���lkc[,���>\�����5���lkc[����5���lkc[����5���lkc[����5���lkc[��������mMj=�lkc[���;13S�(*����AQxG�z�x��9_�z������2?k�����E���Z�m4���N���R����:��G�z�a�����L��i����������Y������f����k'��SCe�Y� .@\��r����.a� "!���� .@\��r��� .@\��r��� .@\��k�$	A�����7������-K� #�uK� (�G^��� (�~3����=�=�GX�U��M������Z�g
*��W�����Q�m�C��g��:�V�����z����W�,��Y_T��JK���q�$�&��i�Um���^���{�6����7��O�f1��u?
�$L���g�������������bQ�h����TB_Z?k���Ow��v���_����};N}��c-GP�0�<4���Rr%^�_���;}&W ;pF����'����0�[�G����C1(����`'/	=��SFxj|�R��J�oCK}��s�z�Lu|iE{L�?��Fe�f^Fe�f\L���J�"joH�����-�������7I��K�����5���lkc[���9lkc[�,�5���lkc[����5���lkc[������,�����I���]>�Y����1�M''�y�������5={Ev^v5���li��m�lkc[���G�u��~2����Z��TZ3S��|fm���h4�&���������(.P\��2af(��mEnkh������+Q�sK'8[�SsJa=��;6���������#�/w�D�4�	-j:����W�kh�K�WG�
^$��V����D���{����Wt��tw`D��toF����H�1S�����Q��k�U����Np�4F�����8�h����}�6��%���JWl�����GjV��E�����=V_�a������?�u	��	/!]
:�����������Gb��XJhl�3 6���l�e�,�Hy&!f��l�e�)I��e�,�f�������e�,�_������Xg-���j;��B�G&l�e��/1{6Y���l�e�,�f�6Y���2c��,�f�6Y��\�f�6Y�������y�yTu��Q�]J����R�L�:��f} x�+v��5�	��F"�a]�j�J���DF^:�o�h��?IZ�]�@��@+�/?]K�E��	�'&��;�(zC��G�tY3n��7E�^V�i��?���mG�:?mV��Z�Jgq������������1�|U�u�O����D��4�G,^B�j5���	���5��	�gv��.���m��2?k���S������&cc&�zk�oZ����o�8���[�����0����+��5.�qk�����Q��jP�����	��8W���<�<�z5��E��_�]�,���_��f95;C�_K�����limb�3!��3�.�������tj�H�C���C`��,�M�8����f��:�����1�n��_�X��#��HW*
m'M����]]�$����a�����Q76���]b�O�]������.�u����$f��6�$V���/;�0NY2��L�e�"=�����.�u��]b�X���.�u��]b�X����G�xO#X���jM�����)
��F�������.�mG������>�C����wzU�5:����@��"�b��+X
���!��#���x�X������y���?��E-��B�A�4w4�{V�5H����^ze��^ze��@�=���2?i[�%�t�����oWJ���4N�U������4u�c�)SL.��z�H"�aw�S�,�����������C8�I���������P��A�Wh�6Y�����;WZ\k6Y���l���Ub�����m���D�K��B����N����gwa�Vl�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�>�� rG6Q���f���E�MG��m%��3������������;~f�9U�5`��I�D�M�r��u����"Q�1�C������=���o�3�+C$�,CV���o��OM���qg@
��,`�X2����,`�X2�����e�,`�X2���X2���e�,�2���e�,��2���e�,`��X2���e�,G��]q���4vzM�	����������_���4vzlFvu
��4�#��C�qo#����%Y�(�\}<���q������8���1�+��t�1Q����b	�(B��X��1���u=hH"�!�kE;�8
�XG����� ����V��lu ���C���j01yH9z@��U����PG�O��61!2R"0@ABQ Pa#$3p���Sqr����?������������S�G��I���P%Qk�^0��2����;����aykO�N!���{�0f���lh�>��S�6��sm�(���{G4<B9����|$��[�F��An�
�����PI�g�6���7��:h����_������-~��X`�#�f����"�����9�t��$
��C��&F���|����Y�%;�|,Q�0��,-��c�
a�~�kD�?y�s� j!R�
���z&�WM�l��n�5��I]���{G4_8�~������6�cS�uj��-'�E��%����(�����N������kno����{G4�`Ak`����w9�A��I�R�,��z���*�*�����f��v���h��(�2�s�8Vt�T�f�U�1?	`P��O�+���8����L��yLaR�q�	�u���������5��%�m�D�k�a�UX����*1-i�*��R���Y���6�9����.�S3s�@�w���<�2�])����:!�\�uU��/0�gH)���v'b1O{�F4)���k����d���J�6��(���/��6!"1Q023@ABq� PR�#$apCD�����?�f�.f������S�m\�����?);�G�����~SO�i���~P���AsP3���^���7v�������-(u������6��H�H���S�	C���#%�8*�1�2�w�bp=�J�Z�]�z4�c�|te��1��o�S��T��L��1����.uq��h���D��b�'��n��'T�����#�������>����<)v����o�T7�r��RT�y`��kZ�TT�O�R��Z��S�i���a�.�����U�eK_d
Lb:�p����G�5(�2T�������
���*�NP(JM����>?liv����c*dZd���c��w#p`���gTi�N7��3��xL��5�@��2��T^�S|��z�"�i���S�
d��n����!s{��S��r��Ns]O�b�+5i�5$m�MRr���{��F�|�����l�E���,���~>�*�1��p������ml?�YBe��'9�NsYL�9����������R�e��J�S��������������*1��s|*W@�f�&�%<�����2/)�NST���9@��@�����fv<�77�I��i+��R�������l�����a����(#(9��������z�[tl���f���k�m�F77�%��B
2�!1BQ� "@Aq���03a#4PRb�$Cr��`���Ss���?���i|"�d������ha��1c�d�0����I����4l�M�i���E��A|'F$�����qE�����Z��X�jR|>k33���P�D���@��2VmM��[�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�/3��[���������[�����~-�}�<��%�Nlq#�"Z�D��9�������F�O�����aak�K�����
[�-���T����R��[uKn�m�-���T������R��[uKn�m�-���T����R��[uKn�m�-���T����R��[uKn�m�-���T����R��[uKn�m�-���T����R��[uKn�m�-���T����y�~�A�E���8�3]Vwa��-��f��3<�,Lnr�d�������+�����|0�����3��[���?�>��,�a0o���F~��6or��Aap��c�C���@e�j#���
j{��>�����5�6;a�q���c)n�(�����2���Z0o=�H����,����,���*eCl�m�
���T6���P�*e�E���)HIP�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�*eCl�m�
���T6���P�/3��[�����^g�����7�~�}�A��dddu"!�c����7���&h�q�k�g����A�W����KMa�!c�j�X�����\�`k!3�i|���d�Ng�:���"���[�M���U6���T�*�eSl�m��6�IT�!�����U6���T�*�eSl�m�M���U6���T�*�eSl�m�M���U6���T�*�eSl�m�M���U6���T�*�eSl�m�M���U6���:+���	kO/sf�UM���U6���y�~�AXYh�Gub���n=xD��7
���"\���%r7�A|�Z��Jf�~�3��[��J���d��(E�g?~�3���I��V����z>g������9�K�d���%�.e�<O.9L�3J�����JI�|��g�Z�_�#��,�������pS|NwsI�B�/8"���Y�]7�5oU����V���u[n�m�m���Y8:����M8��m���U����V���u[n�m�m���U����V���u[n�m�m���U����V���u[n��32��q�[�m���U���
�D��:2�0��H��U����0�����u[n�f$[�*�u[n�m�m��>�����(���dj��.�o_����&�_��f/80	6Z��SbF��	7�.F�|!���^j�T��>dG���@�����#/���}�����9�|��K���k��}[�:>g�������������/�?��F(����CX�-s1�N0�$gb����D�Wa����Pts����~`��N2��?���@cb=��0Z�b�S��o�����!�-_U��@���^�I�����O,v�r.:��Q4���]^����G���
�����}GcO,�c3�0������7'�����YdW:��+� _�����[�����8�2&�k[�To���`~�H1�
h�����t�q"k�L�6GBr���>�O��4���$�r��7��0�r��l�<C�	�~J(��)��l���v�����d��,�u�[��8_�b������S�1������gbk��C�5�q_H_�is���7��^~��;���s��.}
8�P$���lN�&�/3������������&5�l2��(������O%='���7/�����a������h����&6;�r��V�t�8|(�f3��w�c�(����/\��.��-�8���w}DY71�r��U6���T�*�eSl�m�ZH���U6���6dYT�*�eSl�m�M���U6���T�*�eSl�m�M���U6���:#�������������R�<�$#i/-Y��m�M�����&��g�3Ci�����T�*�eSl��C�e�e�l6��h��m�M���U6�B/��U6���T�*�e�}�k|>A���F���<"A���!�fp��z�*$C����������A��>�kZ����7�,��5�{a�?�=���!����{�C��<�z(�������3%2d�����^SL�'�\' �60������Bb�N`K��1�����#�`K����w��:^��-�����A���@kNo���o@hZ>ouHCo����s�H����,�g�!3Dm8������7.�3�JQa9��<��m�s�Q���]3���*B�*G#�@�q�5��Z��P0K]�,���`8��Y��P�B��Z��:S �r�F�}�f�������$��p�h-!�L!�����@��!�����@�� ���=��F"��5j@=�a�3�)���g����L��&����xl���aB��5,N8�:����P�5�!2���i��������N�A�Y�e����<�������"�xJs�9��e�`������s����N
8�y��$B��n�_�mp���V�	[\%mp��N�Rw	[\%mp���V�	M�e��V�	[\%mp�^��?J����r�[\%mp���T�>u�7%7�s���R�;:wx�����'�7.I�b�$L��	[\%u��391��/5��V�	[\%mp���V�	[\%mp���V�	[\%F���%��V�	[\%mp���S�v[\%mp���V�	^g��:�p�R�#s�,l���b��`�e���6$h��`�z��o�D��(�y��7���eip�)��&7k��:\�=��GN�2��!�N��M
&����<��H�e���n��������<��eQ�8��z���fUC�4���>�������-!�^�~���G�3>���vM����fV�<�S����+K~/����F�5�@rhN�_�,?��P��ZB7�����/v��DTz�=W�e�������2Y��,�B�:c��}�k|:����"9��ZO�@��M�H���7���?����!�.��k����(o&�7��Z<���:3�9h�8�	��$��� �y*�X��a�R�X��Ozh�>g���d��7��������1�aq!�	�������
;���S���txFPS��i�MH���M����s��CP@ 9�o�P��>�3���������?S$�P����L�����u��Z�(v3��!85�dOr�5�N��?�'�7Oe���q!��e��U��:�n�U���������J{NR'����c�*���w��Fs�������?S�V�Enz���!�~���n�U��V�D�������\�m!V�E[�n�U��+�q ���/�_x>a}������K�?���61��y�/��/��/��/��/��*��Un�U��V�E[�n�U��V�E[�n�U��V�E[�n�U��V�E[�n�^g�����|(Bn*,wT�(�	�����MQ#������������Q��nS�~n�0����Cg|�|z0ao+�����k�6��2�bl�T�J�-AN]������r��!�38�u���:��G���xGh���G\��n�������i�Ft`Hb��/��}����_ur���^�e������mo�&Y���s�N8��w$a�
����]h��rs�|��z��m���@��<��J������g���a2��Q��������n{�1wI+������
jH�����]�M����������~�����J��)�����"��iQA1"��k�����J��*$XM�/����K�,%��|����!g�=���O�NN�+0��:�NN�*D��+k�����J��+k�����J��+k�����J��+k�����J��+k�����J��+k�����J��+k�����J��+k��3���E����k�1�>8�R����\�y�!h��gP�i?�A������Jnpk���5�N�)��~�}�l-o����3��b���R�U�f���&�[�OA��?
�>��.������g��9�2kD�A�He�m�5�����a����g��F����%�F`���f{�}�u���k���"�����+k���"��X�+k���"�����+k���"���n�����+k���"�����+k���"����b�����+k���"�����+k����W�+k���"�����+k���"��X�+k���"�����+k���"���n�����+k���"�����+k���"�����F�����a�"��L�Kd7e�sfD���iy�l���m����aak�K���9�A�������B:C�O�#)�&2���
�t�
P�Y"	�e}�9��[���Z����k��~��`��~������0"��}r�J��"�Y����3C��m��y�i#���8�N~�q����;9��3�L|a���>������������%��=�������!�)v8���x2�t(@6���j��^����]��}'1���Ee�T���"��Lnr�d��e��b����K�������`��%�aL���
h��K�R�e?
K����-yO|�@��e��&�k��39x����#	�[�A�P�������w���8&qEv��sZx��=������2G��'�����d������d���O��+!1a�AQ�� q���0@��P�`��?!��qa,�#�A��S�U@N�TM��L���@	����<��L�7a!��@,v��av8��)����H��n�4	�rNe)9"������l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l �0g��~?�;t_7������|������E�����'���.��@[��@x��x@<���%AAf_�|X�?���������tWN���]:+�Et������tE������tWN���]:+�Et������tWN���]:+�Et������tWN���]:+�Et������tWN���]:+�Et������tWN���]:+�Et�����t���n&,X���G04A,Q!�:���*�nC�
'�%� sA���{o�-��v��ok����2@����E�{_�n(����N��"��4_h��<�gDB�h!�[��;����Y>2	�9/i���{O9�9���*v��;ll��Q�y���&�����D�Ip3���X6&P��C
�nI���'$R�V�����l+a[
�N���$@r�)	,�A[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[h��{_�jB�*v��ok��FG3���*v��ES�;"��}#�S	g6G����zv��,	�
�~����n���h(���cS�H�0�Y��_���������b��C�w��uWN���]:��Ut���U��y��hIx�WN�b�''�t���U���uWN���]:��Ut���U���uWN���]:��Ut���U���uWN���]:��Ut���U���uWN���]:��Ut���U����1`&�~*>����uWN���:y���2�1��`a8�K����L'����]��;_�g�Q_�^����!�1�p@@S�v��ok��^������l1��eh	&0!�0Ra����K�'�^��������N����NL�Xssj�@��Nn�QQp�_O7�p��v��<L0A���5������ p��4/F@��h��n�<	�`NE	������l+a[
�D��@��\-�p	�V�����l+a[
�V�����l+a[
�V�����l+a[
.����F�J[
�V����� "d�J"y+a7�B98���V���;�$������@����@1�H��d���P6`0"Q.�6`0"Q.��M�o2b\�����e�]����(f �/M�����
���v�5U2�#=��hx`�m�.Y����8gn��A!p~&��
����d�����uSv�4���C��A'G;������,>J`�c��NR'���z�"t�a3��N;J&� 'K�bpp���A��3@�� d
���B�g3F)1���N�!�8)���A��k���W�����6����hP{�����nx�c��A ^@(�`y:3g`3����C�J1�0� �H���!h�A�PR���)a����������'$�G�pf@���qT�fU�"�����jY�,0^��Nq���CLj������`�W����264<�����c@������jv��v�U�:���m��`���i�9�
�A?������0H�pAC+��XD�vv�}������8w�B��(�@��r�
�G�d_
 |��P������U���uWN���]:��T�g��uWN���H/�J��]:��Ut���U���uWN���]:��Ut���U���uWN��`��4!����Si��;t ��]:��T3-���6�]�4W����(U�`��uWN���7�9�7����]:��Ut�������U���uWN����|�����d������~ 	%����7�b�@��>�`e+6�2��:as��}�c�o�����p-��6�S	�B�9��0j�(��1	b3E`���1���A ;�	�d�[�����5`	'4W!���1Eq�p�"��##�/��3�@8FE���k��r�������7#�N���I������e�����l��F�:�A�F��@��{F��4Y���4/!�P�#F	N����q�b!~��xgn���������E��h".y3��4<��Ic�W�n(��#��(?+�}wG�MY1��2DS	�B0X$30DS	�B0X$30@)��-L��5��{_� ���{�n�p�*W�6��O8"��fL��c���x���<�f��i��0]n�x������b����9�����&�Q��p����1s`����h���@R�d������3����F#��-/��EOb���=�*{OG5�
�T�(��QS����D�$=�*{T�(��Q2%�$�C�"�j��=�*{YF! �X�E$����B��D��RK�Le������N��QS��l�$��8�>&���EOb���=�*{T�(��QS����EOb���	�0�6��EOb���=�*{Br:�{T�(��QS��'~�|�*���"��\�1&%�rP6`0"Q.��M�o2b\��������Q���M��G&_*�@K�=c��G`��o
}�B�}9��������f�����n�Qe�.$t���'��"�@�WZe���"(�T�7))p_��W&
:��v��ok��CH�C{t 0`!O�4���H&�<��>I������vuX�38k�K�M�d�2@�0��c0t�XC�����6�z%��RR���$*��{G���S��Rc� 	!���)���NXB������%?��G�%�`�f{�
���Pc4��=@x��_������@��GQ�
<�us�������0��7�Du��	W�&�@H<*�����(�Q�.�����g��v��L�JP~�c� ��b~���,���@�����&O0���x��8m��}��E'��=3��:�`@A����=7�
<78�8C��TX�'i�NW��3[sb�TI��0l�@&H�B���/���
�B�3�^|�Q*X��H�O"B��2�(��
�2h���Be�4Q�#����MpN&E<!��d��
����6a/��6�_J��L�t��B �U�B�hV�
< $I�,#�x�s���hV�
��[�!��AO��JfJ`@�LB^A���-�Cv�������4VF��O��
|r�AC-
��[�+v�n����B�hV�
��[�+v�n����B�hV�
	�������[Z'�#T��S��]e�='�K�*�)r}y��f}��D��)���m���i�>5DA��*z�!`o����s��V�D�����|��N���~= ��Q5������N��/>�z�'���� g8����?���y�8lp����[s�������E�: |�>t��|����O�����F���}�b�*��u�r�i�r��j�@#�8G���4�9��%OO��$���Lj'�$d	�@��N��sbA�vhi hfb"�P�H���HTO	��2�QF#��-/z�8!�+'�l*��=�&���%:`�3,�~�'�T�(��QS����EF)��"�z������<��0�"PxTg��ut`�	�CqE��s�Ob���=�*{T�(��QS����EOb���=�*{T�(��QS����EOb���=�*{T�(��QS����EOb���=�*{T�(����"c!�%�d����v`0sPX)��D��D��W89�U/R2��@@�M�fP��*��4}�����/�u\pr�,��8OD�ab�
N���U� �~��-��������$�I�����n��8X2(*�bs1@E��E?���i���R�*�1���1����9=&�yx�}���U�*��UW����T0� �sU�*��UW����U_b���U}��2�#�UW����U_b���U}���U�*�F`0rU}���U�*��UW����U_b�' M�_b���U}���U�*��U$H:��}���U�*��UW����U_b���H�U�*��UW����U_b���U}��f�w�x�R�; ����Y���H	�E���u3�� ��,��^"����sD&����� ����Y��x6�p�"0���'m��~H���t����>(F�,�ID1��n"|ABA���Hp��`"��P!��(�a0���!�GI�# �����\���*H�!�V�p�I@KG�:��v���PH��� �v������,A�$N�K��,�1<A	��
3	����@�L��s�x�������p�E�&'�����N����?p!D0$:d���h�����_�g�qF�\E
��d.r9������
 `	� 8[�������J$���u�z�tq��P��EB05� `��L�D������<��<��<��<��<��<��<��<��<��<��<2C0�0�0�0�0�0�S�<��<��<����<��<��<��<��<��<��<��<��<��2C0�M0�0�0�0�0�S�<��<��<����<�_<��<��<��<��<��<��<��<����0�Q�0�0�0�0�0S�<��<��<����<�_<��<��<��<��<��<��<��<����0�~��0�0�0�0��0S�<��<��<3���9�_<��<��<��<�����<��<��<��<��0�^�0�0��K�G\0S�<��<��<����<U_<��<��<�/-��}��<����O<��8�O<�/��<��<��I��>��<S���4��<��$0��0S0�0Q�2�Lp�0�<����
<��<�O<����<�lG�_:s��]�<S��4�,1�<�C$0��0S
�e�4�L0�A�bC0�<��<��<��<�O;[�^�o�����\E�����<S�<��<��<����0yJp��cD��0�0�<��<��<��<�I���
����K:���<��<S�<��<��<�������0�0�0�<��<��<��r�,q���<��<��<��<��<S�<��<��<����<��<��<��<��<��<��<��<��<��<��0�0�0�0�0�0�s�<��<��<����������<����<��<��<��<��<��<��<��<��<��<��<��<��<��<��<��+!1A�aq0@Q����� P��p���?��"���OLf������?S����	��'Gr@�{�#�;\�=�^�e���.��&5=�hFi��Q�h�����3<��s����Pf0�x���de����f;@L�����Bk�nb�\=��cc��O���h���&��^�t�������>��D����xnw�*Vg(�7���e���*+7m8�/YQ�����9��+�kM��o
���5���+F���U�%��+�1�^4,�^�����E~ZK����,���z~"2g:)��M�K�/��~��@hy�6r��7������g�^��+h^�C�1
���c`�(��;��x��W��������Q�S�Zp62�������|Q�{���Zb`,B�I��iyw���)������>s���{�p+P(��������1y������������y�bf:3�fJ)�t/t��a<��"�k�V/��b������x����+B�[9G���Nr��+N��a�Vs~�B�����������f�5b	P����c�4�T��p"\�3���Gi�':m(�ad��������0��@����k�|�\Ll���k(�����]}u������A_�_��+1!AaQq���0@���� P�p���?�%�L~�"n����	��e7���w\?S���~j�/C�;�\�O`�q�y!�C�8M�^n�
��<uZcb�����w��WJ:+�����agR1a#c������8�P�W�t<1M��T���('��o�]
6��Up��sB��CX���a�F9�=����S�^�{_/������qk��N!��8�nU��SV}O@���^�����I��E���}�@�R��b5	�V]�d?������S�/|iC��f}mF�=��agD��2t`�{������H������PZ^�`��Tb��O�k���*��(X����X�v�Zl�2��0,�Ra�i�o���Gw��v�P�g�6�+�iHM%3�9q���Z�
��
)<������/
����]��]����H�|��@�����i���K��YvI �oH��ZZ���Z^T*��di���Yx��H��b����`�*����=�B#��MF��sOF�J0����\6�`~+c��:�X)�����>�H�9���P�]����t��^�]��)�l�x��Z�'���O�i01(Ea���O"�|���VDy���n4	u�{�eJ���W
v[  ���|��5$�G�.IG���%2���(�'�X�<\)R25������m�c�r����V������Hn-�K��+!1AQ��aq ���0@`��P����?�O�V2
ifA6!�������#"�4Z8��`(�#{��2&QpbY��lL,	p�i��"@@nH��&(�-���Qj�8���0Y-H����	)��jR����*�X:W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p����@zp�?����N������I0���W�PL�"��\jo�1V(d@���{m� +��^�
3m,$.���	�EU�*4��X�}IR�J�*T�R�OQ
�����QR�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�Y��S��"����Q��W(CB_HQ�U���"���L�E��vPX�,�b�M��`4��t}8=� ����9{��=NG�c�;oE����8]OW���F.p��ZT����7�
S&L��4�t20�8c�00��^����G��F���P���>���hj'8{4�6���d�!���������PDc�r���[����`N+�����c
�V�����p���m\+j�[W
��V������%9���RV�i\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+j�[W
��V�����p���m\+jf ��.���u����W'��=NG��o�����S��
�K:�2���Lz�w�*>����q�_G�'p�Y@�=�?G����&,�����"p��}�����XmRk���������1#3��s�����*��:'b-B���R�J�*U��1mR���m$�*[� c���T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�VA+���KM�A/����R�br� ]�N�$���2���f�	�D��$���Z�O�;����}|8]E�Oc�����y"Z�C~���@��#�%NG�q���rc������������,�P���`�l)���r�8���S����tZ������*:��(Go�����	X���31j |G���J����I(�bAe��h����Y�
���T�%�9LeQ$%0����gI��p;I}��AzM$���K�$��=b�y�AzRw���(�:�
��V�����p���o\+z�[��	f�y6�z�[�>���
��V�����p���o\+z�[�
��V�����p���o\+z�[�
��V�����p���o\+z	XA�$����;8�����V�����A(�`�
uQX�L����t&�1>������]_�T�
ha����
��V���r����
��V�����3���g�,�')'R�8����o�HA8�0$���RN6L	 /k��m�d��	 �k�p�>����4 ���������oC�I��,k�R�i_����Cs��w�B<�\�a�n�WS�qR���wb$+(��������������Tp����!�k����C�d�+��0�j�a!c
�)6���*Kee&6���B2dC�4���tbR�>����\���hx|�������8�n�����7��{ m�~>[Q� D�V} ���oD��[�+�J���2�v ����s�U>4��D����M,�#k��R^�%�#����2p	R��XLK<���C����h���P�1��.����|��cv�,�^�"ANg�zzWH=�m]�������[�,4�B� 0�����z��yu,t��@u��C]z��<B~�H��iT���-�c��k)��=�y�X�~(:���u����z���7!������$;D_`�bI�(���	U�
1���JIzzB� qB��Ir�������\����� C	n��cHi?�>�L����'@�BFw`3}h�p|��k��/lN�2�=��id\��I�4��3��R�	����W(���v���:��k)�bN�&�aD�M��.���6"!�j����6"!�j����(���I:C�ZA
��|��f�B�x��})�)��0�(cm/fffr�4���Z��!ml$.���)�DU�
tN�Z��	R�J�*U��I!����!R�A+. 1%?bT�R�J�*T�R�J�+%7��k[Q|HQC 4u]����l�h`�(Lg1>���*UB�����(�*��
h�@8��2{���
�T�Q8�sL��	5pJ��r�WZ���*T�Xe��%�-���*T�NCr����.�8:@%h���O����2��0��]m%�{�B��bE�1o|��������f*r�Tq=�B�t$�z��������/���u����������U���fLla�HZ�k1���Z�����vV���xk(3������(�LX�66�0��Z�k��ey�*�n�~4$L
C�%��&��&f*1.�5A31A��H(
7�h�xa,b�������z����:�LO_��iM�P]i*^��N2�7���+����'+_���?�fW�Q����D�Qu��J]�]��GS�qA��H�(�����0X��
 	�����s�4�� ����M$t�z@T�gV,SyV;r_�J��U�_�PU
�
Cd�d�r������fW���:k83��������m81xs���;�\�F6�~��.�~4 \S-A�f���ERL���l=HDU$�M8���(`Y��l�^�X�E������KD��9����>�1d�z|�eX5��WUR�

"�\�
R��1U���^1
��Q�N.~��������@�����>Q�Y��r���X�'���(�5�8=&�(gs�)$��	��OR!#�%M@V�����8�dC�(t���'����N8���I ��,�����$�l&��O��aL��R�J�4Vq�	�� A��*T���a%��*T���1���q�,��N�J�*"�r�Vb����Zj�JGD�g�zQ�`2M�(����=�6(LA��Sd@&.��#�*	��#:��(�
�J'��*T�R�J�*T������-�*T�R��D�S����*T�N&$�����(#�G�}'��@b� �l�@^������Av���tyo8w��I��I��|��Y~�K��=�`����xe��6>��R���������Y���`�2k'/F^R�0�@uX=k����:HN�$�7�[���^�{�����dt����"��k/����gB�;%�J����r����3��d�)�_�P���Q��~���T�tyUC�f~���I@>VM69/8�%��D�*X��l#��H"�H����9���u[�{W+�R9R����i2���e0%x�����^%{c����xzn{&� �`�G��U���?3�Q����c���-��_-���"�1H&��d���;�3����6��~�����g�L�t�S�a�/A��K�=T��r�5�%���2�
���L�!�����@�K�lW6g���.���
d�I!�v�O�-��cm��p:���$�q�G�������)9xtZz��4������A��<������A����P�P�U���@�����|�#�T|���hh&	���@�K�S	_���UBfM�`3+`�lD�n�F� ���)���8��� �G�u��Cy���h�QM���o��f�=�p�J#�C�[�g����M�Ue�Sj���|5�:o�^+C"@�{����:��!�'��>1�v���68��,R�_��(r��cz�r�K��5����GL0
��C�'���j �H��]ix�9�Z��K:���P,S"l/z�y9�5�V�+	����eA	b�� ��8���V&����&
���?7"�O�U:#h��\|Hd�����h��v��	� �����x �&�W��<{��GeK�+�#(��yD!,_5�"
-;a\��f������_r5�U?���=���l�v
5�z�&n�[�V��b"M�=����Ac�D @� @�QHb�[�x]!��a��hd.�)�=��G��T�����Y �P�/��.����L���6�������+f�)z��t�$�6��'��ZD�a����Xe]�4x���@g���f���WEAB�%b^����W�T$��]���*p�<$�g�/�?�VT,�HU�t���3s;�t3�'j�f���|�,"��
\	�q��e^��&�8�E�M�n�'U���P�0�`�0�{�"	���n���	p�	2m,J�cj@��!8�u�|xE�QT.�t8��G���=�S�,�UuzO�#C��I�,�SH
"^�f��y�+�{�i,�U���*��F�g��R��(��"�C/ec
�b�A#�����L���Et0���b���n`��b�N��L���qkT��HFC���kR�B������aF$�N���_�(K\�e���h��R��:�H���Y������B��
|�*T������^i�����EI3X*I}����$�s��<0D�5'�[��O��@R���b�!N���:)��E���RB%O�T�R�J�*T�R�J�*T�R�J�*T�R�Bq0�$=��
�*�JF�IIiV4N���`1�T��;�,�\��c��\��J?����x�, �$�=�(� �)�H>i��#��%@��fA^{]��~Q�@�9N/C���(S�@��|��V�7d��Ph9�4Y|u'���hVfA4n�"R������%\��53H=�z��T6n6��)���8��� �
����D%�R�0
d	�����HKN�,/V��
�r�k,�/zVB�^���V��|�*T�R��/H$}���R�J�*�X����R�J�*T|�o��
�*T�R��<b(y�*T�R��/H$}���R�J�*�X����R�J�*T'���/$�����3�o�4�,@�-|��Z�"r\��4k	�(+�$�>f�XH]{�5�D�/h0�Fg����������:i2X�bZ��F(W���I14�%+ c���FR^P!�@�s�1���0\���ddM�|���G���{�����a��V����.EC�&�D'89���FR^P!�@�s�\���0l��W������!FD���H��p����<�\�^����R�Pb�,T�����AT2,Q��H0��a���1"�@Q��45����up<�tgp�"[F�4�d"��aK.���~��<��*)�9�2������.��z�BU�X�I�Y�n):P=�02��i6gq+�)�\A]����|�F���
�b�)z-c)�Y$�n6k2-�'J�&@@yf�4@�@ $,"��C"���1�qvpW��u��b)X����M/�/!UT%���T�bmaN4�2['�	`�%����
v6_x86_extended.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"�������������H��V��y����;3
	�&���j	�&���j	�&���j	�&���j	�&���j	�&���j	�!y��2���|�r�����'@c������nv�����)�&(��b��&(��b��&(��b��&(��b��&(���O/�o��mw���t�b��&(�R:L�G�����\v�:�G#����N�(�����Pl'I��C�S��9��GfaA5�PMA5�PMA5�PMA5��r��{t��_K�yOE����:��j	��1N���j	�&���@��r;�q-��0�@���|x�'~Ok6��:N���s�����;p���M��LQ1D�LQ1D�LQ1D<�����}f��J&(��z����'~^�>��X�jB�K�M�Q1D�
B�'I�e9\��'����3�c����:�����[�5���|6#�}���_ �:v{)�e"�_����RU��:�L�s��D������;2���jd-�'I���KY��R�|�C���r4��u������������9��GfaA5�PMA5�PMO�<�~���:%Y�&��;~f�J^�__m��O���o-!{+���f���j	�&���j^t<S���^�n������}���k_`��ywk*l�:���/G����W�a�
Vv�5����t:N���c�C�������8?N���������a��0�g+�>�����^bl�(��b��&(��b�=�e'����s�k��z>�nMo-�y�����s����jr��&(��b��&(��b��&��f�nw�����i���%
��������:L�J�?X����������=>���@�`t�`f��:ne
��ec�5�������+n����������:��KR�-���'@J�������#�0���j	�'�.�S��9��y�s�n�Z��vQ�����[3PMA5�PMA5�PMA5�PMA5/:�$GoG{�@���r���e~�>���
��a���y�r�:��M)�2:��o�e!��o�kNn�}_'o�xOI�Z�����\���lOboO����&(��b��&(��b��&(��b��&(��b��&(��aH\����;�~n�}���k��q�|���g��:N�f����}is7}��������N����ha�5�PMA�J	�&���^�PMA5��&���j	�5�A5�PN{j	�&�������.��z)��b�������YT�����;@�z=C�c�>��c���O1.�g���>��s�q^������?��,@ 0231P!`"#$4���T�A_�o��bQ�7�u����E����xO^{���k%�)�gkC���/�;����,��Y���c���=��0�o���a�={�wk&���o��bQ�2���?����x��\�<��wq�������'�=���5��S8��X`,���X`,���X`,���X`,���X`)�2;=P�N��X`,���X���?�����c��ui�
�����$��F����kC������
�ll�����66z��v�=�F��-7�H�1(���G�	�kL�^v1�3�w���=i*Ki��������[4�KAL`��X`,���X`,���X`,���X`,����!/T����,����G^8����X`,���X`,Z���.����D��:���E�ha�������?��w{��[�H������H�-���rL�1�o��w{�������A�I�Mz8����,�zN�v�I�m���1#x����e��^�&�O�w�5��_�n(kG��wo+K�������xO^{���k%��0q�,���X`,���X`,���X���:w����0�X`,��|P��h�F(��X`/�k`,���X�w��]�������a�:���q��*���t�a�M`o��w{������_���t��>��#�'�Ybo������n�w��]����P����8�<�{ck�Mh�M��L���A�����O�Y_���)e~R��+�=����\��k�4�V���r�$o�v_����9ed1����<����<YNI�������2�79����z���fB�]q�Qc�uZ�}�e��>g:z�����kf��h)�x`,���X`,���X?��U��X��,���S����-(�E�X<����.6����X`,���X`,Z�81:��i���>�F�m�����I
{����lQ�f�Vn(*E���"da����O��}���f_c2�0a�r��3I��s���4E/a�����Jx!ex��sX���wCE�>GJ��h�w{��_��V����'���j�a�cco��
^/�5�����NV.V.V.V)����3��5����]��$��|����sJ�[�W�{36;6����e���q�bF��Ge��?���7x��}�_W81�������FB����{��j�V����L�1S�����S�N���tqV�'����4Oe9�{�Y��b?���}I,:z�����kf��h)�x`,���X`,�����O������Z����1�`,��6"�����X`,���X`,���X`,��@w{���#b��n�v��{c`�5���~�yyda������+��m����G����}�
G<1x:T�w�����/Zu�+WeX~�H���������M��w��{�/3�^����6I�Hh�����N:��X���u3�~��q��47�w����q���6�'P�'P��l��<���X`,3��0����V�`'a�2�i���Z���\m`,���X`,���X`,���X`,���X�w��^����d5f��l�6���ci��"%����c�����_EN�K��2��[0���2��^(?��w{������E���\��d������v���G$b�h^�Ei�H����>�.G_u��_�Th�)o�'��1�����N�v�����v�'P�'P��l���(����E^=����+ U\sN������X`,���X`,���X`,���X`,���X`,���E�;������������]�Z�����xi�&�7���w��q��T"`��&D�K6�1��w���rZ��� � ��,��]�������6I&����=8:it�T�&����wur'OL9�m��������X`,���`,���Oo�0�X`,���`,���X9�`,���X	��`,���E�;���������a0�|Z��Gg�rp�O����[t>?-��[p�b�m����c3^�����E�������?��]h�.�R(�]�=�eX��c��,�Ab&6��K&dP����������QB�[�5!1�����3�,�
8f�8�dL��Z)\�p<>�&�5�B4��A��J:�D���=�F~>��T�5� k"�8�?��3!1"2P @AQa�03q��BS`bp����?���[N��l���!���n��h�n�����S��JN��t"m��-���6�l�a�K���J������������S���G��0���	��r�P;�C,o���aid>w'Ie�����r�&�xe�����#�9�@����H'C0���O���oU,l'eN����J�����������t�[��;_��2�"���!�6���2�S!��Y��o��/��&����c�"ah�[?����f,n�t�+�������L����_!2��������f�
��W����
�bZ&m����KN����Cf�V����:������p��,OA�����e����s>�Q�b7l�,OC��*T�(%Z��"��Dq�*m����D���+�<3f�
^7�J��gG!5��I�������^#�4�+����#��5!12P "@A�03Qq�#Ra�B`bp���?�������
��k��*�'jf�(U�3_�oZ�5�F���v����'��f�Ww����UKQ`0��m�kf������u]� �����4������|�I*G�kT�p�G�$������3�����
U
,*I�>#W�M�����>~�H���*b���?�Y d��~
X�����M{W�m���8U3������o�iX����>�'���~o��'U:FMZL����"��1��#�����ss��~���.������K�����^�>T
��U�3_���:���Bd�C�rd�5]T�8E35c'���|Uye��=j8�1e�j�gjf�<����&��3w75������D����3t'�=j�
MRK������zU��;����#�����E���8���-�6�R�g<g���F���k�3��C1�
����DCJ�Fgb���B�M���D�D-K��w�Q���>B�!12AQq� "@�#0Ra�$P�4b����`r3����?��>!����h0�@X�Y��Z1��I���~f�`/t�nL���M�)���6�����[��e��x��JO����������z,,p��n>�H��b;��+6������oE���a��z-��[
���l7��oE���a��z-��[
���l7��oE���a��z-��[
���l7��oE���a��z-��[
���l7��oE���a��z-��[
���l7��oE���a��z-��[
���l7��oE���a��z&I�g�����Vs�����c`b���[�9�y���+WK�a�4P>n+�&���s����2N��5�������~f�`/t�n_�����=[�!���/o�~���"eh{t�8�Zn��#�]�CW�g?O3���H���)>�33���������39��(Q"B��@�d��Su���WU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU������0�7	�R��>������*��������*��������*��f�|~�y��F1"4-,Y��7e�PH��~�y��E/y�B����0�8�����3���o/��d��O6vV�5�M��g?O0���h�Q��5��p�#�_�����=[��ah��'OV��q����������C.q�F��&����0I�w�g?O1���&��p�s'1��
O����������z,,p��n>�H��b;��+T�z�eU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\T�D���L�D?BW��a���_��Nw� �>���������*��������*��������*��������)����y����������aKh��z/w
���'
��{�9�y�������7��Ol���e`��nX�K���ah��,������Hx��M_���(�SN����7�
q���(Rce�����D�)�ir|X���1�����_�����=[��ah��'OV��q��������������������~�37p?u	xF|O�g?O0�����N�T���f74����'��ac�fsq�R|=8fg7E
$HZ1�����n�_2������*��������*��������*��������*��������*��������*����@������w�2����u�j��������*��������*oq�T�1�1r��n�����*��������7\UWU�UqU\UWU�UqU\S5�������$N'��c�Gs�
�G��2G�[?�X��H���ex��������o/,\��o+E��e�!X��"�?[O�(�75a`������2�,��g?O<�^T��r��O�	lXm
o��by�X;8���S���D����d>+9�w���/�����������Q���L���]�hcB�a2H��Q�C��%�q�?�P{9�`c��?0�vS�������s��1��E�-{�K)�����&b��~f�`/t�n_�����=[�!���/o�~����#���.]��-��5|��jt��I�C�����VC�����0�9	���v1$���������'D/s�@l��C��^"xe�ZW=�_)��	�,.�Fi���<2R|=8fg7E'��ac�fsq�P�D�����[�����*�*��������*��������*��������*��������*��/s�h�q?{+h5�n�U\UWU�UqU\UW \��*q\Y�^G�����*������'��}UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\S5����"{��Iq�BM'.)�|�C'.9&���1 ������=���h��Yn����yy)� ������A�h�x�|���Y	���f�����l����)���/����rc_��-|B��P��m���S�>�>p��r����v	�qs9}�fdW5�1��\s�$"�����7���7g>�m�p����q�X;+�c�t���d3*o��Y���7���g�v����=�0�o�,}��|�ah��f���]��;����kM,n)�tZ65�{a�?�w}�C��<�}�C��<�}��cC^��L����2	�l��mk�i� ���\K����������h@`ccX��	�&6\��q�	}s��{��/�>�t��5
����1<.��~f�`/t�n_�����=[�!���/o�~���,�?W�M��v��8�C<x������M��0�~����B�d{���2�����X�X�yg����X�X�yg���@��pak���L�>�5��a=�h��=Z�L�&�����X��,`����X��,`���
7��(�[?��H��#s�q�9c�X�'���|lp~c�e'��ac�fsq�R|=8fg7E
$HZ1�����n�_2������*��������*��������*��������Nt�G5���3�?EU�UqU\Qs�@�T�M�>�M��������*^"�W��KUU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UWU�UqU\UW�z��<�y|b��4/����
�8�3��>�5�@w$��������7����2���<PcD�5{K�d���qR@{sSvMYy�s���������r���>�O2�0F����L���3������P+�2$/	x�5�����c���|/1�k0���M���cy|2I��~��*������}�[���i;Q��!��I�k�VC�����s��HNM�T\B'���
���f��	��
���fF�8Ofn�6�25(�0C6�3�o���W�����*��������*��������� ����Q���Cj����?�V��7{�+yw��&���G��������*��������*��������*��������*��������*��������*��������*��������*�������k����������sx���� $�[O���7sT����V_�����o${����M_��gS���M�����2����&7������������;�O��e7��2g�X�K����6�)��e�;9�y���&6�b�5��^c��c�xq�cXA�He�m�5����!�D�vS�Zs��wn����(�Y;�CH��������5g��L���	��v�M�:)%���� !�{0�t��Y���������J�8�������[��s���U�9�y�j��>c2����D�'N7iv�u/z������������*��������*��������*��������*��������*��������*��������*��������*��������*��������*��������)����y���i��	�xe9hk��Ck�M��M�'����3����:D���h�-#�s�f��v8�����*(���c\5~��~�{*jcN�sQa��r�(ns�����s��n{���2Pa�p�q�i�>Q5�cq<����3F%J�����9�y������(�0C6�3�o���W�����*��������*�����f�_1U\UWU�UqU\UWU�8���b����*��������*�����f�_1U\UWU�UqU\UWU�UqZ��TUWU�UqU\UWU�UqU\V�kU�UqU\UWU�UqU\UW�z��?�����'�q���	�Cil������g.?E
��h��pu/�K��Q���L�i�����C�]���Y�i���]��!8�:���+��(10���?��VR��kyw�@��-s�bcv��g����"x).��x��Zuk�j+)dW5���
���a	�!�o$qZ�,�����)v����8����q���=�/���`��G�B����9o�Z����9o�Z����b���t��'��1��=��Yb)�����t6a���?E��0B��7)x��

�ab-�?7��aa`���}����L��G�E�8����V��x��>3XX%���}�����8�?t�`�f�����c�;����"Y�A��H#�Mf	l����0I����������F	aFF\2M�6F��. ���wH;�-�k�G��������ks3�����)fI��Zg#��:�v��w>)���$C�]����a�X�����I�����+a!1�AQ@��� 0q���P��`��?!��#A���t���/@����F��TUM��M���@	�]�
��|{K ;nCp�MP����n�%@�5����f�i{�nI���'$S�l.�]��av�����l.�]��av�����l.�]��av�����l.�]��av�����l.�]��av�����l.�]��av�����l.�]���6�q����re�r�����@��92��_O�4(�5�XE�W��z��<��qiX��nI�ZP����n�jn%@�T����J��$�u���1��9_O�!��<MF�=7��L���\d�VL��Gw�-���X������u#{y�L�m/tQ���F��S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	^�� U��p�Jk_���*|��*|��*|��*|��*|��(��GW��~�]�)������y�2��_O��x�IMS7@�������~�~ErH���@+O?&\AMb���x��~UM��M�������P1D�n�\5��+��1�����/����������^���h=P����n�%@�5����f�i{��O��0�oR���J���J���J���J���J���J���J���J���J���J���J���J���J���J���J��f�,�9��6�������'���I5>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��F�#���}8���@�l��$u(��a�y�~�*�����T/�^NL��W��y.Z�t�}u@O( \�U����Cl%�b�y92��_N1�i~�<K�������AGh)��O��.!�a���:�����F��TUM��M���@	�]�
��������
��6�5��1�bM�9���}���. ��T�
=Rz�+����HJ��`j7R7����F��E�i�a^��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��wXD�4h`���)��S�O�T�%O�T�%O�T�%O�T�%A�*k�C
�]qe>	S��>	S��������%O�T�%O�T�%O�T�%O�QT��;���!	���.�>��@��s�������1d�46�����Y���@V7���.;�����"�
��;�sl��r%(d��B���%��,*���������}8V����U3�3t����R���\�����$��Ct�S�6!�<��y" ��!��.�5�h4��tfgD�����wmw@y���8�E"tpp5���I/
�I��eSv#Sq**��F��TQ �����g���p�;.Jbd���P�
<d�rX
Po$�[��M�%�� L��n��!���/$��nn��2���8������Lj��uS��g
i:�j���8+��w,(P����n�%@�5����f�i{��O��0�oR���J���J���J���J���J���J���J���J���J+\�9�X��� �49��J���J���J��N�UQ[q'�dn��>	S��>	T v�U�n.���5>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��F�#��F�5(�T��!}$Pi(!}$Pi()U�mj����z@�g�j��W��9$�����������w�L����"s�S��&^s�����?k���y�B}���8&���T,_}O���b����W� �.�������`�P���U�����N�w1o�Y�?����&\w+����A���[�X�X��FT@�A�������;��2�� Y��2�X�Jq��q�*��#P��#uQF
��Hc���pq$�Q@�&����N<���V0�!�wEq�p�TW!���5B/��68��FY����.{x(�h-�&84j�n�jn%@�T����J��$�u���1��9_N�A��@�^�x���M���5���`�WJ��Y2����l]���w�0����DE�&hX���"��4	,`wQ
���@W@ �wPG�i�h��� F7�F��a2HF��f��a2HF��f�e6!��d4�>�K�J��/��
\�	_�,
F�BW�Q������j6��(��L#
��)�J���J���J���J���J���J���J���v��(���%�u>	S��>	Az����bE��,BX�O�T�%O�QR��U&�t-�j|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��(��GW��y��.J4hv.���]2*���@`�~I��L���F����q����K6N�[���`
2������d��@������y�/��m������������Q�
?�r�-H�/��L����~X���Uc� ��b~.��AnC�uEA`6��
���2�9$� 2ujS�	��~����z���v���@$��>]=�c���q���P�[���:��U�����*�l���1���1����9>����J|��*|��*|��*|��*|�����T)���A`���|�������GJ2�B
��5E6��6:a�
1�j|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��*|��(��GW��y.��wB�t�S�}`�������i��F�����������"*
5V������`������U,z"5gtC�; ,?K�.;��]O�Bh����{�����Q���l�U�u�E'%��0�>L���r���- 9�
�by�O��c� ��b~.��,�������]��Z!P:�E���>U -W�����������;cD1	��IV�NL����z�&7)arT"�U�����*�l���1���1����9>�	�6�� �,��(U�y���@g�?������r%���nYa$�����pS��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��>	S��F�#���1��C�����L���Qb���
������~�&\ud���D�����j�\lh�D']��3� ��O_�d��hs�����21�����9��1��,`�����`���,��������������� 8^���]�a�,�qv�)b'�)c�[Qr}��{z��%O�T�%O�T�%O�T�%O�PN8@�����J���J���J���p�>	S��>	S��>	S��>	S���#���*|��*|��*|��*|�,�����J���J���J���JH�{��|��*|��*|��*|��(��GW���R���Fw�jm��n"���C_DW�4�z�������\�#;�c5���#q���6�B
AB
�Xs �ui0@���BK�]����� h�!H��5}&#��$$��k�I��N^�����^i�D�4��*��IQ����
T!��TS�����xA��� �$g	#]��W��r7�^�z@=��PJ������������ ���[eD���]��?�e�����i�8���*��D���I"��S�1�H;�X�*pE�$��U��W������	$[�j|n�:@��;�P]��!)���EB�;$����;�P]��!�|D����{5�TB
���0,�n&,X����45A,h��n��E
u�a�'�%�`sA��|�EP��5^�uQ��oqZ!�'����]3mS_�����
<��<��<��<��<��<��<��<��<��<��<��0�0�0�0�0�0�S�<��<��<��(��<��<��<��<��<�O<��<��<��<��$bL0�0�0�0�0�T��0S�<��<��<��(��<��<��<��<��g��<��<��<��<��8��<��<��<��<��9�<��<S�<��<��<��0�0�0�0��2�0c0�<��<��<��<��<��<��<��<��<��l��<S�<��<��<�F$�0�0�0�,0�x@��0�<��<��<��<��<��<��<�������P��<S�<��<��<�
(��<��<������{���<��<����O<����0�0�>��Q4��0�0�S���4��<��(��<��<e�93����<��<��<����
<��bL0�0��3�*�0�0�0S��4�,1�<��(��<����GbS�<��<��<��<��<��<��<��<��:����<��<��<��<S�<��<��<���:����0�0�0�0�<��<��<��<��<�^,�<��<��<��<��<S�<��<��<��7G[�b<0�0�0�0�0�<��<��<��<��oy�<��<��<��<��<��<S�<��<��<�� ��<��<��<��<��<��<��<��<��<��<��0�0�0�0�0�0�s�<��<��<������,C�<SO<�<��<����<��<��<��<��<��<��<��<��<��<��<��<��<��+	1!AQPaq�� @������0`p��?�X�U�0���dUPU�	U���F�Yz��vE���^8��2�Vu���&>_��]��a������4���%�\��0t��n��H���@j��E���0����>�!����z�^b�u�;[�n�3�}����f�fUw[�Kx��
���S���qf������w���R�/he2����Bg���s���q�e��:�p1�'��������>f�����V��p�n�y��M
����a�����9��v8�X�}Y�["�x�E<�o"$EB���z���>j�v�K����5��f�)���h]����� *�h�W����VW,����="W?��z�2������T����;]���yq�L]ej�{��i��qt����
m�M��e������l'S�`8�f3r��a��j]����:L|�B�m�)M���c�[X�o�q����}j4U�5��J�l��UWcj�����WpL��J������,!1AQPaq���@���� 0`��p��?�X
��]}}�
L�1�=��O����1�����c��Vh^����8~
�wa;�I33~?���#�L���\�
��z��_�	PV[WFt�8}�����<�N^t��J��i�	����%�G/&.�����a�:�������X)c��if��r�1������+����~�t�I�����sB��hl+����� ��
_b�����I(����>8U�u��y����+}�SX<�(x����V_	���qjviz�1������:��A�"���W��9�g��d��;P�KW4���`��t?�_�Gw��������Xxr�Se�cSm`��	h-���"�F�M��=�d�{���p��d��,b�H�9�~(��C>E6Cx�s�a��v�w
[VV�h���_������N��]��� �Q�������0�`��������n��Z���V-�G	c*z���U�9����l#b[D��v5i�
r�At��M����J�0n������L -v��:s�����+!1AQaq�� 0@���P����`��?����A��0YFHo��d�-�pA0(�#{��C'L
 H������\1,�����t"t�/�����qn�]��NXel��,���NXel��,��X�p�	)��jR��-;U
�p���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ
c]�v~���s�v~��v�n-���<����e������d��[.��&~
�3^����>O�z����8���K�N`��uQhd��	�D	��Z8��`QF��6dM"��!f��v]��,-(�7Z�mS�6n�5�2\�lG55�@��^�`!��6]f��8qa��`�Z��8qa��`�Z�c�2�$�%�Y�|S�uX��z�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�PR�h-ZJ��:w��5y`"�
��eJ�*T�R�J��Ql����.���e�
Wj0��G/f��A��~��=�g�zH��
����]�����}��{.��L�	`�"��G�wn���/P�0
��$`_b$�hd��	�D	��Z8��`QF��6dM"��!f��v]��w�fT������;���������B�.��=�YTX:��NXel��,���NXel��,��X�p�	)��jVB�_��V�����J�*T�R�J�*T�R�J�*T�R�J�*T���e�M�����?XX�o$>��396
��y��jT�R�J�*T�R�J��Ql����.�]���CW�Nu�p�$6�jw^����h�����Y���Cm���hN�O�z�����z,��
������MBM83�l�`@�-(�8��K'�_�q�Gf����G�a�C���/\�]��y18��nH,tF$�[��J*0Y�5*Q ��Nl����9|���Mk&�1�u�*����rE�N8 �@���E���&$o{SfD�.��oo�e���H�0N$3���S)^��:�`�������v����W���=�����o�z�3��D��������pA��E�p��+d,�d�E�p��+d,�d�"��e�ILKR�b�����&
���R�J�*T�R�J�*T�R�J�*�����V�#������&�C<����R�J�*T�x������g_y�%�F���?�|5*T��('}$��R�J�*TsE������=B	A/�KG?a���H2��@�awu��������+�
��q~*cTR������� ��*�j��%�����O�AA�������{.�L9`���,nN?m�S�*wA������m6-?�(�w�-9�������?@Z���t0M$0�]O���^���=.#�9�iY��!�c|��V+"����|�X�g�����po��KO��Xx4���U�T�a�����D�s������q�QEC��x
������W)��E�����*��J��-�������ba�"#Na�	�FB*Y��	�W0������������7���2q�����jl��E�0B�����=!��@j�}y�����T`�T�9�)xE���N���6R����{�8�r�M�,[��#���A����/%P��&F��i`$�]�:��a/0���?��J��h��bY��D����M��M%	c>R��)�&�����d$_�������������������%0Y-J�]��b�����4GR�J�*T�R�J�*T�R��0
7M���[���r�a`4�J�*TC	�A�����@�prLs~*d�<�|*TOJak�#��_k<t��5*T�R�J�*T�R�J�b��rU�c���R����K����P)�������=�y�x�~Pq��/[��4\�U���h���"�|���� �-2�E�X�{�ti�j�d#qx���&����Ka���>�(]���������N��-B���z?��P�5��R�'<�P��D&�jf�x���7w���6"!�n��Sj6Q�#p�v�j��%���A�vz	WJX��?{
W�5
U��MW��<h����6�o���d��./�x�kF��i�(��zO���e��f������7*vQvqk�hxPM�^N)���Y�D��������-��2��;���'u��}����U�����W��'��2��>3BB�CX���%������ijxxg���=�Z�:y����3ci3
��f�@Y���R����	�����PD3��H�� ����i$�fh�xbY�2� nH�����?�-���$�hd��	�D	��Z8��`QF��6dM"��!f��v]��L-�P�f�`��%�2�uE�����t�(��g�2�-���t����9eZz��1�5��8�tA�O�^�
����
���J�y!�E��`�A�Lt��g�K�`�5����GN�B	�fm8b���<��s
$����3�T�v��B�e�=L�z���I������������^�%8`�)W�F��,|���Q2��^'j��C�7!$]��NXel��,���NXel��,��X�p�	)��jVB�_��V����*T�R�J�*T�R�e��D��i�������hs���J�C%;� M�����������T�QYB����BND����[������R�J�*T�R�J�*T�R�J�*9����W�v]�p?�_�u�R����8��[`��#�S�n�C�z��t���?.Zl�Qo�C��_OW�^���<�DJ���n/\I��9�(�����U���@��W������?3�+-G0���H���>��=�g�3�������r�����Wv#��NV��B�lw��($�@�M}���^������*��[qQDER"V��S
�(�T]�ca�@�*��N������v]�[��8
V���s}xV�6>��*g/��^�S�rlf0p=�7�l�A�b���Z�8��-�����/P��1�	���W�=\�P+0 *B�@�2�a{��'�`e2&���W��yea{��b�����&
��R�J�*T��T�����L�����O�������7
����	#
�m�5�'%���%$���l ���c���!>a����R�J�*T�R�J�*T�R�J�*T�R�J�*9����W�v]�$��o"�-�{����������^Q��8
��re\x�����#�U�Pu8���x[�t-�G�������z������	�0[_���������c�P�<NA���!p���O�(��x4�i[7�t5�F���M�����{.�b�#js�[���v��2�K^^:�t���n�
�[6\@������fG=_��0p��{ m�CJ����3�d�3H��0@2����n�F� ���S
�(�T]�p�8�%�%�.�����\���@ED/�M(��!G^#�T�
&�a�� �*(�as��[2�lE��g�,2�p�{%�"��W+������4?S�^�`Qb�`*�.�)�
'�HQ�N��dM��R�
������^N@0
����J�]��b�>L�A�H��X�������p�P���Y�Z��>��9���m���I4b��A�V�3�k����yi�2^e: 
�O�T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�Q�U�J��3�1�R!_��m�)��k�
�!�u����0�F��%LmS����H�-|PT+Uw�J�������/\�8BB�.�5A�3�Q�2*��
�(��'����3p��3d��	�|���/\�nh�'��`Ad���BP� ���4��0I.x�E�FH���/V�7��)���8��� ��~L��JD����0� �AW~t!]w��e@@~���[	i�������^N@0
����J�]��b�����4&�J�*T�Q�2(y
�*T�R��<`A�U*T�R�J�X���T�R�J�*"��G��5*T�R�J��� ��>MJ�*T�R���-�����1�� ��-��M&�,K�����RX�)a�8A��lcz�'���X�����t�h����43X���	�,����CRZI3��b'�p�1�JR ]��i�����B�A�[�s>k���jR�x���lCO_�4�����c����*`�P�I�Q	��f�#)/(��gY���I��Gzd	����2\����@W���`M��y�����m����������G����z��r9!��1��LG:.�	bh�l"d��1G�>� �D�fz��$Vn#M
t%�JX��<a.#au���>[�StP	B��Y����K���upx����9�37����F�d
l�{������+-�UV���;g8�Ya�7���a	��f��^{(���M�oy���w�ee����_td�fD�fK�b>,��<D�
���E�I��������n$�#�d�=��	} ���`��\�rPX�L�E����jYt�f����X�����b�g4h��+8I����j^�l����4���t
/m"�fKd���X&	���
#33Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#32)
Re: add AVX2 support to simd.h

On Thu, Mar 21, 2024 at 12:09:44PM -0500, Nathan Bossart wrote:

It does still eventually win, although not nearly to the same extent as
before. I extended the benchmark a bit to show this. I wouldn't be
devastated if we only got 0001 committed for v17, given these results.

(In case it isn't clear from the graph, after 128 elements, I only tested
at 200, 300, 400, etc. elements.)

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#34Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#32)
2 attachment(s)
Re: add AVX2 support to simd.h

On Thu, Mar 21, 2024 at 12:09:44PM -0500, Nathan Bossart wrote:

On Thu, Mar 21, 2024 at 11:30:30AM +0700, John Naylor wrote:

Further, now that the algorithm is more SIMD-appropriate, I wonder
what doing 4 registers at a time is actually buying us for either SSE2
or AVX2. It might just be a matter of scale, but that would be good to
understand.

I'll follow up with these numbers shortly.

It looks like the 4-register code still outperforms the 2-register code,
except for a handful of cases where there aren't many elements.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

2reg.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"��������5����n��y��!�������U	�&���j	�&���j	�&���j	�&���j	�&���j	�&���j	�t'I��'@��K�su�>���6a�U	�&���j	��k���Q���PMA5�PMA5�PMA5�PMA5�PN��:N�<�}�w����c:��K:N�5�
!���FF�U�>S�LQ1D�B^<��Sz8�1D�LQ1D�LQ1D�LQ1D�����n5�3�l��M��
N��:U�V�+���C� ���������'�)����{�i`'I�?z#n���5�{��1�Q1D�����k���>��M�J&(��b��&(��b��&(��b����[l>znu�-�\��fR\���&���Rt���1)q��:]����++��:h���cc���f��-�fo�m��.���.sw�{����.o�Z���Z���m�[j��>m�]��q���Bo���������(��b��b�<����1����LQ1D�LQ1D�`b���o?AZm=�}�����������zo��?oM�O��=6'w���M���'@����54�GS��P�����=��7�Q�t���n�d��cY�^��X�P��AJ��|��s�1��z~J���v��/�s�l�����������V����C��y9����z:N��^�>�1f3ZOo��l��f5������i�<��?L�a:N�/�[.-~��b�\��@1���/.[�Tw�[��s�m+���x�����-o�w�����F�W���%�����8c�]����2�	�t���.����K�z�h�n��|�&�z4R���=���FWZ4[�c�PMA9�bao���H�6n.6���MA5�S_X�h����������h�C��w�K����3�o�zgg��N��s�<\�8>�r���<e��;��s���5�:x�o���V4~�2����������1-�3U��������'I����]>����X���9�r�gIa�:=�"|���u�h�>�����+��<�L�a���0�u��������������=m�c=21g=2����r��V�i�U�s���@N��f/�92�f79�������FAOZ�C���F�7���6L�r�_O�L
��U	�'���Z�2}�Rj5�#�����g�����e�V%��{���9�v�S��n��X{Z4�{�p��a����~�����
f�����j_�4���x�l���=Z�q�p��B��MA:N��y�K1���[9�D�������fbk��{&����
�K�'I�\~��'���<^�7i����-������t���_-m������[��_����z}�7��r5�u���������u�w�@�������J-�q
�7F��N�ld��r��<��<�`7����H���cg"tM�:�����oF�+b4�����X�t�79�]����)\�z���c�^�_����0gs��_����s���_������;O�z�����:���V��6��f��CC�;�x6�M�/W��e����U����tR��&(��b��&�	�t<p��%:;>C������3y|&�M^����'��m3�=^��:>:�to8�����a�@���t�^Z��g6��e���DD�"}��t�����u
.���=�v<��-����Mp���~�k�6����g��k���||����r�M��z/I_��}���������LQ1D�LQ1D�LQ1D�
N�	Tk1����z��S���x�����C��}����c*��L��,�����N�����i��XY���O�SNO/�.���8iv��5h&���s�PMA5=(&���s�PMA5=(&���s�PMA5=(&���j	��m]�������^���|���g���u�6�j��9�]L������o+�zM��15�j����]��W{�B���.3 0@!"#2P$14`%����w����.��{������w�Dl����=��k.�1��0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�ck���ck���ck���ck�?������N+����������=�nks[�����5���ni��]�nks[�����5���nks[�����5���nks[�����5���nks[�����5���nks[������{�nC(b(���R�Ua��}�D�a���h�_������C6�~����G��}�D������Nz��>�����H� ���5DN��^�3�{�:W#V$�p�c�k�>�c�9#�y�%9���s��]<`���,`���"a���sc�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c�0X�c(��S�������s��0X�G��}�BN�}����o�M�5��^�!�~C�����^���������P���u����	�k�����)\&�TE/�}�BSgf�����O�Y�b�	j-�]8v�������j�����8��}s�A���5���lkc[��~{.Yv5���lkc[����5���lkc[����5���lkc[����LU�2�p�]��uv654E4QFQG����KgP����5���^�����QAV�D�b��>��!������s����
Bs�������l;�Zm�$}�I�/�Y_��)e|R��!F/bC�Q�Z�����F�,\�����z�w�����	sCvlP��?���kN�+��/	dc��+6/=k�k���"sGE��h����K�G�� �T��j����`a�GbRm�Lo���Z	a.��l���*�b����N�-G�y#�zJ4�����G%�"2���#y}G�f�4U�X��X�=�.b�1��@Yd�@O m�@FB��@Yd�@Yd�@Yd�@Yd�@S��8���+v�"�+U��}k ,��`S�����~5U�Q�Y��s���zyjd=f@Yd�}�nA����C2�fRC��8	�Wj���$
�G����=�=�GX�Uk�_�=�'s�U����k@�\&z�1��&�<��������W�]��:u�7�j���O
c��H�_����?�.����`�����[T�
�{7F���I
S�1S����!��rn�Q��� U�����v��Z
q���>����b���*����fw]\�W4�Vd�D ����9f+1[��������40�ff�#�y����R�~���^�p�$�Q"��-�5U�����X�_1�f�1��bA�5�����Z�o��������K�f���>�����b� �����sF=���X����\+��L6b�5U9���q���D�!n	;pJ�����y�����_���Kj�8���.3l���7"ai5y�D��h]�2�z�Dkx^m���:pt�|W�]���k�r�9?�o~�Q�7I��K���<{����5���6��lkcRr�c[����5���lkc[����r;W��]<	����4����h���P@�����1AGD���G<����X	
�
��(lk�[��L}kc[��}�~!�����[2�fZ�&b)�R�W�[��X7������z�u������-�S�=X�^i>����o���;�w�[
?����f5��0���Z��
Z���O�����<�+zP<��$�\+�yA��.?n��_$}�@�	�#i����"EJQW�p�������#D�z�����4M��o!�����2���NN�}��dU�&*��h��	���������?PV,\�R��*����:�"h ����yFv�BUSMND����H�^���G����\p��B���-���D��l��G������f���i��J�<���Z�|4�~pj�j#�w-��ruf�R����$���+'��@�
�9U�Z��������/�!� �R��u'&�Z_vu�$T���+��0?$}�E��Z_
��i|*����8H��${b\KN/�E+�^A�Zx�/CO%_�~��pV�%�;'��SCe���>l�����,��7��e��6O0
��S����i���I�5�V��Z���`|�5b�jv
;J�s�_y(|�Q���:����se,����6\�se�����&�����X ��-��6\�se����se��6\�G��Z1i<%���z2F2�������:����^MB��7����2����J�����]�����Q-��5��P�%�yQ�E�u���5}?J��-J������+QX�j���yb��`s�.��
�����p�S;?����G��w����������Lq�W3I�����VW�����Y���Ks@R[��dd�i^�Y
���	:�v���Y���^2E��q��j��1p����q���������j�!��@0��Km�[%H��4�X	k��[��yiV�+������G��|��}���R�/A]5��4��
x�c�,@�X����>��,`���
�%#�s<��#C`Z�r�Y���������99�"��=X��~���k4��E$,RE�G���O<��
X�W�]3�%���Q��.��m?�}R���]h4�*�(u��iJZ |�6������
���a)\�:����h�<D���W���+0��Gh}x�^���+^�Ehd�G���F� ���9:�?��?��7����*�^���Z{mX�����f8��wzT~��@q�������j`2�7b�(h��yef�S�`]4+�&�l�������i
�a��I�3Y�����u���UZ��Ky���������Xz��C]���|��j�[.�!��S���,�Z������qc�����T���^g]b�WZ��s���6\�se���.������.������.������.����>��#�#6�.�5��[S�j�?�o�����A��A��.������������x�i1e������(8�*���vn��@��c�W\qck����7{��NA=F���E�U�U�����������#h�u��J����t�*��&�"k�&�"h�;s���_���f����y4��j���xl�f�X�X���Hq@��S�����S��A���,���5(�����m�\�O���������C8�I���������sh��������q������d\o<o?�i���Q���Ma��a�����iNyZ(��gn1��5�vf~l�����.l�����.l�����.l�����.l�����.l�����.l����>���2G6Q���f��u����j��Ztg,l�������Z��UR����j�4���;�e�x������
BR���k����t�b��7�w����D��D���?Mk����Hf�����yO�ZT���g�������Z$!c��~�>���OM���qg@
��.�2���.�wn�2���.�2a��e��\pe����.�2���.�~|pe��\pdC�pe��\pe������\pe��\G���������-�"�� �a'(<���C�r}<���5N����4v|����g�X�`�|��l�Q�B����������h`q�������6���_�Fh���b	�(B��(JC���1c	*C"a$�����(B�IJ
R��,�b��Nc��E�|g��z0:z��GD2�3��/Z�G����6!1"2@Q 0A#Bq��PRap���3CS���?�cr'���k�&��x���y�
�q=�/��s�)����$7�U�h�a��*=|���v�Ljh��3!����t+����#�����:O)���~�%^B�L?53�(lc��K�����R1��u�>����M"id�(*�]Cu�B|Llgy��M3R��iQU��a����U�M�)��5�)E#Db��F5N�
���5��7uD�� �ef�|^ih�J',!GMj����[�Z�=�!�X��c�-;�B��C�����4���oB�jCA4��:�i1��A����Ma�8#)"�[�2���s��^�������(��~��R�3�'47����Bq����-J�	���i�o���!QG�����*��t��%�z�M��u[T;,�A��[..>��g_	p�m��O?f�XG�����4��	�B�m�*;'|���5:�+M{2��[%�%6�������qm3=����C����R��7uE��ia
��Y��tBS.��=,)�(����R5@���j=�������s#�����K������)���h��xuX^S��4�e�����&d�|B��=������$6w���-��Y?�/���!H]mh9���?]���N��g���w�����C���>���wH���!=[�N�2I�>�Ab�wg����T}��s�yz,�js�CSc�������y/�FF�2�X�W��)5��#3���?��6!"1Q 23@Aaq#0BR���Pp�$4����?��i5H��9�J��S?P<�$x�l/-j(y�~y�9�G�9r���7��-I?>���.2nb<F�P�Xy�'bq�n/iX�����xL>&P)>#H@�[�i]���d�\����?x�n1���t
��U q%���x���4���~�Vc,f:�	6�x,d��t���wFR�k��G3�)'��'�@��_w�?}v|l#�"-S��2��e5��2�� ��V%�����vJ]��`8��Np0<<
5�k-akM/�?��^����}��%1�j���#5��A�*���������U�����OK�i�vE����2���f������c�����F�����1��/�To�t�[�F�Fbf�����x���AO��+z/��K�Lw���S�L�I���2M#�Q�p�G�y����=�}�P�s���kL���CU	���_��OV��iU.������UV:������<�X�r���a��n�t���8�l:E���]B�2������f�v*�#���#����~Be��~�z8h�OAr�)j�[�Uo�x������+s_v�K�i���m�h�ro9��((�m�q����GD7[���Q�]�N�'��N�S�t������y��S�JN�n�&`X��������%��jR�����>��D��0��T�u}^����J�<�w
�=�qz=!�\"."
��o5�5���T���yMZr���5k��m� `�A�J@�de%�1|��K���������m�1N�|��R�P���r|��Q�,�O��������@m���=���k��`mkEcs�/��C2!1�ABQq� "a�03@R�#4Pb�r���$5C`�S������?�i�i|"�I����jq������p_�aj���'O
���A�����?�y O�im_	��,ysn�����z?���Z�����I��VX�y����$-X��fJ��$�����*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���*���&�����2o/�[��&�����2o/����j�68��-k�At���s������9m���{]j[|��f�p�����[����[����[����[����[�����nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj���nj������;QsC�q�G�6Y����f"��wpV���+�7v7��%������\�G��V-\;���y{&�~D]�}�����Q�`�a����*3�
K![g)��^6(p����X���������k���=�KGw��o/�6q�5�6;a�q����b�p�Q'�
��KI,0�X�v�q#���X�N�{FI��CrT7%CrT7%CrT7%CrT7$|
�L��%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrT7%CrO6sN���z��#�{��&�MP��
�P��
�7��-��J	�������W������;��_����(rM��^��-�E�~��{f;����������+,l�h��1����:��|��g��^��������l����W"�������8-5���n��������[[��$&}�^��|�Z&���:��\FI��8*����*����*����*����#�T��o��nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ���nJ�����u��'J��7]pR�����ZM�E���$.�*���0���u����nJ���nI��,�&!]�A(
�8��-`����������c'��.���'�c>&�/k1Pa��6�H3���9w�C��n�(z������q�
�@����?��_�����_�~����Yh����V��TXM��+�)�|=5����Z����e����-
��!�k���l�Z�c�
���Fe��y{v3��v��SG�u�<K�[2��oc`1��%bs�tG�5��t]�1h���u���-O��bJS9����1�\v�`<?�y�.y�%4�,F��et������v/y�������|3��l)������j!D>I��5�������b	Z��R��ko!�6���h��t��#��^�o2V^����V^��7��V^��7��M�^5��-Xw�da����b���p��.��n�7���c��	8)�&��oI�B�.�E`!^��o�a�V��m�V��m�V��m�V��m�^�m�4�[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�5[sU�4����i���=�Yi���-?y�lPXi��nj���e��4!�}�,o�44��0^Z�.i	��[�����&4����49�u��[sZ�;��[sU�5[sU�4�]�41_�r�S���������B}O>���-*z��?���?�x�����BdH���|s8x
�4���22;v����5@C��q_�b���&�������`�I��o4���k���cy{����$$0E��(���H
���:N��kKJ��<
a�O��B���\��u�\��,s��2�~	���P���������������c=��F���aBv
�(�v���c�Ms8�C�������G��a{�������V��g[�����G�8�B��2*�P�������.�ywj
��
�������G|&��C�q���,X��[����E�3Wc�u��-�[�������@jm"s7O��t��8�O���!���}�=C�n��z&��!Ce�����
�mxi��?��[��;�^�%�^�H������|���}���J���\�B��m��8IZ��#oM�-`���7l�{#rP��:O�P�or�0��ywpX,����{�v-�J,(���]�n��E��Wk�m����B��i�f�pR{A�1Rh����:4�C\e��T���[��>��%���u�}T(@C����-��8!l��3��2�+�x�[w�/�Z�we��S�1��^����{Y|�h����g�P���8	&��E�28���7(5�@`���MF;f�|��(�N���T(m�$���n�}��.}����d	�;�&p}�i2��|<U�1��u�����-���e�-�6���l�W�m�
���nJ���nJ���nH��J�2U7%SrM�*����*����*����*����*����*����(0�pP�1���
Mp�>��#W���U7%�6O��;�a4�C�����M�v�)b=	�L��A4�@H\�nJ����`��b���-�
�h)��SrU7$|��?�T��SrU7%SrM��s���W/��~���Z=����&KE��c.|����P����s�7�
�}��6�,aw�f��@s����V���+E����)�d��C:8?v����yy���{���������n$��f��=�-2�KGGps��������)��T6�&�{��
����4t���JSN�q��Zv1�`h���r���i��Q����!�!��ZUA��h������[hN��9�D#����7Z��u�Q��R�yw��^qGV�O9�������cX,�5���M�c��o�Y���������Nk� s�$�6z{f��M�����i����v����v&O�.{ ��yP�0�]�8�M������2
f���*�A�����t}�����,@��z�L?������Du�-&1|W�2��i���M����mBmr�E[$7����^�	.2k[�*���cW/��j��^)�����2,��L����bj/���q#k�'5����D��cC
v�)����&qB�sm1��;���x0)w��Bu�BLy��|�I��%�B�b��-�[��6o��y�
�������7&�j���@?x�|���MG�a��[%�������g���8�#Jv/wk��n��{$*+��
o�] ����m�{���Q��Z�	��o.��d�J|��O_��D�����O_��D����4�C�e)/���`D>)Y���_m�>��<6|R�)����!����h�g�	��k��sHZI�����M��D���������h\�8���a�T����1^�Ak[6���K��!������e,�a��>#�N�����/�����Lq�'
H���H��$�p�V�I[�%ot���V��q��-����J��+�H��$�|�\�q��8&�P�}��nVN;B��<�:rm7-��S���Q�-��g%+�2x&�f�x~H��h�>D�ot���U�K�Qb�s{�,IX���J��(\S�|��?��!��t��%�m���[�%ot���V�IM�%��V�I[�%ot��^���D�m��9Ja�]�V��Y�{'Cu.*ZDmcC�,W����`�I��o4���k���cywD6/=�l��n��C�i���J��")77j�����Mj��
Nw'3����Z>��v�- ����R�;	L�TCr���y���4?�������G��'��	��}���`��|I�	���Z�|Pl���\{.���rk��b����!k�}�L���=�Js_g�5���vR��hvXcC8�%�B��M��V�I3��4M���y�aDtCf�m���Q��[p32O�[
�l�0��*$�di�X�Q[�	���*ZE�
Q�ub�&JR����G��Ds�?�b�Ed�F&��p�����p������%5�'j{� -lS;
F{"E�.N�����XUDw���f
���
��D���[���<�vC?�P]y��P�D���j��O�Gh��K�4�<L*=U��3%@WCnJRU,�*������!R!C|9k!�b�b�D���j[�����6k�����)(�%��o�K�d(�����NhE��H����8��0�!cb6oFi�b6 �`6�$f�Fc�h�6�V]G����r�Z�[�-����/�F��B����I2����(���Y�����-���P�s���!���o.�4f��m#�E�����OJ"l ,��S������8�3F�\E����w���o�P�v�u�4^&J'��O�
�K��8��9�H)�9)����!��L�����~����!��e���O�o/�����m�;b��>���10���M��=������&������b���C����F���Xx��Z������x����G����=��\[�IG��D�!��O�IQ"��4�C�)�WM��m?�_h����\v���-�����M^��,��E�?�_�	��"�v(�CYd��A�@C��Y���l�#�F`���^v*�M��n�U��6y�.������5:�����*m2����"��4G^�m��Jo���<D-*-�u���+�V�E�i<`��!�I�����M6P��@x6�2�5�{��-��z'��u��l��J��������z(���z���vK��%�]�>7z*�U��V�E[�n�U��V�E[�n�U��V�E[�n�U��V�E[�y~B�]��71�Q�B��S|���Z;|��?e�{4���n�����k7�D<&{#;��6x3��kx�6p�����a{����r^4}*��
���_���`��"�H-o���gcl�j�#x��3X�Ox�Nv�f�N����y7�0�tf=��rS������b��uv����k��5�s��9�C�2�>1��!o$��?2����?TZ�}�� �H�`D�6;��34�l���{"����j��^]{�4C�B��!�B���fg��Jn��*��U���L�/�o/�!��eq��?G���	���LZ�$8�u�n��N�1P"Dx�u�O
�nI����K18�cm�6�S;��Z��%��\�:��87�����J�^����r�C��������O5���Y/v�A`��t���I�%`����)��'dV�I[�%ot���V�I[�%ot���V�I[�%ot���V�I[�%ot���V�I[�%ot���V�I[�%ot���V�I[�%ot��_�Z-�%6��f4G��f���_@������-�1"D-[l�;��jv���i��a��lD�!��|��G��v�^�����y�QT��TG��,������:,L�&�fr*0�Gf��4��qxU�K��j���������	�I;.�"a=���08�@/�������b�R����9�2kD�A�He�m�1E���Pl1 o����:46���(�0C�����m�7>e��V�Q[�Eou��V�QX��V�Q[�Eou��V�Q[�E:�c���V�Q[�Eou��V�Q@M�|�ou��V�Q[�Eou��Sov?1[�Eou��V�Q[�Eou���ou��V�Q[�Eou��S����V�Q[�Eou��V�Q[�Eou�_�=���x���a2�m-��wy�km�/�~~KN|[�B����4`����{� �0�~���Y`����\fe��bR�Fx`St71��������I�����C���	���<��>c`���Y�ejD�B�a�V�F�v��"�W����3C��m���R�F�2l��o�P�������L�,'
��|�����@���6Y{�3x���QY���kyw�N^Eh�9�h�7b����B:C�Om�SZL&]
��.��/
�����:'�9�����T���>#>=��^�	�M�J�r
h����av.&�&s�5e�[N��K&��<X�n�Z%����&g�KF\Bm���)�3�4���dY2���5�@`;�b�#�q�6z�Z��4C����f>�1�M�/�/��?���o��,,���;U�	m;g����+!1a�AQ���q�� ��0@�P`��?!��qa,�#�A��S�U@N�TM��L���@	����<��L�7a!��@,v��av8��)����H��n�4	�rNe)9"������l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+a[
�V�������KK�����-/��:_��������<���w��&�����A�FuA*8 2����Y����t�������vWN���];+�et����02`xWN���];+�et�������vWN���];+�et�������vWN���];+�et�������vWN���];+�et�������vWN���];+�et���������`b�����sD���`(B���9��x1Y20�_��Btl_�}/��:_���s*��	���>�/��:_��@�V��,�0����~�d����FlA��: ��A������T���N9�xcOM/����ix����.~~�3A��~G�K��m��QGA�@2[�D���c�H:�%��p�1`�H�BN!*��'2���Ka[
�V�����l#��2!���CG�&�����l+a[
�V�����l+a[
�V�����l+a[
�V�����l+aO�$��DG]������D6D+a[
�V�������0�T)�o�����z�w�L�j&'�@�=�_��t��@���1��	R0"f����3�[���`��������vE(�G����IAgV���~�a!���Vx!pBcbC5�+�������~���b�
�@�L�51����Z+���]���>U�h`[o1WN���];��ut�������wN.M.���d
���];��ut�������wWN���];��ut�������wWN���];��ut�������wWN�)�a���E��w
qX��~���A�*�>F�3c��n���X$�oQ�.�*��];��ut����2�1��`a8�K���2�Oq��)���zi~?	vo�@�$�����dk���_MfC�dNe��jR0>�/������^�G6�I��L"�����'��(��3}��~��]!��"o(�Nl_�OK��q���4��N�`�0<�f/�d������d�����6���oVh(JLA�%����=�on���� f�]� H�u!����t����D���p`� :~������5�lM
�~r 1�4�T�B���E�H3`k�&�~�3�z|��H�ts���7����!�����012x�P|�w"�����n�T����p
��|�&O�E=�����ba����Ln�P��l:�)z2�uD��v���Nh�r(0H�����l+a[
�N9���sH2V�����l+a[
�V�����l+a[
�V��E3N&Fv���I�|�a!�/����l �s�"�s!c��J;20=rX,X�
�g1(g�������-�l+a[K��;�d\s�������t�1��i�Z����c�h�l�1.f�!��A�s�$H\��D�&b!�=����rd�l�`D�] l�`D�]`�p�d�������h��zk@�QH���.���M���#!k�U��������LL�����&�1"�!���,"������g�����K��5o]������=
�	���s�ncZ��VT���y�47���u�����"Jt�`Z�^����������=��#/�C:3g����>�/���U�_�_j���	�X,<(e���-y�`���3E�0,Z`�Fi��g��)*��`�X�h��v���H�.`b;�F
��Hc�(���$1��5�Sp�A�\8�����~?W����b
����>vGe�T��|�$xC���0�F��U&-���	��w�Y&H���D.��F^��`
�BS�3�N�{Hs���_�a���B�
�+��3�t�2� � 	�2`$A�@Zd�I�k����D��tx�K��;�Q\����"��0	,`sAs���09��Z�HqDFLh;����@#��83�uV�8��}3
*�qQ�����"Py������#�Q�5:X��@���E��L0NI�&Xp����6��b��g|"��p�lf��Xu9�d������P�%���~y�9�����G��_<�G�BX���NTAZH��,��D�w���U���wWN���];��t�,�3A�0c���ut��,F��t�������wWN���];��ut�������wB\#'�DG�fd��.M�hH�"(��m�M��wQ���:vt���Vj���i���'�3�/�Fa��(*C��WN���	�70��>��w�_�
Zm���o�����!�����];��ut�����Fe�;.I�rN��vDNJ`q�bratGs�$����4���B(^s:0��E�����h�u@��H��blDu���L ��30r������2������}�
��A�6���g��Hz�{P�hBg�����t=�3�^?hXA��.�-Ctm$a�dM9�W�4	�|���]\U���$�'li)�1S.�T�[�`PV"�{t��
�,��9�O4���Hb��'�D\`�"4nb�	�G�� ���A�Q�C�C����d���A���3aa�S�f4@L�������{�r�F)�=�u�vF@�`$!�ADd2�FP�$���C�D�"�J)(!�!u�%b�`�P6�P~,K8��>��0�p:\W��ua/�(�pn����S��=�`@�!��q`	�G�8�vRFY�9���X�:�t���Y����(��m(��`��P�6�]���x�����<E3����L��C��a`\':�bB��!����������M!��z�i�X�'q�F7\|�u���hXp@�A1H8��8S�1�1�L�t	����D�n0,�,����2�C�/e��59�n�c��=B��O��<���!2�B��f�$w+����`�H>u
�{Z_�aQ�D��A��nt�b�����l�X�rrM����#�r=�kTEG��$��7���@���p�L�� Z�@M�\h��H?��1s`����h���@R�d������3����F#��-/��EO���>
"3�y�1�&2��QS����D$$�P�-H!]�g�x�k�!��P�����AB&0��J������"ba?1B
0y�S��@f ����5(f)�@����;��>
&�O��@s�����J�(���S����Dd.DV�`S� �2`�a���)�i�]����QS����EO��l*|T�(��QS����~fl@f�C�*���8L���$C�	W/�Q4"���1.[�����t�m�y���_�k�H� @�{$.|����'�Za
�`�����A��K��� ��������@XnD����Z; P	�`t����A8 :^;vA&v���9!P�����.w���"dt�b�)R>� ��4
�q� ��!���RP��N��~=�!HEr��(\�su`"s�E�V��!�w�!��[��	�@�t[���D�E#�����
���O��?��!	ddP^qH� �"�)8,.	2�S�O�&�c
Q���o�T!�`@���a$#� b,D�w,,0rn��� �����.U�|t�7-)���2�9�A�q��>'
J;�x*T~��2A(��s���-1�������y�|�����,�c���@3B����
	��pFi
�L���� �	�PJ���5]�Oh��V*'/�`�r�(�T�|(@|�M�$Xh���Pw��~=��oV�oV�i�����Qp��y�q��IvE�6���g�%�4�01��b�>L��Yb��T�'�@��	����H�	����H��NK�'��m?�".�!�2����3�����3�ba�,X�,T���� �1�����@h�Y��/������� I�Q9�	�&�E���
�0�IHa�?i�R&�'&�	���E>�bbtt�!~~�h�e�q������3��:����NA����12-���Db��7J
��E L��
/�������F1<���'�j
��m��}4���yD������'/&L#�P���1�*)�q>��0�,�"��|f��������OT�
[��(��c:�Xb��dhNP����Gs�r�3X�����52L�#f"�A������Fhhp�%�������R�>�F"y<��E�0m��X`�en���Z�!�����1QI�z#�0B(�1�1G��2���H4�!�K����K��0���F(H���n��	����|P1%��b$�>���f6�i�0vV����e�D�0xP0#V��#�l��EA��j�[+#edl���8�D`���+v�n����b�lV����[�+v�n����b�lV����i~?���z1NJ�1����b z���>�:}����w�E�|zu����w�|�s�rU����X�(}�C=�k�hy���.b��:H�}����$�D��+b&g�O��������#"dL0~W��a����@�((dpG����`,Y����`2?M�c��IP'9]<pp��Z�`j��82n���]+�etl��sK����J)�I�J��;Q�!%	'�Tu$�|���6���kA�(x~���P��	��=�!�<������OD(�64p���D	��0�u(��M�6��/��9� ��44�431v�(N$OR�$*'��c�B(��e��=�� B9j����o�/p���iH�!#�O��)�c����B�J��@������,����;=.��N@i�:�OTt���\���5�{�q���y$�;�O���>
*|T�(��QS����EO���>
*|T�(��QS����EO���>
*|T�(��QS����EO���>
-/��Ld:`����`�<.��j��0����$�Sip8��]D|���J��������\���O����#�������f0���� ��	&�i~?�+����
Pg��D2J g��	"A&O�l�.@�`��.@�_���8�����#Q�]O���"L��`�\���������@A`6�Nb(8�3
��'�-/��J,�(�O"B��2�(���F��U�*��UW����U_���P�L���W����U_���U~
��C#0:��U~
��U�*��UW��#,?U~
��U�*��UW����T01��j��UW����U_���U~
���s���U_���U~
��U�*�1�D\U�*��UW����U_���U~
�/����R�4A�9����38�e�vs%E.��H�V�,7������'�`�P����DS�����{e�c�cG���8���D��3�r3g��P�D"���.�A��/�8j�����^�|ABA���Hp��`"��P!��(�a0����[���kQ���w��b8IxK�v�oa���0�4G{�1�D&����� ����Y�����"��D���I"���'��M�
 ��L$$�������Q3.HPH1`Hl�w�1�#D��;��;1.H�$��@�p�B@#'"S��i�(��G�#�."�B��9P�nh���0���a��pu��Ry�,VF(���T��{1l<G��a�����9�q<n��vD���I"���[���<��<��<��<��<��<��<��<��<��<��<2C0�0�0�0�0�0�<��<��<��<����<��<��<��<��<��<S�<��<��<��2C0��0�0�0�0�0�<��<��<��<����<c�<��<��<��<��<S�<��<��<����0�p�0�0�0�v:�<��<��<��<����<�<��<��<��+��<S�<��<��<����0���0�0�1��t�<��<��<��<3���;��<��<��9���g<S�� O4��<��<��0��p�0�0������<�-�C�<��<������<��9���rF�w�<S�p�N0��<��8�O<�J|��<����*t��8��<��,p�s�<��$0��$�0�z�[���a�S�}�@ 0<��<�O<S�{�	�V�$}�8�����<�<<<��<�C$0������c��	�S����8��0��<�O*�{�C�4�����r�o��<������<����S,j�����Z��0�0S�<��<��<��<�@�����<�3���<��<��<��<��<��<�3�����[���0�0�0S�<��<��<��s�I��_1�<��<��<��<��<��<��<��<��=�<��<��<��<��<��<S�<��<��<��<��0�0�0�0�0�0�<��<��<��<������<��<��<2O8��$��<��<��<��<��<��<��<��<��<��<��<��<��<��<��*!1AaQq�� @����0�Pp���?����u�|������dh'�����v�?�`�2;�~�N5��Kj�k�v�?����
&%�%��u�v����j�$���b�,��)�>0�hM`�=����U`fy�GB�KK�l�.r�U�u�9��Bs���E��6M�gd�_"f�>N�1A�V��,YD�����������q���-\~!����ND��3<�����]{x��C;BcI�1t��Q�Z�c�W�C��NP���&21�������/�P��S+�oa�����5��.��(-���/H5�V�_�,�K��=H:U[����M~0L���g�.������@��������C4�`�����|
���(f�	R"m1��L�AOr,B[Ea�������2��?�w�����f<0�e�+w���	�G�����?�	@
�*����X{�r���c@T��2��!��d������z�<�z����"c��7�*�Sg��y��A5�^�Ip�g�Pq����7S	��+R����=X5/��p]>9�f���X`��j�]CC��:��o�7�A�C�P�ax�`8�PV"_�8b��?�1�P���^��>������8N�^���.qw�P��A�?�zr������$io�����r~��O�yU�F��&�r�9�w���W����h�E����2��
-^��b6�-�S	k���A5�����xP����J�VT���{J&I����=�O�H.��^�|���H ����
�]p��3��5�S�_� �|�������+!1AQaq�� @���0����Pp��?���\��������z6��F��^`�i�jz����O�<.T����iR�����	c�����-������=���[�~�}����
���i�2>���?�'�� �*��k���6�c)�h�(��u��2u�!�q���w�i��4\�_��E��sMche�9��h������f<Z�oY���z�����;>������MZ��]�+���x�Y�o�9�^���F&�0}�mli�����/Rgq��\a��w����x���/�N.�����`WH?�����(��X��N|�r����p��r��\��U[b������N;N��A]3	o���0=]��������t
��EM��Rdw����=���l�Bq	m���������C"�ck�xI��.�m�g
VQ���A���:���.���]���8����ch��YF5z��&�7��[�/zs��]�7�8w"mA=u��^�����k�6��|������������H3��&�t��	o�Bqk�(��M>p1�������@��t�"��o���KM�{�~`�/��l=Kn����w�%���K�t���g������V�R�q�����9
����2��uG��a��T���1�*�+����h�7�2s8���+�,\���-����w����s�a�p�>��%Eh�����9���e�ni�D2]Ba.�+�/�>j�w��9��l�O�6��b��&p��ln�7X�o�s�?����%�X��8����B/��MZ�&�:if8lc�Wl������c(�~^�N����F�a1/~n�^�p3�>�!0�a����s��bW�7-������U=�\
�����\�����rEo�,��>%�E���[��e(������+!1AQa��� q��0@P����`��?����A��zY�F�
�F$�X`����#"�4Z8��`QF��6dM"��!f�����

���i��"@@nH��&(�-��v��8qa��`�Z�c�2�$�%�K�@��aT*���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�v]����?%�v~����e�����]�g���I��Gz�2�fQf����O�b�D��`M��y�����m�������3H���F����O�*T�R�J�*Uu�n0�w����*T�R�J�*T�R�J�*T�R�J�*T�R�J�*T�^����"������W(CB_HQ�U���E���3R��Ab�����7������e����4d�/[?���?�v}+7 r�z|��� �1@�<��g�{.���������(��m������>�-*e�hN���S"L�0Sn�����:�px���#��0+&p_����\3<���~�+-A�Jh�~�������i��P�!����>��=�@XPP�a�i:l[D�����c����#s� 	�z���B	���/y���P�
����]+��Wet���]���+�vWJ
�Q��Jl��1��bL��vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet���]���+�vWJ����]+��Wet�h :��BaH��3�Wet��XrL��
a�H!2w�]���+�vWJ����v~���*W���.����Y�u���
,�i2��o}"h�&��� �Z���|��g�{.���7C���w��y�b	b]�{�V��*�j������3���I0 b'��0y;.������v�%��2���&9�w�*?Q�t����?����!�sC���/�/G���=(
+)��+v��B�6�J�d�P���:�~����PLY/O��6	D����B{�ua�I��'{������z9�cZSt�������5�2�J�*T�X
�v������2����J�*T�R�J�*T�R�J�*T�R����|���$$��E���e������j��U�S �\���X�������MP���HX%Lz��������<�J�+�v}�w�$�Dl��.�s4��L�H@A��4��Z�x	�N�%����~�g�2�;:��~���PSn�����z�"��
��/�\�M�z����:*�bB�y'm��&#����0Y���_j&	��<��g�H����1UQC��*Kx#�-I�PnEi�M����I ��|�;���C��m�������H��fA�&{��e��o7<?0����6\?�z�@Y�S�����.G�Q����c��07������K+�!�(\��E�}�"�.-]��paNg>������.����pH�����,��'B!*X�t�J��Z�v}$�=ho�StL%��I-�Ch0U�]�>���,F��'��~������<�}��\�p@��
�.�l|���|�*X�mZh�[����j{)�l��AQ����*�OxF��P�����V��F�����wid�
�3������#�R�rk
�~�.�D$�)�8A�(Oq�
�@x���)
�"�&&����!i������d��I�*$��7�9V5��N8���I �&�q��%�I���fy�AzRw�
Q0q���]���+�vWZ����]k����F�O@6po]�����R�;Weu���]���+�vWZ����]k���eu���]���+�vWZ����]k���eu���@�Z�V,i�	
� �I�Ep2�����8��+�1�D�3`�L���k����'����T*���,z)[D	`v
/K�&	�.�f(�e���9��B)���8�i0������]���?�/��eu���]���+�v]�E�e�/(a�h�	����M�H�mH��#�5�U�a1��a�k��l*��0��`�Y�U�FY��"I�I��%�����RN6L@^��'�	 /k��m�d�Av����>����Q��x|o��D%])��J���7'n*������M),It��(+��l���������������O�L�������o�����';����� �3R��@�3<*1�Z�(��T�����y{.���]�A�ij1�_��G{�Z-I���)������i�T��M�S~B�5�r\�-���<�	0x��Gr�Kc���)��7R��������'_	J����S�DP$`������-���(�`�!��b �{:TS��r�4n@�o�����#2����DfO'e��@U���W�*�_��AT�@��!���L����,R�������8)y�/ !I3��Mv��@���"U�N����F%L�AH$�d�D�i�����ijxxg���=�Z�"Q
��%�q�H�do3*1��m�;.����(G�����>��[+)J�O�B~��V���R|J��JQ&5�?��0fqn����u7J)"�z/Z���(��aNL4�D�H����������\�C=��	qd<I�^��P�.:T�	�>��iH�"	���y;.�!� ����uq���������B4���9(�T<@��`�J,U$�X3��H�Kj���,��h:S����KSSp�^AHl�`l�.P`�5����4�Mk{4�,3��9�0Ce�\>��G?fJ��P��DX$����(�(P��.^��a�"�y27���<�"��8�y����
���V!pN��V&=H?�R�l�'�A����R��5c�H���8	Z����o��n(�����Qp@.��YF+av�R�0BDOs.g���=)�Q����(���ks��AY��h�\+�:y{.���|���N[6��B����X/o���$�.����h����$��i{333�����j3����������W�)�;jk��*T�R�]���4c�^��*T�t���n���T�R�J�*T�R��� �Jl���4$;���]!l���LR�6&���K.M�������L�E�������*VU�e�*��9�[���s�!�db����7LI�����d\e��7��$0J"�3�@�"A���<IL�2�BP.QI�tP��"T�^����CL����M4�;�F���
J��:oSC,8d�p�3N�!�X�`����[$�P��-X�x�P��J�iQh	�	<[�"(=x��M�����4/
.���
��C����Jl���-E����g���/n�F9�}>�
�@�X��P�'��tt')�\�?f�����^����qPs�^�}��vZ�=
�V�,���5w�	z���f�sP�A�����{�!� �|����KFH�~)��Dv%
���)b�	,q���}��j���^�����]B�,o���2L"B��@���,f��\���\�GP��!4��3k�I&�4��QAE%anS��`�G�ZB�5�,�4��	DE���M-��~�e����5�q=����=<�+�K��F',��
�H�J�" I$?�*kd��^	�<m�\S��D��q�b6�I~#1�j��vnHr�����~T� �B�b�(U7z=������WOjh
(SI�*� ��3�m"�T5�d�����
BLY�����7C��N �3K�j�J�N"���,@��ja�f�,�X�	Zt2v%�T��]�D@�����U@��]T�
�J5�H� ����%s1@� l���fn��S��c����9��^b��2�m�Y��K;R�m|���AIe����vy�K���4�A���gi�&5���5z4��t�	7����F�%���*���U�������'�����jY�fZ�l��8)����]�h7$��TI8��Sb
��&��hr�C�
���A����j<Rb@"}�&�(�g���<J�t��C�A�:`��^��o_R�t�Q���3��g�*�B"`�s�$��W���E�{�
���G����E�����<��O��&s�(��t�������`:L�Iab�&Q�$��G"�s�@RX`���m��"d7$����@�U�R�JE�\�W)����[2yl@f)������QoHk!J�~�q;��4���\R��Fm
&�q��%�I���fy�AzV�l^�����yjT�Q�4g�E�p(��<�*T*�J����
��_i���t���$!����@Y�f�#Z.��NS	�1�3\�-�:�R�wcp�0��C���
���H��11��R&����
��e!��2�B�������E�!��"A�1�@�;I�.��m�jT3e��p�6_�%�( &d�����Qq�C�����J��(A��@�)���
=`cV��TH*��2�f��fH�k��Z�*T^,�$�h�Z�*T��>��h��q0����Lx�yi fk�l�v�D�'��N�~��(#����O]���q�
`���
���L�$m����"n����%`��:�[c�Rh�D>�<��
�XW��$ 	�P������9���H���N"�<:���+aI]��hL��V
s�&�bPg���h��P�/��V(�Yn�q@+�E�p�� L ^/(��J�g�J1
�����6�z.7����i���c��+�
Y����8|&���;�����5t�)k�OjWA`"�D�
B��T��2�P%`(�d�O����<��
�q���q���q���q���0���&��IR���y��I�x����L:M��+�)nn�2�@�� 
����:i2n��y���TS��d�(m��R@�Tn:46�O) H*7�ulX0CH�B�9�,P��%"7�����2\L�[h{s�����b(X�:���
�D�?�r�mKP\b'���<�|%��6���_I�q�>)��&�CeQ6A��j�V��zL�7&��E�2i�|]c��Z�)F���RCa��<.�Cz�&-t2���>�O��uQK��I,X�\����
�M�U>	����SMGp�@�9��Iaq�L��>K�-��|GCT�A�`�X�&qED�_0M��#+���w�
%�����),f��%���P�$H\�A�l���bP�����3L�Y��u��{.�"2�W�Z�_�k�-4����%n!jJJ9�������s��Y�	h(�?�=JL�,E��"�$h���*���il�������
�I�$ �E��������E%�slHf�tk�p�uJ��}�>�-��H����I���Wq<me"-E`C.	+�"����5��}?G���%�h���O2�?j, S<��o�O�����B���j�g����
]u[���)���������J?�p
��v�i���Ip�F�We%�W/-����Jad3��
�sl�(��HI,�pML�.���� �����A��� ����Zt��=�2�)�������4Ie9QPQk��")[	�r�F4��LT��"�H�dH1i��R*C��>�����fM�`3+`�l�1����AE��C��[\��xv]�7C�����U7}'<8�{�.h���m+���������z� �'�����)%FLK��U�=�*������,6���t�R�@G?�z��JV�=���XhX��T����
uZFp��`����x�O	M�@uF])�P4@9��@�"H�������{b�D����@E&�SO)*�N*-�~>������6p��/�Na-J\KZ���P,�D�^�
�r�o,�/zVb?)������� AHBG@����mL���C����(�[
����H#Z ��
�4
\U�*2�l�G��U��Y0��Q���Dm2���O�	D���i�!!��1���	Z`R�0>�
$�(�0/z�gZ(�����"����b�aY�I0>��P&%����w*�In�g�FM���^��~8� �T#)0jb����x���!{(c�G<S� @� @����=�g�N���:-�.�B��S�W���E?�0�=����C$9s�*=R_o�wP�J]�C?X�a�X����R9c0~�����o��aei/+c	���J��%XoA���hQM����g�Rhy$��r�������+����h��@@i��g�{.��@`����)F!4JW���VXZ$K�r���� !I2
n�����H�����%�"K~��������Pzz\_�+TaLAHZ�!�2�>`/�d��Q$�/�+Yf~.0��e.��i�}�g�{.��\|`�G��%
� ���"��*��9e��~���q�c��Xf�Yzl�)�������?���39���������&��)%���&s&�
hf5���@`G�4^H��Tp�`������F�(��4���R��^����5��;.��WC�Z
��Pw�M�$X���n&�������Y��#!�HX��[	�{�S�pXS�Y?��I���B[�	/�gz��
�YZq>A��J��L���J�K
!��OZ
�Q�@X�b����_������H��{Sd��)p&
@�h!{���(���	
�wP>���}���.�� ��J��X��X\�_x�R�J�*T�R�J�*T�R�J�*T�R�e�����R%#�T����'R�0
�T��w�Y�$�-�,��ay�]�/O����Kk� _��CVP0
xvf�E��{\0�����<k��A����%;��A�7��d�_4aJF$T��`W	�PDAP����.���2�[
x��!���e�oR��:���n�>��n's���������m�_��IxiC!&�S��~���h����I�? d}F����U�t�*M���T��s�oe������%}�p�8�%�%�.�_� �����
d	��~K����KN�,/�����YX^������)�:�	�A���R�J��� 0�z�*T�Q��	�K��R�J�Y,j~O��J�*T<�o�y�T�R�C�^H�=J�*T�u�~��J�*T��v��)hc��A�h[��M X��aq����Y�1r������Z/�"��>QNu%Y��51c�# q!�aR2����u�*�j�"�2,�&�����B��Z�[��M X��^
��$d&��)V����M���?!�	$c0��,K�fm�"ZXj�X�B�T>lBu����H�K�:��|�
;�(A� �o��91P��X��OG$��A�����0@�%��v4A7��p��0 (�`�%��f��*9K�o�:i4X�b[����+
7����%,]\c�f",�!zp0"�xi�-�`�K�t�n���M�2P=�04��(	$�E��	������B\��������C#��"\J4|YT��U�-���	`�%��B�`4�M�����9���h��a�����)8P=�04�t�A�$�?4^��1����JZr2�&�(����Yn���W�3�L%�K�b��Q4 �G�d�QSC]	r��.��o��
2reg_extended.jpgimage/jpegDownload
����JFIFhh��C		

 $.' ",#(7),01444'9=82<.342��C			

2!!22222222222222222222222222222222222222222222222222��t�"���������q�q�N��c��������c�l�����PMA5�PMA5�PMA5�PMA5�PMA5�PMA5�/2�N�(8�3����:L���`�j�}�3��Mh;�N�(?<����Tr�^����PMA5�PMA5�PMA5�PMA5�S/9��S5\�PMA5�/2�N�('���c�k�z9�pw0N�(�2�rg��Y�����k��+�9�?�w p������=3��N��biLQ1D�LQ1D�LQ1D�LS.l�L�������|�|�4�[n�h��Q1D�LRo����\'I�����~�%�|�~��.�6�zs��Fw4<�N��"�:L����*�,�*�-��g��s}>	�5��>{�'I�3O����tr��y;z�m
p���<�=���g��&Ph����N�y=[��X�~���X����(����Jt�@�n,;9F>��>8���]}�����_�z�3��3�9��2�|K���~�{��'�;�����ls2��i��.��)������0���j	�&���j	�&�s�
{YL~���M�V�������r�����Y#��y=~���h�&���������������j	�^e�n��]=������p��c�>�����8R�_'��t�@�O>�~����_-��7u5:d��y����[���|9�����\����mn���~�:L���w<7q<���eU�����g�)��&r��jr#��w�	�e�����4p�s�i�����o�z�����4D2�trpo�����y�;��Td`�mS-��p6�N�(��_�y2�fo9����;�
}s���9��W����{�W��g����0���j	�&��/�v1����w����V�%��_.Ud�o�e����e����W�~�f��ncY4��?{�����j	�&����@~g��fw�Y����������p}�2s^�{'39�s�����ri��S||����-���).��O���No�n{n�������Q���������Y���rJ��f��xN��E����wGWDqc�'�O�t�@
��Mm�m��}���������)��a��������B&��c�����2�H�y���<�D�O�&��O��\~��C���m�o����_���0����:G��<����~m6�D��&(��b��&(��b��&(��b�|����\'I_x��C��k��}6W�+�����2KF�zz^=�������\�y�2�g��*�����[�fK�O��;�_�)��y��������z3���>1��_��QS�[��S�w���g�k���&P}�<=!/�	�&���	�&���s�&���j	�@��j	�'=j	�&���	�&���r:?��6�a�?L}��@���������o��k�?Nw����W���b���'��{���}V.m18�;<�_U��?��,03 @P!"#$1`42����w��i��!;�q'��C�>o��}��	p�=s�c�=��Lk/��!�������������������������������������������������D�D?�����������_Lk���]��/����$E��8AG��_����Z�����-�/���J�m��{[�4��"i��z��L�n<�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f�6Y���l�e�,�f����C���6Y���l�e�,�f�6Y��r��m�
-�!���!�U������ PH��?�����"�G+e���������B1i_36����4%B��?���D?�! (��|��B>�8��F���� ��a��D?�5�g9L�������kd�Z������+��*��U���|�X�Pk�
�)4"k��JZp�w��.�51,!�c���?��>J��5���[Mm5���[Mm5���[Mm5���[Mm5���[Mm5���[Mm5�������Mm5���V���"+z���ki�����ki�����ki�����y~h�!<!(BP��?��b5�Z��/e�O	����/��x��W�,��Y_����C��$��i��gc�h��7��_ ��!\,�]�15�_%:"�i� FZ��J�{{�{%L��������w��Z!�������X'��:����0�u��=��g�n��I�Q�a��{$,��L��)��9XL���c�|c��@%�z��*q#^#3E���/�[K�;6��/�m�:.���2�L��Jca&�^j��`���7�c�|R�yH��)F.�[�������8��)��*��?��q�x��{�$;3�C�|2����K��j����HA!��lU!X��1�P�����VqY�g`�J��uf$z�2�NyJ0j�ab3)�dV�m
�����`��L��"��J���=�=s�h����gP�q��X2���e�,`�X2���e�,`�V:�
Y�qf��d ��Y�gWKe�,�P��j����MOR*�(OX2���e�,b�<s|`�X2���e��������BV>�����Ac��
����*f��L�WFx�a�������ZS���DUTO|��GB@P���
&��}�I���!��J�V��/}��Qg���,�`F�gy'vf>�K��B��[��Y������%�	J�
�H�qV{O��M��Ze��u�]i���4�ijT��V���&��QF ���\}T,���m�i���?~��V��%b���kZ��\U��d5�V�������R�Qu�or��Q�E�ZM.��{3�p�O��%*@�qL.�GH](xN<1�<;��w�q�*�8���#�����Fm���Vn���
$qvf�{gbD�)<i��U/c�,����h�{����z����0���x[���A�����\�����4���gQ��KM�dc����j
��?�����'fe�(�a����Wu�9&[��_ab���/�Z_���i[i1{o=��i������H�V8��NJ���2�F/t�����,�FF
��2���6Y���l�e�,�f�6Y����2l��"�a3X�gX!�;Z���U�p�(��e�.H�{NC:���)�����6��$Y9���h��l�e�,�f�6Y���l�e�,�f�}��a���Z�4��]�%���r=*z.T:�b���VO)��x�
��*q
#�$b���G	S�����2s��O%;�
���*��oGvd�wL't�����w����������c~��N��H��U+���X�gIl��-����{����`�gn�����?f����������D��0d���n����rB)��x'�]-�U�W�Er<�t���/�����
�dG������`��RCY���d.p���)�1� [���;q�c�=.IJ� �#�v-Y�j�����G#Y��V��RhD�.�P��f�����B�VB<h���m�"F+)�0[�����������Z��n���u����BDPw6�X�-A����Wh������#��Qc����C�{�V���+��k*�l�Dc@�M����hE��U���D�W�"����_����T"�����f��2���
������)!��X�����-�E4~[�~ap��-}Cuq�>��P��y�����ky�f�����^�D/%~��V�2�e��5#�f���q\��q��E����ky�����ky�����ky�����ky�����ky�����ky�����ky���^_�(��
��(BP���~&�I�l�)��Q�
A��@o��8M��	��t�%���rL8��?������x�!��Uk�L,Y��_N�x��a'{GC���$��}�~�����wh��N i�w�+#��;�}f��{*�&Ym����������wR�2<�W��j��z{(�������d-��;n����$�>(}S�~|��V�Bl!��Y��B=m2#Q������#�iJ��B2P�n8�`�X2����X2���e�,4\`�X2���d��pe�,`�X2��?�X2���e�,c��e�,`�X2������fZ�����L0��B �}����3���B �}�'!�.T�v-I�C�w���2,*��`a��:ok�k��g��b�Q���f�Q�v~yE@A�c�0`q�@�cP��0�����P��0�]�a��u$*�����2B�OaEF4�t���� �1�(�h�>�H>�B����61!Q"2@AP 03aq��#Rb��$S`p���?�+�(&�v�����{\�s���ny�K	u���$�s����i�s��6���R|��Bn�h�[<��v��'in02{�>h������b��f���)�7i�:_��������x��h���l!�����M�����������b�ca:��&3�����-�gF9���^5�����#[q����M����N��F����.�[��-a|���G�#�v�����<`���8��wyR���6`P1��b
��Q����������fss���]F!r}uU93B~h��;�a/��Zx��r�h�hX��V��	����5�t�6�4�2������g�3Z�����M�)���JF{��o�
�q��9=�c��ql��X�ii�Kw|JaE��j�Fk�����EPj���k5M� C���$��6������*os�y3�]��������	7s5(�$����hM�;�?�5V�z��"�����in��%J���x\��r��^�YD�PP�A=!)�Im��|��j}���5���+�5�DZ1,n�_��6!1 2P"3@aq#0AQR�Bb��`p�����?��sa�3:�� �rrm��1�G��ho���u�g�!����Zv�������T(��K����������,��u��F�Kf���r��h&7�b�uIr�:EP��p�2OXM��%����>p0:xb@0)|�M�x�%	�7I f`{�7��?�������8E�+0Qs�8��P��/�uN�yAMFs���������F{d5�	7n�K����,[^�����B��Fvi���N.�^`��MoO�%
i������*���(�f-�����{��Zux�sx���F{j��}���
�g��'I��P�5�LK+
,:a��~,cT��p�Z�B-5��M�g�Z�M{#�)�J|>���_���*��W�f�\�I����\���T��PYG v�/m��P�,2YR����r�Ca{�Io���������r
���O��E�K�8�E\�T���8�[T�~��ZuO�������S���4�3boca�Z*��T��mAa�Z�x���R]s���Ijd�C�%<��zoU�T�@��h���C
!12B"AQq�� #0@PRa��$4��3br��5`S��Cc���?���F&�aS9�d�� ����;��aj��N����aj��N���!���/n�����?��]����[f��5�`���F#�d>V��UK-3���R|=U,p��n?�B��Bx3%Z��7d]�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���X�`oE���B@������y{�r>�o/s��qS��������G�m��Z�d,���I���G#�6��$���M�������Yb���s�
�%��0���Ct�t��8�uP�:�;@�+�T7N�N�/P"Dx�un�����������f�+7iY�J��Vn��v������f�+7iY�J��Vn��v������f�+7iY�J��Vn��v������f�+7iY�J��Vn��v������f�*�f�%��b��f�+7iY�J��Vn��v������f�+7iY�J�w{a��(�F��1����\�s���+7���4
��zG>.��pA�$H����nF��G��F���r>�o/p������+}p�}��^�.q�
�Y�C���F�1�pO��E.{�oP9q����� �h-�/(5�@x+���� G���I�k{����G�m���A�k4�L�*@Hx5l�����{��
h
��G�������h��1���4uq�_I�:��|�����^��~���/�lx��@yF���Uc����G�c�E`!7���g�j��rg�K����3�����d�MM2��~ampX��boE��&�X��boE��&�X��boE��&�X��boE��&�X��boE��&�X��boE��&�X��boE{K��K[E�q{V&�X��boETH�������V��qA��k�boE��&�X��boE��&�X��boE��&�X��	�q������N�[$up���H����c'}"SGW
����y���{��k���p���h�Y�7-~�u�N����G���@����?��/�?��/�?��/�?��(�L,n��7Ud��J��1������=�����@��7Yk�Q��K�!�k�����Z�c�
��������37��#��x	q����+�"������+�U��?H�Yv��r>��Oa��(qL�r���iq��p�����s��)��b0��p�VO��������6�#y{EN�����
dWH-��Q�B6�x}#�<7�tHT��r�iQW��-hh�=H�}T�d�{�w�l��R��/�������_a���%�X����mL�����7���k�-v�v��-��nj��;Xj@H�D��x���W��x��=X�|.h2$^�4���/�9~i��0:���	��v�
@�iP#��d|{ X������>=�,Z"E���3v�Z��m��F��mV�*w\C\O)s�o+S�0��k]���N�	{�F��-l���L[����G�]�Z&TG��
cj4�#�_����:wn_����:wnL���M�S�7��T� ���?����H����"i�,��TBhh�������]"l���1�
h���#�� ��!��]���G|&��7�^�)�X�5p�:�Y?��f��^�'�Z5-n�.�}�Sq|��D�l�$�0(49��w��  4�:�m��@@iuv�����
�M�%��P��hc�M�Y��Q�[HMi� �	�����R|=U,p��n?�I��T���9���
$HZ������m�|K7qY����Vn��w�����f�+7qY����Vn��w�����f�+7qY����{�Ekt�~MR�+7qE�Z���Q�C����N����n��w(��xTT��+�5�tW41+�iT�U�{����w�����f�+7qY�������1Y����Vn��w�����f�(_q���\�Wy#�e�%���(��j��u��_U�\]���K��j�
�&��fl�'�"�&�y�\L��@�����]����X�X��������5��o��q ��9��;��Vv�����..7��-v���(�����
`�p��pk~h���Z��
��X�%l1��=D�d������5���	�'`�H��q����<^n_���<�6��jv��dC����Z�"�!�k������n�i����'�-����G��2���_�r����\�z.��@�-[
������
�c�>y��M���^��6�X������@s��������*�75#t8S����tp}��7������{����Y�2�eq$�j�����W���9�_ }{y{"�������~�J��_�[��_�Q	���q^�j�F�p�������l�U����%�I���X}@�}Sj��9�D#i��Rn�3�t�#3j��iv���������s%23%uBm�M�Z�~��h�nnT�cgU[n3?�zs]@b<M�w���������qZ��������q���4�3��^���"�-��U�q����@Hx�����)�u#���<O������`���@�Iq�Z��X]Tj��5���F�[SOqk��dY+f������]�D;	��9�����$���k�5^��.�f�!�'6�8�{yz����pU&'�h"�o�y�F9��\GH/@50~%TOJ�.�^���d�T��_���<������w������J|��O����"}�u��������h��K�7�|�%����"
�v�L�/���S��;R�S��i:C��
� �A��&��c!�!i$Cs�63�@�l�TM��#<��a9�T,	��f�t0��`w���t�c����g��%������?����@^Tx�
p�G�������hN
o����6�$��C�X"��	��M���f�+7iY�J��Vn��v������f�+7iY�J���Y�T�p����Uq	h��D:�h���TBk�f�2��~��~�T���R��%f�+7iV����J�O�zh�|$��Q�U�=��T���������l��f?�������Vn��v������f�+7iY�J��Vn��v�������;�y��>Z��)�g�T������,x�M�=��t:$�X(,l���~&.�PX$�_��lH����M�����_���o*�[���������
h
�j4q��l�r��&���wymR�&UQ\�T~K����~j�%�L
o*�`��j�z�^����!U�z�y3_]3�)Nk���}t�,�9���a��p�
+�7�$!V�$��SD�HkG��
 c�h-��#
 c�h-��'�-����0��$�:�
L�r��v�M��8�i8j�C�d�(QK���L�)�����{Z%z�����5�T�^�S
������|�o�
���9�G6��S7/7���Yk���Z��
�U�|��F�YS�"H|!l6�>[J������9���,!C|9k!�b����f����R���M����YJRQbJ��* .I���l44�sB-
�D��i��i����S@m�H�:,B���h3Q#	��hh�7�T�6�-yq'�b�Z�WW��)~�4'2QA��$�.J�����D��1��-���;�� ������7�����7*�lCz/��Z�p�}����9�yE�2z�A���'qZ�M���d���N����ar�H�\x&4��[V�(�}��^����k����Q
����T�+k
�/y�F�!6h�=P�I�Hy-��
�9����h���+����G�m��/+�cZN�p��9����������C!�B��f��� Ah��<��T�+Y����j��++�t���c�}&h��1���4uq�_I�����,��6�N���V����y6������I� ,��C��m��R������qVX�c����0Kk��5ml6������|2L�i�>-SL�v�XT��V�X[�aoU��V�CG`:��N�%?�G��c=�����5�7-V�\^��@��*��?�[T���P[C~%S����W7��j���V1�V�X[�aoU��V�X[�aoU��V�X[�aoU��V�X[�aoU��V�X[�aoU��V�X[�aoU��V�X[�aoU��P���Z�7�����\62w�%4up���H����c'}"^F�Dp�o��Pv�����)^q9j�
l_��[�E�
�����,Y(R��H���+��I�[NW{�r>�o$�(��t�X���F,w
c�;�-\Yy������r�"�5��m��������L��=��Yrs�:��c~K]���5F���^O�+j���(�}��A=j�J�Cq���&�=�n����#��(m��J-y�����Nq��I�[v)��c����&�L�a��\&����u�U��:M����������G#�����2P!�`�
U�V����L~�U+����w�����f�+7qY������f�+7qY����Vn��w���Z��"�w�����f�+7qY����S�u�Y����Vn��w�����f�(�������f�+7qY����Vn��w�w�Vn��w�����f�+7qY���w��thN��m6��e��[!���GR��rN|6:<WCe7Y;O�j!
!D�6�m�: ���]9�K�3pZD6�.�,:��Mj!
!D�6�m�: ���]9�c����B%�`tH.��9�|y�tQAt'�[��h��=��[�~�6���dQ�$��KC�!�N�*v��[5��g�b9��{�-���t��+�6���T8��4����gvd��_�����	��a2�m-��g��t��+F��F�.6\�Cs(�4�\7/4����gvd��L�Kd8Y�
^&,�w��a�i��9�{���{��T��0�U���L��$�=f/�{h�~*����y&g�"�Y��L���,�Si��Lm����<A�f�FC�U������*��w[`�<4��+���5�n�����fu_l��j���W�>3T�Ky�?/�_��?D�I���zGY�E�8��i��]3��g�j�	o;��-��+!1a�AQ��q���� 0@P��`��?!��qa,�&�� uFH��]��v#1(*&�F&bPQ ����k�b	��&h��PY@�#(����	O�,
�BS�A������h6�&���RrE6��l-����l-����l-����l-����l-����l-����l-����l-����l-����l-����l-����l!������~?B���>���;<~6c��_�����AW���,����~?�"�R��S,��~6c��_�����PY����	�Y�p	0Nh��I��O%�''�)p�(N$OR�$P�H���HTO	��2�QF#��-/�b���*v(����b���*v(����b���*v(����b���*v(����b���*v(����b���*v(����b���*v(����b���*v(����b���(`L��4G<K�T�QS�EN�;T�QS�EN�;T�QS�EN����#�A�
�*�j8��A������4�<.���N���0�PX	��D�����#(��T��&�l����~?��wM���f?���@	�* ��` ���1���G��f�~k���� bb���a �<x's��AE,�Y����}��p��������	�2��� �Pa\� �0�1�d���tG��,�	L%�Y���-���}�\eZ��
QL�
���@����(#�l�K6c������B32�s��#8�+���������������b����m���:����:����:����:����:����:����:����:����:����:����:����:����:�������D"�������:�����#�(H�xPD��{����:����:����:����:����:����:����G��&*`�2�O1��)��"�K���k���.�Y����0��Sc�����f�{J�� %�Ash������!J:Y��p
�b�M��7��17����`!��&�������v�f{�����\=���0}�J����'A�e�<2=���sP$���Y�bj��IG���C�w� �HtU"%�X���v�`�X)�B�^����&~\��=�!��������"b��(�
0��$�|6l��'�QA����72*jX`�8�EM@'����d��S�J01�S������
��qg�A�������S��fH'
P��1�;�dV�UnuV�Unu@@`8��CL��L���c�3D�f1�p�4AG�
��� dqp#����}Y(�<�FX�����0�@i�F���_�l��t��t
t�U�1�Jkj������n�bf%�D����J�$�q�`�1�};_��f���m��W�a�*wza�(|�����4:!D�jA ������������(�	e�"��XxP� 9�Z��u�|f�^`X.��/���1���RU
D����%����}$G01����$1��Q�db�f���)�8�i���.fS��g\
rp�Dp�5%?@�4�	O�,
�F��v���Nh���aW��W�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�U{U�UW�U^�S�#�wA��L�C$��b�0dh=-^A	�2�2U�UW�Td���A���=�5X�Q,��]':�b�����*����b����#�+����{U�UW�U^�U{U�UW�T��#�����U��p�3�t�2� � 	�2`$A�@Zd�I�k����D��tx�K����(�.``sAs���09�����K�F-Z�$8�#&4�|�����Rf�p�sc�2S ���*T`=�L��No(H��OD��8T1���EHF�_�1�'D,�/`^��<?k��[�g6����$��&
TM������� �5Sc�
��y�o��N�9������H��������3+���:+���:"'%08�190��#����a�Q��c��!/9�p�"����H�Dh�u@�����blDu���L ��30r������2����G�����q �??k��OH'�<��b�������`/h\c��\G�6�0���������+�'�rq0�n����$��<C��Y��Y)�s��i�  $���O>�$�1&((Dh����c�Agc
������(&
���1�����vf��X���h���19������`�J9�(O���(���!�D PS�!rD������K�P��������p2�q"H7��y�}���V�
r��1�6+0� �2B#;M�vL!������"\0�8�1��(�b���4I�sP��@���N����#G&h*t�D�q�;_���J���C�3�D��;��pj������1}MxDq�3�Es�������	4�d�B�xx#�d�O����
J�� ����vl������"~� ����7�:`1`s���6L,99&��	ci��9���"��ObI��Ec HG�8H&JK�-L ���.4b�Z�������J��
^���6P��l8�{y�L�l'4Q��2�K�����b���*v(����b���*v(����b��� ��RBw.�v@!��Xg�`Y������� �SH'�C�p���
�!vT�QS�DDvEt����3yF���D���e��xx��;Fq�P�c����D��� XQS�EN�;T�QS�EN�;T�QS�EN�;T�Qj*�b>vl@f�C�*���8L���$C��Rb�`o0wr�Uf %�f %�+���&%�=;_��xa�K������<�B��(p���������}@�*��9e����4��}�������`=�2#TF,���0�l��)��]
�WB���D�����LC�� C:/2�
1����
b)���G!Q	b�d�'�N��!	ddP^qH� �"�)8,.	2�S�O�&�c����G�7TT!�`@���a$#� b,D�w,,0rn+��A��]���!������0�b85���"����I(�p�������z�!�,�d1�#�sZ�K�� �FdO�������d�>��1� � ��m�o[z4���@�(�F�<�8��$�"�`��3����W����zb�>L��Yb���4	��#������DY$r����DY$r�A�%�����6���T!�2����3�����3�ba�,X�,T���� �1�����@h�Y��]������'��8 d�%�����NK|r\[tD)�sfVT�W��	�D�`���������.������0��l����-btm���J� "88�P�I��Ib�����p�~�-����B�����ri�8�?^���>������ef�
�X`�h�2R\Zz�x�\k��1 ]t�����(�s=g%�C�e!�g���x�	@gDpJa,�����Y���)(,�������d��Q�a!��
�9������DT�R� 	�'�A�J,�6��fe2���Qa�k�~��1�@���E2���3�0 qvW��geY��%�
����Y�G�o�t[�E�t[�E�tU: �����BP�F}��G��.yO��`x26(��,y����6r?�25��H�(&�����,p����n���n���n���n���n���n���n���n���n���n���n���n���n���n�&��v#��0Xv�d`�a<�F��d`�a.�����'���s)��?��@Vq�G������b]$���A@���x$���f���k+�����f�~?�Nc?Qh�&,�%�������"����r�&�>���R\�:��3�z ;C (�J�~�1�}�������%�(��`�"Y�����
N	�pBV�!F]D�*t�T���l���P� ��)�#�lt�fi��;#�3epC��h���Y��B���OU0"�9�&�vA!H.������p�dPTh�� b�SH��`(;��PR��S�S$��Y��(���<�x�	�<������J�^�U{U�UW�U^�U{C)2g^�U{U�UW�U^�U{U�U��`�
��*����b�����*����b�dfUW�U^�U{U�UW�U^�U{@e��U^�U{U�UW�U^�U{U�U�	�U{U�UW�U^�U{U�UW�T��#���J@�a�<�d������>~�e� W%���\��?�e���p����$9��PH�����p���8j������l�����&�����A�FuA*8 2����`��I\�
�<������!���gF��.CCD��� c�nn.��M�
���\�#;�#1�|�������H�"�E~���p�pqt�����@e�?����,A�$N�K��,�1>�r(H^aL0f�+0��D�O���l�c��`kN@�`��H���q4��L}� Hg��d9��E
�������	d��P{�`s�H!���]�F�Cqs�8�9{ql<G��'�q=�6y����]F�=Pw��vb\$�fI��b�x�R�g@`�bp�� �����z>��WD���I"���[����<��<��<��<��<��<��<��<��<��<����0�0�0�0�0�1�<��<��<��<��(��<��<��<��<��(��(��<��<��<��<��<��<��<��<��;��<��<��<��<��<p�0�0�0�0�5�0���<��<��<��<��<��<��<��<��<��<��<��<��<��<��(��<��<��<�� �����(��<��<��<���L0�0�0�:o
��0�O<��<��<��<��(��<��<��<��3w����(��� O4��<��<��<��<��<�/�I�-7�<��<�-�C�<��<��(��<��<���9��<��(��p�N0��<����0�0�|��NP�!�1�<��,p�s�<��(��<��?>)�[�?�?���(��}�@ 0<��<��<��<�����3WO?]�<��<�<<<��<�K0�1���)q��r�0������8��0��<��<��~o��1�Y��<��<��<������<��(��*^�c��{�<��<��(��<��<��<���L0�����;C0�0�0�O<��<��<��<��,W�^����<��<��<��(��<��<��<��<���R7~��<��<��<��<��<��<��<��<��;F����<��<��<��<��(��<��<��<��<��0�0�0�0�0�1�<��<��<��<����0S�8S�s�0S�0��0��<��<��<��<��<��<��<��<��<��<��<��<��<��<��+!1AQaq���@P��� �0`����?��M�H���z:�-�=$�r��d8���P�b���2��95��/�,�P�|O�5��������P<�9|�)���Z������}�Y��n�Q����y7��R�*��*
���O�C�}�a����;$u��jtF�
[�q�Q����|K\���A�"p+j�bye�+�iPZ�/�|���Sm��'��Yn��0�o��Sy����(��V(�*Z�R�W���x�U�J/:~u���<��/!�������:��O��x`���`�J������t)�yU�A��Z&�,.�]���\�Tx����*�V��z�'��Z���|���j?�|��r�G+^��p����n�8q���}���+���C��������s��r$��u�Q>IR���t�����mAl��xP��RL#�W��u��U��,}�@Kcz�;�V:�"J��E�2���~�M��"�����S�b�Z�3�C�@`�� ������%zI�K+��T�eji���%��`��*~���������G.��++�	E��
�5�������N7��)��y|��/{�Yz��Z�9\��W��H[��xE;*��EGOn��mXcbY2�a�Z����)���J1Gd����,!1AQa��@Pq��� 0����`p��?�V�4���n����0��!����Z%D��[_��rG��<��0�>
EMl	�����M�$	�����	,^66��4������g������M�OhN(����jnacu	��� �x.���2���e��<�By��xk�S�<�g�E�������L�a5�%m�PM�E�Bg��������N�m��$�w8�'L�2F{��%��`���D�����Q�D���#!�-# ��t��~�|nm@����*���h�9lC� L���Xk�sD�,"��Z[M��GsC�����`&{�hm:
��R2�����<�"����:.�&�F�D���<R����D{2�F�Hm�\F�ha:4L�[�1,�U?r���y��Ej71+ DG�5]��!��#S2s�	�����gA}��8a�>��>q(����TRd��2�1��f�Z�39&rH�L���Hez'�onJ��&y�+O>g�d�/�>���5a�� ���P�I5��������	2�U1����4A��a��>##��

0�<3Ag�?�Nd����|��;7It@p@�����A�&�|:*�h4��"��+!1AQaq����� 0@P�`�����?�'��V4
���s@XA ��GF���$[M�N8 �@���E���&$o{SfD�.��oo��������..�4��@�R���/O�2�$4!`�)oQj�8���0Y-Qj�8���0Y-H���lS���� Zv0�`����^���ezW��+�^���J�W�{+����^���ezW��+�^���J�W�{+����^���ezW��+�^���J�W�{+����^���ezW��+�^���J�W�{+����^���ezW��+�^���J�W�{+����^���ezW��+�^���J�W�e�K��?e��L����@�d���#��?�Kb��OGG�c�x!3#�y�Ye��]z��v
Z<
����-�V�[����ae��M3����y�J��94q�Z��:~��TC2��hE{gW�@������Fcq0&�dE��Q��L���qkT��HFC���kR�b������
C�u*T�R�J�*T�R�J�*T�R�J�*T�R�J�*)�����+�O���R�J�*T�R�9"O9B~��LE*a"I���_@w��X,��KT�(fSfiKT�(fSfiKT�(fSfi�f(���$�?e��$�[}������������.���D�U�sV/xl0�����$}�g�4���c����j�
>[����26,j�*�=,_rR �?G�v}��U�`���8���i�:PO�XI+��W�����QL��VJ�I�hpy�0�����[}��v�<hkM��e0px�@��k/d���x_�k]X�lC��Q�"��"�U���Y��&)��Q����
���zo����������I�������?~B�("���$u�n�a��p��-���;���s���$EQ�	a�kU�BK�
 8��o�����"b��}���<D�O���ts�����W�)�;jk�r�J�*T�R�J�*T�R�J�*T�R�A��#����������@!��J��3j�i��)fI��������u�y~�R�J�*T�R�J�@�C��z�D�:�Y&WZ	��[�OzpA-E���|�{�O�����A�I��!�p�h����q�l��l`��U�P^F��g4@@}�#���|�(���T��F(Z������:�6�8��$��K�a�D�����n7�@���jG2�90���=�g��l#n����x�a�pw�.eWl}�v��8�u�f���?6�~w�SX��r�1v�\yI8[l�DTwnI���u�"�� ���J�4���+����I �7���&�B$���!�	4*�.������F\}������h���=~6��ev5h3����9���V����=@�����'v���=�.W�8��pv>���T�\��`#)K��!�	9� ��Oq�
�@y��R�E�LL1K�-bB�#'7�L1�~����qI}�]���1i&�[�( ��p_s�<&��� �'�����C����
a/X����j�'��m���@�Za�Z@��C0�F@L}�xj$���6���j�0��]��XF#k����X�v��]��
�����������T
�WJ�D����n��'������6����m;�a�����Gt&���|��e��R��.����LZ���������*`�+\ H���$I�����7���2q�����jl��E�0B����]�h
�7[�;����0VuY����5k^WK�Q N�{/.��:������m�Yr�����0���������~��"���
��W�*�J
�����Zd,.�y`�B���������K��x�
I��k�Ht�e���p�����D0{*d�
A$%�$�O���g�KS��<5������T�@�.s��Dc#y�aQ�D+o�\$���YeP
`�Z��8qa��`�Z��8qa��`�Z�c�2�$�%�Y�xS�uX��u*T�R�J�*T�R�J�*F��o����K��gz��p|3a���r�^����,wb�`]d<����S�����Mu�<Y/*��>!����lv�@�<B%W����R�J��2�����
�*T�R��&O9V>S
A����������/�~��F�0 �g%*��P,�E���D�rP��mP�e��m@*vR\�`�jjb.B���W�R$X&�4�Mk{4�
1�Z���/E���NFD�����vW	�{.��������U�&�E4����F����n�������(.��:|3qCw��r����1wv�%�Z�2~��	�uT������pg�*�l�63��U/��	�f�|�_�Xk����g��0�o�������u��S�WJ�K1�YX9v0�����n���O�R���Jf��gv��4��J��#�I��r��`�u��t@�����}r/D4�	�*i���H�0��Xh`��a����42��L��i� _+L�P5d�ju(�V)$D� o���TZy�[�"(=x��M�����4/
.���
��C����Jl���-E��������ov���1����_��������>~p�+;!��Z����i2OL]�fII<�y�oq�<������G����+�����a������ ���5X����"��u���[6��0�4>C\��K�s0X�p�C��������$L�LK`�[QE�
���Z��'�	�p�KHZ&�%��f�A!"���CV�)�@��p/��]�X��,��'��V�kX1+������5 �8c��|��P�����
�E���V
R�a�D�����w$�sP����2��������!%{��Z����c�Z� ����>I����WJ�r���]_�" xA`P���zQ��D	�
�J5�H� ���J�b�!"(@�y!���)7��[��-���s
���Xd5L�)D�%c�v�0��.�,/,	%�Ft~��v}X��;�u����TK=�u�@���Y�cU6�v�l�{J�	���Wx�X����AV�\�zm��H�=�~*`@�(@5jx�U(�zy���B�1�S��(�|�����z�w��N���D2����Pu�#>��P	rg?)ES����.�`�&+�`�K��0�{%��9��������m�	�!�%L�E<���6���R-B���M^
��@�b1H�L_����zCY
W��{�`Uv�9�p��T+��i'�_q$�^�I8�v���$��OX���I �+a6/�~N
`�>�J�*T�R�J�0=�p�e|� �$�j�[��-mh9v-w����GFxX��*��w_�
)�@�QR��&���1����2L8a��Q�[(
l����������"��
'�KJ^��>4a �NP�$�r���T����7��iR{�A�W����C���*T�R�J�*T�$I�(O���.&��1i�����f�&��i$O�b|,������% �x�E�jB	��)�H��!�`�$�z��,�bH.��e���N�,u'N]:��5��cc����7�����4=]�w�>mR��d`+6�	��`�Y?,S=
�i���:�����o
er0\k����������n
Y.��Y����&�-/C�c�4%G��DN�}�vOs��s��s��s��0���&��IR���y��I�x����L:M��+�)nn�2�@�� 
����:i2n��{���TS��d�P��<�� ��thm��R@�Tn:4
��8��`��(:
��spX�`1�JDn[�Ag�
d��
�����kc\2�P�u�����*������O���>�d�������e��E���X
jd������9��Ir'�v�*�V���A�3.��h�>��x��r"h2�?OB�o��4HO�S�0�w�=���u�E�e�����
^��=)�{6���
��� ����������M�
`RV������L:��KAIIG8�u�����c����q2�\(,F����]t:�-��T1V�!������}I�@j�B�>��@ 5j���Ir\��]���=]R�ei�s���K��X��``����6��������qH�H����=?K�v}��$�iM�*`g�o�;����_���X��S����0c5yv��O_��
V���Hs����]�����h�����Y�eOB��3e;e�� s;��Nu��V�l�z�{�e��5�!����iNg(A�#��}�g���c�q��%����Z��l����g�"�[���D[��������S,��������x�n��u��!t]���
-��c�?��)�N��3�����]���0������D}�g�hD�l?e�Xi���R�N,��.�R\����-�����\��v��af��p`���l 5�����eR����H��I�h�m�����x�����N+�/�P������T	X+2���Lw��T}F;�w�*>����2>�����@��������c�T��U�3���f��K{7$����"�{�4r���5��a�V$��B0VI���-B�.����)nd{cB<�����TV���6�����	����e���B&���Bx�ua�I��'x���}�N.4�?F$jq������U^�L�U�A��*T�R�X��F��k�u�J�\�@j�A���c��64�i�gZ
o%�e�7��
]��7�R��jNU�\��)�J��yV���y?I����M�{�t��PS����@�A~�Q�(g�eJ�*T�R�J�*T�R�J�*T�R�J�+�M.�}��qt�#d�BG$���[�OzpA-E���|�{�O������>Ha[�dj�d�{�u��D��
<i��y����������%�f��H�5�h���f=M)m}3�B^j��,��{��aK���WJ�A��2*F���Q(��+�
���1H���tW�Q�R�K}���|�F������=E�����u���0�L�����9F<���@;-������4�Dk���H����j�\���*T��Fe��TZ"�/���i�b���px/�4 #�A��j&|�	�g���s{���u��,M�#H�O�,��;Rg��&% .l��%"��?�8FQX5'BN����l�K<M"*�U7�z�A� ;�4?��_;&��r�Z��2ZM/�W�>���yH	e��k��p��N
<����Yf�`�8rG��b�/��=��gh:�{$PF������\��C�nb&��aKN�,/-��+�����������1PL���$6�!��T�R�J���@a�>�J�*T�Q��	���*T�R�J���M�_�R�J�*T{`����B�J�*T�r�cPc�~�J�*T�R\�'���I)hc�p����4�,@�-�ms
:C����1��T%���
����D����1�m��kAU4
���:�e("�U
v��2tH���@�`"F3
�V%Y�l���S��X��&/C�o�^w���i�ma!u���Y�2l,�H�DX��P�����G��B"(�G�2���-��`�.�5�j%���_�kHb~�*����qz�'���X�����t�h����.��#�wq�P���`r�,g��� �61�C�AV,O���o�:i4X�b[��y����C]	r��.��R�y@]s�K�����������.����R��l�PK�/�X���h��a�+0]K�H�Y��8.��	�_�0���J�5�����Z�`��\�tPX�,�b�M��`,m����:EQ�p3S4�TdD��-L��Q����,��0�A/������dXi"��J��Nh��W������K���up|�0k�]�&(J��BM��
p�n=M�@\%
��f�4��B\������o��
#35Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#34)
1 attachment(s)
Re: add AVX2 support to simd.h

Here's a new version of 0001 with some added #ifdefs that cfbot revealed
were missing.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v7-0001-pg_lfind32-add-overlap-code-for-remaining-element.patchtext/x-diff; charset=us-asciiDownload
From cc2bc5ca5b49cd8641af8b2231a34a1aa5091bb9 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 20 Mar 2024 14:20:24 -0500
Subject: [PATCH v7 1/1] pg_lfind32(): add "overlap" code for remaining
 elements

---
 src/include/port/pg_lfind.h | 105 ++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 33 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..5830cc7cb3 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+#ifndef USE_NO_SIMD
+/*
+ * pg_lfind32_helper
+ *
+ * Searches one 4-register-block of integers.  The caller is responsible for
+ * ensuring that there are at least 4-registers-worth of integers remaining.
+ */
+static inline bool
+pg_lfind32_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				vals3,
+				vals4,
+				result1,
+				result2,
+				result3,
+				result4,
+				tmp1,
+				tmp2,
+				result;
+
+	/* load the next block into 4 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+	vector32_load(&vals3, &base[nelem_per_vector * 2]);
+	vector32_load(&vals4, &base[nelem_per_vector * 3]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+	result3 = vector32_eq(keys, vals3);
+	result4 = vector32_eq(keys, vals4);
+
+	/* combine the results into a single variable */
+	tmp1 = vector32_or(result1, result2);
+	tmp2 = vector32_or(result3, result4);
+	result = vector32_or(tmp1, tmp2);
+
+	/* return whether there was a match */
+	return vector32_is_highbit_set(result);
+}
+#endif							/* ! USE_NO_SIMD */
+
 /*
  * pg_lfind32
  *
@@ -119,46 +164,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+	/*
+	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * one-by-one linear search code.
+	 */
+	if (nelem <= nelem_per_iteration)
+		goto one_by_one;
+
+	/*
+	 * Process as many elements as possible with a block of 4 registers.
+	 */
+	do
 	{
-		Vector32	vals1,
-					vals2,
-					vals3,
-					vals4,
-					result1,
-					result2,
-					result3,
-					result4,
-					tmp1,
-					tmp2,
-					result;
-
-		/* load the next block into 4 registers */
-		vector32_load(&vals1, &base[i]);
-		vector32_load(&vals2, &base[i + nelem_per_vector]);
-		vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
-		vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
-
-		/* compare each value to the key */
-		result1 = vector32_eq(keys, vals1);
-		result2 = vector32_eq(keys, vals2);
-		result3 = vector32_eq(keys, vals3);
-		result4 = vector32_eq(keys, vals4);
-
-		/* combine the results into a single variable */
-		tmp1 = vector32_or(result1, result2);
-		tmp2 = vector32_or(result3, result4);
-		result = vector32_or(tmp1, tmp2);
-
-		/* see if there was a match */
-		if (vector32_is_highbit_set(result))
+		if (pg_lfind32_helper(keys, &base[i]))
 		{
 			Assert(assert_result == true);
 			return true;
 		}
-	}
+
+		i += nelem_per_iteration;
+
+	} while (i < tail_idx);
+
+	/*
+	 * Process the last 'nelem_per_iteration' elements in the array with a
+	 * 4-register block.  This will cause us to check some of the elements
+	 * more than once, but that won't affect correctness, and testing has
+	 * demonstrated that this helps more cases than it harms.
+	 */
+	Assert(assert_result == pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]));
+	return pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]);
+
 #endif							/* ! USE_NO_SIMD */
 
+one_by_one:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

#36Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#35)
1 attachment(s)
Re: add AVX2 support to simd.h

On Sun, Mar 24, 2024 at 03:53:17PM -0500, Nathan Bossart wrote:

Here's a new version of 0001 with some added #ifdefs that cfbot revealed
were missing.

Sorry for the noise. cfbot revealed another silly mistake (forgetting to
reset the "i" variable in the assertion path). That should be fixed in v8.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v8-0001-pg_lfind32-add-overlap-code-for-remaining-element.patchtext/x-diff; charset=us-asciiDownload
From f15d85844370aef8505559fc0f2db629b135a9e8 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 20 Mar 2024 14:20:24 -0500
Subject: [PATCH v8 1/1] pg_lfind32(): add "overlap" code for remaining
 elements

---
 src/include/port/pg_lfind.h | 109 ++++++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 35 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..f746aabbf9 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+#ifndef USE_NO_SIMD
+/*
+ * pg_lfind32_helper
+ *
+ * Searches one 4-register-block of integers.  The caller is responsible for
+ * ensuring that there are at least 4-registers-worth of integers remaining.
+ */
+static inline bool
+pg_lfind32_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				vals3,
+				vals4,
+				result1,
+				result2,
+				result3,
+				result4,
+				tmp1,
+				tmp2,
+				result;
+
+	/* load the next block into 4 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+	vector32_load(&vals3, &base[nelem_per_vector * 2]);
+	vector32_load(&vals4, &base[nelem_per_vector * 3]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+	result3 = vector32_eq(keys, vals3);
+	result4 = vector32_eq(keys, vals4);
+
+	/* combine the results into a single variable */
+	tmp1 = vector32_or(result1, result2);
+	tmp2 = vector32_or(result3, result4);
+	result = vector32_or(tmp1, tmp2);
+
+	/* return whether there was a match */
+	return vector32_is_highbit_set(result);
+}
+#endif							/* ! USE_NO_SIMD */
+
 /*
  * pg_lfind32
  *
@@ -109,9 +154,9 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	bool		assert_result = false;
 
 	/* pre-compute the result for assert checking */
-	for (i = 0; i < nelem; i++)
+	for (int j = 0; j < nelem; j++)
 	{
-		if (key == base[i])
+		if (key == base[j])
 		{
 			assert_result = true;
 			break;
@@ -119,46 +164,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+	/*
+	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * one-by-one linear search code.
+	 */
+	if (nelem <= nelem_per_iteration)
+		goto one_by_one;
+
+	/*
+	 * Process as many elements as possible with a block of 4 registers.
+	 */
+	do
 	{
-		Vector32	vals1,
-					vals2,
-					vals3,
-					vals4,
-					result1,
-					result2,
-					result3,
-					result4,
-					tmp1,
-					tmp2,
-					result;
-
-		/* load the next block into 4 registers */
-		vector32_load(&vals1, &base[i]);
-		vector32_load(&vals2, &base[i + nelem_per_vector]);
-		vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
-		vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
-
-		/* compare each value to the key */
-		result1 = vector32_eq(keys, vals1);
-		result2 = vector32_eq(keys, vals2);
-		result3 = vector32_eq(keys, vals3);
-		result4 = vector32_eq(keys, vals4);
-
-		/* combine the results into a single variable */
-		tmp1 = vector32_or(result1, result2);
-		tmp2 = vector32_or(result3, result4);
-		result = vector32_or(tmp1, tmp2);
-
-		/* see if there was a match */
-		if (vector32_is_highbit_set(result))
+		if (pg_lfind32_helper(keys, &base[i]))
 		{
 			Assert(assert_result == true);
 			return true;
 		}
-	}
+
+		i += nelem_per_iteration;
+
+	} while (i < tail_idx);
+
+	/*
+	 * Process the last 'nelem_per_iteration' elements in the array with a
+	 * 4-register block.  This will cause us to check some of the elements
+	 * more than once, but that won't affect correctness, and testing has
+	 * demonstrated that this helps more cases than it harms.
+	 */
+	Assert(assert_result == pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]));
+	return pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]);
+
 #endif							/* ! USE_NO_SIMD */
 
+one_by_one:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

#37John Naylor
johncnaylorls@gmail.com
In reply to: Nathan Bossart (#32)
Re: add AVX2 support to simd.h

On Fri, Mar 22, 2024 at 12:09 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

On Thu, Mar 21, 2024 at 11:30:30AM +0700, John Naylor wrote:

If this were "<=" then the for long arrays we could assume there is
always more than one block, and wouldn't need to check if any elements
remain -- first block, then a single loop and it's done.

The loop could also then be a "do while" since it doesn't have to
check the exit condition up front.

Good idea. That causes us to re-check all of the tail elements when the
number of elements is evenly divisible by nelem_per_iteration, but that
might be worth the trade-off.

Yeah, if there's no easy way to avoid that it's probably fine. I
wonder if we can subtract one first to force even multiples to round
down, although I admit I haven't thought through the consequences of
that.

[v8]

Seems pretty good. It'd be good to see the results of 2- vs.
4-register before committing, because that might lead to some
restructuring, but maybe it won't, and v8 is already an improvement
over HEAD.

/* Process the remaining elements one at a time. */

This now does all of them if that path is taken, so "remaining" can be removed.

#38Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#37)
Re: add AVX2 support to simd.h

On Mon, Mar 25, 2024 at 10:03:27AM +0700, John Naylor wrote:

Seems pretty good. It'd be good to see the results of 2- vs.
4-register before committing, because that might lead to some
restructuring, but maybe it won't, and v8 is already an improvement
over HEAD.

I tested this the other day [0]/messages/by-id/20240321183823.GA1800896@nathanxps13 (only for x86). The results seemed to
indicate that the 4-register approach was still quite a bit better.

/* Process the remaining elements one at a time. */

This now does all of them if that path is taken, so "remaining" can be removed.

Right, will do.

[0]: /messages/by-id/20240321183823.GA1800896@nathanxps13

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#39Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#38)
1 attachment(s)
Re: add AVX2 support to simd.h

Here is what I have staged for commit. One notable difference in this
version of the patch is that I've changed

+ if (nelem <= nelem_per_iteration)
+ goto one_by_one;

to

+ if (nelem < nelem_per_iteration)
+ goto one_by_one;

I realized that there's no reason to jump to the one-by-one linear search
code when nelem == nelem_per_iteration, as the worst thing that will happen
is that we'll process all the elements twice if the value isn't present in
the array. My benchmark that I've been using also shows a significant
speedup for this case with this change (on the order of 75%), which I
imagine might be due to a combination of branch prediction, caching, fewer
instructions, etc.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v9-0001-Micro-optimize-pg_lfind32.patchtext/x-diff; charset=us-asciiDownload
From 1dd970248efd3c5ae1736c0dd1d61fbabbb6c101 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 25 Mar 2024 16:21:45 -0500
Subject: [PATCH v9 1/1] Micro-optimize pg_lfind32().

This commit improves the performance of pg_lfind32() in many cases
by modifying it to process the remaining "tail" of elements with
SIMD instructions instead of processing them one-by-one.  Since the
SIMD code processes a large block of elements, this means that we
will process a subset of elements more than once, but that won't
affect the correctness of the result, and testing has shown that
this helps more cases than it regresses.  With this change, the
standard one-by-one linear search code is only used for small
arrays and for platforms without SIMD support.

Furthermore, this commit restructures pg_lfind32() to minimize
branching, which should also improve performance.

Suggested-by: John Naylor
Reviewed-by: John Naylor
Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 114 ++++++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 38 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..dbc3e9fc6a 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+#ifndef USE_NO_SIMD
+/*
+ * pg_lfind32_simd_helper
+ *
+ * Searches one 4-register-block of integers.  The caller is responsible for
+ * ensuring that there are at least 4-registers-worth of integers remaining.
+ */
+static inline bool
+pg_lfind32_simd_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				vals3,
+				vals4,
+				result1,
+				result2,
+				result3,
+				result4,
+				tmp1,
+				tmp2,
+				result;
+
+	/* load the next block into 4 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+	vector32_load(&vals3, &base[nelem_per_vector * 2]);
+	vector32_load(&vals4, &base[nelem_per_vector * 3]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+	result3 = vector32_eq(keys, vals3);
+	result4 = vector32_eq(keys, vals4);
+
+	/* combine the results into a single variable */
+	tmp1 = vector32_or(result1, result2);
+	tmp2 = vector32_or(result3, result4);
+	result = vector32_or(tmp1, tmp2);
+
+	/* return whether there was a match */
+	return vector32_is_highbit_set(result);
+}
+#endif							/* ! USE_NO_SIMD */
+
 /*
  * pg_lfind32
  *
@@ -95,8 +140,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
-	 * on a block of four registers.  Testing for SSE2 has showed this is ~40%
-	 * faster than using a block of two registers.
+	 * on a block of four registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
@@ -109,9 +153,9 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	bool		assert_result = false;
 
 	/* pre-compute the result for assert checking */
-	for (i = 0; i < nelem; i++)
+	for (int j = 0; j < nelem; j++)
 	{
-		if (key == base[i])
+		if (key == base[j])
 		{
 			assert_result = true;
 			break;
@@ -119,47 +163,41 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+	/*
+	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * one-by-one linear search code.
+	 */
+	if (nelem < nelem_per_iteration)
+		goto one_by_one;
+
+	/*
+	 * Process as many elements as possible with a block of 4 registers.
+	 */
+	do
 	{
-		Vector32	vals1,
-					vals2,
-					vals3,
-					vals4,
-					result1,
-					result2,
-					result3,
-					result4,
-					tmp1,
-					tmp2,
-					result;
-
-		/* load the next block into 4 registers */
-		vector32_load(&vals1, &base[i]);
-		vector32_load(&vals2, &base[i + nelem_per_vector]);
-		vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
-		vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
-
-		/* compare each value to the key */
-		result1 = vector32_eq(keys, vals1);
-		result2 = vector32_eq(keys, vals2);
-		result3 = vector32_eq(keys, vals3);
-		result4 = vector32_eq(keys, vals4);
-
-		/* combine the results into a single variable */
-		tmp1 = vector32_or(result1, result2);
-		tmp2 = vector32_or(result3, result4);
-		result = vector32_or(tmp1, tmp2);
-
-		/* see if there was a match */
-		if (vector32_is_highbit_set(result))
+		if (pg_lfind32_simd_helper(keys, &base[i]))
 		{
 			Assert(assert_result == true);
 			return true;
 		}
-	}
+
+		i += nelem_per_iteration;
+
+	} while (i < tail_idx);
+
+	/*
+	 * Process the last 'nelem_per_iteration' elements in the array with a
+	 * 4-register block.  This will cause us to check a subset of the elements
+	 * more than once, but that won't affect correctness, and testing has
+	 * demonstrated that this helps more cases than it harms.
+	 */
+	Assert(assert_result == pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]));
+	return pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]);
+
 #endif							/* ! USE_NO_SIMD */
 
-	/* Process the remaining elements one at a time. */
+one_by_one:
+	/* Process the elements one at a time. */
 	for (; i < nelem; i++)
 	{
 		if (key == base[i])
-- 
2.25.1

#40Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#39)
Re: add AVX2 support to simd.h

I've committed v9, and I've marked the commitfest entry as "Committed,"
although we may want to revisit AVX2, etc. in the future.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#41Tom Lane
tgl@sss.pgh.pa.us
In reply to: Nathan Bossart (#40)
Re: add AVX2 support to simd.h

Nathan Bossart <nathandbossart@gmail.com> writes:

I've committed v9, and I've marked the commitfest entry as "Committed,"
although we may want to revisit AVX2, etc. in the future.

A significant fraction of the buildfarm is issuing warnings about
this.

adder | 2024-03-26 21:04:33 | ../pgsql/src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]
buri | 2024-03-26 21:16:09 | ../../src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]
cavefish | 2024-03-26 22:53:23 | ../../src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]
cisticola | 2024-03-26 22:20:07 | ../../../../src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]
lancehead | 2024-03-26 21:48:17 | ../../src/include/port/pg_lfind.h:199:1: warning: unused label 'one_by_one' [-Wunused-label]
nicator | 2024-03-26 21:08:14 | ../../src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]
nuthatch | 2024-03-26 22:00:04 | ../../src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]
rinkhals | 2024-03-26 19:51:32 | ../../src/include/port/pg_lfind.h:199:1: warning: unused label 'one_by_one' [-Wunused-label]
siskin | 2024-03-26 19:59:29 | ../../src/include/port/pg_lfind.h:199:1: warning: label 'one_by_one' defined but not used [-Wunused-label]

regards, tom lane

#42Nathan Bossart
nathandbossart@gmail.com
In reply to: Tom Lane (#41)
Re: add AVX2 support to simd.h

On Tue, Mar 26, 2024 at 07:28:24PM -0400, Tom Lane wrote:

Nathan Bossart <nathandbossart@gmail.com> writes:

I've committed v9, and I've marked the commitfest entry as "Committed,"
although we may want to revisit AVX2, etc. in the future.

A significant fraction of the buildfarm is issuing warnings about
this.

Thanks for the heads-up. Will fix.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#43Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#42)
Re: add AVX2 support to simd.h

On Tue, Mar 26, 2024 at 06:55:54PM -0500, Nathan Bossart wrote:

On Tue, Mar 26, 2024 at 07:28:24PM -0400, Tom Lane wrote:

A significant fraction of the buildfarm is issuing warnings about
this.

Thanks for the heads-up. Will fix.

Done. I'll keep an eye on the farm.

I just did the minimal fix for now, i.e., I moved the new label into the
SIMD section of the function. I think it would be better stylistically to
move the one-by-one logic to an inline helper function, but I didn't do
that just in case it might negatively impact performance. I'll look into
this and will follow up with another patch if it looks good.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#44Tom Lane
tgl@sss.pgh.pa.us
In reply to: Nathan Bossart (#43)
Re: add AVX2 support to simd.h

Nathan Bossart <nathandbossart@gmail.com> writes:

On Tue, Mar 26, 2024 at 06:55:54PM -0500, Nathan Bossart wrote:

On Tue, Mar 26, 2024 at 07:28:24PM -0400, Tom Lane wrote:

A significant fraction of the buildfarm is issuing warnings about
this.

Done. I'll keep an eye on the farm.

Thanks.

I just did the minimal fix for now, i.e., I moved the new label into the
SIMD section of the function. I think it would be better stylistically to
move the one-by-one logic to an inline helper function, but I didn't do
that just in case it might negatively impact performance. I'll look into
this and will follow up with another patch if it looks good.

Sounds like a plan.

regards, tom lane

#45Nathan Bossart
nathandbossart@gmail.com
In reply to: Tom Lane (#44)
1 attachment(s)
Re: add AVX2 support to simd.h

On Tue, Mar 26, 2024 at 09:48:57PM -0400, Tom Lane wrote:

Nathan Bossart <nathandbossart@gmail.com> writes:

I just did the minimal fix for now, i.e., I moved the new label into the
SIMD section of the function. I think it would be better stylistically to
move the one-by-one logic to an inline helper function, but I didn't do
that just in case it might negatively impact performance. I'll look into
this and will follow up with another patch if it looks good.

Sounds like a plan.

Here's what I had in mind. My usual benchmark seems to indicate that this
shouldn't impact performance.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v1-0001-improve-style-of-pg_lfind32.patchtext/x-diff; charset=us-asciiDownload
From c3f163753246c5ec82dd8c5dba70232cbeebbf2a Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 27 Mar 2024 13:50:17 -0500
Subject: [PATCH v1 1/1] improve style of pg_lfind32()

---
 src/include/port/pg_lfind.h | 58 +++++++++++++++----------------------
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index 33e8471b03..5b76cc8937 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,24 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+/*
+ * pg_lfind32_one_by_one_helper
+ *
+ * Searches the array of integers one-by-one.  The caller is responsible for
+ * ensuring that there are at least "nelem" integers in the array.
+ */
+static inline bool
+pg_lfind32_one_by_one_helper(uint32 key, uint32 *base, uint32 nelem)
+{
+	for (int i = 0; i < nelem; i++)
+	{
+		if (key == base[i])
+			return true;
+	}
+
+	return false;
+}
+
 #ifndef USE_NO_SIMD
 /*
  * pg_lfind32_simd_helper
@@ -134,9 +152,8 @@ pg_lfind32_simd_helper(const Vector32 keys, uint32 *base)
 static inline bool
 pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 {
-	uint32		i = 0;
-
 #ifndef USE_NO_SIMD
+	uint32		i = 0;
 
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
@@ -150,25 +167,15 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
-	bool		assert_result = false;
-
-	/* pre-compute the result for assert checking */
-	for (int j = 0; j < nelem; j++)
-	{
-		if (key == base[j])
-		{
-			assert_result = true;
-			break;
-		}
-	}
+	bool		assert_result = pg_lfind32_one_by_one_helper(key, base, nelem);
 #endif
 
 	/*
-	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * If there aren't enough elements for the SIMD code, use the standard
 	 * one-by-one linear search code.
 	 */
 	if (nelem < nelem_per_iteration)
-		goto one_by_one;
+		return pg_lfind32_one_by_one_helper(key, base, nelem);
 
 	/*
 	 * Process as many elements as possible with a block of 4 registers.
@@ -193,27 +200,10 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	Assert(assert_result == pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]));
 	return pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]);
-
-one_by_one:
-
-#endif							/* ! USE_NO_SIMD */
-
+#else
 	/* Process the elements one at a time. */
-	for (; i < nelem; i++)
-	{
-		if (key == base[i])
-		{
-#ifndef USE_NO_SIMD
-			Assert(assert_result == true);
+	return pg_lfind32_one_by_one_helper(key, base, nelem);
 #endif
-			return true;
-		}
-	}
-
-#ifndef USE_NO_SIMD
-	Assert(assert_result == false);
-#endif
-	return false;
 }
 
 #endif							/* PG_LFIND_H */
-- 
2.25.1

#46Tom Lane
tgl@sss.pgh.pa.us
In reply to: Nathan Bossart (#45)
Re: add AVX2 support to simd.h

Nathan Bossart <nathandbossart@gmail.com> writes:

Here's what I had in mind. My usual benchmark seems to indicate that this
shouldn't impact performance.

Shouldn't "i" be declared uint32, since nelem is?

BTW, I wonder why these functions don't declare their array
arguments like "const uint32 *base".

LGTM otherwise, and I like the fact that the #if structure
gets a lot less messy.

regards, tom lane

#47Nathan Bossart
nathandbossart@gmail.com
In reply to: Tom Lane (#46)
1 attachment(s)
Re: add AVX2 support to simd.h

On Wed, Mar 27, 2024 at 05:10:13PM -0400, Tom Lane wrote:

Shouldn't "i" be declared uint32, since nelem is?

Yes, that's a mistake.

BTW, I wonder why these functions don't declare their array
arguments like "const uint32 *base".

They probably should. I don't see any reason not to, and my compiler
doesn't complain, either.

LGTM otherwise, and I like the fact that the #if structure
gets a lot less messy.

Thanks for reviewing. I've attached a v2 that I intend to commit when I
get a chance.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Attachments:

v2-0001-improve-style-of-pg_lfind32.patchtext/x-diff; charset=us-asciiDownload
From 6f2577779917230bf70284f0ec1186bed17d9b4a Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 27 Mar 2024 13:50:17 -0500
Subject: [PATCH v2 1/1] improve style of pg_lfind32()

---
 src/include/port/pg_lfind.h | 62 ++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index 33e8471b03..4b1431ed00 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,24 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+/*
+ * pg_lfind32_one_by_one_helper
+ *
+ * Searches the array of integers one-by-one.  The caller is responsible for
+ * ensuring that there are at least "nelem" integers in the array.
+ */
+static inline bool
+pg_lfind32_one_by_one_helper(uint32 key, const uint32 *base, uint32 nelem)
+{
+	for (uint32 i = 0; i < nelem; i++)
+	{
+		if (key == base[i])
+			return true;
+	}
+
+	return false;
+}
+
 #ifndef USE_NO_SIMD
 /*
  * pg_lfind32_simd_helper
@@ -88,7 +106,7 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
  * ensuring that there are at least 4-registers-worth of integers remaining.
  */
 static inline bool
-pg_lfind32_simd_helper(const Vector32 keys, uint32 *base)
+pg_lfind32_simd_helper(const Vector32 keys, const uint32 *base)
 {
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
 	Vector32	vals1,
@@ -132,11 +150,10 @@ pg_lfind32_simd_helper(const Vector32 keys, uint32 *base)
  * return false.
  */
 static inline bool
-pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
+pg_lfind32(uint32 key, const uint32 *base, uint32 nelem)
 {
-	uint32		i = 0;
-
 #ifndef USE_NO_SIMD
+	uint32		i = 0;
 
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
@@ -150,25 +167,15 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
-	bool		assert_result = false;
-
-	/* pre-compute the result for assert checking */
-	for (int j = 0; j < nelem; j++)
-	{
-		if (key == base[j])
-		{
-			assert_result = true;
-			break;
-		}
-	}
+	bool		assert_result = pg_lfind32_one_by_one_helper(key, base, nelem);
 #endif
 
 	/*
-	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * If there aren't enough elements for the SIMD code, use the standard
 	 * one-by-one linear search code.
 	 */
 	if (nelem < nelem_per_iteration)
-		goto one_by_one;
+		return pg_lfind32_one_by_one_helper(key, base, nelem);
 
 	/*
 	 * Process as many elements as possible with a block of 4 registers.
@@ -193,27 +200,10 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	Assert(assert_result == pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]));
 	return pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]);
-
-one_by_one:
-
-#endif							/* ! USE_NO_SIMD */
-
+#else
 	/* Process the elements one at a time. */
-	for (; i < nelem; i++)
-	{
-		if (key == base[i])
-		{
-#ifndef USE_NO_SIMD
-			Assert(assert_result == true);
+	return pg_lfind32_one_by_one_helper(key, base, nelem);
 #endif
-			return true;
-		}
-	}
-
-#ifndef USE_NO_SIMD
-	Assert(assert_result == false);
-#endif
-	return false;
 }
 
 #endif							/* PG_LFIND_H */
-- 
2.25.1

#48Nathan Bossart
nathandbossart@gmail.com
In reply to: Nathan Bossart (#47)
Re: add AVX2 support to simd.h

On Wed, Mar 27, 2024 at 04:37:35PM -0500, Nathan Bossart wrote:

On Wed, Mar 27, 2024 at 05:10:13PM -0400, Tom Lane wrote:

LGTM otherwise, and I like the fact that the #if structure
gets a lot less messy.

Thanks for reviewing. I've attached a v2 that I intend to commit when I
get a chance.

Committed.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com