use SSE2 for is_valid_ascii
new thread [was: WIP Patch: Add a function that returns binary JSONB as a bytea]
I wrote:
We can also shave a
few percent by having pg_utf8_verifystr use SSE2 for the ascii path. I
can look into this.Here's a patch for that. If the input is mostly ascii, I'd expect that
part of the flame graph to shrink by 40-50% and give a small boost
overall.
Here is an updated patch using the new USE_SSE2 symbol. The style is
different from the last one in that each stanza has platform-specific
code. I wanted to try it this way because is_valid_ascii() is already
written in SIMD-ish style using general purpose registers and bit
twiddling, so it seemed natural to see the two side-by-side. Sometimes
they can share the same comment. If we think this is bad for
readability, I can go back to one block each, but that way leads to
duplication of code and it's difficult to see what's different for
each platform, IMO.
--
John Naylor
EDB: http://www.enterprisedb.com
Attachments:
v2-0001-Use-SSE2-in-is_valid_ascii-where-available.patchapplication/x-patch; name=v2-0001-Use-SSE2-in-is_valid_ascii-where-available.patchDownload
From 69d56a21192ed2f03bc08f078cfff7ba5cb0d80b Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Fri, 5 Aug 2022 13:30:47 +0700
Subject: [PATCH v2] Use SSE2 in is_valid_ascii where available.
This is ~40% faster than portable C on contemporary Intel hardware.
---
src/common/wchar.c | 18 +++------
src/include/mb/pg_wchar.h | 50 ++++++++++++++++++++++--
src/test/regress/expected/conversion.out | 3 +-
src/test/regress/sql/conversion.sql | 3 +-
4 files changed, 57 insertions(+), 17 deletions(-)
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..a305e0e66b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,26 +1918,20 @@ pg_utf8_verifystr(const unsigned char *s, int len)
const int orig_len = len;
uint32 state = BGN;
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
-
- if (len >= STRIDE_LENGTH)
+ if (len >= ASCII_CHECK_LEN)
{
- while (len >= STRIDE_LENGTH)
+ while (len >= ASCII_CHECK_LEN)
{
/*
* If the chunk is all ASCII, we can skip the full UTF-8 check,
* but we must first check for a non-END state, which means the
* previous chunk ended in the middle of a multibyte sequence.
*/
- if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
- utf8_advance(s, &state, STRIDE_LENGTH);
+ if (state != END || !is_valid_ascii(s, ASCII_CHECK_LEN))
+ utf8_advance(s, &state, ASCII_CHECK_LEN);
- s += STRIDE_LENGTH;
- len -= STRIDE_LENGTH;
+ s += ASCII_CHECK_LEN;
+ len -= ASCII_CHECK_LEN;
}
/* The error state persists, so we only need to check for it here. */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 011b0b3abd..cbb1dfe978 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -19,6 +19,8 @@
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
+#include "port/simd.h"
+
/*
* The pg_wchar type
*/
@@ -704,25 +706,57 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
* Verify a chunk of bytes for valid ASCII.
*
* Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
+ * Since the chunk size is platform-specific, we provide the ASCII_CHECK_LEN
+ * convenience macro for callers to pass for len.
*/
static inline bool
is_valid_ascii(const unsigned char *s, int len)
{
const unsigned char *const s_end = s + len;
+
+#ifdef USE_SSE2
+ __m128i chunk,
+ error_cum = _mm_setzero_si128();
+#else
uint64 chunk,
highbit_cum = UINT64CONST(0),
zero_cum = UINT64CONST(0x8080808080808080);
+#endif
+
+ /*
+ * With two chunks, gcc can unroll the loop. Even if the compiler can
+ * unroll a longer loop, it's not worth it because callers might have to
+ * use a byte-wise algorithm if we return false.
+ */
+#ifdef USE_SSE2
+#define ASCII_CHECK_LEN (2 * sizeof(__m128i))
+#else
+#define ASCII_CHECK_LEN (2 * sizeof(uint64))
+#endif
Assert(len % sizeof(chunk) == 0);
while (s < s_end)
{
+#ifdef USE_SSE2
+ chunk = _mm_loadu_si128((const __m128i *) s);
+#else
memcpy(&chunk, s, sizeof(chunk));
+#endif
+
+ /* Capture any zero bytes in this chunk. */
+#ifdef USE_SSE2
+
+ /*
+ * Set all bits in each lane of the error accumulator where input
+ * bytes are zero.
+ */
+ error_cum = _mm_or_si128(error_cum,
+ _mm_cmpeq_epi8(chunk, _mm_setzero_si128()));
+#else
/*
- * Capture any zero bytes in this chunk.
- *
* First, add 0x7f to each byte. This sets the high bit in each byte,
* unless it was a zero. If any resulting high bits are zero, the
* corresponding high bits in the zero accumulator will be cleared.
@@ -734,13 +768,22 @@ is_valid_ascii(const unsigned char *s, int len)
* because we check for those separately.
*/
zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+#endif
/* Capture all set bits in this chunk. */
+#ifdef USE_SSE2
+ error_cum = _mm_or_si128(error_cum, chunk);
+#else
highbit_cum |= chunk;
+#endif
s += sizeof(chunk);
}
+#ifdef USE_SSE2
+ /* Check if any lanes in the error accumulator got set. */
+ return _mm_movemask_epi8(error_cum) == 0;
+#else
/* Check if any high bits in the high bit accumulator got set. */
if (highbit_cum & UINT64CONST(0x8080808080808080))
return false;
@@ -750,6 +793,7 @@ is_valid_ascii(const unsigned char *s, int len)
return false;
return true;
+#endif
}
#endif /* PG_WCHAR_H */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 442e7aff2b..434dc4d93c 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -140,7 +140,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
-- will contain all 4 bytes if they are present, so various
-- expressions below add 3 ASCII bytes to the end to ensure
-- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
-- Test multibyte verification in fast path
with test_bytes as (
select
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 9a65fca91f..27ef069eaf 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -121,7 +121,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
-- will contain all 4 bytes if they are present, so various
-- expressions below add 3 ASCII bytes to the end to ensure
-- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
-- Test multibyte verification in fast path
with test_bytes as (
--
2.36.1
On Wed, Aug 10, 2022 at 01:50:14PM +0700, John Naylor wrote:
Here is an updated patch using the new USE_SSE2 symbol. The style is
different from the last one in that each stanza has platform-specific
code. I wanted to try it this way because is_valid_ascii() is already
written in SIMD-ish style using general purpose registers and bit
twiddling, so it seemed natural to see the two side-by-side. Sometimes
they can share the same comment. If we think this is bad for
readability, I can go back to one block each, but that way leads to
duplication of code and it's difficult to see what's different for
each platform, IMO.
This is a neat patch. I don't know that we need an entirely separate code
block for the USE_SSE2 path, but I do think that a little bit of extra
commentary would improve the readability. IMO the existing comment for the
zero accumulator has the right amount of detail.
+ /*
+ * Set all bits in each lane of the error accumulator where input
+ * bytes are zero.
+ */
+ error_cum = _mm_or_si128(error_cum,
+ _mm_cmpeq_epi8(chunk, _mm_setzero_si128()));
I wonder if reusing a zero vector (instead of creating a new one every
time) has any noticeable effect on performance.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Thu, Aug 11, 2022 at 5:31 AM Nathan Bossart <nathandbossart@gmail.com> wrote:
This is a neat patch. I don't know that we need an entirely separate code
block for the USE_SSE2 path, but I do think that a little bit of extra
commentary would improve the readability. IMO the existing comment for the
zero accumulator has the right amount of detail.+ /* + * Set all bits in each lane of the error accumulator where input + * bytes are zero. + */ + error_cum = _mm_or_si128(error_cum, + _mm_cmpeq_epi8(chunk, _mm_setzero_si128()));
Okay, I will think about the comments, thanks for looking.
I wonder if reusing a zero vector (instead of creating a new one every
time) has any noticeable effect on performance.
Creating a zeroed register is just FOO PXOR FOO, which should get
hoisted out of the (unrolled in this case) loop, and which a recent
CPU will just map to a hard-coded zero in the register file, in which
case the execution latency is 0 cycles. :-)
--
John Naylor
EDB: http://www.enterprisedb.com
On Thu, Aug 11, 2022 at 11:10:34AM +0700, John Naylor wrote:
I wonder if reusing a zero vector (instead of creating a new one every
time) has any noticeable effect on performance.Creating a zeroed register is just FOO PXOR FOO, which should get
hoisted out of the (unrolled in this case) loop, and which a recent
CPU will just map to a hard-coded zero in the register file, in which
case the execution latency is 0 cycles. :-)
Ah, indeed. At -O2, my compiler seems to zero out two registers before the
loop with either approach:
pxor %xmm0, %xmm0 ; accumulator
pxor %xmm2, %xmm2 ; always zeros
And within the loop, I see the following:
movdqu (%rdi), %xmm1
movdqu (%rdi), %xmm3
addq $16, %rdi
pcmpeqb %xmm2, %xmm1 ; check for zeros
por %xmm3, %xmm0 ; OR data into accumulator
por %xmm1, %xmm0 ; OR zero check results into accumulator
cmpq %rdi, %rsi
So the call to _mm_setzero_si128() within the loop is fine. Apologies for
the noise.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
v3 applies on top of the v9 json_lex_string patch in [1]/messages/by-id/CAFBsxsFV4v802idV0-Bo=V7wLMHRbOZ4er0hgposhyGCikmVGA@mail.gmail.com and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.
[1]: /messages/by-id/CAFBsxsFV4v802idV0-Bo=V7wLMHRbOZ4er0hgposhyGCikmVGA@mail.gmail.com
--
John Naylor
EDB: http://www.enterprisedb.com
Attachments:
v3-use-simd-is-valid-ascii.patchtext/x-patch; charset=US-ASCII; name=v3-use-simd-is-valid-ascii.patchDownload
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..1ca7533f00 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,11 +1918,12 @@ pg_utf8_verifystr(const unsigned char *s, int len)
const int orig_len = len;
uint32 state = BGN;
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
+ /*
+ * With a stride of two vector widths, gcc will unroll the loop. Even if
+ * the compiler can unroll a longer loop, it's not worth it because we
+ * must fall back to the byte-wise algorithm if we find any non-ASCII.
+ */
+#define STRIDE_LENGTH (2 * sizeof(Vector8))
if (len >= STRIDE_LENGTH)
{
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 011b0b3abd..aea045aa66 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -19,6 +19,8 @@
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
+#include "port/simd.h"
+
/*
* The pg_wchar type
*/
@@ -704,25 +706,28 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
* Verify a chunk of bytes for valid ASCII.
*
* Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
*/
static inline bool
is_valid_ascii(const unsigned char *s, int len)
{
const unsigned char *const s_end = s + len;
- uint64 chunk,
- highbit_cum = UINT64CONST(0),
- zero_cum = UINT64CONST(0x8080808080808080);
+ Vector8 chunk;
+ Vector8 highbit_cum = vector8_broadcast(0);
+#ifdef USE_NO_SIMD
+ Vector8 zero_cum = vector8_broadcast(0x80);
+#endif
Assert(len % sizeof(chunk) == 0);
while (s < s_end)
{
- memcpy(&chunk, s, sizeof(chunk));
+ vector8_load(&chunk, s);
+
+ /* Capture any zero bytes in this chunk. */
+#if defined(USE_NO_SIMD)
/*
- * Capture any zero bytes in this chunk.
- *
* First, add 0x7f to each byte. This sets the high bit in each byte,
* unless it was a zero. If any resulting high bits are zero, the
* corresponding high bits in the zero accumulator will be cleared.
@@ -734,20 +739,31 @@ is_valid_ascii(const unsigned char *s, int len)
* because we check for those separately.
*/
zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+#else
+
+ /*
+ * Set all bits in each lane of the highbit accumulator where input
+ * bytes are zero.
+ */
+ highbit_cum = vector8_or(highbit_cum,
+ vector8_eq(chunk, vector8_broadcast(0)));
+#endif
/* Capture all set bits in this chunk. */
- highbit_cum |= chunk;
+ highbit_cum = vector8_or(highbit_cum, chunk);
s += sizeof(chunk);
}
/* Check if any high bits in the high bit accumulator got set. */
- if (highbit_cum & UINT64CONST(0x8080808080808080))
+ if (vector8_is_highbit_set(highbit_cum))
return false;
+#ifdef USE_NO_SIMD
/* Check if any high bits in the zero accumulator got cleared. */
- if (zero_cum != UINT64CONST(0x8080808080808080))
+ if (zero_cum != vector8_broadcast(0x80))
return false;
+#endif
return true;
}
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 56df989094..8f85153110 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -38,6 +38,7 @@ typedef __m128i Vector8;
* If no SIMD instructions are available, we can in some cases emulate vector
* operations using bitwise operations on unsigned integers.
*/
+#define USE_NO_SIMD
typedef uint64 Vector8;
#endif
@@ -47,7 +48,11 @@ static inline Vector8 vector8_broadcast(const uint8 c);
static inline bool vector8_has_zero(const Vector8 v);
static inline bool vector8_has(const Vector8 v, const uint8 c);
static inline bool vector8_has_le(const Vector8 v, const uint8 c);
-
+static inline bool vector8_is_highbit_set(const Vector8 v);
+static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
+#ifndef USE_NO_SIMD
+static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+#endif
/*
* Functions for loading a chunk of memory into a vector.
@@ -181,4 +186,38 @@ vector8_has_le(const Vector8 v, const uint8 c)
return result;
}
+static inline bool
+vector8_is_highbit_set(const Vector8 v)
+{
+#ifdef USE_SSE2
+ return _mm_movemask_epi8(v) != 0;
+#else
+ return v & vector8_broadcast(0x80);
+#endif
+}
+
+/* comparisons between vectors */
+
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_eq(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_cmpeq_epi8(v1, v2);
+#endif
+}
+#endif
+
+/* bitwise operations */
+
+static inline Vector8
+vector8_or(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_or_si128(v1, v2);
+#else
+ return v1 | v2;
+#endif
+}
+
#endif /* SIMD_H */
On Thu, Aug 25, 2022 at 04:41:53PM +0700, John Naylor wrote:
v3 applies on top of the v9 json_lex_string patch in [1] and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.
LGTM
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
On Fri, Aug 26, 2022 at 10:26 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:
On Thu, Aug 25, 2022 at 04:41:53PM +0700, John Naylor wrote:
v3 applies on top of the v9 json_lex_string patch in [1] and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.LGTM
Thanks for looking, pushed with some rearrangements.
--
John Naylor
EDB: http://www.enterprisedb.com