use SSE2 for is_valid_ascii

Started by John Naylorover 3 years ago7 messages
#1John Naylor
john.naylor@enterprisedb.com
1 attachment(s)

new thread [was: WIP Patch: Add a function that returns binary JSONB as a bytea]

I wrote:

We can also shave a
few percent by having pg_utf8_verifystr use SSE2 for the ascii path. I
can look into this.

Here's a patch for that. If the input is mostly ascii, I'd expect that
part of the flame graph to shrink by 40-50% and give a small boost
overall.

Here is an updated patch using the new USE_SSE2 symbol. The style is
different from the last one in that each stanza has platform-specific
code. I wanted to try it this way because is_valid_ascii() is already
written in SIMD-ish style using general purpose registers and bit
twiddling, so it seemed natural to see the two side-by-side. Sometimes
they can share the same comment. If we think this is bad for
readability, I can go back to one block each, but that way leads to
duplication of code and it's difficult to see what's different for
each platform, IMO.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v2-0001-Use-SSE2-in-is_valid_ascii-where-available.patchapplication/x-patch; name=v2-0001-Use-SSE2-in-is_valid_ascii-where-available.patchDownload
From 69d56a21192ed2f03bc08f078cfff7ba5cb0d80b Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Fri, 5 Aug 2022 13:30:47 +0700
Subject: [PATCH v2] Use SSE2 in is_valid_ascii where available.

This is ~40% faster than portable C on contemporary Intel hardware.
---
 src/common/wchar.c                       | 18 +++------
 src/include/mb/pg_wchar.h                | 50 ++++++++++++++++++++++--
 src/test/regress/expected/conversion.out |  3 +-
 src/test/regress/sql/conversion.sql      |  3 +-
 4 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..a305e0e66b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,26 +1918,20 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	const int	orig_len = len;
 	uint32		state = BGN;
 
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
-
-	if (len >= STRIDE_LENGTH)
+	if (len >= ASCII_CHECK_LEN)
 	{
-		while (len >= STRIDE_LENGTH)
+		while (len >= ASCII_CHECK_LEN)
 		{
 			/*
 			 * If the chunk is all ASCII, we can skip the full UTF-8 check,
 			 * but we must first check for a non-END state, which means the
 			 * previous chunk ended in the middle of a multibyte sequence.
 			 */
-			if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
-				utf8_advance(s, &state, STRIDE_LENGTH);
+			if (state != END || !is_valid_ascii(s, ASCII_CHECK_LEN))
+				utf8_advance(s, &state, ASCII_CHECK_LEN);
 
-			s += STRIDE_LENGTH;
-			len -= STRIDE_LENGTH;
+			s += ASCII_CHECK_LEN;
+			len -= ASCII_CHECK_LEN;
 		}
 
 		/* The error state persists, so we only need to check for it here. */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 011b0b3abd..cbb1dfe978 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -19,6 +19,8 @@
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
 
+#include "port/simd.h"
+
 /*
  * The pg_wchar type
  */
@@ -704,25 +706,57 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
  * Verify a chunk of bytes for valid ASCII.
  *
  * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
+ * Since the chunk size is platform-specific, we provide the ASCII_CHECK_LEN
+ * convenience macro for callers to pass for len.
  */
 static inline bool
 is_valid_ascii(const unsigned char *s, int len)
 {
 	const unsigned char *const s_end = s + len;
+
+#ifdef USE_SSE2
+	__m128i		chunk,
+				error_cum = _mm_setzero_si128();
+#else
 	uint64		chunk,
 				highbit_cum = UINT64CONST(0),
 				zero_cum = UINT64CONST(0x8080808080808080);
+#endif
+
+	/*
+	 * With two chunks, gcc can unroll the loop. Even if the compiler can
+	 * unroll a longer loop, it's not worth it because callers might have to
+	 * use a byte-wise algorithm if we return false.
+	 */
+#ifdef USE_SSE2
+#define ASCII_CHECK_LEN (2 * sizeof(__m128i))
+#else
+#define ASCII_CHECK_LEN (2 * sizeof(uint64))
+#endif
 
 	Assert(len % sizeof(chunk) == 0);
 
 	while (s < s_end)
 	{
+#ifdef USE_SSE2
+		chunk = _mm_loadu_si128((const __m128i *) s);
+#else
 		memcpy(&chunk, s, sizeof(chunk));
+#endif
+
+		/* Capture any zero bytes in this chunk. */
+#ifdef USE_SSE2
+
+		/*
+		 * Set all bits in each lane of the error accumulator where input
+		 * bytes are zero.
+		 */
+		error_cum = _mm_or_si128(error_cum,
+								 _mm_cmpeq_epi8(chunk, _mm_setzero_si128()));
+#else
 
 		/*
-		 * Capture any zero bytes in this chunk.
-		 *
 		 * First, add 0x7f to each byte. This sets the high bit in each byte,
 		 * unless it was a zero. If any resulting high bits are zero, the
 		 * corresponding high bits in the zero accumulator will be cleared.
@@ -734,13 +768,22 @@ is_valid_ascii(const unsigned char *s, int len)
 		 * because we check for those separately.
 		 */
 		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+#endif
 
 		/* Capture all set bits in this chunk. */
+#ifdef USE_SSE2
+		error_cum = _mm_or_si128(error_cum, chunk);
+#else
 		highbit_cum |= chunk;
+#endif
 
 		s += sizeof(chunk);
 	}
 
+#ifdef USE_SSE2
+	/* Check if any lanes in the error accumulator got set. */
+	return _mm_movemask_epi8(error_cum) == 0;
+#else
 	/* Check if any high bits in the high bit accumulator got set. */
 	if (highbit_cum & UINT64CONST(0x8080808080808080))
 		return false;
@@ -750,6 +793,7 @@ is_valid_ascii(const unsigned char *s, int len)
 		return false;
 
 	return true;
+#endif
 }
 
 #endif							/* PG_WCHAR_H */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 442e7aff2b..434dc4d93c 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -140,7 +140,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
 -- will contain all 4 bytes if they are present, so various
 -- expressions below add 3 ASCII bytes to the end to ensure
 -- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
 -- Test multibyte verification in fast path
 with test_bytes as (
   select
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 9a65fca91f..27ef069eaf 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -121,7 +121,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
 -- will contain all 4 bytes if they are present, so various
 -- expressions below add 3 ASCII bytes to the end to ensure
 -- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
 
 -- Test multibyte verification in fast path
 with test_bytes as (
-- 
2.36.1

#2Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#1)
Re: use SSE2 for is_valid_ascii

On Wed, Aug 10, 2022 at 01:50:14PM +0700, John Naylor wrote:

Here is an updated patch using the new USE_SSE2 symbol. The style is
different from the last one in that each stanza has platform-specific
code. I wanted to try it this way because is_valid_ascii() is already
written in SIMD-ish style using general purpose registers and bit
twiddling, so it seemed natural to see the two side-by-side. Sometimes
they can share the same comment. If we think this is bad for
readability, I can go back to one block each, but that way leads to
duplication of code and it's difficult to see what's different for
each platform, IMO.

This is a neat patch. I don't know that we need an entirely separate code
block for the USE_SSE2 path, but I do think that a little bit of extra
commentary would improve the readability. IMO the existing comment for the
zero accumulator has the right amount of detail.

+		/*
+		 * Set all bits in each lane of the error accumulator where input
+		 * bytes are zero.
+		 */
+		error_cum = _mm_or_si128(error_cum,
+								 _mm_cmpeq_epi8(chunk, _mm_setzero_si128()));

I wonder if reusing a zero vector (instead of creating a new one every
time) has any noticeable effect on performance.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#3John Naylor
john.naylor@enterprisedb.com
In reply to: Nathan Bossart (#2)
Re: use SSE2 for is_valid_ascii

On Thu, Aug 11, 2022 at 5:31 AM Nathan Bossart <nathandbossart@gmail.com> wrote:

This is a neat patch. I don't know that we need an entirely separate code
block for the USE_SSE2 path, but I do think that a little bit of extra
commentary would improve the readability. IMO the existing comment for the
zero accumulator has the right amount of detail.

+               /*
+                * Set all bits in each lane of the error accumulator where input
+                * bytes are zero.
+                */
+               error_cum = _mm_or_si128(error_cum,
+                                                                _mm_cmpeq_epi8(chunk, _mm_setzero_si128()));

Okay, I will think about the comments, thanks for looking.

I wonder if reusing a zero vector (instead of creating a new one every
time) has any noticeable effect on performance.

Creating a zeroed register is just FOO PXOR FOO, which should get
hoisted out of the (unrolled in this case) loop, and which a recent
CPU will just map to a hard-coded zero in the register file, in which
case the execution latency is 0 cycles. :-)

--
John Naylor
EDB: http://www.enterprisedb.com

#4Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#3)
Re: use SSE2 for is_valid_ascii

On Thu, Aug 11, 2022 at 11:10:34AM +0700, John Naylor wrote:

I wonder if reusing a zero vector (instead of creating a new one every
time) has any noticeable effect on performance.

Creating a zeroed register is just FOO PXOR FOO, which should get
hoisted out of the (unrolled in this case) loop, and which a recent
CPU will just map to a hard-coded zero in the register file, in which
case the execution latency is 0 cycles. :-)

Ah, indeed. At -O2, my compiler seems to zero out two registers before the
loop with either approach:

pxor %xmm0, %xmm0 ; accumulator
pxor %xmm2, %xmm2 ; always zeros

And within the loop, I see the following:

movdqu (%rdi), %xmm1
movdqu (%rdi), %xmm3
addq $16, %rdi
pcmpeqb %xmm2, %xmm1 ; check for zeros
por %xmm3, %xmm0 ; OR data into accumulator
por %xmm1, %xmm0 ; OR zero check results into accumulator
cmpq %rdi, %rsi

So the call to _mm_setzero_si128() within the loop is fine. Apologies for
the noise.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#5John Naylor
john.naylor@enterprisedb.com
In reply to: Nathan Bossart (#4)
1 attachment(s)
Re: use SSE2 for is_valid_ascii

v3 applies on top of the v9 json_lex_string patch in [1]/messages/by-id/CAFBsxsFV4v802idV0-Bo=V7wLMHRbOZ4er0hgposhyGCikmVGA@mail.gmail.com and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.

[1]: /messages/by-id/CAFBsxsFV4v802idV0-Bo=V7wLMHRbOZ4er0hgposhyGCikmVGA@mail.gmail.com

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v3-use-simd-is-valid-ascii.patchtext/x-patch; charset=US-ASCII; name=v3-use-simd-is-valid-ascii.patchDownload
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..1ca7533f00 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,11 +1918,12 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	const int	orig_len = len;
 	uint32		state = BGN;
 
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
+	/*
+	 * With a stride of two vector widths, gcc will unroll the loop. Even if
+	 * the compiler can unroll a longer loop, it's not worth it because we
+	 * must fall back to the byte-wise algorithm if we find any non-ASCII.
+	 */
+#define STRIDE_LENGTH (2 * sizeof(Vector8))
 
 	if (len >= STRIDE_LENGTH)
 	{
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 011b0b3abd..aea045aa66 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -19,6 +19,8 @@
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
 
+#include "port/simd.h"
+
 /*
  * The pg_wchar type
  */
@@ -704,25 +706,28 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
  * Verify a chunk of bytes for valid ASCII.
  *
  * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
  */
 static inline bool
 is_valid_ascii(const unsigned char *s, int len)
 {
 	const unsigned char *const s_end = s + len;
-	uint64		chunk,
-				highbit_cum = UINT64CONST(0),
-				zero_cum = UINT64CONST(0x8080808080808080);
+	Vector8		chunk;
+	Vector8		highbit_cum = vector8_broadcast(0);
+#ifdef USE_NO_SIMD
+	Vector8		zero_cum = vector8_broadcast(0x80);
+#endif
 
 	Assert(len % sizeof(chunk) == 0);
 
 	while (s < s_end)
 	{
-		memcpy(&chunk, s, sizeof(chunk));
+		vector8_load(&chunk, s);
+
+		/* Capture any zero bytes in this chunk. */
+#if defined(USE_NO_SIMD)
 
 		/*
-		 * Capture any zero bytes in this chunk.
-		 *
 		 * First, add 0x7f to each byte. This sets the high bit in each byte,
 		 * unless it was a zero. If any resulting high bits are zero, the
 		 * corresponding high bits in the zero accumulator will be cleared.
@@ -734,20 +739,31 @@ is_valid_ascii(const unsigned char *s, int len)
 		 * because we check for those separately.
 		 */
 		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+#else
+
+		/*
+		 * Set all bits in each lane of the highbit accumulator where input
+		 * bytes are zero.
+		 */
+		highbit_cum = vector8_or(highbit_cum,
+								 vector8_eq(chunk, vector8_broadcast(0)));
+#endif
 
 		/* Capture all set bits in this chunk. */
-		highbit_cum |= chunk;
+		highbit_cum = vector8_or(highbit_cum, chunk);
 
 		s += sizeof(chunk);
 	}
 
 	/* Check if any high bits in the high bit accumulator got set. */
-	if (highbit_cum & UINT64CONST(0x8080808080808080))
+	if (vector8_is_highbit_set(highbit_cum))
 		return false;
 
+#ifdef USE_NO_SIMD
 	/* Check if any high bits in the zero accumulator got cleared. */
-	if (zero_cum != UINT64CONST(0x8080808080808080))
+	if (zero_cum != vector8_broadcast(0x80))
 		return false;
+#endif
 
 	return true;
 }
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 56df989094..8f85153110 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -38,6 +38,7 @@ typedef __m128i Vector8;
  * If no SIMD instructions are available, we can in some cases emulate vector
  * operations using bitwise operations on unsigned integers.
  */
+#define USE_NO_SIMD
 typedef uint64 Vector8;
 #endif
 
@@ -47,7 +48,11 @@ static inline Vector8 vector8_broadcast(const uint8 c);
 static inline bool vector8_has_zero(const Vector8 v);
 static inline bool vector8_has(const Vector8 v, const uint8 c);
 static inline bool vector8_has_le(const Vector8 v, const uint8 c);
-
+static inline bool vector8_is_highbit_set(const Vector8 v);
+static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
+#ifndef USE_NO_SIMD
+static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+#endif
 
 /*
  * Functions for loading a chunk of memory into a vector.
@@ -181,4 +186,38 @@ vector8_has_le(const Vector8 v, const uint8 c)
 	return result;
 }
 
+static inline bool
+vector8_is_highbit_set(const Vector8 v)
+{
+#ifdef USE_SSE2
+	return _mm_movemask_epi8(v) != 0;
+#else
+	return v & vector8_broadcast(0x80);
+#endif
+}
+
+/* comparisons between vectors */
+
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_eq(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_cmpeq_epi8(v1, v2);
+#endif
+}
+#endif
+
+/* bitwise operations */
+
+static inline Vector8
+vector8_or(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_or_si128(v1, v2);
+#else
+	return v1 | v2;
+#endif
+}
+
 #endif							/* SIMD_H */
#6Nathan Bossart
nathandbossart@gmail.com
In reply to: John Naylor (#5)
Re: use SSE2 for is_valid_ascii

On Thu, Aug 25, 2022 at 04:41:53PM +0700, John Naylor wrote:

v3 applies on top of the v9 json_lex_string patch in [1] and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.

LGTM

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#7John Naylor
john.naylor@enterprisedb.com
In reply to: Nathan Bossart (#6)
Re: use SSE2 for is_valid_ascii

On Fri, Aug 26, 2022 at 10:26 AM Nathan Bossart
<nathandbossart@gmail.com> wrote:

On Thu, Aug 25, 2022 at 04:41:53PM +0700, John Naylor wrote:

v3 applies on top of the v9 json_lex_string patch in [1] and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.

LGTM

Thanks for looking, pushed with some rearrangements.

--
John Naylor
EDB: http://www.enterprisedb.com