From 97362f6bec5a8d6e016d8a6b8700c2d3c7e7b877 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Wed, 24 Feb 2021 11:39:42 -0400 Subject: [PATCH v7 3/4] Add an ASCII fast path to the fallback UTF-8 validator. Using bitwise operations, we can check an entire 8-byte chunk for both valid ASCII and zero bytes. --- src/include/port/pg_utf8.h | 33 +++++++++++++++++++++++++++++++++ src/port/pg_utf8_fallback.c | 11 ++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h index 636c637706..a19fc55c1e 100644 --- a/src/include/port/pg_utf8.h +++ b/src/include/port/pg_utf8.h @@ -48,4 +48,37 @@ extern int pg_validate_utf8_fallback(const unsigned char *s, int len); #define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) #define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) +/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */ +#define HAS_ZERO(chunk) ( \ + ((chunk) - UINT64CONST(0x0101010101010101)) & \ + ~(chunk) & \ + UINT64CONST(0x8080808080808080)) + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 chunk, + highbits_set; + + if (len >= sizeof(uint64)) + { + memcpy(&chunk, s, sizeof(uint64)); + + /* If there are zero bytes, bail and let the slow path handle it. */ + if (HAS_ZERO(chunk)) + return 0; + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = (chunk & UINT64CONST(0x8080808080808080)); + + if (!highbits_set) + return sizeof(uint64); + else + return 0; + } + else + return 0; +} + #endif /* PG_UTF8_H */ diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c index 85a6bbd0eb..9a29d909ef 100644 --- a/src/port/pg_utf8_fallback.c +++ b/src/port/pg_utf8_fallback.c @@ -34,7 +34,16 @@ pg_validate_utf8_fallback(const unsigned char *s, int len) { int l; - /* ASCII */ + /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') -- 2.22.0