From b3577cfb84149214fe8a1119a92ee6fb1a362628 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev Date: Wed, 9 Oct 2024 15:02:58 +0300 Subject: [PATCH v1] Refactor bytea_sortsupport() TODO FIXME write a better commit message Aleksander Alekseev, reviewed by TODO FIXME Discussion: TODO FIXME --- src/backend/utils/adt/varlena.c | 303 +++++++++++++++++++++++++++++--- 1 file changed, 281 insertions(+), 22 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 533bebc1c7..0c6e2922fc 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -96,6 +96,22 @@ typedef struct pg_locale_t locale; } VarStringSortSupport; +typedef struct /* AALEKSEEV TODO FIXME correct comments, remove redundant fields */ +{ + char *buf1; /* 1st string, or abbreviation original string + * buf */ + char *buf2; /* 2nd string, or abbreviation strxfrm() buf */ + int buflen1; /* Allocated length of buf1 */ + int buflen2; /* Allocated length of buf2 */ + int last_len1; /* Length of last buf1 string/strxfrm() input */ + int last_len2; /* Length of last buf2 string/strxfrm() blob */ + int last_returned; /* Last comparison result (cache) */ + // bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ --- AALEKSEEV always true + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ + double prop_card; /* Required cardinality proportion */ +} ByteaSortSupport; + /* * Output data for split_text(): we output either to an array or a table. * tupstore and tupdesc must be set up in advance to output to a table. @@ -116,6 +132,10 @@ typedef struct #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X)) #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X)) +static Datum bytea_abbrev_convert(Datum original, SortSupport ssup); +static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup); +static int bytea_ssup_comparator(Datum x, Datum y, SortSupport ssup); + static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup); static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup); static int namefastcmp_c(Datum x, Datum y, SortSupport ssup); @@ -124,6 +144,7 @@ static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup); static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup); static Datum varstr_abbrev_convert(Datum original, SortSupport ssup); static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup); + static int32 text_length(Datum str); static text *text_catenate(text *t1, text *t2); static text *text_substring(Datum str, @@ -1847,10 +1868,8 @@ bttextsortsupport(PG_FUNCTION_ARGS) * Includes locale support, and support for BpChar semantics (i.e. removing * trailing spaces before comparison). * - * Relies on the assumption that text, VarChar, BpChar, and bytea all have the - * same representation. Callers that always use the C collation (e.g. - * non-collatable type callers like bytea) may have NUL bytes in their strings; - * this will not work with any other collation, though. + * Relies on the assumption that text, VarChar and BpChar all have the + * same representation. */ void varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) @@ -2214,7 +2233,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) * representation. Our encoding strategy is simple -- pack the first 8 bytes * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are * stored in reverse order), and treat it as an unsigned integer. When the "C" - * locale is used, or in case of bytea, just memcpy() from original instead. + * locale is used just memcpy() from original instead. */ static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) @@ -2242,21 +2261,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) /* * If we're using the C collation, use memcpy(), rather than strxfrm(), to * abbreviate keys. The full comparator for the C locale is always - * memcmp(). It would be incorrect to allow bytea callers (callers that - * always force the C collation -- bytea isn't a collatable type, but this - * approach is convenient) to use strxfrm(). This is because bytea - * strings may contain NUL bytes. Besides, this should be faster, too. - * - * More generally, it's okay that bytea callers can have NUL bytes in - * strings because abbreviated cmp need not make a distinction between - * terminating NUL bytes, and NUL bytes representing actual NULs in the - * authoritative representation. Hopefully a comparison at or past one - * abbreviated key's terminating NUL byte will resolve the comparison - * without consulting the authoritative representation; specifically, some - * later non-NUL byte in the longer string can resolve the comparison - * against a subsequent terminating NUL in the shorter string. There will - * usually be what is effectively a "length-wise" resolution there and - * then. + * memcmp(). * * If that doesn't work out -- if all bytes in the longer string * positioned at or past the offset of the smaller string's (first) @@ -2524,6 +2529,235 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup) return true; } +/* + * sortsupport comparison func for bytea + */ +static int +bytea_ssup_comparator(Datum x, Datum y, SortSupport ssup) +{ + bytea *arg1 = DatumGetByteaPP(x); + bytea *arg2 = DatumGetByteaPP(y); + int len1, + len2, + result; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + + /* We can't afford to leak memory here. */ + /* XXX we should probably add ByteaGetDatum() for type safety - a.alekseev */ + if (PointerGetDatum(arg1) != x) + pfree(arg1); + if (PointerGetDatum(arg2) != y) + pfree(arg2); + + return result; +} + +/* + * Conversion routine for sortsupport. Converts original to abbreviated key + * representation. Our encoding strategy is simple -- pack the first 8 bytes + * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are + * stored in reverse order), and treat it as an unsigned integer. When the "C" + * locale is used just memcpy() from original instead. + */ +// AALEKSEEV TODO FIXME refactor +static Datum +bytea_abbrev_convert(Datum original, SortSupport ssup) +{ + const size_t max_prefix_bytes = sizeof(Datum); + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + VarString *authoritative = DatumGetVarStringPP(original); + char *authoritative_data = VARDATA_ANY(authoritative); + + /* working state */ + Datum res; + char *pres; + int len; + uint32 hash; + + pres = (char *) &res; + /* memset(), so any non-overwritten bytes are NUL */ + memset(pres, 0, max_prefix_bytes); + len = VARSIZE_ANY_EXHDR(authoritative); + + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string transformations for no saving + * in full strcoll()-based comparisons. These statistics are used by + * varstr_abbrev_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE))); + + if (len > PG_CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&bss->full_card, hash); + + /* Hash abbreviated key */ +#if SIZEOF_DATUM == 8 + { + uint32 lohalf, + hihalf; + + lohalf = (uint32) res; + hihalf = (uint32) (res >> 32); + hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf)); + } +#else /* SIZEOF_DATUM != 8 */ + hash = DatumGetUInt32(hash_uint32((uint32) res)); +#endif + + addHyperLogLog(&bss->abbr_card, hash); + + /* Cache result, perhaps saving an expensive strxfrm() call next time */ + /// bss->cache_blob = true; --- AALEKSEEV always true + + /* + * Byteswap on little-endian machines. + * + * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer + * 3-way comparator) works correctly on all platforms. If we didn't do + * this, the comparator would have to call memcmp() with a pair of + * pointers to the first byte of each abbreviated key, which is slower. + */ + res = DatumBigEndianToNative(res); + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + + return res; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + */ +// AALEKSEEV TODO FIXME refactor +static bool +bytea_abbrev_abort(int memtupcount, SortSupport ssup) +{ + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + double abbrev_distinct, + key_distinct; + + Assert(ssup->abbreviate); + + /* Have a little patience */ + if (memtupcount < 100) + return false; + + abbrev_distinct = estimateHyperLogLog(&bss->abbr_card); + key_distinct = estimateHyperLogLog(&bss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While + * NULLs are generally disregarded, if only NULL values were seen so far, + * that might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct <= 1.0) + abbrev_distinct = 1.0; + + if (key_distinct <= 1.0) + key_distinct = 1.0; + + /* + * In the worst case all abbreviated keys are identical, while at the same + * time there are differences within full key strings not captured in + * abbreviations. + */ + if (trace_sort) + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f " + "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, + bss->prop_card); + } + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct authoritative original keys, that's reason enough to + * proceed. We can win even with a very low cardinality set if most + * tie-breakers only memcmp(). This is by far the most important + * consideration. + * + * While comparisons that are resolved at the abbreviated key level are + * considerably cheaper than tie-breakers resolved with memcmp(), both of + * those two outcomes are so much cheaper than a full strcoll() once + * sorting is underway that it doesn't seem worth it to weigh abbreviated + * cardinality against the overall size of the set in order to more + * accurately model costs. Assume that an abbreviated comparison, and an + * abbreviated comparison with a cheap memcmp()-based authoritative + * resolution are equivalent. + */ + if (abbrev_distinct > key_distinct * bss->prop_card) + { + /* + * When we have exceeded 10,000 tuples, decay required cardinality + * aggressively for next call. + * + * This is useful because the number of comparisons required on + * average increases at a linearithmic rate, and at roughly 10,000 + * tuples that factor will start to dominate over the linear costs of + * string transformation (this is a conservative estimate). The decay + * rate is chosen to be a little less aggressive than halving -- which + * (since we're called at points at which memtupcount has doubled) + * would never see the cost model actually abort past the first call + * following a decay. This decay rate is mostly a precaution against + * a sudden, violent swing in how well abbreviated cardinality tracks + * full key cardinality. The decay also serves to prevent a marginal + * case from being aborted too late, when too much has already been + * invested in string transformation. + * + * It's possible for sets of several million distinct strings with + * mere tens of thousands of distinct abbreviated keys to still + * benefit very significantly. This will generally occur provided + * each abbreviated key is a proxy for a roughly uniform number of the + * set's full keys. If it isn't so, we hope to catch that early and + * abort. If it isn't caught early, by the time the problem is + * apparent it's probably not worth aborting. + */ + if (memtupcount > 10000) + bss->prop_card *= 0.65; + + return false; + } + + /* + * Abort abbreviation strategy. + * + * The worst case, where all abbreviated keys are identical while all + * original strings differ will typically only see a regression of about + * 10% in execution time for small to medium sized lists of strings. + * Whereas on modern CPUs where cache stalls are the dominant cost, we can + * often expect very large improvements, particularly with sets of strings + * of moderately high to high abbreviated cardinality. There is little to + * lose but much to gain, which our strategy reflects. + */ + if (trace_sort) + elog(LOG, "varstr_abbrev: aborted abbreviation at %d " + "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, bss->prop_card); + + return true; +} + /* * Generic equalimage support function for character type's operator classes. * Disables the use of deduplication with nondeterministic collations. @@ -3977,8 +4211,33 @@ bytea_sortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - /* Use generic string SortSupport, forcing "C" collation */ - varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); + ssup->comparator = bytea_ssup_comparator; + + if (ssup->abbreviate) + { + ByteaSortSupport *bss; + + bss = palloc(sizeof(ByteaSortSupport)); + bss->buf1 = palloc(TEXTBUFLEN); + bss->buflen1 = TEXTBUFLEN; + bss->buf2 = palloc(TEXTBUFLEN); + bss->buflen2 = TEXTBUFLEN; + /* Start with invalid values */ + bss->last_len1 = -1; + bss->last_len2 = -1; + /* Initialize */ + bss->last_returned = 0; + + bss->prop_card = 0.20; + initHyperLogLog(&bss->abbr_card, 10); + initHyperLogLog(&bss->full_card, 10); + + ssup->ssup_extra = bss; + ssup->abbrev_full_comparator = ssup->comparator; + ssup->comparator = ssup_datum_unsigned_cmp; + ssup->abbrev_converter = bytea_abbrev_convert; + ssup->abbrev_abort = bytea_abbrev_abort; + } MemoryContextSwitchTo(oldcontext); -- 2.46.0