From: John Naylor Date: Tue, 16 Dec 2025 08:19:16 +0000 (+0700) Subject: Separate out bytea sort support from varlena.c X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=9303d62c6db0207b8c2649205f7bc9350c1f62db;p=postgresql.git Separate out bytea sort support from varlena.c In the wake of commit b45242fd3, bytea_sortsupport() still called out to varstr_sortsupport(). Treating bytea as a kind of text/varchar required varstr_sortsupport() to allow for the possibility of NUL bytes, but only for C collation. This was confusing. For better separation of concerns, create an independent sortsupport implementation in bytea.c. The heuristics for bytea_abbrev_abort() remain the same as for varstr_abbrev_abort(). It's possible that the bytea case warrants different treatment, but that is left for future investigation. In passing, adjust some strange looking comparisons in varstr_abbrev_abort(). Author: Aleksander Alekseev Reviewed-by: John Naylor Reviewed-by: Chao Li Discussion: https://postgr.es/m/CAJ7c6TP1bAbEhUJa6+rgceN6QJWMSsxhg1=mqfSN=Nb-n6DAKg@mail.gmail.com --- diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index 6e7b914c563..f8524548e46 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -15,18 +15,19 @@ #include "postgres.h" #include "access/detoast.h" -#include "catalog/pg_collation_d.h" -#include "catalog/pg_type_d.h" +#include "common/hashfn.h" #include "common/int.h" #include "fmgr.h" +#include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bitutils.h" +#include "port/pg_bswap.h" #include "utils/builtins.h" #include "utils/bytea.h" #include "utils/fmgrprotos.h" +#include "utils/guc.h" #include "utils/memutils.h" #include "utils/sortsupport.h" -#include "utils/varlena.h" #include "varatt.h" /* GUC variable */ @@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L, bool length_not_specified); static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); +typedef struct +{ + bool abbreviate; /* Should we abbreviate keys? */ + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ + double prop_card; /* Required cardinality proportion */ +} ByteaSortSupport; + +/* Static function declarations for sort support */ +static int byteafastcmp(Datum x, Datum y, SortSupport ssup); +static Datum bytea_abbrev_convert(Datum original, SortSupport ssup); +static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup); + /* * bytea_catenate * Guts of byteacat(), broken out so it can be used by other functions @@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(result); } +/* + * sortsupport comparison func + */ +static int +byteafastcmp(Datum x, Datum y, SortSupport ssup) +{ + bytea *arg1 = DatumGetByteaPP(x); + bytea *arg2 = DatumGetByteaPP(y); + char *a1p, + *a2p; + int len1, + len2, + result; + + a1p = VARDATA_ANY(arg1); + a2p = VARDATA_ANY(arg2); + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + result = memcmp(a1p, a2p, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + + /* We can't afford to leak memory here. */ + if (PointerGetDatum(arg1) != x) + pfree(arg1); + if (PointerGetDatum(arg2) != y) + pfree(arg2); + + return result; +} + +/* + * Conversion routine for sortsupport. Converts original to abbreviated key + * representation. Our encoding strategy is simple -- pack the first 8 bytes + * of the bytea data into a Datum (on little-endian machines, the bytes are + * stored in reverse order), and treat it as an unsigned integer. + */ +static Datum +bytea_abbrev_convert(Datum original, SortSupport ssup) +{ + const size_t max_prefix_bytes = sizeof(Datum); + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + bytea *authoritative = DatumGetByteaPP(original); + char *authoritative_data = VARDATA_ANY(authoritative); + Datum res; + char *pres; + int len; + uint32 hash; + + pres = (char *) &res; + + /* memset(), so any non-overwritten bytes are NUL */ + memset(pres, 0, max_prefix_bytes); + len = VARSIZE_ANY_EXHDR(authoritative); + + /* + * Short byteas will have terminating NUL bytes in the abbreviated datum. + * Abbreviated comparison need not make a distinction between these NUL + * bytes, and NUL bytes representing actual NULs in the authoritative + * representation. + * + * Hopefully a comparison at or past one abbreviated key's terminating NUL + * byte will resolve the comparison without consulting the authoritative + * representation; specifically, some later non-NUL byte in the longer + * bytea can resolve the comparison against a subsequent terminating NUL + * in the shorter bytea. There will usually be what is effectively a + * "length-wise" resolution there and then. + * + * If that doesn't work out -- if all bytes in the longer bytea positioned + * at or past the offset of the smaller bytea (first) terminating NUL are + * actually representative of NUL bytes in the authoritative binary bytea + * (perhaps with some *terminating* NUL bytes towards the end of the + * longer bytea iff it happens to still be small) -- then an authoritative + * tie-breaker will happen, and do the right thing: explicitly consider + * bytea length. + */ + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string abbreviations for no saving in + * full memcmp()-based comparisons. These statistics are used by + * bytea_abbrev_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE))); + + if (len > PG_CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&bss->full_card, hash); + + /* Hash abbreviated key */ + { + uint32 tmp; + + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); + hash = DatumGetUInt32(hash_uint32(tmp)); + } + + addHyperLogLog(&bss->abbr_card, hash); + + /* + * Byteswap on little-endian machines. + * + * This is needed so that ssup_datum_unsigned_cmp() works correctly on all + * platforms. + */ + res = DatumBigEndianToNative(res); + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + + return res; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + * + * This is based on varstr_abbrev_abort(), but some comments have been elided + * for brevity. See there for more details. + */ +static bool +bytea_abbrev_abort(int memtupcount, SortSupport ssup) +{ + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + double abbrev_distinct, + key_distinct; + + Assert(ssup->abbreviate); + + /* Have a little patience */ + if (memtupcount < 100) + return false; + + abbrev_distinct = estimateHyperLogLog(&bss->abbr_card); + key_distinct = estimateHyperLogLog(&bss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While + * NULLs are generally disregarded, if only NULL values were seen so far, + * that might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct < 1.0) + abbrev_distinct = 1.0; + + if (key_distinct < 1.0) + key_distinct = 1.0; + + if (trace_sort) + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f " + "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, + bss->prop_card); + } + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct original keys, continue with abbreviation. + */ + if (abbrev_distinct > key_distinct * bss->prop_card) + { + /* + * Decay required cardinality aggressively after 10,000 tuples. + */ + if (memtupcount > 10000) + bss->prop_card *= 0.65; + + return false; + } + + /* + * Abort abbreviation strategy. + */ + if (trace_sort) + elog(LOG, "bytea_abbrev: aborted abbreviation at %d " + "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, bss->prop_card); + + return true; +} + Datum bytea_sortsupport(PG_FUNCTION_ARGS) { @@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - /* Use generic string SortSupport, forcing "C" collation */ - varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); + ssup->comparator = byteafastcmp; + + /* + * Set up abbreviation support if requested. + */ + if (ssup->abbreviate) + { + ByteaSortSupport *bss; + + bss = palloc_object(ByteaSortSupport); + bss->abbreviate = true; + bss->prop_card = 0.20; + initHyperLogLog(&bss->abbr_card, 10); + initHyperLogLog(&bss->full_card, 10); + + ssup->ssup_extra = bss; + ssup->abbrev_full_comparator = ssup->comparator; + ssup->comparator = ssup_datum_unsigned_cmp; + ssup->abbrev_converter = bytea_abbrev_convert; + ssup->abbrev_abort = bytea_abbrev_abort; + } MemoryContextSwitchTo(oldcontext); diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index baa5b44ea8d..8adeb8dadc6 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -92,7 +92,7 @@ typedef struct int last_returned; /* Last comparison result (cache) */ bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ bool collate_c; - Oid typid; /* Actual datatype (text/bpchar/bytea/name) */ + Oid typid; /* Actual datatype (text/bpchar/name) */ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */ double prop_card; /* Required cardinality proportion */ @@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS) * Includes locale support, and support for BpChar semantics (i.e. removing * trailing spaces before comparison). * - * Relies on the assumption that text, VarChar, BpChar, and bytea all have the - * same representation. Callers that always use the C collation (e.g. - * non-collatable type callers like bytea) may have NUL bytes in their strings; - * this will not work with any other collation, though. + * Relies on the assumption that text, VarChar, and BpChar all have the + * same representation. */ void varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) @@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) * representation. Our encoding strategy is simple -- pack the first 8 bytes * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are * stored in reverse order), and treat it as an unsigned integer. When the "C" - * locale is used, or in case of bytea, just memcpy() from original instead. + * locale is used just memcpy() from original instead. */ static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) @@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) /* * If we're using the C collation, use memcpy(), rather than strxfrm(), to - * abbreviate keys. The full comparator for the C locale is always - * memcmp(). It would be incorrect to allow bytea callers (callers that - * always force the C collation -- bytea isn't a collatable type, but this - * approach is convenient) to use strxfrm(). This is because bytea - * strings may contain NUL bytes. Besides, this should be faster, too. - * - * More generally, it's okay that bytea callers can have NUL bytes in - * strings because abbreviated cmp need not make a distinction between - * terminating NUL bytes, and NUL bytes representing actual NULs in the - * authoritative representation. Hopefully a comparison at or past one - * abbreviated key's terminating NUL byte will resolve the comparison - * without consulting the authoritative representation; specifically, some - * later non-NUL byte in the longer string can resolve the comparison - * against a subsequent terminating NUL in the shorter string. There will - * usually be what is effectively a "length-wise" resolution there and - * then. - * - * If that doesn't work out -- if all bytes in the longer string - * positioned at or past the offset of the smaller string's (first) - * terminating NUL are actually representative of NUL bytes in the - * authoritative binary string (perhaps with some *terminating* NUL bytes - * towards the end of the longer string iff it happens to still be small) - * -- then an authoritative tie-breaker will happen, and do the right - * thing: explicitly consider string length. + * abbreviate keys. The full comparator for the C locale is also + * memcmp(). This should be faster than strxfrm(). */ if (sss->collate_c) memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); @@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * strxfrm() blob is itself NUL terminated, leaving no danger of * misinterpreting any NUL bytes not intended to be interpreted as * logically representing termination. - * - * (Actually, even if there were NUL bytes in the blob it would be - * okay. See remarks on bytea case above.) */ memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); } @@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup) * NULLs are generally disregarded, if only NULL values were seen so far, * that might misrepresent costs if we failed to clamp. */ - if (abbrev_distinct <= 1.0) + if (abbrev_distinct < 1.0) abbrev_distinct = 1.0; - if (key_distinct <= 1.0) + if (key_distinct < 1.0) key_distinct = 1.0; /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3451538565e..04845d5e680 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -366,6 +366,7 @@ BulkWriteBuffer BulkWriteState BumpBlock BumpContext +ByteaSortSupport CACHESIGN CAC_state CCFastEqualFN