From 19b6de698ade5f12c5058764b6906e2472a44322 Mon Sep 17 00:00:00 2001 From: Brandur Date: Mon, 29 Jul 2019 20:09:43 -0700 Subject: [PATCH v5] SortSupport for inet/cidr types Implements SortSupport for the inet/cidr types in Postgres by devising an abbreviated key representation for them that will faithfully reproduce their existing sorting rules. This has the effect of typically reducing the time taken for inet/cidr sorts by ~50-60%, and should show a good improvement for the vast majority of real-world data sets. --- src/backend/utils/adt/network.c | 369 +++++++++++++++++++++++++++++++++++++ src/include/catalog/pg_amproc.dat | 3 + src/include/catalog/pg_proc.dat | 3 + src/test/regress/expected/inet.out | 104 +++++++++++ src/test/regress/sql/inet.sql | 53 ++++++ 5 files changed, 532 insertions(+) diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index 5e980cf23f..12e9f29b66 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -16,6 +16,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "common/ip.h" +#include "lib/hyperloglog.h" #include "libpq/libpq-be.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -24,12 +25,36 @@ #include "nodes/supportnodes.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/guc.h" #include "utils/hashutils.h" #include "utils/inet.h" #include "utils/lsyscache.h" +#include "utils/sortsupport.h" +/* + * An IPv4 netmask size is a value in the range of 0 - 32, which is + * represented with 6 bits in inet/cidr abbreviated keys where possible. + * + * An IPv4 inet/cidr abbreviated key can use up to 25 bits for subnet + * component. + */ +#define ABBREV_BITS_INET4_NETMASK_SIZE 6 +#define ABBREV_BITS_INET4_SUBNET 25 + +/* sortsupport for inet/cidr */ +typedef struct +{ + int64 input_count; /* number of non-null values seen */ + bool estimating; /* true if estimating cardinality */ + + hyperLogLogState abbr_card; /* cardinality estimator */ +} network_sortsupport_state; static int32 network_cmp_internal(inet *a1, inet *a2); +static int network_fast_cmp(Datum x, Datum y, SortSupport ssup); +static int network_cmp_abbrev(Datum x, Datum y, SortSupport ssup); +static bool network_abbrev_abort(int memtupcount, SortSupport ssup); +static Datum network_abbrev_convert(Datum original, SortSupport ssup); static List *match_network_function(Node *leftop, Node *rightop, int indexarg, @@ -405,6 +430,350 @@ network_cmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(network_cmp_internal(a1, a2)); } +/* + * SortSupport strategy routine + */ +Datum +network_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = network_fast_cmp; + ssup->ssup_extra = NULL; + + if (ssup->abbreviate) + { + network_sortsupport_state *uss; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + uss = palloc(sizeof(network_sortsupport_state)); + uss->input_count = 0; + uss->estimating = true; + initHyperLogLog(&uss->abbr_card, 10); + + ssup->ssup_extra = uss; + + ssup->comparator = network_cmp_abbrev; + ssup->abbrev_converter = network_abbrev_convert; + ssup->abbrev_abort = network_abbrev_abort; + ssup->abbrev_full_comparator = network_fast_cmp; + + MemoryContextSwitchTo(oldcontext); + } + + PG_RETURN_VOID(); +} + +/* + * SortSupport comparison func + */ +static int +network_fast_cmp(Datum x, Datum y, SortSupport ssup) +{ + inet *arg1 = DatumGetInetPP(x); + inet *arg2 = DatumGetInetPP(y); + + return network_cmp_internal(arg1, arg2); +} + +/* + * Abbreviated key comparison func + */ +static int +network_cmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + if (x > y) + return 1; + else if (x == y) + return 0; + else + return -1; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization. + * + * We pay no attention to the cardinality of the non-abbreviated data, because + * there is no equality fast-path within authoritative inet comparator. + */ +static bool +network_abbrev_abort(int memtupcount, SortSupport ssup) +{ + network_sortsupport_state *uss = ssup->ssup_extra; + double abbr_card; + + if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating) + return false; + + abbr_card = estimateHyperLogLog(&uss->abbr_card); + + /* + * If we have >100k distinct values, then even if we were sorting many + * billion rows we'd likely still break even, and the penalty of undoing + * that many rows of abbrevs would probably not be worth it. At this point + * we stop counting because we know that we're now fully committed. + */ + if (abbr_card > 100000.0) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: estimation ends at cardinality %f" + " after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count, memtupcount); +#endif + uss->estimating = false; + return false; + } + + /* + * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row + * fudge factor allows us to abort earlier on genuinely pathological data + * where we've had exactly one abbreviated value in the first 2k + * (non-null) rows. + */ + if (abbr_card < uss->input_count / 2000.0 + 0.5) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: aborting abbreviation at cardinality %f" + " below threshold %f after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count, + memtupcount); +#endif + return true; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: cardinality %f after " INT64_FORMAT + " values (%d rows)", abbr_card, uss->input_count, memtupcount); +#endif + + return false; +} + +/* + * SortSupport conversion routine. Converts original inet/cidr representation + * to abbreviated key representation that works with simple 3-way unsigned int + * comparisons. The network_cmp_internal() rules for sorting inet/cidr datums + * are followed by abbreviated comparisons by an encoding scheme that + * conditions keys through careful use of padding. + * + * Some background: inet values have three major components (take for example + * the address 1.2.3.4/24): + * + * * A network, or netmasked bits (1.2.3.0). + * * A netmask size (/24). + * * A subnet, or bits outside of the netmask (0.0.0.4). + * + * cidr values are the same except that with only the first two components -- + * all their subnet bits *must* be zero (1.2.3.0/24). + * + * IPv4 and IPv6 are identical in this makeup, with the difference being that + * IPv4 addresses have a maximum of 32 bits compared to IPv6's 64 bits, so in + * IPv6 each part may be larger. + * + * inet/cdir types compare using these sorting rules. If inequality is detected + * at any step, comparison is finished. If any rule is a tie, the algorithm + * drops through to the next to break it: + * + * 1. IPv4 always appears before IPv6. + * 2. Network bits are compared. + * 3. Netmask size is compared. + * 4. All bits are compared (having made it here, we know that both + * netmasked bits and netmask size are equal, so we're in effect only + * comparing subnet bits). + * + * When generating abbreviated keys for SortSupport, we pack as much as we can + * into a datum while ensuring that when comparing those keys as integers, + * these rules will be respected. Exact contents depend on IP family and datum + * size. + * + * IPv4 + * ---- + * + * 4 byte datums: + * + * Start with 1 bit for the IP family (IPv4 or IPv6; this bit is present in + * every case below) followed by all but 1 of the netmasked bits. + * + * +----------+---------------------+ + * | 1 bit IP | 31 bits network | (1 bit network + * | family | (truncated) | omitted) + * +----------+---------------------+ + * + * 8 byte datums: + * + * We have space to store all netmasked bits, followed by the netmask size, + * followed by 25 bits of the subnet (25 bits is usually more than enough in + * practice). cidr datums always have all-zero subnet bits. + * + * +----------+-----------------------+--------------+--------------------+ + * | 1 bit IP | 32 bits network | 6 bits | 25 bits subnet | + * | family | (full) | network size | (truncated) | + * +----------+-----------------------+--------------+--------------------+ + * + * IPv6 + * ---- + * + * 4 byte datums: + * + * +----------+---------------------+ + * | 1 bit IP | 31 bits network | (up to 97 bits + * | family | (truncated) | network omitted) + * +----------+---------------------+ + * + * 8 byte datums: + * + * +----------+---------------------------------+ + * | 1 bit IP | 63 bits network | (up to 65 bits + * | family | (truncated) | network omitted) + * +----------+---------------------------------+ + */ +static Datum +network_abbrev_convert(Datum original, SortSupport ssup) +{ + network_sortsupport_state *uss = ssup->ssup_extra; + inet *authoritative = DatumGetInetPP(original); + Datum res, + ipaddr_datum, + subnet_bitmask, + network; + int subnet_size; + + Assert(ip_family(authoritative) == PGSQL_AF_INET || + ip_family(authoritative) == PGSQL_AF_INET6); + + + /* + * Get an unsigned integer representation of the IP address by taking its + * first 4 or 8 bytes. Always take all 4 bytes of an IPv4 address. Take + * the first 8 bytes of an IPv6 address with an 8 byte datum and 4 bytes + * otherwise. + * + * We're consuming an array of char, so make sure to byteswap on little + * endian systems (an inet's IP array emulates big endian in that the + * first byte is always the most significant). + */ + if (ip_family(authoritative) == PGSQL_AF_INET) + { + uint32 ipaddr_datum32 = *((uint32 *) ip_addr(authoritative)); + + /* Must byteswap on little-endian machines */ +#ifndef WORDS_BIGENDIAN + ipaddr_datum32 = pg_bswap32(ipaddr_datum32); +#endif + + ipaddr_datum = ipaddr_datum32; + + /* + * Initialize result without setting ipfamily bit (a leading zero bit + * implies IPv4) + */ + res = (Datum) 0; + } + else + { + ipaddr_datum = *((Datum *) ip_addr(authoritative)); + + /* Must byteswap on little-endian machines */ + ipaddr_datum = DatumBigEndianToNative(ipaddr_datum); + + /* Initialize result with ipfamily (most significant) bit set */ + res = ((Datum) 1) << (SIZEOF_DATUM * BITS_PER_BYTE - 1); + } + + /* + * ipaddr_datum must be "split in two": high order bits go in "network" + * component of abbreviated key (often with zeroed bits at the end due to + * masking), while low order bits go in "subnet" component when there is + * space for one. This is accomplished by generating a temp datum subnet + * bitmask, which we may reuse later when generating the subnet bits + * themselves. (Note that subnet bits are only used with IPv4 datums on + * platforms where datum is 8 bytes.) + * + * The number of bits in subnet is used to generate a datum subnet + * bitmask. For example, with a /24 IPv4 datum there are 8 subnet bits (32 + * - 24), so the final subnet bitmask is '1111 1111'. + */ + subnet_size = ip_maxbits(authoritative) - ip_bits(authoritative); + Assert(subnet_size >= 0); + subnet_bitmask = (((Datum) 1) << subnet_size) - 1; + network = ipaddr_datum & ~subnet_bitmask; + +#if SIZEOF_DATUM == 8 + if (ip_family(authoritative) == PGSQL_AF_INET) + { + /* + * IPv4 with 8 byte datums: keep all 32 netmasked bits, netmask size, + * and most significant 25 subnet bits + */ + Datum netmask_size = (Datum) ip_bits(authoritative); + Datum subnet; + + /* Shift left 31 bits: 6 bits netmask size + 25 subnet bits */ + network <<= (ABBREV_BITS_INET4_NETMASK_SIZE + + ABBREV_BITS_INET4_SUBNET); + + /* Shift size to make room for subnet bits at the end */ + netmask_size <<= ABBREV_BITS_INET4_SUBNET; + + /* Extract subnet bits without shifting them */ + subnet = ipaddr_datum & subnet_bitmask; + + /* + * If we have more than 25 subnet bits, we can't fit everything. Shift + * subnet down to avoid clobbering bits that are only supposed to be + * used for netmask_size. + * + * Discarding the least significant subnet bits like this is correct + * because abbreviated comparisons that are resolved at the subnet + * level must have had equal subnet sizes in order to get that far. + */ + if (subnet_size > ABBREV_BITS_INET4_SUBNET) + subnet >>= subnet_size - ABBREV_BITS_INET4_SUBNET; + + /* + * Assemble the final abbreviated key without clobbering the ipfamily + * bit that must remain a zero. + */ + res |= network | netmask_size | subnet; + } + else +#endif + { + /* + * 4 byte datums, or IPv6 with 8 byte datums: Use as many of the + * netmasked bits as will fit in final abbreviated key. Avoid + * clobbering the ipfamily bit that was set earlier. + */ + res |= network >> 1; + } + + uss->input_count += 1; + + /* Hash abbreviated key */ + if (uss->estimating) + { + uint32 tmp; + +#if SIZEOF_DATUM == 8 + tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); +#else /* SIZEOF_DATUM != 8 */ + tmp = (uint32) res; +#endif + + addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + } + + return res; +} + /* * Boolean ordering tests. */ diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 5567b7ebad..5e705019b4 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -93,6 +93,9 @@ amproc => 'in_range(float4,float4,float8,bool,bool)' }, { amprocfamily => 'btree/network_ops', amproclefttype => 'inet', amprocrighttype => 'inet', amprocnum => '1', amproc => 'network_cmp' }, +{ amprocfamily => 'btree/network_ops', amproclefttype => 'inet', + amprocrighttype => 'inet', amprocnum => '2', + amproc => 'network_sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '1', amproc => 'btint2cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 0902dce5f1..58311d31e3 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3954,6 +3954,9 @@ { oid => '3551', proname => 'network_overlap', prorettype => 'bool', proargtypes => 'inet inet', prosrc => 'network_overlap' }, +{ oid => '8190', descr => 'sort support', + proname => 'network_sortsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'network_sortsupport' }, # inet/cidr functions { oid => '598', descr => 'abbreviated display of inet value', diff --git a/src/test/regress/expected/inet.out b/src/test/regress/expected/inet.out index 24202376f1..961808cde1 100644 --- a/src/test/regress/expected/inet.out +++ b/src/test/regress/expected/inet.out @@ -845,3 +845,107 @@ SELECT inet_merge(c, i) FROM INET_TBL WHERE inet_same_family(c, i); ::/24 (17 rows) +-- tests to specifically check some of the potentially sharp edges when it +-- comes to SortSupport implementation +SELECT i FROM (VALUES + ('0.0.0.0/0'::inet), + ('0.0.0.1/0'::inet), + ('192.168.1.4/0'::inet), + ('192.168.1.5/0'::inet), + ('255.0.0.0/0'::inet), + ('255.255.255.255/0'::inet), + ('0.0.0.0/1'::inet), + ('0.0.0.0/32'::inet), + ('0.0.0.1/1'::inet), + ('255.255.255.255/0'::inet), + ('255.255.255.255/1'::inet), + ('192.168.1.3/1'::inet), + ('192.168.1.1/5'::inet), + ('192.168.1.255/5'::inet), + ('192.168.1.1/6'::inet), + ('192.168.1.255/6'::inet), + ('192.168.1.1/23'::inet), + ('192.168.1.2/23'::inet), + ('192.168.1.3/23'::inet), + ('192.168.1.0/24'::inet), + ('192.168.1.0/25'::inet), + ('255.255.255.255/16'::inet), + ('255.255.255.255/31'::inet), + ('255.255.255.254/32'::inet), + ('255.255.255.255/32'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/0'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/0'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/128'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0001/128'::inet), + ('::1:ffff:ffff:ffff:ffff/128'::inet), + ('::2:ffff:ffff:ffff:ffff/128'::inet), + ('127::1'::inet), + ('127::2'::inet), + ('8000:0000:0000:0000:0000:0000:0000:0000/1'::inet), + ('ffff:83e7:f118:57dc:6093:6d92:689d:58cf/70'::inet), + ('ffff:84b0:4775:536e:c3ed:7116:a6d6:34f0/44'::inet), + ('ffff:8566:f84:5867:47f1:7867:d2ba:8a1a/69'::inet), + ('ffff:8883:f028:7d2:4d68:d510:7d6b:ac43/73'::inet), + ('ffff:8ae8:7c14:65b3:196:8e4a:89ae:fb30/89'::inet), + ('ffff:8dd0:646:694c:7c16:7e35:6a26:171/104'::inet), + ('ffff:8eef:cbf:700:eda3:ae32:f4b4:318b/121'::inet), + ('ffff:90e7:e744:664:a93:8efe:1f25:7663/122'::inet), + ('ffff:9597:c69c:8b24:57a:8639:ec78:6026/111'::inet), + ('ffff:9e86:79ea:f16e:df31:8e4d:7783:532e/88'::inet), + ('ffff:a0c7:82d3:24de:f762:6e1f:316d:3fb2/23'::inet), + ('ffff:ffff:ffff:fffd::/128'::inet), + ('ffff:ffff:ffff:fffe::/128'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/128'::inet) +) AS i ORDER BY i; + i +---------------------------------------------- + (0.0.0.0/0) + (0.0.0.1/0) + (192.168.1.4/0) + (192.168.1.5/0) + (255.0.0.0/0) + (255.255.255.255/0) + (255.255.255.255/0) + (0.0.0.0/1) + (0.0.0.1/1) + (0.0.0.0) + (192.168.1.3/1) + (255.255.255.255/1) + (192.168.1.1/5) + (192.168.1.255/5) + (192.168.1.1/6) + (192.168.1.255/6) + (192.168.1.1/23) + (192.168.1.2/23) + (192.168.1.3/23) + (192.168.1.0/24) + (192.168.1.0/25) + (255.255.255.255/16) + (255.255.255.255/31) + (255.255.255.254) + (255.255.255.255) + (::/0) + (ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/0) + (::) + (::1) + (::1:ffff:ffff:ffff:ffff) + (::2:ffff:ffff:ffff:ffff) + (127::1) + (127::2) + (8000::/1) + (ffff:83e7:f118:57dc:6093:6d92:689d:58cf/70) + (ffff:84b0:4775:536e:c3ed:7116:a6d6:34f0/44) + (ffff:8566:f84:5867:47f1:7867:d2ba:8a1a/69) + (ffff:8883:f028:7d2:4d68:d510:7d6b:ac43/73) + (ffff:8ae8:7c14:65b3:196:8e4a:89ae:fb30/89) + (ffff:8dd0:646:694c:7c16:7e35:6a26:171/104) + (ffff:8eef:cbf:700:eda3:ae32:f4b4:318b/121) + (ffff:90e7:e744:664:a93:8efe:1f25:7663/122) + (ffff:9597:c69c:8b24:57a:8639:ec78:6026/111) + (ffff:9e86:79ea:f16e:df31:8e4d:7783:532e/88) + (ffff:a0c7:82d3:24de:f762:6e1f:316d:3fb2/23) + (ffff:ffff:ffff:fffd::) + (ffff:ffff:ffff:fffe::) + (ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff) +(48 rows) + diff --git a/src/test/regress/sql/inet.sql b/src/test/regress/sql/inet.sql index bbfa9d37df..1bd6219d8f 100644 --- a/src/test/regress/sql/inet.sql +++ b/src/test/regress/sql/inet.sql @@ -156,3 +156,56 @@ INSERT INTO INET_TBL (c, i) VALUES ('10', '10::/8'); SELECT inet_merge(c, i) FROM INET_TBL; -- fix it by inet_same_family() condition SELECT inet_merge(c, i) FROM INET_TBL WHERE inet_same_family(c, i); + +-- tests to specifically check some of the potentially sharp edges when it +-- comes to SortSupport implementation +SELECT i FROM (VALUES + ('0.0.0.0/0'::inet), + ('0.0.0.1/0'::inet), + ('192.168.1.4/0'::inet), + ('192.168.1.5/0'::inet), + ('255.0.0.0/0'::inet), + ('255.255.255.255/0'::inet), + ('0.0.0.0/1'::inet), + ('0.0.0.0/32'::inet), + ('0.0.0.1/1'::inet), + ('255.255.255.255/0'::inet), + ('255.255.255.255/1'::inet), + ('192.168.1.3/1'::inet), + ('192.168.1.1/5'::inet), + ('192.168.1.255/5'::inet), + ('192.168.1.1/6'::inet), + ('192.168.1.255/6'::inet), + ('192.168.1.1/23'::inet), + ('192.168.1.2/23'::inet), + ('192.168.1.3/23'::inet), + ('192.168.1.0/24'::inet), + ('192.168.1.0/25'::inet), + ('255.255.255.255/16'::inet), + ('255.255.255.255/31'::inet), + ('255.255.255.254/32'::inet), + ('255.255.255.255/32'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/0'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/0'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/128'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0001/128'::inet), + ('::1:ffff:ffff:ffff:ffff/128'::inet), + ('::2:ffff:ffff:ffff:ffff/128'::inet), + ('127::1'::inet), + ('127::2'::inet), + ('8000:0000:0000:0000:0000:0000:0000:0000/1'::inet), + ('ffff:83e7:f118:57dc:6093:6d92:689d:58cf/70'::inet), + ('ffff:84b0:4775:536e:c3ed:7116:a6d6:34f0/44'::inet), + ('ffff:8566:f84:5867:47f1:7867:d2ba:8a1a/69'::inet), + ('ffff:8883:f028:7d2:4d68:d510:7d6b:ac43/73'::inet), + ('ffff:8ae8:7c14:65b3:196:8e4a:89ae:fb30/89'::inet), + ('ffff:8dd0:646:694c:7c16:7e35:6a26:171/104'::inet), + ('ffff:8eef:cbf:700:eda3:ae32:f4b4:318b/121'::inet), + ('ffff:90e7:e744:664:a93:8efe:1f25:7663/122'::inet), + ('ffff:9597:c69c:8b24:57a:8639:ec78:6026/111'::inet), + ('ffff:9e86:79ea:f16e:df31:8e4d:7783:532e/88'::inet), + ('ffff:a0c7:82d3:24de:f762:6e1f:316d:3fb2/23'::inet), + ('ffff:ffff:ffff:fffd::/128'::inet), + ('ffff:ffff:ffff:fffe::/128'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/128'::inet) +) AS i ORDER BY i; -- 2.16.2