From 47eea6b280c24b3908a696fddb048e5e5cacd2f5 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 26 Jul 2019 09:13:31 -0700 Subject: [PATCH v3] Add sort support for inet/cidr opfamily. Add sort support for inet and cidr, including support for abbreviated keys. Testing has shown that this reduces the time taken to sort medium to large inet/cidr inputs by ~50-60% in realistic cases. Author: Brandur Leach Reviewed-By: Peter Geoghegan, Edmund Horner --- src/backend/utils/adt/network.c | 328 +++++++++++++++++++++++++++++ src/include/catalog/pg_amproc.dat | 3 + src/include/catalog/pg_proc.dat | 3 + src/test/regress/expected/inet.out | 104 +++++++++ src/test/regress/sql/inet.sql | 53 +++++ 5 files changed, 491 insertions(+) diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index 5e980cf23f..aa19fabeb3 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -16,6 +16,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "common/ip.h" +#include "lib/hyperloglog.h" #include "libpq/libpq-be.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -24,12 +25,36 @@ #include "nodes/supportnodes.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/guc.h" #include "utils/hashutils.h" #include "utils/inet.h" #include "utils/lsyscache.h" +#include "utils/sortsupport.h" +/* + * An IPv4 netmask size is a value in the range of 0 - 32, which is + * represented with 6 bits in inet/cidr abbreviated keys where possible. + * + * An IPv4 inet/cidr abbreviated key can use up to 25 bits for subnet + * component. + */ +#define ABBREV_BITS_INET4_NETMASK_SIZE 6 +#define ABBREV_BITS_INET4_SUBNET 25 + +/* sortsupport for inet/cidr */ +typedef struct +{ + int64 input_count; /* number of non-null values seen */ + bool estimating; /* true if estimating cardinality */ + + hyperLogLogState abbr_card; /* cardinality estimator */ +} network_sortsupport_state; static int32 network_cmp_internal(inet *a1, inet *a2); +static int network_fast_cmp(Datum x, Datum y, SortSupport ssup); +static int network_cmp_abbrev(Datum x, Datum y, SortSupport ssup); +static bool network_abbrev_abort(int memtupcount, SortSupport ssup); +static Datum network_abbrev_convert(Datum original, SortSupport ssup); static List *match_network_function(Node *leftop, Node *rightop, int indexarg, @@ -405,6 +430,309 @@ network_cmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(network_cmp_internal(a1, a2)); } +/* + * SortSupport strategy routine + */ +Datum +network_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = network_fast_cmp; + ssup->ssup_extra = NULL; + + if (ssup->abbreviate) + { + network_sortsupport_state *uss; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + uss = palloc(sizeof(network_sortsupport_state)); + uss->input_count = 0; + uss->estimating = true; + initHyperLogLog(&uss->abbr_card, 10); + + ssup->ssup_extra = uss; + + ssup->comparator = network_cmp_abbrev; + ssup->abbrev_converter = network_abbrev_convert; + ssup->abbrev_abort = network_abbrev_abort; + ssup->abbrev_full_comparator = network_fast_cmp; + + MemoryContextSwitchTo(oldcontext); + } + + PG_RETURN_VOID(); +} + +/* + * SortSupport comparison func + */ +static int +network_fast_cmp(Datum x, Datum y, SortSupport ssup) +{ + inet *arg1 = DatumGetInetPP(x); + inet *arg2 = DatumGetInetPP(y); + + return network_cmp_internal(arg1, arg2); +} + +/* + * Abbreviated key comparison func + */ +static int +network_cmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + if (x > y) + return 1; + else if (x == y) + return 0; + else + return -1; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization. + * + * We pay no attention to the cardinality of the non-abbreviated data, because + * there is no equality fast-path within authoritative inet comparator. + */ +static bool +network_abbrev_abort(int memtupcount, SortSupport ssup) +{ + network_sortsupport_state *uss = ssup->ssup_extra; + double abbr_card; + + if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating) + return false; + + abbr_card = estimateHyperLogLog(&uss->abbr_card); + + /* + * If we have >100k distinct values, then even if we were sorting many + * billion rows we'd likely still break even, and the penalty of undoing + * that many rows of abbrevs would probably not be worth it. At this point + * we stop counting because we know that we're now fully committed. + */ + if (abbr_card > 100000.0) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: estimation ends at cardinality %f" + " after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count, memtupcount); +#endif + uss->estimating = false; + return false; + } + + /* + * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row + * fudge factor allows us to abort earlier on genuinely pathological data + * where we've had exactly one abbreviated value in the first 2k + * (non-null) rows. + */ + if (abbr_card < uss->input_count / 2000.0 + 0.5) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: aborting abbreviation at cardinality %f" + " below threshold %f after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count, + memtupcount); +#endif + return true; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: cardinality %f after " INT64_FORMAT + " values (%d rows)", abbr_card, uss->input_count, memtupcount); +#endif + + return false; +} + +/* + * SortSupport conversion routine. Converts original inet/cidr representation + * to abbreviated key representation that works with simple 3-way unsigned int + * comparisons. The network_cmp_internal() rules for sorting inet/cidr datums + * are followed by abbreviated comparisons by an encoding scheme that + * conditions keys through careful use of padding. + * + * IPv4 + * ---- + * + * 4 byte datums: + * + * Start with 1 bit for the IP family (IPv4 or IPv6; this bit is present in + * every case below) followed by all but 1 of the netmasked bits. + * + * +----------+---------------------+ + * | 1 bit IP | 31 bits network | (1 bit network + * | family | (truncated) | omitted) + * +----------+---------------------+ + * + * 8 byte datums: + * + * We have space to store all netmasked bits, followed by the netmask size, + * followed by 25 bits of the subnet (25 bits is usually more than enough in + * practice). cidr datums always have all-zero subnet bits. + * + * +----------+-----------------------+--------------+--------------------+ + * | 1 bit IP | 32 bits network | 6 bits | 25 bits subnet | + * | family | (full) | network size | (truncated) | + * +----------+-----------------------+--------------+--------------------+ + * + * IPv6 + * ---- + * + * 4 byte datums: + * + * +----------+---------------------+ + * | 1 bit IP | 31 bits network | (up to 97 bits + * | family | (truncated) | network omitted) + * +----------+---------------------+ + * + * 8 byte datums: + * + * +----------+---------------------------------+ + * | 1 bit IP | 63 bits network | (up to 65 bits + * | family | (truncated) | network omitted) + * +----------+---------------------------------+ + */ +static Datum +network_abbrev_convert(Datum original, SortSupport ssup) +{ + network_sortsupport_state *uss = ssup->ssup_extra; + inet *authoritative = DatumGetInetPP(original); + Datum res, + ipaddr_datum, + subnet_bitmask, + network; + int subnet_size; + + Assert(ip_family(authoritative) == PGSQL_AF_INET || + ip_family(authoritative) == PGSQL_AF_INET6); + + /* + * Get an unsigned integer representation of datum's ippaddr component. + * This may just be a prefix consisting of the `SIZEOF_DATUM` most + * significant bytes + */ + if (ip_family(authoritative) == PGSQL_AF_INET) + { + uint32 ipaddr_datum32 = *((uint32 *) ip_addr(authoritative)); + + /* Must byteswap on little-endian machines */ +#ifndef WORDS_BIGENDIAN + ipaddr_datum = pg_bswap32(ipaddr_datum32); +#endif + + /* Initialize result without setting ipfamily bit */ + res = (Datum) 0; + } + else + { + ipaddr_datum = *((Datum *) ip_addr(authoritative)); + + /* Must byteswap on little-endian machines */ + ipaddr_datum = DatumBigEndianToNative(ipaddr_datum); + + /* Initialize result with ipfamily (most significant) bit set */ + res = ((Datum) 1) << (SIZEOF_DATUM * BITS_PER_BYTE - 1); + } + + /* + * ipaddr_datum must be "split in two": high order bits go in "network" + * component of abbreviated key (often with zeroed bits at the end due to + * masking), while low order bits go in "subnet" component when there is + * space for one. This is accomplished by generating a temp datum subnet + * bitmask, which we may reuse later when generating the subnet bits + * themselves. (Note that subnet bits are only used with IPv4 datums on + * platforms where datum is 8 bytes.) + * + * The number of bits in subnet is used to generate datum subnet bitmask. + * For example, with a /24 IPv4 datum there are 8 subnet bits (32 - 24), + * so final subnet bitmask is '1111 1111'. + */ + subnet_size = ip_maxbits(authoritative) - ip_bits(authoritative); + Assert(subnet_size >= 0); + subnet_bitmask = (((Datum) 1) << subnet_size) - 1; + network = ipaddr_datum & ~subnet_bitmask; + +#if SIZEOF_DATUM == 8 + if (ip_family(authoritative) == PGSQL_AF_INET) + { + /* + * IPv4 with 8 byte datums: keep all 32 netmasked bits, netmask size, + * and most significant 25 subnet bits + */ + Datum netmask_size = (Datum) ip_bits(authoritative); + Datum subnet; + + /* Shift left 31 bits: 6 bits netmask size + 25 subnet bits */ + network <<= (ABBREV_BITS_INET4_NETMASK_SIZE + + ABBREV_BITS_INET4_SUBNET); + + /* Shift size to make room for subnet bits at the end */ + netmask_size <<= ABBREV_BITS_INET4_SUBNET; + /* Don't shift subnet bits (unless we discard below) */ + subnet = ipaddr_datum & subnet_bitmask; + + /* + * If we have more than 25 subnet bits, we can't fit everything. + * Shift subnet down to avoid clobbering bits that are only supposed + * to be used for netmask_size. + * + * Discarding the least significant subnet bits like this is correct + * because abbreviated comparisons that are resolved at the subnet + * level must have had equal subnet sizes in order to get that far. + * Comparisons consistently compare like with like. + */ + if (subnet_size > ABBREV_BITS_INET4_SUBNET) + subnet >>= subnet_size - ABBREV_BITS_INET4_SUBNET; + + /* + * Assemble the final abbreviated key without clobbering the ipfamily + * bit that must remain unset + */ + res |= network | netmask_size | subnet; + } + else +#endif + { + /* + * 4 byte datums, or IPv6 with 8 byte datums: Use as many of the + * netmasked bits as will fit in final abbreviated key. Avoid + * clobbering the ipfamily bit that was set earlier. + */ + res |= network >> 1; + } + + uss->input_count += 1; + + /* Hash abbreviated key */ + if (uss->estimating) + { + uint32 tmp; + +#if SIZEOF_DATUM == 8 + tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); +#else /* SIZEOF_DATUM != 8 */ + tmp = (uint32) res; +#endif + + addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + } + + return res; +} + /* * Boolean ordering tests. */ diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 5567b7ebad..5e705019b4 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -93,6 +93,9 @@ amproc => 'in_range(float4,float4,float8,bool,bool)' }, { amprocfamily => 'btree/network_ops', amproclefttype => 'inet', amprocrighttype => 'inet', amprocnum => '1', amproc => 'network_cmp' }, +{ amprocfamily => 'btree/network_ops', amproclefttype => 'inet', + amprocrighttype => 'inet', amprocnum => '2', + amproc => 'network_sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '1', amproc => 'btint2cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 0902dce5f1..58311d31e3 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3954,6 +3954,9 @@ { oid => '3551', proname => 'network_overlap', prorettype => 'bool', proargtypes => 'inet inet', prosrc => 'network_overlap' }, +{ oid => '8190', descr => 'sort support', + proname => 'network_sortsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'network_sortsupport' }, # inet/cidr functions { oid => '598', descr => 'abbreviated display of inet value', diff --git a/src/test/regress/expected/inet.out b/src/test/regress/expected/inet.out index 24202376f1..961808cde1 100644 --- a/src/test/regress/expected/inet.out +++ b/src/test/regress/expected/inet.out @@ -845,3 +845,107 @@ SELECT inet_merge(c, i) FROM INET_TBL WHERE inet_same_family(c, i); ::/24 (17 rows) +-- tests to specifically check some of the potentially sharp edges when it +-- comes to SortSupport implementation +SELECT i FROM (VALUES + ('0.0.0.0/0'::inet), + ('0.0.0.1/0'::inet), + ('192.168.1.4/0'::inet), + ('192.168.1.5/0'::inet), + ('255.0.0.0/0'::inet), + ('255.255.255.255/0'::inet), + ('0.0.0.0/1'::inet), + ('0.0.0.0/32'::inet), + ('0.0.0.1/1'::inet), + ('255.255.255.255/0'::inet), + ('255.255.255.255/1'::inet), + ('192.168.1.3/1'::inet), + ('192.168.1.1/5'::inet), + ('192.168.1.255/5'::inet), + ('192.168.1.1/6'::inet), + ('192.168.1.255/6'::inet), + ('192.168.1.1/23'::inet), + ('192.168.1.2/23'::inet), + ('192.168.1.3/23'::inet), + ('192.168.1.0/24'::inet), + ('192.168.1.0/25'::inet), + ('255.255.255.255/16'::inet), + ('255.255.255.255/31'::inet), + ('255.255.255.254/32'::inet), + ('255.255.255.255/32'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/0'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/0'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/128'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0001/128'::inet), + ('::1:ffff:ffff:ffff:ffff/128'::inet), + ('::2:ffff:ffff:ffff:ffff/128'::inet), + ('127::1'::inet), + ('127::2'::inet), + ('8000:0000:0000:0000:0000:0000:0000:0000/1'::inet), + ('ffff:83e7:f118:57dc:6093:6d92:689d:58cf/70'::inet), + ('ffff:84b0:4775:536e:c3ed:7116:a6d6:34f0/44'::inet), + ('ffff:8566:f84:5867:47f1:7867:d2ba:8a1a/69'::inet), + ('ffff:8883:f028:7d2:4d68:d510:7d6b:ac43/73'::inet), + ('ffff:8ae8:7c14:65b3:196:8e4a:89ae:fb30/89'::inet), + ('ffff:8dd0:646:694c:7c16:7e35:6a26:171/104'::inet), + ('ffff:8eef:cbf:700:eda3:ae32:f4b4:318b/121'::inet), + ('ffff:90e7:e744:664:a93:8efe:1f25:7663/122'::inet), + ('ffff:9597:c69c:8b24:57a:8639:ec78:6026/111'::inet), + ('ffff:9e86:79ea:f16e:df31:8e4d:7783:532e/88'::inet), + ('ffff:a0c7:82d3:24de:f762:6e1f:316d:3fb2/23'::inet), + ('ffff:ffff:ffff:fffd::/128'::inet), + ('ffff:ffff:ffff:fffe::/128'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/128'::inet) +) AS i ORDER BY i; + i +---------------------------------------------- + (0.0.0.0/0) + (0.0.0.1/0) + (192.168.1.4/0) + (192.168.1.5/0) + (255.0.0.0/0) + (255.255.255.255/0) + (255.255.255.255/0) + (0.0.0.0/1) + (0.0.0.1/1) + (0.0.0.0) + (192.168.1.3/1) + (255.255.255.255/1) + (192.168.1.1/5) + (192.168.1.255/5) + (192.168.1.1/6) + (192.168.1.255/6) + (192.168.1.1/23) + (192.168.1.2/23) + (192.168.1.3/23) + (192.168.1.0/24) + (192.168.1.0/25) + (255.255.255.255/16) + (255.255.255.255/31) + (255.255.255.254) + (255.255.255.255) + (::/0) + (ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/0) + (::) + (::1) + (::1:ffff:ffff:ffff:ffff) + (::2:ffff:ffff:ffff:ffff) + (127::1) + (127::2) + (8000::/1) + (ffff:83e7:f118:57dc:6093:6d92:689d:58cf/70) + (ffff:84b0:4775:536e:c3ed:7116:a6d6:34f0/44) + (ffff:8566:f84:5867:47f1:7867:d2ba:8a1a/69) + (ffff:8883:f028:7d2:4d68:d510:7d6b:ac43/73) + (ffff:8ae8:7c14:65b3:196:8e4a:89ae:fb30/89) + (ffff:8dd0:646:694c:7c16:7e35:6a26:171/104) + (ffff:8eef:cbf:700:eda3:ae32:f4b4:318b/121) + (ffff:90e7:e744:664:a93:8efe:1f25:7663/122) + (ffff:9597:c69c:8b24:57a:8639:ec78:6026/111) + (ffff:9e86:79ea:f16e:df31:8e4d:7783:532e/88) + (ffff:a0c7:82d3:24de:f762:6e1f:316d:3fb2/23) + (ffff:ffff:ffff:fffd::) + (ffff:ffff:ffff:fffe::) + (ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff) +(48 rows) + diff --git a/src/test/regress/sql/inet.sql b/src/test/regress/sql/inet.sql index bbfa9d37df..1bd6219d8f 100644 --- a/src/test/regress/sql/inet.sql +++ b/src/test/regress/sql/inet.sql @@ -156,3 +156,56 @@ INSERT INTO INET_TBL (c, i) VALUES ('10', '10::/8'); SELECT inet_merge(c, i) FROM INET_TBL; -- fix it by inet_same_family() condition SELECT inet_merge(c, i) FROM INET_TBL WHERE inet_same_family(c, i); + +-- tests to specifically check some of the potentially sharp edges when it +-- comes to SortSupport implementation +SELECT i FROM (VALUES + ('0.0.0.0/0'::inet), + ('0.0.0.1/0'::inet), + ('192.168.1.4/0'::inet), + ('192.168.1.5/0'::inet), + ('255.0.0.0/0'::inet), + ('255.255.255.255/0'::inet), + ('0.0.0.0/1'::inet), + ('0.0.0.0/32'::inet), + ('0.0.0.1/1'::inet), + ('255.255.255.255/0'::inet), + ('255.255.255.255/1'::inet), + ('192.168.1.3/1'::inet), + ('192.168.1.1/5'::inet), + ('192.168.1.255/5'::inet), + ('192.168.1.1/6'::inet), + ('192.168.1.255/6'::inet), + ('192.168.1.1/23'::inet), + ('192.168.1.2/23'::inet), + ('192.168.1.3/23'::inet), + ('192.168.1.0/24'::inet), + ('192.168.1.0/25'::inet), + ('255.255.255.255/16'::inet), + ('255.255.255.255/31'::inet), + ('255.255.255.254/32'::inet), + ('255.255.255.255/32'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/0'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/0'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0000/128'::inet), + ('0000:0000:0000:0000:0000:0000:0000:0001/128'::inet), + ('::1:ffff:ffff:ffff:ffff/128'::inet), + ('::2:ffff:ffff:ffff:ffff/128'::inet), + ('127::1'::inet), + ('127::2'::inet), + ('8000:0000:0000:0000:0000:0000:0000:0000/1'::inet), + ('ffff:83e7:f118:57dc:6093:6d92:689d:58cf/70'::inet), + ('ffff:84b0:4775:536e:c3ed:7116:a6d6:34f0/44'::inet), + ('ffff:8566:f84:5867:47f1:7867:d2ba:8a1a/69'::inet), + ('ffff:8883:f028:7d2:4d68:d510:7d6b:ac43/73'::inet), + ('ffff:8ae8:7c14:65b3:196:8e4a:89ae:fb30/89'::inet), + ('ffff:8dd0:646:694c:7c16:7e35:6a26:171/104'::inet), + ('ffff:8eef:cbf:700:eda3:ae32:f4b4:318b/121'::inet), + ('ffff:90e7:e744:664:a93:8efe:1f25:7663/122'::inet), + ('ffff:9597:c69c:8b24:57a:8639:ec78:6026/111'::inet), + ('ffff:9e86:79ea:f16e:df31:8e4d:7783:532e/88'::inet), + ('ffff:a0c7:82d3:24de:f762:6e1f:316d:3fb2/23'::inet), + ('ffff:ffff:ffff:fffd::/128'::inet), + ('ffff:ffff:ffff:fffe::/128'::inet), + ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/128'::inet) +) AS i ORDER BY i; -- 2.17.1