diff --git a/Makefile b/Makefile index 6d1ccab..53d363e 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,8 @@ MODULES = hashset CFLAGS=`pg_config --includedir-server` -REGRESS = basic cast conversions incremental parallel_query value_count_api trimmed_aggregates +REGRESS = prelude basic random table + REGRESS_OPTS = --inputdir=test PG_CONFIG = pg_config diff --git a/README.md b/README.md index 5805613..ad7f12b 100644 --- a/README.md +++ b/README.md @@ -6,22 +6,92 @@ providing a collection of integer items with fast lookup. ## Usage -FIXME +After installing the extension, you can use the `hashset` data type and +associated functions within your PostgreSQL queries. + +To demonstrate the usage, let's consider a hypothetical table `users` which has +a `user_id` and a `user_likes` of type `hashset`. + +Firstly, let's create the table: + +```sql +CREATE TABLE users( + user_id int PRIMARY KEY, + user_likes hashset DEFAULT hashset_init(2) +); +``` +In the above statement, the `hashset_init(2)` initializes a hashset with initial +capacity for 2 elements. The hashset will automatically resize itself when more +elements are added beyond this initial capacity. + +Now, we can perform operations on this table. Here are some examples: + +```sql +-- Insert a new user with id 1. The user_likes will automatically be initialized +-- as an empty hashset +INSERT INTO users (user_id) VALUES (1); + +-- Add elements (likes) for a user +UPDATE users SET user_likes = hashset_add(user_likes, 101) WHERE user_id = 1; +UPDATE users SET user_likes = hashset_add(user_likes, 202) WHERE user_id = 1; + +-- Check if a user likes a particular item +SELECT hashset_contains(user_likes, 101) FROM users WHERE user_id = 1; -- true + +-- Count the number of likes a user has +SELECT hashset_count(user_likes) FROM users WHERE user_id = 1; -- 2 +``` + +You can also use the aggregate functions to perform operations on multiple rows. +For instance, you can add an integer to a `hashset`. ## Data types -FIXME +- **hashset**: This data type represents a set of integers. Internally, it uses +a combination of a bitmap and a value array to store the elements in a set. It's +a variable-length type. -## Operators +## Functions -FIXME +The extension provides the following functions: +### hashset_add(hashset, int) -> hashset +Adds an integer to a `hashset`. -#### Casts +### hashset_contains(hashset, int) -> boolean +Checks if an integer is contained in a `hashset`. -FIXME +### hashset_count(hashset) -> bigint +Returns the number of elements in a `hashset`. + +### hashset_merge(hashset, hashset) -> hashset +Merges two `hashset`s into a single `hashset`. + +### hashset_to_array(hashset) -> integer[] +Converts a `hashset` to an integer array. + +### hashset_init(int) -> hashset +Initializes an empty `hashset` with a specified initial capacity for maximum +elements. The argument determines the maximum number of elements the `hashset` +can hold before it needs to resize. + +## Aggregate Functions + +### hashset(integer) -> hashset +Generates a `hashset` from a series of integers, keeping only the unique ones. + +### hashset(hashset) -> hashset +Merges multiple `hashset`s into a single `hashset`, preserving unique elements. + + +## Installation + +To install the extension, run `make install` in the project root. Then, in your +PostgreSQL connection, execute `CREATE EXTENSION hashset;`. + +This extension requires PostgreSQL version ?.? or later. ## License diff --git a/hashset.c b/hashset.c index 33b2133..d60ccc8 100644 --- a/hashset.c +++ b/hashset.c @@ -73,7 +73,6 @@ Datum hashset_agg_combine(PG_FUNCTION_ARGS); Datum hashset_to_array(PG_FUNCTION_ARGS); -/* allocate hashset with enough space for a requested number of centroids */ static hashset_t * hashset_allocate(int maxelements) { @@ -85,7 +84,6 @@ hashset_allocate(int maxelements) len += (maxelements + 7) / 8; len += maxelements * sizeof(int32); - /* we pre-allocate the array for all centroids and also the buffer for incoming data */ ptr = palloc0(len); SET_VARSIZE(ptr, len); @@ -95,7 +93,6 @@ hashset_allocate(int maxelements) set->maxelements = maxelements; set->nelements = 0; - /* new tdigest are automatically storing mean */ set->flags |= 0; return set; @@ -104,30 +101,109 @@ hashset_allocate(int maxelements) Datum hashset_in(PG_FUNCTION_ARGS) { -// int i, r; -// char *str = PG_GETARG_CSTRING(0); -// hashset_t *set = NULL; + char *str = PG_GETARG_CSTRING(0); + char *endptr; + int32 len = strlen(str); + hashset_t *set; - PG_RETURN_NULL(); + /* Check the opening and closing braces */ + if (str[0] != '{' || str[len - 1] != '}') + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for hashset: \"%s\"", str), + errdetail("Hashset representation must start with \"{\" and end with \"}\"."))); + } + + /* Start parsing from the first number (after the opening brace) */ + str++; + + /* Initial size based on input length (arbitrary, could be optimized) */ + set = hashset_allocate(len/2); + + while (true) + { + int32 value = strtol(str, &endptr, 10); + + /* Add the value to the hashset, resize if needed */ + if (set->nelements >= set->maxelements) + { + set = hashset_resize(set); + } + set = hashset_add_element(set, value); + + /* Error handling for strtol */ + if (endptr == str) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for integer: \"%s\"", str))); + } + else if (*endptr == ',') + { + str = endptr + 1; /* Move to the next number */ + } + else if (*endptr == '}') + { + break; /* End of the hashset */ + } + } + + PG_RETURN_POINTER(set); } Datum hashset_out(PG_FUNCTION_ARGS) { - //int i; - //tdigest_t *digest = (tdigest_t *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); - StringInfoData str; + hashset_t *set = (hashset_t *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + char *bitmap; + int32 *values; + int i; + StringInfoData str; + /* Calculate the pointer to the bitmap and values array */ + bitmap = set->data; + values = (int32 *) (set->data + (set->maxelements + 7) / 8); + + /* Initialize the StringInfo buffer */ initStringInfo(&str); + /* Append the opening brace for the output hashset string */ + appendStringInfoChar(&str, '{'); + + /* Loop through the elements and append them to the string */ + for (i = 0; i < set->maxelements; i++) + { + int byte = i / 8; + int bit = i % 8; + + /* Check if the bit in the bitmap is set */ + if (bitmap[byte] & (0x01 << bit)) + { + /* Append the value */ + if (str.len > 1) + appendStringInfoChar(&str, ','); + appendStringInfo(&str, "%d", values[i]); + } + } + + /* Append the closing brace for the output hashset string */ + appendStringInfoChar(&str, '}'); + + /* Return the resulting string */ PG_RETURN_CSTRING(str.data); } Datum hashset_recv(PG_FUNCTION_ARGS) { - //StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - hashset_t *set= NULL; + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + hashset_t *set; + + set = (hashset_t *) palloc0(sizeof(hashset_t)); + set->flags = pq_getmsgint(buf, 4); + set->maxelements = pq_getmsgint64(buf); + set->nelements = pq_getmsgint(buf, 4); PG_RETURN_POINTER(set); } @@ -148,22 +224,6 @@ hashset_send(PG_FUNCTION_ARGS) } -/* - * tdigest_to_array - * Transform the tdigest into an array of double values. - * - * The whole digest is stored in a single "double precision" array, which - * may be a bit confusing and perhaps fragile if more fields need to be - * added in the future. The initial elements are flags, count (number of - * items added to the digest), compression (determines the limit on number - * of centroids) and current number of centroids. Follows stream of values - * encoding the centroids in pairs of (mean, count). - * - * We make sure to always print mean, even for tdigests in the older format - * storing sum for centroids. Otherwise the "mean" key would be confusing. - * But we don't call tdigest_update_format, and instead we simply update the - * flags and convert the sum/mean values. - */ Datum hashset_to_array(PG_FUNCTION_ARGS) { @@ -182,7 +242,7 @@ hashset_to_array(PG_FUNCTION_ARGS) set = (hashset_t *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); sbitmap = set->data; - svalues = (int32 *) (set->data + set->maxelements / 8); + svalues = (int32 *) (set->data + (set->maxelements + 7) / 8); /* number of values to store in the array */ nvalues = set->nelements; @@ -231,13 +291,14 @@ int32_to_array(FunctionCallInfo fcinfo, int32 *d, int len) static hashset_t * hashset_resize(hashset_t * set) { - int i; - hashset_t *new = hashset_allocate(set->maxelements * 2); - char *bitmap; - int32 *values; + int i; + hashset_t *new = hashset_allocate(set->maxelements * 2); + char *bitmap; + int32 *values; + /* Calculate the pointer to the bitmap and values array */ bitmap = set->data; - values = (int32 *) (set->data + set->maxelements / 8); + values = (int32 *) (set->data + (set->maxelements + 7) / 8); for (i = 0; i < set->maxelements; i++) { @@ -266,7 +327,7 @@ hashset_add_element(hashset_t *set, int32 value) hash = ((uint32) value * 7691 + 4201) % set->maxelements; bitmap = set->data; - values = (int32 *) (set->data + set->maxelements / 8); + values = (int32 *) (set->data + (set->maxelements + 7) / 8); while (true) { @@ -308,7 +369,7 @@ hashset_contains_element(hashset_t *set, int32 value) hash = ((uint32) value * 7691 + 4201) % set->maxelements; bitmap = set->data; - values = (int32 *) (set->data + set->maxelements / 8); + values = (int32 *) (set->data + (set->maxelements + 7) / 8); while (true) { @@ -347,7 +408,10 @@ hashset_add(PG_FUNCTION_ARGS) if (PG_ARGISNULL(0)) set = hashset_allocate(64); else - set = (hashset_t *) PG_GETARG_POINTER(0); + { + /* make sure we are working with a non-toasted and non-shared copy of the input */ + set = (hashset_t *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)); + } set = hashset_add_element(set, PG_GETARG_INT32(1)); @@ -377,7 +441,7 @@ hashset_merge(PG_FUNCTION_ARGS) setb = PG_GETARG_HASHSET(1); bitmap = setb->data; - values = (int32 *) (setb->data + setb->maxelements / 8); + values = (int32 *) (setb->data + (setb->maxelements + 7) / 8); for (i = 0; i < setb->maxelements; i++) { @@ -439,7 +503,7 @@ hashset_agg_add(PG_FUNCTION_ARGS) /* * We want to skip NULL values altogether - we return either the existing - * t-digest (if it already exists) or NULL. + * hashset (if it already exists) or NULL. */ if (PG_ARGISNULL(1)) { @@ -450,7 +514,7 @@ hashset_agg_add(PG_FUNCTION_ARGS) PG_RETURN_DATUM(PG_GETARG_DATUM(0)); } - /* if there's no digest allocated, create it now */ + /* if there's no hashset allocated, create it now */ if (PG_ARGISNULL(0)) { oldcontext = MemoryContextSwitchTo(aggcontext); @@ -481,7 +545,7 @@ hashset_agg_add_set(PG_FUNCTION_ARGS) /* * We want to skip NULL values altogether - we return either the existing - * t-digest (if it already exists) or NULL. + * hashset (if it already exists) or NULL. */ if (PG_ARGISNULL(1)) { @@ -492,7 +556,7 @@ hashset_agg_add_set(PG_FUNCTION_ARGS) PG_RETURN_DATUM(PG_GETARG_DATUM(0)); } - /* if there's no digest allocated, create it now */ + /* if there's no hashset allocated, create it now */ if (PG_ARGISNULL(0)) { oldcontext = MemoryContextSwitchTo(aggcontext); @@ -513,7 +577,7 @@ hashset_agg_add_set(PG_FUNCTION_ARGS) value = PG_GETARG_HASHSET(1); bitmap = value->data; - values = (int32 *) (value->data + value->maxelements / 8); + values = (int32 *) (value->data + (value->maxelements + 7) / 8); for (i = 0; i < value->maxelements; i++) { @@ -558,7 +622,7 @@ hashset_agg_combine(PG_FUNCTION_ARGS) int32 *values; if (!AggCheckCallContext(fcinfo, &aggcontext)) - elog(ERROR, "tdigest_combine called in non-aggregate context"); + elog(ERROR, "hashset_agg_combine called in non-aggregate context"); /* if no "merged" state yet, try creating it */ if (PG_ARGISNULL(0)) @@ -570,7 +634,7 @@ hashset_agg_combine(PG_FUNCTION_ARGS) /* the second argument is not NULL, so copy it */ src = (hashset_t *) PG_GETARG_POINTER(1); - /* copy the digest into the right long-lived memory context */ + /* copy the hashset into the right long-lived memory context */ oldcontext = MemoryContextSwitchTo(aggcontext); src = hashset_copy(src); MemoryContextSwitchTo(oldcontext); @@ -590,7 +654,7 @@ hashset_agg_combine(PG_FUNCTION_ARGS) dst = (hashset_t *) PG_GETARG_POINTER(0); bitmap = src->data; - values = (int32 *) (src->data + src->maxelements / 8); + values = (int32 *) (src->data + (src->maxelements + 7) / 8); for (i = 0; i < src->maxelements; i++) { diff --git a/hashset.control b/hashset.control index 3d0b9a4..89bb1ff 100644 --- a/hashset.control +++ b/hashset.control @@ -1,3 +1,3 @@ -comment = 'Provides tdigest aggregate function.' +comment = 'Provides hashset type.' default_version = '1.0.0' relocatable = true diff --git a/test/expected/basic.out b/test/expected/basic.out new file mode 100644 index 0000000..5be2501 --- /dev/null +++ b/test/expected/basic.out @@ -0,0 +1,96 @@ +SELECT hashset_sorted('{1}'::hashset); + hashset_sorted +---------------- + {1} +(1 row) + +SELECT hashset_sorted('{1,2}'::hashset); + hashset_sorted +---------------- + {1,2} +(1 row) + +SELECT hashset_sorted('{1,2,3}'::hashset); + hashset_sorted +---------------- + {1,2,3} +(1 row) + +SELECT hashset_sorted('{1,2,3,4}'::hashset); + hashset_sorted +---------------- + {1,2,3,4} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5}'::hashset); + hashset_sorted +---------------- + {1,2,3,4,5} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6}'::hashset); + hashset_sorted +---------------- + {1,2,3,4,5,6} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7}'::hashset); + hashset_sorted +----------------- + {1,2,3,4,5,6,7} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8}'::hashset); + hashset_sorted +------------------- + {1,2,3,4,5,6,7,8} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9}'::hashset); + hashset_sorted +--------------------- + {1,2,3,4,5,6,7,8,9} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10}'::hashset); + hashset_sorted +------------------------ + {1,2,3,4,5,6,7,8,9,10} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11}'::hashset); + hashset_sorted +--------------------------- + {1,2,3,4,5,6,7,8,9,10,11} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12}'::hashset); + hashset_sorted +------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13}'::hashset); + hashset_sorted +--------------------------------- + {1,2,3,4,5,6,7,8,9,10,11,12,13} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13,14}'::hashset); + hashset_sorted +------------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12,13,14} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}'::hashset); + hashset_sorted +--------------------------------------- + {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15} +(1 row) + +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}'::hashset); + hashset_sorted +------------------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16} +(1 row) + diff --git a/test/expected/prelude.out b/test/expected/prelude.out new file mode 100644 index 0000000..b094033 --- /dev/null +++ b/test/expected/prelude.out @@ -0,0 +1,7 @@ +CREATE EXTENSION hashset; +CREATE OR REPLACE FUNCTION hashset_sorted(hashset) +RETURNS TEXT AS +$$ +SELECT array_agg(i ORDER BY i::int)::text +FROM regexp_split_to_table(regexp_replace($1::text,'^{|}$','','g'),',') i +$$ LANGUAGE sql; diff --git a/test/expected/random.out b/test/expected/random.out new file mode 100644 index 0000000..889f5ca --- /dev/null +++ b/test/expected/random.out @@ -0,0 +1,38 @@ +SELECT setseed(0.12345); + setseed +--------- + +(1 row) + +\set MAX_INT 2147483647 +CREATE TABLE hashset_random_numbers AS + SELECT + (random()*:MAX_INT)::int AS i + FROM generate_series(1,(random()*10000)::int) +; +SELECT + md5(hashset_sorted) +FROM +( + SELECT + hashset_sorted(hashset(format('{%s}',string_agg(i::text,',')))) + FROM hashset_random_numbers +) q; + md5 +---------------------------------- + 4ad6e4233861becbeb4a665376952a16 +(1 row) + +SELECT + md5(input_sorted) +FROM +( + SELECT + format('{%s}',string_agg(i::text,',' ORDER BY i)) AS input_sorted + FROM hashset_random_numbers +) q; + md5 +---------------------------------- + 4ad6e4233861becbeb4a665376952a16 +(1 row) + diff --git a/test/expected/table.out b/test/expected/table.out new file mode 100644 index 0000000..8d4fbe2 --- /dev/null +++ b/test/expected/table.out @@ -0,0 +1,25 @@ +CREATE TABLE users ( + user_id int PRIMARY KEY, + user_likes hashset DEFAULT hashset_init(2) +); +INSERT INTO users (user_id) VALUES (1); +UPDATE users SET user_likes = hashset_add(user_likes, 101) WHERE user_id = 1; +UPDATE users SET user_likes = hashset_add(user_likes, 202) WHERE user_id = 1; +SELECT hashset_contains(user_likes, 101) FROM users WHERE user_id = 1; + hashset_contains +------------------ + t +(1 row) + +SELECT hashset_count(user_likes) FROM users WHERE user_id = 1; + hashset_count +--------------- + 2 +(1 row) + +SELECT hashset_sorted(user_likes) FROM users WHERE user_id = 1; + hashset_sorted +---------------- + {101,202} +(1 row) + diff --git a/test/sql/basic.sql b/test/sql/basic.sql new file mode 100644 index 0000000..662e65a --- /dev/null +++ b/test/sql/basic.sql @@ -0,0 +1,16 @@ +SELECT hashset_sorted('{1}'::hashset); +SELECT hashset_sorted('{1,2}'::hashset); +SELECT hashset_sorted('{1,2,3}'::hashset); +SELECT hashset_sorted('{1,2,3,4}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13,14}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}'::hashset); +SELECT hashset_sorted('{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}'::hashset); diff --git a/test/sql/prelude.sql b/test/sql/prelude.sql new file mode 100644 index 0000000..ccc0595 --- /dev/null +++ b/test/sql/prelude.sql @@ -0,0 +1,8 @@ +CREATE EXTENSION hashset; + +CREATE OR REPLACE FUNCTION hashset_sorted(hashset) +RETURNS TEXT AS +$$ +SELECT array_agg(i ORDER BY i::int)::text +FROM regexp_split_to_table(regexp_replace($1::text,'^{|}$','','g'),',') i +$$ LANGUAGE sql; diff --git a/test/sql/random.sql b/test/sql/random.sql new file mode 100644 index 0000000..16c9084 --- /dev/null +++ b/test/sql/random.sql @@ -0,0 +1,27 @@ +SELECT setseed(0.12345); + +\set MAX_INT 2147483647 + +CREATE TABLE hashset_random_numbers AS + SELECT + (random()*:MAX_INT)::int AS i + FROM generate_series(1,(random()*10000)::int) +; + +SELECT + md5(hashset_sorted) +FROM +( + SELECT + hashset_sorted(hashset(format('{%s}',string_agg(i::text,',')))) + FROM hashset_random_numbers +) q; + +SELECT + md5(input_sorted) +FROM +( + SELECT + format('{%s}',string_agg(i::text,',' ORDER BY i)) AS input_sorted + FROM hashset_random_numbers +) q; diff --git a/test/sql/table.sql b/test/sql/table.sql new file mode 100644 index 0000000..d848207 --- /dev/null +++ b/test/sql/table.sql @@ -0,0 +1,10 @@ +CREATE TABLE users ( + user_id int PRIMARY KEY, + user_likes hashset DEFAULT hashset_init(2) +); +INSERT INTO users (user_id) VALUES (1); +UPDATE users SET user_likes = hashset_add(user_likes, 101) WHERE user_id = 1; +UPDATE users SET user_likes = hashset_add(user_likes, 202) WHERE user_id = 1; +SELECT hashset_contains(user_likes, 101) FROM users WHERE user_id = 1; +SELECT hashset_count(user_likes) FROM users WHERE user_id = 1; +SELECT hashset_sorted(user_likes) FROM users WHERE user_id = 1;