diff --git a/doc/src/sgml/hash.sgml b/doc/src/sgml/hash.sgml index d29cd18b8f..ce35d940df 100644 --- a/doc/src/sgml/hash.sgml +++ b/doc/src/sgml/hash.sgml @@ -22,22 +22,29 @@ - Hash indexes support only single-column indexes and do not allow - uniqueness checking. + Hash indexes support uniqueness checking. + Hash indexes support multi-column indexes, but only store the hash value + for the first column, so multiple columns are useful only for uniquness + checking. Hash indexes support only the = operator, so WHERE clauses that specify range operations will not be able to take - advantage of hash indexes. + advantage of hash indexes. The = operator must be + strict which means that NULL values cannot be searched for. As a result, + hash indexes choose not to store index pointers for NULL values, making + them effectively partial indexes WHERE col IS NOT NULL. Each hash index tuple stores just the 4-byte hash value, not the actual column value. As a result, hash indexes may be much smaller than B-trees when indexing longer data items such as UUIDs, URLs, etc. The absence of - the column value also makes all hash index scans lossy. Hash indexes may - take part in bitmap index scans and backward scans. + the column value means that an index may return hash collisions, hence + all hash index scans are marked lossy and require the heap values to be + rechecked. Hash indexes may take part in bitmap index scans, as well as + both forward and backward scans of non-unique data. @@ -61,7 +68,7 @@ that hash value. When scanning a hash bucket during queries, we need to scan through all of the overflow pages. Thus an unbalanced hash index might actually be worse than a B-tree in terms of number of block - accesses required, for some data. + accesses required, for highly skewed non-unique data. @@ -70,7 +77,8 @@ of rows per hash bucket. One possible way to avoid problems is to exclude highly non-unique values from the index using a partial index condition, but this may - not be suitable in many cases. + not be suitable in many cases, though in the case of NULL values is + performed automatically. @@ -94,7 +102,7 @@ - Hash indexes may expand the number of bucket pages as the number of + Hash indexes try to expand the number of bucket pages as the number of rows indexed grows. The hash key-to-bucket-number mapping is chosen so that the index can be incrementally expanded. When a new bucket is to be added to the index, exactly one existing bucket will need to be "split", with some of @@ -103,9 +111,12 @@ - The expansion occurs in the foreground, which could increase execution - time for user inserts. Thus, hash indexes may not be suitable for tables - with rapidly increasing number of rows. + The expansion occurs in the foreground in small incremental splits, so + the cost of expansion is spread across multiple inserts. Although this + causes minimal additional execution time, similar to a b-tree page split, + the activity requires us to track the number of rows in the index, so + reducing scalability for inserts on larger multi-core servers. Thus, hash + indexes may not be suitable for table with rapidly increasing number of rows. diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 0752fb38a9..2e668f3c1a 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -38,6 +38,7 @@ typedef struct HSpool *spool; /* NULL if not using spooling */ double indtuples; /* # tuples accepted into index */ Relation heapRel; /* heap relation descriptor */ + int NumIndexKeyAttrs; /* number of keys in index */ } HashBuildState; static void hashbuildCallback(Relation index, @@ -64,7 +65,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amcanorderbyop = false; amroutine->amcanbackward = true; amroutine->amcanunique = false; - amroutine->amcanmulticol = false; + amroutine->amcanmulticol = true; amroutine->amoptionalkey = false; amroutine->amsearcharray = false; amroutine->amsearchnulls = false; @@ -164,6 +165,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* prepare to build the index */ buildstate.indtuples = 0; buildstate.heapRel = heap; + buildstate.NumIndexKeyAttrs = indexInfo->ii_NumIndexKeyAttrs; /* do the heap scan */ reltuples = table_index_build_scan(heap, index, indexInfo, true, true, @@ -216,7 +218,7 @@ hashbuildCallback(Relation index, IndexTuple itup; /* convert data to a hash key; on failure, do not insert anything */ - if (!_hash_convert_tuple(index, + if (!_hash_convert_tuple(index, buildstate->NumIndexKeyAttrs, values, isnull, index_values, index_isnull)) return; @@ -255,7 +257,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull, IndexTuple itup; /* convert data to a hash key; on failure, do not insert anything */ - if (!_hash_convert_tuple(rel, + if (!_hash_convert_tuple(rel, indexInfo->ii_NumIndexKeyAttrs, values, isnull, index_values, index_isnull)) return false; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 2ffa28e8f7..021a80ae63 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -317,8 +317,6 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) /* There may be more than one index qual, but we hash only the first */ cur = &scan->keyData[0]; - /* We support only single-column hash indexes */ - Assert(cur->sk_attno == 1); /* And there's only one operator strategy, too */ Assert(cur->sk_strategy == HTEqualStrategyNumber); diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 519872850e..bcccc1e142 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -316,18 +316,22 @@ _hash_get_indextuple_hashkey(IndexTuple itup) * currently support that. */ bool -_hash_convert_tuple(Relation index, +_hash_convert_tuple(Relation index, int nkeys, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull) { uint32 hashkey; + int i; /* * We do not insert null values into hash indexes. This is okay because * the only supported search operator is '=', and we assume it is strict. */ - if (user_isnull[0]) - return false; + for (i = 0; i < nkeys; i++) + { + if (user_isnull[i]) + return false; + } hashkey = _hash_datum2hashkey(index, user_values[0]); index_values[0] = UInt32GetDatum(hashkey); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 1cce865be2..57c346fe28 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -460,7 +460,7 @@ extern uint32 _hash_spareindex(uint32 num_bucket); extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); -extern bool _hash_convert_tuple(Relation index, +extern bool _hash_convert_tuple(Relation index, int nkeys, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull); extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); diff --git a/src/test/regress/expected/amutils.out b/src/test/regress/expected/amutils.out index 7ab6113c61..226aed4afd 100644 --- a/src/test/regress/expected/amutils.out +++ b/src/test/regress/expected/amutils.out @@ -163,7 +163,7 @@ select amname, prop, pg_indexam_has_property(a.oid, prop) as p gist | bogus | hash | can_order | f hash | can_unique | f - hash | can_multi_col | f + hash | can_multi_col | t hash | can_exclude | t hash | can_include | f hash | bogus | diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out index e23de21b41..ac4c9b99d4 100644 --- a/src/test/regress/expected/hash_index.out +++ b/src/test/regress/expected/hash_index.out @@ -241,3 +241,60 @@ CREATE INDEX hash_f8_index2 ON hash_f8_heap USING hash (random float8_ops) WITH (fillfactor=101); ERROR: value 101 out of bounds for option "fillfactor" DETAIL: Valid values are between "10" and "100". +-- Test multi-column indexes +CREATE TABLE hash_multicol (x integer, y integer, z integer); +CREATE INDEX ON hash_multicol USING hash (x, y, z); +INSERT INTO hash_multicol VALUES (1, 1, 1); +INSERT INTO hash_multicol VALUES (1, 1, 2); +INSERT INTO hash_multicol VALUES (1, 1, 3); +INSERT INTO hash_multicol VALUES (1, 2, 1); +INSERT INTO hash_multicol VALUES (1, 2, 2); +INSERT INTO hash_multicol VALUES (2, 2, 2); +ANALYZE hash_multicol; +SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2; + x | y | z +---+---+--- + 1 | 2 | 2 +(1 row) + +SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 3; + x | y | z +---+---+--- +(0 rows) + +BEGIN; +SET LOCAL enable_seqscan = off; +EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON) +SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2; + QUERY PLAN +----------------------------------------------------------------------------------- + Index Scan using hash_multicol_x_y_z_idx on hash_multicol (actual rows=1 loops=1) + Index Cond: ((x = 1) AND (y = 2) AND (z = 2)) + Rows Removed by Index Recheck: 4 +(3 rows) + +ROLLBACK; +-- with multiple indexes +CREATE INDEX ON hash_multicol USING hash (y); +CREATE INDEX ON hash_multicol USING hash (y, z); +BEGIN; +SET LOCAL enable_seqscan = off; +EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON) +SELECT * FROM hash_multicol WHERE y = 2 AND z = 2; + QUERY PLAN +--------------------------------------------------------------------------------- + Index Scan using hash_multicol_y_z_idx on hash_multicol (actual rows=2 loops=1) + Index Cond: ((y = 2) AND (z = 2)) + Rows Removed by Index Recheck: 1 +(3 rows) + +EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON) +SELECT * FROM hash_multicol WHERE y = 1; + QUERY PLAN +--------------------------------------------------------------------------------- + Index Scan using hash_multicol_y_z_idx on hash_multicol (actual rows=3 loops=1) + Index Cond: (y = 1) +(2 rows) + +ROLLBACK; +DROP TABLE hash_multicol; diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql index 4d1aa020a9..aab039e5ec 100644 --- a/src/test/regress/sql/hash_index.sql +++ b/src/test/regress/sql/hash_index.sql @@ -202,3 +202,37 @@ CREATE INDEX hash_f8_index2 ON hash_f8_heap USING hash (random float8_ops) WITH (fillfactor=9); CREATE INDEX hash_f8_index2 ON hash_f8_heap USING hash (random float8_ops) WITH (fillfactor=101); + +-- Test multi-column indexes +CREATE TABLE hash_multicol (x integer, y integer, z integer); +CREATE INDEX ON hash_multicol USING hash (x, y, z); +INSERT INTO hash_multicol VALUES (1, 1, 1); +INSERT INTO hash_multicol VALUES (1, 1, 2); +INSERT INTO hash_multicol VALUES (1, 1, 3); +INSERT INTO hash_multicol VALUES (1, 2, 1); +INSERT INTO hash_multicol VALUES (1, 2, 2); +INSERT INTO hash_multicol VALUES (2, 2, 2); +ANALYZE hash_multicol; +SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2; +SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 3; +BEGIN; +SET LOCAL enable_seqscan = off; +EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON) +SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2; +ROLLBACK; + +-- with multiple indexes +CREATE INDEX ON hash_multicol USING hash (y); +CREATE INDEX ON hash_multicol USING hash (y, z); +BEGIN; +SET LOCAL enable_seqscan = off; +EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON) +SELECT * FROM hash_multicol WHERE y = 2 AND z = 2; +EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON) +SELECT * FROM hash_multicol WHERE y = 1; +ROLLBACK; + +-- test NULLs +INSERT INTO hash_multicol VALUES (1, 1, NULL); +INSERT INTO hash_multicol VALUES (1, NULL, 1); +DROP TABLE hash_multicol;