diff --git a/doc/src/sgml/hash.sgml b/doc/src/sgml/hash.sgml
index d29cd18b8f..ce35d940df 100644
--- a/doc/src/sgml/hash.sgml
+++ b/doc/src/sgml/hash.sgml
@@ -22,22 +22,29 @@
- Hash indexes support only single-column indexes and do not allow
- uniqueness checking.
+ Hash indexes support uniqueness checking.
+ Hash indexes support multi-column indexes, but only store the hash value
+ for the first column, so multiple columns are useful only for uniquness
+ checking.
Hash indexes support only the = operator,
so WHERE clauses that specify range operations will not be able to take
- advantage of hash indexes.
+ advantage of hash indexes. The = operator must be
+ strict which means that NULL values cannot be searched for. As a result,
+ hash indexes choose not to store index pointers for NULL values, making
+ them effectively partial indexes WHERE col IS NOT NULL.
Each hash index tuple stores just the 4-byte hash value, not the actual
column value. As a result, hash indexes may be much smaller than B-trees
when indexing longer data items such as UUIDs, URLs, etc. The absence of
- the column value also makes all hash index scans lossy. Hash indexes may
- take part in bitmap index scans and backward scans.
+ the column value means that an index may return hash collisions, hence
+ all hash index scans are marked lossy and require the heap values to be
+ rechecked. Hash indexes may take part in bitmap index scans, as well as
+ both forward and backward scans of non-unique data.
@@ -61,7 +68,7 @@
that hash value. When scanning a hash bucket during queries, we need to
scan through all of the overflow pages. Thus an unbalanced hash index
might actually be worse than a B-tree in terms of number of block
- accesses required, for some data.
+ accesses required, for highly skewed non-unique data.
@@ -70,7 +77,8 @@
of rows per hash bucket.
One possible way to avoid problems is to exclude highly non-unique
values from the index using a partial index condition, but this may
- not be suitable in many cases.
+ not be suitable in many cases, though in the case of NULL values is
+ performed automatically.
@@ -94,7 +102,7 @@
- Hash indexes may expand the number of bucket pages as the number of
+ Hash indexes try to expand the number of bucket pages as the number of
rows indexed grows. The hash key-to-bucket-number mapping is chosen so that
the index can be incrementally expanded. When a new bucket is to be added to
the index, exactly one existing bucket will need to be "split", with some of
@@ -103,9 +111,12 @@
- The expansion occurs in the foreground, which could increase execution
- time for user inserts. Thus, hash indexes may not be suitable for tables
- with rapidly increasing number of rows.
+ The expansion occurs in the foreground in small incremental splits, so
+ the cost of expansion is spread across multiple inserts. Although this
+ causes minimal additional execution time, similar to a b-tree page split,
+ the activity requires us to track the number of rows in the index, so
+ reducing scalability for inserts on larger multi-core servers. Thus, hash
+ indexes may not be suitable for table with rapidly increasing number of rows.
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0752fb38a9..2e668f3c1a 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -38,6 +38,7 @@ typedef struct
HSpool *spool; /* NULL if not using spooling */
double indtuples; /* # tuples accepted into index */
Relation heapRel; /* heap relation descriptor */
+ int NumIndexKeyAttrs; /* number of keys in index */
} HashBuildState;
static void hashbuildCallback(Relation index,
@@ -64,7 +65,7 @@ hashhandler(PG_FUNCTION_ARGS)
amroutine->amcanorderbyop = false;
amroutine->amcanbackward = true;
amroutine->amcanunique = false;
- amroutine->amcanmulticol = false;
+ amroutine->amcanmulticol = true;
amroutine->amoptionalkey = false;
amroutine->amsearcharray = false;
amroutine->amsearchnulls = false;
@@ -164,6 +165,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
/* prepare to build the index */
buildstate.indtuples = 0;
buildstate.heapRel = heap;
+ buildstate.NumIndexKeyAttrs = indexInfo->ii_NumIndexKeyAttrs;
/* do the heap scan */
reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
@@ -216,7 +218,7 @@ hashbuildCallback(Relation index,
IndexTuple itup;
/* convert data to a hash key; on failure, do not insert anything */
- if (!_hash_convert_tuple(index,
+ if (!_hash_convert_tuple(index, buildstate->NumIndexKeyAttrs,
values, isnull,
index_values, index_isnull))
return;
@@ -255,7 +257,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull,
IndexTuple itup;
/* convert data to a hash key; on failure, do not insert anything */
- if (!_hash_convert_tuple(rel,
+ if (!_hash_convert_tuple(rel, indexInfo->ii_NumIndexKeyAttrs,
values, isnull,
index_values, index_isnull))
return false;
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 2ffa28e8f7..021a80ae63 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -317,8 +317,6 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
/* There may be more than one index qual, but we hash only the first */
cur = &scan->keyData[0];
- /* We support only single-column hash indexes */
- Assert(cur->sk_attno == 1);
/* And there's only one operator strategy, too */
Assert(cur->sk_strategy == HTEqualStrategyNumber);
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 519872850e..bcccc1e142 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -316,18 +316,22 @@ _hash_get_indextuple_hashkey(IndexTuple itup)
* currently support that.
*/
bool
-_hash_convert_tuple(Relation index,
+_hash_convert_tuple(Relation index, int nkeys,
Datum *user_values, bool *user_isnull,
Datum *index_values, bool *index_isnull)
{
uint32 hashkey;
+ int i;
/*
* We do not insert null values into hash indexes. This is okay because
* the only supported search operator is '=', and we assume it is strict.
*/
- if (user_isnull[0])
- return false;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (user_isnull[i])
+ return false;
+ }
hashkey = _hash_datum2hashkey(index, user_values[0]);
index_values[0] = UInt32GetDatum(hashkey);
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 1cce865be2..57c346fe28 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -460,7 +460,7 @@ extern uint32 _hash_spareindex(uint32 num_bucket);
extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
-extern bool _hash_convert_tuple(Relation index,
+extern bool _hash_convert_tuple(Relation index, int nkeys,
Datum *user_values, bool *user_isnull,
Datum *index_values, bool *index_isnull);
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
diff --git a/src/test/regress/expected/amutils.out b/src/test/regress/expected/amutils.out
index 7ab6113c61..226aed4afd 100644
--- a/src/test/regress/expected/amutils.out
+++ b/src/test/regress/expected/amutils.out
@@ -163,7 +163,7 @@ select amname, prop, pg_indexam_has_property(a.oid, prop) as p
gist | bogus |
hash | can_order | f
hash | can_unique | f
- hash | can_multi_col | f
+ hash | can_multi_col | t
hash | can_exclude | t
hash | can_include | f
hash | bogus |
diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out
index e23de21b41..78efd3dcf9 100644
--- a/src/test/regress/expected/hash_index.out
+++ b/src/test/regress/expected/hash_index.out
@@ -241,3 +241,63 @@ CREATE INDEX hash_f8_index2 ON hash_f8_heap USING hash (random float8_ops)
WITH (fillfactor=101);
ERROR: value 101 out of bounds for option "fillfactor"
DETAIL: Valid values are between "10" and "100".
+-- Test multi-column indexes
+CREATE TABLE hash_multicol (x integer, y integer, z integer);
+CREATE INDEX ON hash_multicol USING hash (x, y, z);
+INSERT INTO hash_multicol VALUES (1, 1, 1);
+INSERT INTO hash_multicol VALUES (1, 1, 2);
+INSERT INTO hash_multicol VALUES (1, 1, 3);
+INSERT INTO hash_multicol VALUES (1, 2, 1);
+INSERT INTO hash_multicol VALUES (1, 2, 2);
+INSERT INTO hash_multicol VALUES (2, 2, 2);
+ANALYZE hash_multicol;
+SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2;
+ x | y | z
+---+---+---
+ 1 | 2 | 2
+(1 row)
+
+SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 3;
+ x | y | z
+---+---+---
+(0 rows)
+
+BEGIN;
+SET LOCAL enable_seqscan = off;
+EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON)
+SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2;
+ QUERY PLAN
+-----------------------------------------------------------------------------------
+ Index Scan using hash_multicol_x_y_z_idx on hash_multicol (actual rows=1 loops=1)
+ Index Cond: ((x = 1) AND (y = 2) AND (z = 2))
+ Rows Removed by Index Recheck: 4
+(3 rows)
+
+ROLLBACK;
+-- with multiple indexes
+CREATE INDEX ON hash_multicol USING hash (y);
+CREATE INDEX ON hash_multicol USING hash (y, z);
+BEGIN;
+SET LOCAL enable_seqscan = off;
+EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON)
+SELECT * FROM hash_multicol WHERE y = 2 AND z = 2;
+ QUERY PLAN
+---------------------------------------------------------------------------------
+ Index Scan using hash_multicol_y_z_idx on hash_multicol (actual rows=2 loops=1)
+ Index Cond: ((y = 2) AND (z = 2))
+ Rows Removed by Index Recheck: 1
+(3 rows)
+
+EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON)
+SELECT * FROM hash_multicol WHERE y = 1;
+ QUERY PLAN
+---------------------------------------------------------------------------------
+ Index Scan using hash_multicol_y_z_idx on hash_multicol (actual rows=3 loops=1)
+ Index Cond: (y = 1)
+(2 rows)
+
+ROLLBACK;
+-- test NULLs
+INSERT INTO hash_multicol VALUES (1, 1, NULL);
+INSERT INTO hash_multicol VALUES (1, NULL, 1);
+DROP TABLE hash_multicol;
diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql
index 4d1aa020a9..aab039e5ec 100644
--- a/src/test/regress/sql/hash_index.sql
+++ b/src/test/regress/sql/hash_index.sql
@@ -202,3 +202,37 @@ CREATE INDEX hash_f8_index2 ON hash_f8_heap USING hash (random float8_ops)
WITH (fillfactor=9);
CREATE INDEX hash_f8_index2 ON hash_f8_heap USING hash (random float8_ops)
WITH (fillfactor=101);
+
+-- Test multi-column indexes
+CREATE TABLE hash_multicol (x integer, y integer, z integer);
+CREATE INDEX ON hash_multicol USING hash (x, y, z);
+INSERT INTO hash_multicol VALUES (1, 1, 1);
+INSERT INTO hash_multicol VALUES (1, 1, 2);
+INSERT INTO hash_multicol VALUES (1, 1, 3);
+INSERT INTO hash_multicol VALUES (1, 2, 1);
+INSERT INTO hash_multicol VALUES (1, 2, 2);
+INSERT INTO hash_multicol VALUES (2, 2, 2);
+ANALYZE hash_multicol;
+SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2;
+SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 3;
+BEGIN;
+SET LOCAL enable_seqscan = off;
+EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON)
+SELECT * FROM hash_multicol WHERE x = 1 AND y = 2 AND z = 2;
+ROLLBACK;
+
+-- with multiple indexes
+CREATE INDEX ON hash_multicol USING hash (y);
+CREATE INDEX ON hash_multicol USING hash (y, z);
+BEGIN;
+SET LOCAL enable_seqscan = off;
+EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON)
+SELECT * FROM hash_multicol WHERE y = 2 AND z = 2;
+EXPLAIN (SUMMARY OFF, COSTS OFF, TIMING OFF, ANALYZE ON)
+SELECT * FROM hash_multicol WHERE y = 1;
+ROLLBACK;
+
+-- test NULLs
+INSERT INTO hash_multicol VALUES (1, 1, NULL);
+INSERT INTO hash_multicol VALUES (1, NULL, 1);
+DROP TABLE hash_multicol;