PATCH: postpone building buckets to the end of Hash (in HashJoin)
Hi,
attached is v1 of one of the hashjoin improvements mentioned in
September in the lengthy thread [1]/messages/by-id/19746.1443983463@sss.pgh.pa.us.
The main objection against simply removing the MaxAllocSize check (and
switching to MemoryContextAllocHuge) is that if the number of rows is
overestimated, we may consume significantly more memory than necessary.
We run into this issue because we allocate the buckets at the very
beginning, based on the estimate. I've noticed we don't really need to
do that - we don't really use the buckets until after the Hash node
completes, and we don't even use it when incrementing the number of
batches (we use the dense allocation for that).
So this patch removes this - it postpones allocating the buckets to the
end of MultiExecHash(), and at that point we know pretty well what is
the optimal number of buckets.
This makes tracking nbuckets_optimal/log2_nbuckets_optimal unnecessary,
as we can simply use nbuckets/log2_nbuckets for that purpose. I've also
removed nbuckets_original, but maybe that's not a good idea and we want
to keep that information (OTOH we never use that number of buckets).
This patch does not change the estimation in ExecChooseHashTableSize()
at all, because we still need to do that to get nbucket/nbatch. Maybe
this is another opportunity for improvement in case of overestimates,
because in that case it may happen that we do batching even when we
could do without it. So we might start with nbuckets=1024 and
nbatches=1, and only switch to the estimated number of batches if really
needed.
This patch also does not mess with the allocation, i.e. it still uses
the MaxAllocSize limit (which amounts to ~256MB due to the doubling,
IIRC), but it should make it easier to do that change.
[1]: /messages/by-id/19746.1443983463@sss.pgh.pa.us
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
hash-delay-buckets-v1.patchtext/x-diff; name=hash-delay-buckets-v1.patchDownload
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 12dae77..8fd9c9f 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2116,21 +2116,17 @@ show_hash_info(HashState *hashstate, ExplainState *es)
if (es->format != EXPLAIN_FORMAT_TEXT)
{
ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
- ExplainPropertyLong("Original Hash Buckets",
- hashtable->nbuckets_original, es);
ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
ExplainPropertyLong("Original Hash Batches",
hashtable->nbatch_original, es);
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
}
- else if (hashtable->nbatch_original != hashtable->nbatch ||
- hashtable->nbuckets_original != hashtable->nbuckets)
+ else if (hashtable->nbatch_original != hashtable->nbatch)
{
appendStringInfoSpaces(es->str, es->indent * 2);
appendStringInfo(es->str,
- "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
+ "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
hashtable->nbuckets,
- hashtable->nbuckets_original,
hashtable->nbatch,
hashtable->nbatch_original,
spacePeakKb);
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 5e05ec3..b0cf97d 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,7 +39,7 @@
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
-static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecHashBuildBuckets(HashJoinTable hashtable);
static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
int mcvsToUse);
static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -129,9 +129,8 @@ MultiExecHash(HashState *node)
}
}
- /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
- if (hashtable->nbuckets != hashtable->nbuckets_optimal)
- ExecHashIncreaseNumBuckets(hashtable);
+ /* Construct the actual hash table (using the optimal number of buckets). */
+ ExecHashBuildBuckets(hashtable);
/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
@@ -283,10 +282,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
hashtable->nbuckets = nbuckets;
- hashtable->nbuckets_original = nbuckets;
- hashtable->nbuckets_optimal = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
- hashtable->log2_nbuckets_optimal = log2_nbuckets;
hashtable->buckets = NULL;
hashtable->keepNulls = keepNulls;
hashtable->skewEnabled = false;
@@ -372,18 +368,11 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
}
/*
- * Prepare context for the first-scan space allocations; allocate the
- * hashbucket array therein, and set each bucket "empty".
- */
- MemoryContextSwitchTo(hashtable->batchCxt);
-
- hashtable->buckets = (HashJoinTuple *)
- palloc0(nbuckets * sizeof(HashJoinTuple));
-
- /*
* Set up for skew optimization, if possible and there's a need for more
* than one batch. (In a one-batch join, there's no point in it.)
*/
+ MemoryContextSwitchTo(hashtable->batchCxt);
+
if (nbatch > 1)
ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
@@ -654,19 +643,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
*/
ninmemory = nfreed = 0;
- /* If know we need to resize nbuckets, we can do it while rebatching. */
- if (hashtable->nbuckets_optimal != hashtable->nbuckets)
- {
- /* we never decrease the number of buckets */
- Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
-
- hashtable->nbuckets = hashtable->nbuckets_optimal;
- hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
-
- hashtable->buckets = repalloc(hashtable->buckets,
- sizeof(HashJoinTuple) * hashtable->nbuckets);
- }
-
/*
* We will scan through the chunks directly, so that we can reset the
* buckets now and not have to keep track which tuples in the buckets have
@@ -704,10 +680,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
memcpy(copyTuple, hashTuple, hashTupleSize);
-
- /* and add it back to the appropriate bucket */
- copyTuple->next = hashtable->buckets[bucketno];
- hashtable->buckets[bucketno] = copyTuple;
}
else
{
@@ -753,27 +725,20 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
}
/*
- * ExecHashIncreaseNumBuckets
- * increase the original number of buckets in order to reduce
- * number of tuples per bucket
+ * ExecHashBuildBuckets
+ * complete building the hash table by allocating the optimal number of
+ * buckets and filling them with tuples
*/
static void
-ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+ExecHashBuildBuckets(HashJoinTable hashtable)
{
HashMemoryChunk chunk;
- /* do nothing if not an increase (it's called increase for a reason) */
- if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
- return;
-
#ifdef HJDEBUG
- printf("Increasing nbuckets %d => %d\n",
- hashtable->nbuckets, hashtable->nbuckets_optimal);
+ printf("Constructing table with nbuckets %d\n", hashtable->nbuckets);
#endif
- hashtable->nbuckets = hashtable->nbuckets_optimal;
- hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
-
+ Assert(hashtable->buckets == NULL);
Assert(hashtable->nbuckets > 1);
Assert(hashtable->nbuckets <= (INT_MAX / 2));
Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
@@ -785,10 +750,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
* chunks)
*/
hashtable->buckets =
- (HashJoinTuple *) repalloc(hashtable->buckets,
- hashtable->nbuckets * sizeof(HashJoinTuple));
-
- memset(hashtable->buckets, 0, hashtable->nbuckets * sizeof(HashJoinTuple));
+ (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
/* scan through all tuples in all chunks to rebuild the hash table */
for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
@@ -816,7 +778,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
}
#ifdef HJDEBUG
- printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+ printf("Nbuckets set to %d, average items per bucket %.1f\n",
hashtable->nbuckets, batchTuples / hashtable->nbuckets);
#endif
}
@@ -872,9 +834,15 @@ ExecHashTableInsert(HashJoinTable hashtable,
*/
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
- /* Push it onto the front of the bucket's list */
- hashTuple->next = hashtable->buckets[bucketno];
- hashtable->buckets[bucketno] = hashTuple;
+ /*
+ * We only do this if we already have buckets allocated, i.e. after
+ * the first batch.
+ */
+ if (hashtable->buckets != NULL)
+ {
+ hashTuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = hashTuple;
+ }
/*
* Increase the (optimal) number of buckets if we just exceeded the
@@ -882,14 +850,14 @@ ExecHashTableInsert(HashJoinTable hashtable,
* batch.
*/
if (hashtable->nbatch == 1 &&
- ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+ ntuples > (hashtable->nbuckets * NTUP_PER_BUCKET))
{
/* Guard against integer overflow and alloc size overflow */
- if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
- hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+ if (hashtable->nbuckets <= INT_MAX / 2 &&
+ hashtable->nbuckets * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
{
- hashtable->nbuckets_optimal *= 2;
- hashtable->log2_nbuckets_optimal += 1;
+ hashtable->nbuckets *= 2;
+ hashtable->log2_nbuckets += 1;
}
}
@@ -898,7 +866,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;
if (hashtable->spaceUsed +
- hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
+ hashtable->nbuckets * sizeof(HashJoinTuple)
> hashtable->spaceAllowed)
ExecHashIncreaseNumBatches(hashtable);
}
@@ -1216,7 +1184,6 @@ void
ExecHashTableReset(HashJoinTable hashtable)
{
MemoryContext oldcxt;
- int nbuckets = hashtable->nbuckets;
/*
* Release all the hash buckets and tuples acquired in the prior pass, and
@@ -1226,8 +1193,8 @@ ExecHashTableReset(HashJoinTable hashtable)
oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
/* Reallocate and reinitialize the hash bucket headers. */
- hashtable->buckets = (HashJoinTuple *)
- palloc0(nbuckets * sizeof(HashJoinTuple));
+ hashtable->buckets =
+ (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
hashtable->spaceUsed = 0;
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 7a51ea6..255c506 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -128,11 +128,6 @@ typedef struct HashJoinTableData
int nbuckets; /* # buckets in the in-memory hash table */
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
- int nbuckets_original; /* # buckets when starting the first
- * hash */
- int nbuckets_optimal; /* optimal # buckets (per batch) */
- int log2_nbuckets_optimal; /* log2(nbuckets_optimal) */
-
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
struct HashJoinTupleData **buckets;
/* buckets array is per-batch storage, as are all the tuples */
On Mon, Dec 14, 2015 at 3:04 PM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:
attached is v1 of one of the hashjoin improvements mentioned in September in
the lengthy thread [1].The main objection against simply removing the MaxAllocSize check (and
switching to MemoryContextAllocHuge) is that if the number of rows is
overestimated, we may consume significantly more memory than necessary.We run into this issue because we allocate the buckets at the very
beginning, based on the estimate. I've noticed we don't really need to do
that - we don't really use the buckets until after the Hash node completes,
and we don't even use it when incrementing the number of batches (we use the
dense allocation for that).So this patch removes this - it postpones allocating the buckets to the end
of MultiExecHash(), and at that point we know pretty well what is the
optimal number of buckets.This makes tracking nbuckets_optimal/log2_nbuckets_optimal unnecessary, as
we can simply use nbuckets/log2_nbuckets for that purpose. I've also removed
nbuckets_original, but maybe that's not a good idea and we want to keep that
information (OTOH we never use that number of buckets).This patch does not change the estimation in ExecChooseHashTableSize() at
all, because we still need to do that to get nbucket/nbatch. Maybe this is
another opportunity for improvement in case of overestimates, because in
that case it may happen that we do batching even when we could do without
it. So we might start with nbuckets=1024 and nbatches=1, and only switch to
the estimated number of batches if really needed.This patch also does not mess with the allocation, i.e. it still uses the
MaxAllocSize limit (which amounts to ~256MB due to the doubling, IIRC), but
it should make it easier to do that change.
If this doesn't regress performance in the case where the number of
buckets is estimated accurately to begin with, then I think this is a
great idea. Can you supply some performance tests results for that
case, and maybe some of the other cases also?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi,
On 12/17/2015 07:20 PM, Robert Haas wrote:
...
If this doesn't regress performance in the case where the number of
buckets is estimated accurately to begin with, then I think this is
a great idea. Can you supply some performance tests results for that
case, and maybe some of the other cases also?
I don't see how it could regress performance, and the benchmarks I've
done confirm that. I'll do more thorough benchmarking and post the
results here, but not now as this patch is in 2016-01 CF and I want to
put all my time into reviewing patches from the open commitfest.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi,
attached is v2 of the patch, with a bugfix and two significant improvements:
1) bugfix: forgotten memset() in ExecHashIncreaseNumBatches()
Caused segfaults whenever we started with a single batch and then
had to increase the number of batches.
2) 0002: postpone the batching (not just build of buckets)
When ExecChooseHashTableSize finds out we'll probably need batching,
we start with nbatch=1 anyway, and only switch to batching (with the
estimated number of batches) after filling work_mem.
This helps in two basic cases - firstly, when we do over-estimates
we may end up doing batching even when we could do with a single
batch (i.e. without writing to temp files at all). That's really
expensive, and this helps with that - not entirely, because we can
only distinguish "no batching vs. batching" case and not the number
of batches, but that's the more interesting case anyway (the
difference between batching with 16 or 32 batches is not large, at
least in my experience).
The other use case is the patch adding bloom filters, because it
allows with properly sizing the bloom filter, which needs the number
of distinct values. So while the filter might be sized using the
estimated number of tuples passed to the Hash, it's likely to be
larger than needed and thus more expensive. So sizing the bloom
filter is crucial, and this change allows first accumulating enough
data to estimate the size of bloom filter first. More discussion in
the other patch.
3) 0003: eliminate the MaxAlloc limit
Currently the size of buckets is limited my MaxAlloc, which means it
can't exceed 512MB (assuming I've done my math correctly), which
means ~67M buckets. This removes the MaxAlloc limit and just keeps
the (INT_MAX/2) limit, so ~2B rows. I don't think it's worth
extending further at this point.
There was a discussion about this, quoting the f2fc98fb message:
Limit the size of the hashtable pointer array to not more than
MaxAllocSize, per reports from Kouhei Kaigai and others of
"invalid memory alloc request size" failures. There was
discussion of allowing the array to get larger than that by using
the "huge" palloc API, but so far no proof that that is actually
a good idea, and at this point in the 9.5 cycle major changes
from old behavior don't seem like the way to go.
I believe the objections were correct, because it'd really waste a
lot of memory in case of over-estimates. I do however think that
doing (1) and (2) fixes this issue, because with those changes we
never allocate the buckets based on the initial estimate. We pretty
much get the exact number of buckets necessary.
I haven't done any performance testing yet (well, I did, but not
reliable enough for sharing here). I'll post more data early January,
once the machine completes other tests it's running right now.
I've also been thinking about what other optimizations might be
possible, and the one thing I'm wondering about is adding HyperLogLog
counter. The thing is that we do size buckets based on number of rows,
but that may be nonsense - to size the hash table, we actually need
number of distinct values. So when the Hash contains duplicate rows (say
10 rows for each value), we end up with only ~10% of buckets containing
any data (about 10 tuples in a list).
If we knew the number of distinct values, we could make the buckets
smaller and thus reduce the memory requirements significantly.
I mention this because the patch with bloom filters actually uses HLL to
size the bloom filters, so this would not be really introducing any new
code.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
0001-delayed-build-of-hash-buckets-v2.patchtext/x-diff; name=0001-delayed-build-of-hash-buckets-v2.patchDownload
>From 3aaf27f722e6c90702af8f51fd11a95b8c173c38 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Sun, 27 Dec 2015 18:25:51 +0100
Subject: [PATCH 1/5] delayed build of hash buckets v2
Removes forgotten memset in ExecHashIncreaseNumBatches() causing
crashes when we start without batching and find out we need to
do batching later.
---
src/backend/commands/explain.c | 8 +---
src/backend/executor/nodeHash.c | 94 +++++++++++++----------------------------
src/include/executor/hashjoin.h | 5 ---
3 files changed, 32 insertions(+), 75 deletions(-)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 12dae77..8fd9c9f 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2116,21 +2116,17 @@ show_hash_info(HashState *hashstate, ExplainState *es)
if (es->format != EXPLAIN_FORMAT_TEXT)
{
ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
- ExplainPropertyLong("Original Hash Buckets",
- hashtable->nbuckets_original, es);
ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
ExplainPropertyLong("Original Hash Batches",
hashtable->nbatch_original, es);
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
}
- else if (hashtable->nbatch_original != hashtable->nbatch ||
- hashtable->nbuckets_original != hashtable->nbuckets)
+ else if (hashtable->nbatch_original != hashtable->nbatch)
{
appendStringInfoSpaces(es->str, es->indent * 2);
appendStringInfo(es->str,
- "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
+ "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
hashtable->nbuckets,
- hashtable->nbuckets_original,
hashtable->nbatch,
hashtable->nbatch_original,
spacePeakKb);
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 5e05ec3..7708581 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,7 +39,7 @@
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
-static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecHashBuildBuckets(HashJoinTable hashtable);
static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
int mcvsToUse);
static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -129,9 +129,8 @@ MultiExecHash(HashState *node)
}
}
- /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
- if (hashtable->nbuckets != hashtable->nbuckets_optimal)
- ExecHashIncreaseNumBuckets(hashtable);
+ /* Construct the actual hash table (using the optimal number of buckets). */
+ ExecHashBuildBuckets(hashtable);
/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
@@ -283,10 +282,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
hashtable->nbuckets = nbuckets;
- hashtable->nbuckets_original = nbuckets;
- hashtable->nbuckets_optimal = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
- hashtable->log2_nbuckets_optimal = log2_nbuckets;
hashtable->buckets = NULL;
hashtable->keepNulls = keepNulls;
hashtable->skewEnabled = false;
@@ -372,18 +368,11 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
}
/*
- * Prepare context for the first-scan space allocations; allocate the
- * hashbucket array therein, and set each bucket "empty".
- */
- MemoryContextSwitchTo(hashtable->batchCxt);
-
- hashtable->buckets = (HashJoinTuple *)
- palloc0(nbuckets * sizeof(HashJoinTuple));
-
- /*
* Set up for skew optimization, if possible and there's a need for more
* than one batch. (In a one-batch join, there's no point in it.)
*/
+ MemoryContextSwitchTo(hashtable->batchCxt);
+
if (nbatch > 1)
ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
@@ -654,25 +643,11 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
*/
ninmemory = nfreed = 0;
- /* If know we need to resize nbuckets, we can do it while rebatching. */
- if (hashtable->nbuckets_optimal != hashtable->nbuckets)
- {
- /* we never decrease the number of buckets */
- Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
-
- hashtable->nbuckets = hashtable->nbuckets_optimal;
- hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
-
- hashtable->buckets = repalloc(hashtable->buckets,
- sizeof(HashJoinTuple) * hashtable->nbuckets);
- }
-
/*
* We will scan through the chunks directly, so that we can reset the
* buckets now and not have to keep track which tuples in the buckets have
* already been processed. We will free the old chunks as we go.
*/
- memset(hashtable->buckets, 0, sizeof(HashJoinTuple) * hashtable->nbuckets);
oldchunks = hashtable->chunks;
hashtable->chunks = NULL;
@@ -704,10 +679,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
memcpy(copyTuple, hashTuple, hashTupleSize);
-
- /* and add it back to the appropriate bucket */
- copyTuple->next = hashtable->buckets[bucketno];
- hashtable->buckets[bucketno] = copyTuple;
}
else
{
@@ -753,27 +724,20 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
}
/*
- * ExecHashIncreaseNumBuckets
- * increase the original number of buckets in order to reduce
- * number of tuples per bucket
+ * ExecHashBuildBuckets
+ * complete building the hash table by allocating the optimal number of
+ * buckets and filling them with tuples
*/
static void
-ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+ExecHashBuildBuckets(HashJoinTable hashtable)
{
HashMemoryChunk chunk;
- /* do nothing if not an increase (it's called increase for a reason) */
- if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
- return;
-
#ifdef HJDEBUG
- printf("Increasing nbuckets %d => %d\n",
- hashtable->nbuckets, hashtable->nbuckets_optimal);
+ printf("Constructing table with nbuckets %d\n", hashtable->nbuckets);
#endif
- hashtable->nbuckets = hashtable->nbuckets_optimal;
- hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
-
+ Assert(hashtable->buckets == NULL);
Assert(hashtable->nbuckets > 1);
Assert(hashtable->nbuckets <= (INT_MAX / 2));
Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
@@ -785,10 +749,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
* chunks)
*/
hashtable->buckets =
- (HashJoinTuple *) repalloc(hashtable->buckets,
- hashtable->nbuckets * sizeof(HashJoinTuple));
-
- memset(hashtable->buckets, 0, hashtable->nbuckets * sizeof(HashJoinTuple));
+ (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
/* scan through all tuples in all chunks to rebuild the hash table */
for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
@@ -816,7 +777,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
}
#ifdef HJDEBUG
- printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+ printf("Nbuckets set to %d, average items per bucket %.1f\n",
hashtable->nbuckets, batchTuples / hashtable->nbuckets);
#endif
}
@@ -872,9 +833,15 @@ ExecHashTableInsert(HashJoinTable hashtable,
*/
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
- /* Push it onto the front of the bucket's list */
- hashTuple->next = hashtable->buckets[bucketno];
- hashtable->buckets[bucketno] = hashTuple;
+ /*
+ * We only do this if we already have buckets allocated, i.e. after
+ * the first batch.
+ */
+ if (hashtable->buckets != NULL)
+ {
+ hashTuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = hashTuple;
+ }
/*
* Increase the (optimal) number of buckets if we just exceeded the
@@ -882,14 +849,14 @@ ExecHashTableInsert(HashJoinTable hashtable,
* batch.
*/
if (hashtable->nbatch == 1 &&
- ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+ ntuples > (hashtable->nbuckets * NTUP_PER_BUCKET))
{
/* Guard against integer overflow and alloc size overflow */
- if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
- hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+ if (hashtable->nbuckets <= INT_MAX / 2 &&
+ hashtable->nbuckets * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
{
- hashtable->nbuckets_optimal *= 2;
- hashtable->log2_nbuckets_optimal += 1;
+ hashtable->nbuckets *= 2;
+ hashtable->log2_nbuckets += 1;
}
}
@@ -898,7 +865,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;
if (hashtable->spaceUsed +
- hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
+ hashtable->nbuckets * sizeof(HashJoinTuple)
> hashtable->spaceAllowed)
ExecHashIncreaseNumBatches(hashtable);
}
@@ -1216,7 +1183,6 @@ void
ExecHashTableReset(HashJoinTable hashtable)
{
MemoryContext oldcxt;
- int nbuckets = hashtable->nbuckets;
/*
* Release all the hash buckets and tuples acquired in the prior pass, and
@@ -1226,8 +1192,8 @@ ExecHashTableReset(HashJoinTable hashtable)
oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
/* Reallocate and reinitialize the hash bucket headers. */
- hashtable->buckets = (HashJoinTuple *)
- palloc0(nbuckets * sizeof(HashJoinTuple));
+ hashtable->buckets =
+ (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
hashtable->spaceUsed = 0;
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 7a51ea6..255c506 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -128,11 +128,6 @@ typedef struct HashJoinTableData
int nbuckets; /* # buckets in the in-memory hash table */
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
- int nbuckets_original; /* # buckets when starting the first
- * hash */
- int nbuckets_optimal; /* optimal # buckets (per batch) */
- int log2_nbuckets_optimal; /* log2(nbuckets_optimal) */
-
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
struct HashJoinTupleData **buckets;
/* buckets array is per-batch storage, as are all the tuples */
--
2.1.0
0002-always-start-in-un-batched-mode.patchtext/x-diff; name=0002-always-start-in-un-batched-mode.patchDownload
>From 8c0c25fc894de49054fb6711af96cbcf61cc2b51 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Wed, 30 Dec 2015 14:02:26 +0100
Subject: [PATCH 2/5] always start in un-batched mode
---
src/backend/executor/nodeHash.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 7708581..e7d4b5f 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -290,7 +290,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
hashtable->skewBucketLen = 0;
hashtable->nSkewBuckets = 0;
hashtable->skewBucketNums = NULL;
- hashtable->nbatch = nbatch;
+ hashtable->nbatch = 1;
hashtable->curbatch = 0;
hashtable->nbatch_original = nbatch;
hashtable->nbatch_outstart = nbatch;
@@ -600,7 +600,15 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
return;
- nbatch = oldnbatch * 2;
+ /*
+ * If we're incrementing the number of batches for the first time,
+ * let's see if we should start with nbatch_original.
+ */
+ if ((oldnbatch == 1) && (hashtable->nbatch_original > 1))
+ nbatch = hashtable->nbatch_original;
+ else
+ nbatch = oldnbatch * 2;
+
Assert(nbatch > 1);
#ifdef HJDEBUG
--
2.1.0
0003-eliminate-the-MaxAllocSize-limit-now-buckets-can-be-.patchtext/x-diff; name=0003-eliminate-the-MaxAllocSize-limit-now-buckets-can-be-.patchDownload
>From b07280437d5dc3cf60b2755b2562825cd0da0ec6 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Wed, 30 Dec 2015 15:56:56 +0100
Subject: [PATCH 3/5] eliminate the MaxAllocSize limit, now buckets can be up
to ~8GB (INT_MAX/2 items)
Note: don't forget to zero buckets allocated with AllocHuge()
---
src/backend/executor/nodeHash.c | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index e7d4b5f..c53d485 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -466,22 +466,19 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
/*
* Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
- * memory is filled, assuming a single batch; but limit the value so that
- * the pointer arrays we'll try to allocate do not exceed work_mem nor
- * MaxAllocSize.
+ * memory is filled, assuming a single batch.
*
* Note that both nbuckets and nbatch must be powers of 2 to make
* ExecHashGetBucketAndBatch fast.
*/
max_pointers = (work_mem * 1024L) / sizeof(HashJoinTuple);
- max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
+
/* If max_pointers isn't a power of 2, must round it down to one */
mppow2 = 1L << my_log2(max_pointers);
if (max_pointers != mppow2)
max_pointers = mppow2 / 2;
- /* Also ensure we avoid integer overflow in nbatch and nbuckets */
- /* (this step is redundant given the current value of MaxAllocSize) */
+ /* Ensure we avoid integer overflow in nbatch and nbuckets */
max_pointers = Min(max_pointers, INT_MAX / 2);
dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
@@ -597,7 +594,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
return;
/* safety check to avoid overflow */
- if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
+ if (oldnbatch > (INT_MAX / 2))
return;
/*
@@ -757,7 +754,11 @@ ExecHashBuildBuckets(HashJoinTable hashtable)
* chunks)
*/
hashtable->buckets =
- (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
+ (HashJoinTuple *) MemoryContextAllocHuge(hashtable->batchCxt,
+ hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ /* Don't forget to zero the buckets (AllocHuge does not do that). */
+ memset(hashtable->buckets, 0, hashtable->nbuckets * sizeof(HashJoinTuple));
/* scan through all tuples in all chunks to rebuild the hash table */
for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
@@ -860,8 +861,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
ntuples > (hashtable->nbuckets * NTUP_PER_BUCKET))
{
/* Guard against integer overflow and alloc size overflow */
- if (hashtable->nbuckets <= INT_MAX / 2 &&
- hashtable->nbuckets * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+ if (hashtable->nbuckets <= INT_MAX / 2)
{
hashtable->nbuckets *= 2;
hashtable->log2_nbuckets += 1;
@@ -1201,7 +1201,11 @@ ExecHashTableReset(HashJoinTable hashtable)
/* Reallocate and reinitialize the hash bucket headers. */
hashtable->buckets =
- (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
+ (HashJoinTuple *) MemoryContextAllocHuge(hashtable->batchCxt,
+ hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ /* Don't forget to zero the buckets (AllocHuge does not do that). */
+ memset(hashtable->buckets, 0, hashtable->nbuckets * sizeof(HashJoinTuple));
hashtable->spaceUsed = 0;
--
2.1.0
Hi,
On 12/17/2015 10:28 PM, Tomas Vondra wrote:
Hi,
On 12/17/2015 07:20 PM, Robert Haas wrote:
...If this doesn't regress performance in the case where the number of
buckets is estimated accurately to begin with, then I think this is
a great idea. Can you supply some performance tests results for that
case, and maybe some of the other cases also?I don't see how it could regress performance, and the benchmarks I've
done confirm that. I'll do more thorough benchmarking and post the
results here, but not now as this patch is in 2016-01 CF and I want to
put all my time into reviewing patches from the open commitfest.
I've finally got to do more thorough benchmarks, and surprisingly there
seems to be a minor regression. The scripts I've used are attached,
along with results, but in short it joins two tables, with different
combinations:
1M x 10M
10M x 100M
5M x 250M
with different work_mem sizes (so that the smallest value uses a bunch
of batches while the largest value uses a single batch).
The 1x10 and 10x100 are designed to fit into RAM on the machine
(including the temporary files in batching mode), while 5x250 is
designed to specifically force the temp files to disk (but it's on fast
SSD array, so not a big deal).
Average timings for current master and the first two patches of the
series (0001 postpones the build of buckets and 0002 always starts
without batching) look like this (the timings are in milliseconds):
size work_mem master postpone no-batch
-----------------------------------------------------
1x10 8 5735 5760 6107
32 5939 5955 6124
128 4852 5038 5175
-----------------------------------------------------
5x250 64 158512 158429 159008
256 144046 144584 144994
1024 111478 117529 116933
-----------------------------------------------------
10x100 64 68389 68278 68530
256 66693 66415 66605
1024 48606 50654 50490
So compared to master, the differences look like this:
size work_mem postpone no-batch
-----------------------------------------
1x10 8 0.44% 6.50%
32 0.28% 3.13%
128 3.83% 6.65%
-----------------------------------------
5x250 64 -0.05% 0.31%
256 0.37% 0.66%
1024 5.43% 4.89%
-----------------------------------------
10x100 64 -0.16% 0.21%
256 -0.42% -0.13%
1024 4.21% 3.88%
So for the first patch (postponing building of buckets) with batching,
the difference is rather negligible, possibly within noise (~0.5%). When
the hash join runs with a single batch, the difference however gets much
more significant 4-5%.
I'm not going to discuss the second patch for now, but naturally it has
the same issue and apparently also some additional ones (at least on the
1x10 data set).
I have spent a fair amount of time profiling this, and I can't wrap my
head around where the difference comes from, though. Essentially we
don't do any additional stuff, we've just moved some of the stuff to a
different place in the code.
It might change the memory access pattern a bit, but the original
patterns are not exactly sequential or so, as we're moving random
updates to array of pointers. Or rather moving them to the
BuildBuckets() method, so if something consumes more time, it should be
in this method, I believe.
So I've measured how much time the function takes for the 1x10 and
10x100 datasets, and it's ~25ms and ~290ms respectively. That's way less
than the actual difference in query runtime, which is ~190ms and ~2050ms.
So even if we entirely skipped the bucket build, we would not recover
the difference. Not sure what's going on here.
I've also done some profiling using perf, but I haven't really found
anything suspicious there.
Any ideas what might be the cause of this?
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
hashes-delayed.odsapplication/vnd.oasis.opendocument.spreadsheet; name=hashes-delayed.odsDownload
PK �3H�l9�. . mimetypeapplication/vnd.oasis.opendocument.spreadsheetPK �3Hxr�4] 4] Thumbnails/thumbnail.png�PNG
IHDR � � +%@ \�IDATx��]w|E����+��[ z�z���Q:H
MZ(�(UED�H��W��zzJ�%����a�;�dgw�b������ww��3g�y����{���5%K�L����v�\ �<��:��r���K��[X!��x�kg�:�W"���{�>�{���,b��ktw�n4�X,VW�Y�A[����Y <Z�p3�:����#�@�@��������5+i��?��?t_�����{O�<��'.�5�/�F�(/���rR�H��P5�T�(�
�#+������ArC�L?��c�t0t%?i�t��}�2:$���8y���L�2��$:��n-Q����������:�R��{���z�j��A�}�Y��is���{���{�{O<�W�^��7/SI������ (y���+V����_p���s������7i���IRZ���B������H:^� "�<x����K�(�pV?��te���oMJ�\�������w�e���7<}���|�����3��C�������/�Z�
v���?^�pa��)q�n�����S���x���o���������v��n�:�p���^�z��w�$ �.
���(b�����o�q���{���u���~*R����?u;v����3�O�<��d�x�b��\�r�*U����g�yH�m��E�%�*hX��%����@��%i���������-[��� Q�i�f�����e�������U�F�>|�A9���e������F���i��c�*V���B���T�R��_�pa���|�������7S�NEc�+Wn�����8�v����;v�8~���/_�\��{��c0v���G��]����-���/�HG�A�;v�����x�� ����4i����� �b��2<�m�.]:h��y���O��Q����3d�����H�b ��;)�b(������� -��o_�J� ���0 �7c�������o����'�4�G�����^��E����C�Pti(�e������s��E��_�>�&$�4�����7o��?�A��w���!C�����M���,Y��Q#�y�QH�:u�0{��~
��o��p1h��
����V�x��g��wA�C����\�r���N�P~��g(;~&O�<:::((X�S����-�u��q+2"Acn�� �-S��V�xz�� ��[@� �7j�7M��jtJQtA�u����!����y)�O���� ;����A++V����h&��������^J��]�P���s�AqY�����/\�F�]�6���-P� �'2�������,Y��#|
ZUB3P�������PX#�"��X�f
�~��:�L�5`pDQ�J�BG���������V3aG�y�7�o�?��x0���������'O��{�����z�~R�'}���x��,Y����������<��+��OZ�3*Z�GC@�`0U�V�����M��������������E` !@-]�4�����qD��B{�(����@��*4�N}�4f~�7p4�����_M������eKt2����X#��#F�.x1�
4 @���?�7x$t��={��A��\�}*���B9�6m����@��l
4��a,j����� ~�0�3�L�:�4lW�F�)%�=���ytb�tFZ�P�^�'�teCm�q���8 +>{�l�t6m;A��&���h��QN��BCC��G�k��5��l��
�o�����p=�L�4UJ��x�U�V�@�q����~�)e @y___"�H �#+
DG�6���"�"dT��R�N�����+��x�aoZ8):I��� �a(��7t����aCT
b��C:S�fM�A{���~I�?[�|����!P�]�t��2�7�7P�xJfo�?r�
�V����D!���2���(����+�~)
�����6Z��XQ�:X@W�n���n��5� ��0��[��$�f�km�F��N�d�-���8 �k*J(�������^1��Q4�L�,���#7��yV;������yb��y(���h���8%������D�����M��O\��T��bbMN��������������F�Ss����+�i��yY����:O�'�c�5������Eg��<�
����u) ���$�)��~��wxF����HD�pO������D8o�O�J��U�V5n�������U�,���H@�RbP�K�.5k�����Q���ax���(ftG�y���3g�}�����"�2
Q�h���,c��-��O����A@%L�8q��1��2+���4��Q._�02g�L~���%$_�cD����z�����B�
����L��k����1��5�o���T"���8 ����~�*`�����&�C�n����������#����� ��UF����]�'{�x�|����<H�""|��������3�h��r�d����$#�S����+������S�~�������i��}�%h����]u�1k`3�AS���bE�Q��k����H4�HAt ���m�����6�Zw����E�$�nX�Y�u���{��x�b����[�9\�x�l��N�b��n�:��k��i���� �I���r�7n<v��P�B�Y#G�,U�T�:u��'����8������C��*U��%KO��y�7l/C���(�1b�?����I�&1���j�y�����?���B���c��<#yQ�� �<)(t�n��.u\�<<���V
�JN�%vY��C���0+��p�d� )C"7�1c�8��B�2�
�mo�4�$j-�: ^�7��"��E��Gl���p�no�v�-XD���
���eox���O���fU����n����3�e�d��G�|�t�@�l4������p/��Zw1����&R�cV�D2�L,lp��������S
���nH��G������K�={v����T!"��sV;8{�l��)s��y���#G��)S�^��U*���EW��Q�`A`���7rU����Xx��f��066>�F����n�m���C�v���o���&l�c��!C��Lt���O�>�j�Z�d��I��e���&h���-Z4e��e��!~�Q{�&�?~|��q��5<xp������$}��9
����7d�+��cG�?�2 /���E� ����p�
���k���_��[76��f~C�A��6*V�U~Cg2e�4y��)Rl���x��P �D`���%Obf�����;e��Y���U� ��!I@��"