Perfomance of IN-clause with many elements and possible solutions

Started by Dmitry Lazurkinalmost 9 years ago28 messagesgeneral

dilaz03@gmail.com

almost 9 years ago

Hello.

I have database with events with type from different souces identified by
id. I have query which filters events by IN-clause with many ids (1-500
ids). I see poor perfomance of IN-clause and try to investigate this
problem.

SELECT version();
version
-------------------------------------------------------------------------------------------------------------------
PostgreSQL 10beta2 on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu
5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609, 64-bit

-- Full table can fit in memory
show shared_buffers;
shared_buffers
----------------
2GB

show work_mem;
work_mem
----------
16MB

SET max_parallel_workers_per_gather TO 0;
SET max_parallel_workers TO 0;

-- Create table with 10 000 000 rows with 500 bigints
CREATE TABLE ids AS SELECT trunc(random() * 500)::bigint as id from
generate_series(1, 10000000);

-- IN (...)
SELECT ('(' || string_agg(id::text, ',') || ')') AS in_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN :in_clause;

Aggregate (cost=2654265.02..2654265.03 rows=1 width=8) (actual
time=17268.831..17268.831 rows=1 loops=1)
Buffers: shared hit=44248
-> Seq Scan on ids (cost=0.00..2644260.48 rows=4001815 width=0)
(actual time=0.066..16722.072 rows=3998646 loops=1)
Filter: (id = ANY
('{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199}'::bigint[]))
Rows Removed by Filter: 6001354
Buffers: shared hit=44248
Planning time: 3.324 ms
Execution time: 17268.907 ms

-- IN (VALUES ...)
SELECT ('(VALUES ' || string_agg('(' || id::text || ')', ',') || ')') AS
values_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;
Aggregate (cost=245006.46..245006.47 rows=1 width=8) (actual
time=4086.188..4086.188 rows=1 loops=1)
Buffers: shared hit=44248
-> Hash Join (cost=7.50..235006.42 rows=4000019 width=0) (actual
time=0.978..3557.037 rows=3998646 loops=1)
Hash Cond: (ids.id = "*VALUES*".column1)
Buffers: shared hit=44248
-> Seq Scan on ids (cost=0.00..144248.48 rows=10000048 width=8)
(actual time=0.031..1138.542 rows=10000000 loops=1)
Buffers: shared hit=44248
-> Hash (cost=5.00..5.00 rows=200 width=4) (actual
time=0.923..0.923 rows=200 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 16kB
-> HashAggregate (cost=3.00..5.00 rows=200 width=4)
(actual time=0.606..0.759 rows=200 loops=1)
Group Key: "*VALUES*".column1
-> Values Scan on "*VALUES*" (cost=0.00..2.50
rows=200 width=4) (actual time=0.003..0.330 rows=200 loops=1)
Planning time: 1.094 ms
Execution time: 4086.333 ms

-- '...'::hstore ? id
SELECT ('''' || string_agg(id::text || '=>NULL', ',') || '''::hstore') AS
hstore_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE :hstore_clause ?
id::text;
Planning time: 0.206 ms
Execution time: 5032.794 ms

-- '...'::jsonb ? id
SELECT ('''{' || string_agg('"' || id::text || '": null', ',') ||
'}''::jsonb') AS jsonb_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE :jsonb_clause ?
id::text;
Planning time: 0.114 ms
Execution time: 9277.307 ms

IN-VALUES clause has the bestest perfomance. So I have some questions:

- May be exist better solution?
- Does PostgreSQL have support of hashset structure? Extension (I don't
found)?
- IN-VALUES clause adds new node to plan. Has additional node big overhead?
How about filter by two or more IN-VALUES clause?

Thanks.

Bill Moran

wmoran@potentialtech.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#1)

Re: Perfomance of IN-clause with many elements and possible solutions

On Sun, 23 Jul 2017 14:35:24 +0300
"dilaz03 ." <dilaz03@gmail.com> wrote:

Hello.

I have database with events with type from different souces identified by
id. I have query which filters events by IN-clause with many ids (1-500
ids). I see poor perfomance of IN-clause and try to investigate this
problem.

SELECT version();
version
-------------------------------------------------------------------------------------------------------------------
PostgreSQL 10beta2 on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu
5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609, 64-bit

-- Full table can fit in memory
show shared_buffers;
shared_buffers
----------------
2GB

show work_mem;
work_mem
----------
16MB

SET max_parallel_workers_per_gather TO 0;
SET max_parallel_workers TO 0;

-- Create table with 10 000 000 rows with 500 bigints
CREATE TABLE ids AS SELECT trunc(random() * 500)::bigint as id from
generate_series(1, 10000000);

-- IN (...)
SELECT ('(' || string_agg(id::text, ',') || ')') AS in_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN :in_clause;

Aggregate (cost=2654265.02..2654265.03 rows=1 width=8) (actual
time=17268.831..17268.831 rows=1 loops=1)
Buffers: shared hit=44248
-> Seq Scan on ids (cost=0.00..2644260.48 rows=4001815 width=0)
(actual time=0.066..16722.072 rows=3998646 loops=1)
Filter: (id = ANY
('{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199}'::bigint[]))
Rows Removed by Filter: 6001354
Buffers: shared hit=44248
Planning time: 3.324 ms
Execution time: 17268.907 ms

In this example you count approximately 40,000,000 values, which is
about 40% of the table. That's going to take time, and most of it
is going to be CPU (since the table fits in memory). If your table
must be structured this way and if these are the queries you're going
to run, the only thing I can expect will speed them up is a faster
processer (note, NOT more CPU cores, but faster). You don't mention
what kind of CPU you have, but I'm guessing it's already pretty fast
and you can't really get anything but marginally faster.

If you really need these queries to be faster, I would suggest
materializing the data, i.e. create a table like:

CREATE TABLE id_counts (
id BIGINT PRIMARY KEY,
num BIGINT
)

Then use a trigger or similar technique to keep id_counts in sync
with the id table. You can then run queries of the form:

SELECT sum(num) FROM id_counts WHERE id IN :values:

which I would wager houseboats will be significantly faster.

-- IN (VALUES ...)
SELECT ('(VALUES ' || string_agg('(' || id::text || ')', ',') || ')') AS
values_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;
Aggregate (cost=245006.46..245006.47 rows=1 width=8) (actual
time=4086.188..4086.188 rows=1 loops=1)
Buffers: shared hit=44248
-> Hash Join (cost=7.50..235006.42 rows=4000019 width=0) (actual
time=0.978..3557.037 rows=3998646 loops=1)
Hash Cond: (ids.id = "*VALUES*".column1)
Buffers: shared hit=44248
-> Seq Scan on ids (cost=0.00..144248.48 rows=10000048 width=8)
(actual time=0.031..1138.542 rows=10000000 loops=1)
Buffers: shared hit=44248
-> Hash (cost=5.00..5.00 rows=200 width=4) (actual
time=0.923..0.923 rows=200 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 16kB
-> HashAggregate (cost=3.00..5.00 rows=200 width=4)
(actual time=0.606..0.759 rows=200 loops=1)
Group Key: "*VALUES*".column1
-> Values Scan on "*VALUES*" (cost=0.00..2.50
rows=200 width=4) (actual time=0.003..0.330 rows=200 loops=1)
Planning time: 1.094 ms
Execution time: 4086.333 ms

-- '...'::hstore ? id
SELECT ('''' || string_agg(id::text || '=>NULL', ',') || '''::hstore') AS
hstore_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE :hstore_clause ?
id::text;
Planning time: 0.206 ms
Execution time: 5032.794 ms

-- '...'::jsonb ? id
SELECT ('''{' || string_agg('"' || id::text || '": null', ',') ||
'}''::jsonb') AS jsonb_clause
FROM (SELECT id FROM ids GROUP BY id ORDER BY id LIMIT 200) AS s \gset

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE :jsonb_clause ?
id::text;
Planning time: 0.114 ms
Execution time: 9277.307 ms

IN-VALUES clause has the bestest perfomance. So I have some questions:

- May be exist better solution?
- Does PostgreSQL have support of hashset structure? Extension (I don't
found)?
- IN-VALUES clause adds new node to plan. Has additional node big overhead?
How about filter by two or more IN-VALUES clause?

Thanks.

--
PT <wmoran@potentialtech.com>

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: Bill Moran (#2)

Re: Perfomance of IN-clause with many elements and possible solutions

On 07/24/2017 01:40 AM, PT wrote:

In this example you count approximately 40,000,000 values, which is
about 40% of the table.

4 000 000 (:

If you really need these queries to be faster, I would suggest
materializing the data, i.e. create a table like:

CREATE TABLE id_counts (
id BIGINT PRIMARY KEY,
num BIGINT
)

Then use a trigger or similar technique to keep id_counts in sync
with the id table. You can then run queries of the form:

SELECT sum(num) FROM id_counts WHERE id IN :values:

which I would wager houseboats will be significantly faster.

I use count only for example because it uses seqscan. I want optimize
IN-clause ;-).

Thanks.

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: Bill Moran (#2)

Re: Perfomance of IN-clause with many elements and possible solutions

On 07/24/2017 01:40 AM, PT wrote:

In this example you count approximately 40,000,000 values, which is
about 40% of the table.

4 000 000 (:

If you really need these queries to be faster, I would suggest
materializing the data, i.e. create a table like:

CREATE TABLE id_counts (
id BIGINT PRIMARY KEY,
num BIGINT
)

Then use a trigger or similar technique to keep id_counts in sync
with the id table. You can then run queries of the form:

SELECT sum(num) FROM id_counts WHERE id IN :values:

which I would wager houseboats will be significantly faster.

I use count only for example because it uses seqscan. I want optimize
IN-clause ;-).

Thanks.

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

Bill Moran

wmoran@potentialtech.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#3)

Re: Perfomance of IN-clause with many elements and possible solutions

On Mon, 24 Jul 2017 13:17:56 +0300
Dmitry Lazurkin <dilaz03@gmail.com> wrote:

On 07/24/2017 01:40 AM, PT wrote:

In this example you count approximately 40,000,000 values, which is
about 40% of the table.

4 000 000 (:

If you really need these queries to be faster, I would suggest
materializing the data, i.e. create a table like:

CREATE TABLE id_counts (
id BIGINT PRIMARY KEY,
num BIGINT
)

Then use a trigger or similar technique to keep id_counts in sync
with the id table. You can then run queries of the form:

SELECT sum(num) FROM id_counts WHERE id IN :values:

which I would wager houseboats will be significantly faster.

I use count only for example because it uses seqscan. I want optimize
IN-clause ;-).

The IN clause is not what's taking all the time. It's the processing of
millions of rows that's taking all the time.

Perhaps you should better describe what it is you really want to accomplish.
Regardless of what it is, if it involves processing many millions of rows,
you're probably going to need to do some sort of materialization.

--
PT <wmoran@potentialtech.com>

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

David G. Johnston

david.g.johnston@gmail.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#1)

Re: Perfomance of IN-clause with many elements and possible solutions

On Sun, Jul 23, 2017 at 4:35 AM, dilaz03 . <dilaz03@gmail.com> wrote:

- IN-VALUES clause adds new node to plan. Has additional node big
overhead? How about filter by two or more IN-VALUES clause?

IN-VALUES is just another word for "TABLE" which is another word for
"RELATION". Writing relational database queries that use explicit
relations is generally going to give you the best performance.

Basically you want to write something like:

SELECT *
FROM ids
JOIN ( :values_clause ) vc (vid) ON (vc.vid = ids.id)

WITH vc AS (SELECT vid FROM .... ORDER BY ... LIMIT )
SELECT *
FROM ids
JOIN vc ON (vid = ids.id)

"IN ('l1','l2','l3')" is nice and all but as demonstrated the mechanics of
executing that are different, and slower, than processing relations and
tuples. For a small number of items the difference is generally not
meaningful and so the convenience of writing (IN (...)) is worth taking.

David J.

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: Bill Moran (#5)

Re: Perfomance of IN-clause with many elements and possible solutions

On 25.07.2017 00:17, PT wrote:

The IN clause is not what's taking all the time. It's the processing of
millions of rows that's taking all the time.

IN (...) - 17 sec
IN (VALUES ...) - 4 sec
So performance issue is with IN-clause.

Perhaps you should better describe what it is you really want to accomplish.
Regardless of what it is, if it involves processing many millions of rows,
you're probably going to need to do some sort of materialization.

I try to find better solutions for IN-task.

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: David G. Johnston (#6)

Re: Perfomance of IN-clause with many elements and possible solutions

On 25.07.2017 00:31, David G. Johnston wrote:

Basically you want to write something like:

SELECT *
FROM ids
JOIN ( :values_clause ) vc (vid) ON (vc.vid = ids.id <http://ids.id>)

or

WITH vc AS (SELECT vid FROM .... ORDER BY ... LIMIT )
SELECT *
FROM ids
JOIN vc ON (vid = ids.id <http://ids.id>)

This query uses JOIN plan node as IN (VALUES ...).

And I have one question. I don't understand why IN-VALUES doesn't use
Semi-Join? PostgreSQL has Hash Semi-Join... For which task the database
has node of this type?

David G. Johnston

david.g.johnston@gmail.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#8)

Re: Perfomance of IN-clause with many elements and possible solutions

On Mon, Jul 24, 2017 at 3:12 PM, Dmitry Lazurkin <dilaz03@gmail.com> wrote:

And I have one question. I don't understand why IN-VALUES doesn't use
Semi-Join? PostgreSQL has Hash Semi-Join... For which task the database
has node of this type?

Semi-Join is canonically written as:

SELECT *
FROM tbl
WHERE EXISTS (SELECT 1 FROM tbl2 WHERE tbl.id = tbl2.id)

The main difference between IN and EXISTS is NULL semantics.

David J.

#10

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: David G. Johnston (#9)

Re: Perfomance of IN-clause with many elements and possible solutions

On 25.07.2017 01:15, David G. Johnston wrote:

On Mon, Jul 24, 2017 at 3:12 PM, Dmitry Lazurkin <dilaz03@gmail.com
<mailto:dilaz03@gmail.com>>wrote:

And I have one question. I don't understand why IN-VALUES doesn't
use Semi-Join? PostgreSQL has Hash Semi-Join... For which task
the database has node of this type?

Semi-Join is canonically written as:

SELECT *
FROM tbl
WHERE EXISTS (SELECT 1 FROM tbl2 WHERE tbl.id <http://tbl.id> =
tbl2.id <http://tbl2.id>)

The main difference between IN and EXISTS is NULL semantics.

David J.

ALTER TABLE ids ALTER COLUMN id SET NOT NULL;
EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;

Aggregate (cost=245006.46..245006.47 rows=1 width=8) (actual
time=3824.095..3824.095 rows=1 loops=1)
Buffers: shared hit=44248
-> Hash Join (cost=7.50..235006.42 rows=4000019 width=0) (actual
time=1.108..3327.112 rows=3998646 loops=1)
...

Hmmm. No Semi-Join.

PostgreSQL can use Semi-Join for IN too.

#11

David G. Johnston

david.g.johnston@gmail.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#10)

Re: Perfomance of IN-clause with many elements and possible solutions

On Mon, Jul 24, 2017 at 3:22 PM, Dmitry Lazurkin <dilaz03@gmail.com> wrote:

ALTER TABLE ids ALTER COLUMN id SET NOT NULL;
EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;

Aggregate (cost=245006.46..245006.47 rows=1 width=8) (actual
time=3824.095..3824.095 rows=1 loops=1)
Buffers: shared hit=44248
-> Hash Join (cost=7.50..235006.42 rows=4000019 width=0) (actual
time=1.108..3327.112 rows=3998646 loops=1)
...

You haven't constrained the outer relation (i.e., :values_clause) to be
non-null which is what I believe is required for the semi-join algorithm to
be considered.

David J.

#12

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: David G. Johnston (#11)

Re: Perfomance of IN-clause with many elements and possible solutions

On 25.07.2017 01:25, David G. Johnston wrote:

On Mon, Jul 24, 2017 at 3:22 PM, Dmitry Lazurkin <dilaz03@gmail.com
<mailto:dilaz03@gmail.com>>wrote:

ALTER TABLE ids ALTER COLUMN id SET NOT NULL;
EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;

Aggregate (cost=245006.46..245006.47 rows=1 width=8) (actual
time=3824.095..3824.095 rows=1 loops=1)
Buffers: shared hit=44248
-> Hash Join (cost=7.50..235006.42 rows=4000019 width=0)
(actual time=1.108..3327.112 rows=3998646 loops=1)
...

You haven't constrained the outer relation (i.e., :values_clause) to
be non-null which is what I believe is required for the semi-join
algorithm to be considered.

David J.

CREATE TABLE second_ids (i bigint);
INSERT INTO second_ids :values_clause;

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN (select
i from second_ids);

Aggregate (cost=225004.36..225004.37 rows=1 width=8) (actual
time=3826.641..3826.641 rows=1 loops=1)
Buffers: shared hit=44249
-> Hash Semi Join (cost=5.50..215004.32 rows=4000019 width=0)
(actual time=0.352..3338.601 rows=3998646 loops=1)
Hash Cond: (ids.id = second_ids.i)
Buffers: shared hit=44249
-> Seq Scan on ids (cost=0.00..144248.48 rows=10000048
width=8) (actual time=0.040..1069.006 rows=10000000 loops=1)
Buffers: shared hit=44248
-> Hash (cost=3.00..3.00 rows=200 width=8) (actual
time=0.288..0.288 rows=200 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 16kB
Buffers: shared hit=1
-> Seq Scan on second_ids (cost=0.00..3.00 rows=200
width=8) (actual time=0.024..0.115 rows=200 loops=1)
Buffers: shared hit=1
Planning time: 0.413 ms
Execution time: 3826.752 ms

Hash Semi-Join without NOT NULL constraint on second table.

#13

Tom Lane

tgl@sss.pgh.pa.us

almost 9 years ago

In reply to: David G. Johnston (#11)

Re: Perfomance of IN-clause with many elements and possible solutions

"David G. Johnston" <david.g.johnston@gmail.com> writes:

On Mon, Jul 24, 2017 at 3:22 PM, Dmitry Lazurkin <dilaz03@gmail.com> wrote:

ALTER TABLE ids ALTER COLUMN id SET NOT NULL;
EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;

Aggregate (cost=245006.46..245006.47 rows=1 width=8) (actual
time=3824.095..3824.095 rows=1 loops=1)
Buffers: shared hit=44248
-> Hash Join (cost=7.50..235006.42 rows=4000019 width=0) (actual
time=1.108..3327.112 rows=3998646 loops=1)
...

You haven't constrained the outer relation (i.e., :values_clause) to be
non-null which is what I believe is required for the semi-join algorithm to
be considered.

No, the planner is thinking about semi-join, it just decides it prefers
to de-dup and then do a plain join. I believe this is mainly because it
lacks statistics about the inner relation and is conservative about what
it assumes about the number of duplicates in the absence of stats.
But you can force it. Taking the original example (and being sure to
have ANALYZE'd ids):

regression=# EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=245006.16..245006.17 rows=1 width=8) (actual time=3550.581..3550.581 rows=1 loops=1)
Buffers: shared hit=2208 read=42040
-> Hash Join (cost=7.50..235006.13 rows=4000013 width=0) (actual time=0.494..3093.100 rows=4002875 loops=1)
Hash Cond: (ids.id = "*VALUES*".column1)
Buffers: shared hit=2208 read=42040
-> Seq Scan on ids (cost=0.00..144248.33 rows=10000033 width=8) (actual time=0.071..1118.278 rows=10000000 loops=1)
Buffers: shared hit=2208 read=42040
-> Hash (cost=5.00..5.00 rows=200 width=4) (actual time=0.404..0.404 rows=200 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 16kB
-> HashAggregate (cost=3.00..5.00 rows=200 width=4) (actual time=0.267..0.332 rows=200 loops=1)
Group Key: "*VALUES*".column1
-> Values Scan on "*VALUES*" (cost=0.00..2.50 rows=200 width=4) (actual time=0.003..0.134 rows=200 loops=1)
Planning time: 0.561 ms
Execution time: 3550.700 ms
(14 rows)

regression=# set enable_hashagg TO 0;
SET
regression=# EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=245012.31..245012.32 rows=1 width=8) (actual time=3553.194..3553.194 rows=1 loops=1)
Buffers: shared hit=2240 read=42008
-> Hash Join (cost=13.64..235012.28 rows=4000013 width=0) (actual time=0.545..3093.434 rows=4002875 loops=1)
Hash Cond: (ids.id = "*VALUES*".column1)
Buffers: shared hit=2240 read=42008
-> Seq Scan on ids (cost=0.00..144248.33 rows=10000033 width=8) (actual time=0.072..1118.853 rows=10000000 loops=1)
Buffers: shared hit=2240 read=42008
-> Hash (cost=11.14..11.14 rows=200 width=4) (actual time=0.452..0.452 rows=200 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 16kB
-> Unique (cost=10.14..11.14 rows=200 width=4) (actual time=0.227..0.384 rows=200 loops=1)
-> Sort (cost=10.14..10.64 rows=200 width=4) (actual time=0.226..0.276 rows=200 loops=1)
Sort Key: "*VALUES*".column1
Sort Method: quicksort Memory: 35kB
-> Values Scan on "*VALUES*" (cost=0.00..2.50 rows=200 width=4) (actual time=0.003..0.134 rows=200 loops=1)
Planning time: 0.567 ms
Execution time: 3553.297 ms
(16 rows)

regression=# set enable_sort TO 0;
SET
regression=# EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM ids WHERE id IN
:values_clause;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=320003.90..320003.91 rows=1 width=8) (actual time=3548.364..3548.364 rows=1 loops=1)
Buffers: shared hit=2272 read=41976
-> Hash Semi Join (cost=5.00..310003.87 rows=4000013 width=0) (actual time=0.331..3091.235 rows=4002875 loops=1)
Hash Cond: (ids.id = "*VALUES*".column1)
Buffers: shared hit=2272 read=41976
-> Seq Scan on ids (cost=0.00..144248.33 rows=10000033 width=8) (actual time=0.071..1117.761 rows=10000000 loops=1)
Buffers: shared hit=2272 read=41976
-> Hash (cost=2.50..2.50 rows=200 width=4) (actual time=0.236..0.236 rows=200 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 16kB
-> Values Scan on "*VALUES*" (cost=0.00..2.50 rows=200 width=4) (actual time=0.003..0.142 rows=200 loops=1)
Planning time: 0.545 ms
Execution time: 3548.463 ms
(12 rows)

The cost to form the inner hash is basically negligible whether it's
de-duped or not, but if it's not (known) de-duped then the cost
estimate for the semijoin is going to rise some, and that discourages
selecting it.

At least in this example, the actual runtimes are basically identical
regardless, so there is no great point in sweating over it.

regards, tom lane

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

#14

Jeff Janes

jeff.janes@gmail.com

almost 9 years ago

In reply to: Bill Moran (#5)

Re: Perfomance of IN-clause with many elements and possible solutions

On Jul 24, 2017 14:19, "PT" <wmoran@potentialtech.com> wrote:

On Mon, 24 Jul 2017 13:17:56 +0300
Dmitry Lazurkin <dilaz03@gmail.com> wrote:

On 07/24/2017 01:40 AM, PT wrote:

In this example you count approximately 40,000,000 values, which is
about 40% of the table.

4 000 000 (:

If you really need these queries to be faster, I would suggest
materializing the data, i.e. create a table like:

CREATE TABLE id_counts (
id BIGINT PRIMARY KEY,
num BIGINT
)

Then use a trigger or similar technique to keep id_counts in sync
with the id table. You can then run queries of the form:

SELECT sum(num) FROM id_counts WHERE id IN :values:

which I would wager houseboats will be significantly faster.

I use count only for example because it uses seqscan. I want optimize
IN-clause ;-).

The IN clause is not what's taking all the time. It's the processing of
millions of rows that's taking all the time.

It isn't either-or. It is the processing of millions of rows over the
large in-list which is taking the time. Processing an in-list as a hash
table would be great, but no one has gotten around to it implementing it
yet. Maybe Dmitry will be the one to do that.

Cheers,

Jeff

#15

David G. Johnston

david.g.johnston@gmail.com

almost 9 years ago

In reply to: Tom Lane (#13)

Re: Perfomance of IN-clause with many elements and possible solutions

On Mon, Jul 24, 2017 at 3:46 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

The cost to form the inner hash is basically negligible whether it's
de-duped or not, but if it's not (known) de-duped then the cost
estimate for the semijoin is going to rise some, and that discourages
selecting it.

Why does the "hash semi join" care about duplication of values on the
inner relation? Doesn't it only care whether a given bucket exists
irrespective of its contents?

Looking at those explains it would seem the "hash semi join" is simply an
inherently more expensive to execute compared to a "hash join" and that the
act of de-duping the inner relation would have to be quite expensive to
overcome the gap. I cannot reconcile this with the previous paragraph
though...

Pointing me to the readme or code file (comments) that explains this in
more detail would be welcome. Not sure what to grep for - "Hash Semi Join"
only turns up a couple of expected output results...

Thx.

David J.

#16

David G. Johnston

david.g.johnston@gmail.com

almost 9 years ago

In reply to: David G. Johnston (#15)

Re: Perfomance of IN-clause with many elements and possible solutions

On Mon, Jul 24, 2017 at 7:58 PM, David G. Johnston <
david.g.johnston@gmail.com> wrote:

On Mon, Jul 24, 2017 at 3:46 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

The cost to form the inner hash is basically negligible whether it's
de-duped or not, but if it's not (known) de-duped then the cost
estimate for the semijoin is going to rise some, and that discourages
selecting it.

Why does the "hash semi join" care about duplication of values on the
inner relation? Doesn't it only care whether a given bucket exists
irrespective of its contents?

Rather, it cares about the contents is-so-far as confirming that at least
one of the tuples in the bucket indeed has the same joining value as the
outer relation (lost track of the fact that two values can share the same
hash). But once it finds one it can move onto the new outer relation tuple
while an inner join would have to spend more time looking for additional
matches.

David J.

#17

Tom Lane

tgl@sss.pgh.pa.us

almost 9 years ago

In reply to: David G. Johnston (#15)

Re: Perfomance of IN-clause with many elements and possible solutions

"David G. Johnston" <david.g.johnston@gmail.com> writes:

On Mon, Jul 24, 2017 at 3:46 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

The cost to form the inner hash is basically negligible whether it's
de-duped or not, but if it's not (known) de-duped then the cost
estimate for the semijoin is going to rise some, and that discourages
selecting it.

Why does the "hash semi join" care about duplication of values on the
inner relation? Doesn't it only care whether a given bucket exists
irrespective of its contents?

No, it cares about the average bucket size, ie number of entries that
a probe will have to look at. The non-de-duped inner relation can't
be assumed to have a flat distribution among the buckets.

Pointing me to the readme or code file (comments) that explains this in
more detail would be welcome.

Look for estimate_hash_bucketsize in selfuncs.c and its caller in
costsize.c. The key point is this argument in that function's
header comment:

* We are passed the number of buckets the executor will use for the given
* input relation. If the data were perfectly distributed, with the same
* number of tuples going into each available bucket, then the bucketsize
* fraction would be 1/nbuckets. But this happy state of affairs will occur
* only if (a) there are at least nbuckets distinct data values, and (b)
* we have a not-too-skewed data distribution. Otherwise the buckets will
* be nonuniformly occupied. If the other relation in the join has a key
* distribution similar to this one's, then the most-loaded buckets are
* exactly those that will be probed most often. Therefore, the "average"
* bucket size for costing purposes should really be taken as something close
* to the "worst case" bucket size. We try to estimate this by adjusting the
* fraction if there are too few distinct data values, and then scaling up
* by the ratio of the most common value's frequency to the average frequency.
*
* If no statistics are available, use a default estimate of 0.1. This will
* discourage use of a hash rather strongly if the inner relation is large,
* which is what we want. We do not want to hash unless we know that the
* inner rel is well-dispersed (or the alternatives seem much worse).

That's certainly a bit pessimistic, but the thing to remember about any
hashing algorithm is that occasionally it will eat your lunch. We prefer
to go to sort-based algorithms if there seems to be a risk of the hash
degenerating to O(N^2).

regards, tom lane

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

#18

David G. Johnston

david.g.johnston@gmail.com

almost 9 years ago

In reply to: Tom Lane (#17)

Re: Perfomance of IN-clause with many elements and possible solutions

On Mon, Jul 24, 2017 at 8:11 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

[*docs]
If the data were perfectly distributed, with the same
* number of tuples going into each available bucket, then the bucketsize
* fraction would be 1/nbuckets. But this happy state of affairs will
occur
* only if (a) there are at least nbuckets distinct data values, and (b)
* we have a not-too-skewed data distribution. Otherwise the buckets will
* be nonuniformly occupied.

Thanks, I have a better feel now. Using this example (200 inner relation
rows) is pretty poor since at this scale there doesn't seem to be enough
data to make a noticeable difference.

But anyway, the above comment is only being applied when dealing with a
non-unique inner relation; however, the fraction used is 1/nbuckets for
any unique relation regardless of its size.

if (IsA(inner_path, UniquePath))
innerbucketsize = 1.0 / virtualbuckets;
else

And to clarify for others only reading this...the 200 on the "VALUES" node
is there because there are 200 literal values in the value_list. The 200
on the resulting Hash (and HashAggregate in the example) node is there
because of DEFAULT_NUM_DISTINCT (changing the query limit to 300 only
changed the former). Further, since it is only the default, the fraction
used charged out is 1/10 instead of 1/200 that would used if the 200 were a
real number instead - or 1/1024 if those 200 rows were known to be
themselves unique.

For me, I'm seeing that the expected number of input rows doesn't factor
into the innerbucketsize computation directly (possibly excepting a scaling
factor adjustment).

I can understand better, now, why this seemingly perfect example of a
semi-join query gets executed with an extra distinct/grouping node.

David J.

#19

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: Jeff Janes (#14)

Re: Perfomance of IN-clause with many elements and possible solutions

On 25.07.2017 05:50, Jeff Janes wrote:

It isn't either-or. It is the processing of millions of rows over the
large in-list which is taking the time. Processing an in-list as a
hash table would be great, but no one has gotten around to it
implementing it yet. Maybe Dmitry will be the one to do that.

Thanks. Yes, I want. But... Is this task too complex for novice?

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

#20

Dmitry Lazurkin

dilaz03@gmail.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#1)

Re: Perfomance of IN-clause with many elements and possible solutions

On 23.07.2017 14:35, dilaz03 . wrote:

- IN-VALUES clause adds new node to plan. Has additional node big
overhead? How about filter by two or more IN-VALUES clause?

Hmmm. This works.

-- Full table can fit in memory
show shared_buffers;
shared_buffers
----------------
4GB

show work_mem;
work_mem
----------
16MB

SET max_parallel_workers_per_gather TO 0;
SET max_parallel_workers TO 0;

-- 10 000 000 events of 30 types from 500 sources
CREATE TABLE events AS
SELECT trunc(random() * 500)::bigint AS source_id, md5(trunc(random() *
30)::text) AS type
FROM generate_series(1, 10000000);

-- Prepare all clauses
SELECT ('(' || string_agg(source_id::text, ',') || ')') AS
source_id_in_clause
FROM (SELECT source_id FROM events GROUP BY source_id ORDER BY source_id
LIMIT 200) AS s \gset

SELECT ('(' || string_agg(('''' || type || ''''), ',') || ')') AS
type_in_clause
FROM (SELECT type FROM events GROUP BY type ORDER BY type LIMIT 100) AS
s \gset

SELECT ('(VALUES ' || string_agg('(' || source_id::text || ')', ',') ||
')') AS source_id_values_clause
FROM (SELECT source_id FROM events GROUP BY source_id ORDER BY source_id
LIMIT 200) AS s \gset

SELECT ('(VALUES ' || string_agg('(''' || type::text || ''')', ',') ||
')') AS type_values_clause
FROM (SELECT type FROM events GROUP BY type ORDER BY type LIMIT 100) AS
s \gset

-- Run queries
EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM events WHERE source_id
IN :source_id_in_clause AND type IN :type_in_clause;
Execution time: 21314.277 ms

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM events WHERE source_id
IN :source_id_values_clause AND type IN :type_in_clause;
Execution time: 9421.592 ms

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM events WHERE source_id
IN :source_id_in_clause AND type IN :type_values_clause;
Execution time: 17598.467 ms

EXPLAIN (ANALYZE, BUFFERS) SELECT count(*) FROM events WHERE source_id
IN :source_id_values_clause AND type IN :type_values_clause;
Execution time: 5589.925 ms

--
Sent via pgsql-general mailing list (pgsql-general@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

#21

Jeff Janes

jeff.janes@gmail.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#8)

#22

Jeff Janes

jeff.janes@gmail.com

almost 9 years ago

In reply to: Dmitry Lazurkin (#19)

#23

Jeff Janes