BUG #18330: The query planner chooses the wrong plan when using the parallel aggregation function
The following bug has been logged on the website:
Bug reference: 18330
Logged by: Michael Efremov
Email address: efremov20081@gmail.com
PostgreSQL version: 15.1
Operating system: Alpine 12.2.1_git20220924-r4
Description:
Below is an example of queries. It shows that the query planner chooses
sequential aggregation instead of parallel, although limit should not have
affected this.
-- FIRSTLY - create parallel agg function
CREATE TYPE top1_units_weights_state_v1_parallel AS (
test jsonb
);
CREATE OR REPLACE FUNCTION agg_top1_units_weights_transition_v1_parallel(
state top1_units_weights_state_v1_parallel,
test jsonb
) RETURNS top1_units_weights_state_v1_parallel AS $$
begin
RETURN state;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION agg_top1_units_weights_final_v1_parallel(
state top1_units_weights_state_v1_parallel
) RETURNS top1_units_weights_state_v1_parallel AS $$
begin
RAISE NOTICE 'agg_top1_units_weights_final_v1_parallel:% ', state;
RETURN state;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION agg_top1_units_weights_combinefunc_v1_parallel(
first top1_units_weights_state_v1_parallel,
second top1_units_weights_state_v1_parallel
) RETURNS top1_units_weights_state_v1_parallel AS $$
begin
RAISE NOTICE 'agg_top1_units_weights_combinefunc_v1_parallel:% ', first;
RETURN first;
END;
$$ LANGUAGE plpgsql PARALLEL SAFE;
CREATE OR REPLACE AGGREGATE agg_top1_units_weights_v1_parallel(jsonb) (
sfunc = agg_top1_units_weights_transition_v1_parallel,
stype = top1_units_weights_state_v1_parallel,
finalfunc = agg_top1_units_weights_final_v1_parallel,
initcond = '({})',
COMBINEFUNC = agg_top1_units_weights_combinefunc_v1_parallel,
parallel = SAFE
);
-- SECONDLY - create test data
CREATE TABLE public.test_jsonb_agg(
test_jsonb jsonb not null
);
CREATE OR REPLACE FUNCTION random_between(low INT ,high INT)
RETURNS INT AS
$$
BEGIN
RETURN floor(random()* (high-low + 1) + low);
END;
CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max
integer) RETURNS integer[] AS $BODY$
begin
return (select array_agg(random_between(min,max)) from generate_series (0,
dim));
end
$BODY$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION random_jsonb_array_of_array(dim integer, min
integer, max integer) RETURNS jsonb AS $BODY$
declare
res jsonb = '[]'::jsonb;
cnt integer = random_between(1,4);
begin
loop
res = jsonb_build_array(random_int_array(dim,min,max)) || res;
IF cnt > 3 THEN
return res;
END IF;
cnt = cnt + 1;
END LOOP;
end
$BODY$ LANGUAGE plpgsql;
INSERT into test_jsonb_agg
select
jsonb_array as jsonb_array
from (
select
random_jsonb_array_of_array(random_between(1,5), 1, 500) as
jsonb_array
from generate_series(0,100000)
) as data_t
-- THIRDLY - check two times
analyze test_jsonb;
explain (analyze,verbose,timing,costs,buffers)
with test_speed as not MATERIALIZED(
select test_jsonb as col
from test_jsonb_agg -- table size is 100_000
limit 900000 -- try to comment this line then parallel aggregation plan will
be used.
) select agg_top1_units_weights_v1_parallel(col)
from test_speed
-> output
Aggregate (cost=29174.52..29174.53 rows=1 width=32) (actual
time=50.872..50.872 rows=1 loops=1)
Output: agg_top1_units_weights_v1_parallel(test_jsonb_agg.test_jsonb)
Buffers: shared hit=2174
-> Limit (cost=0.00..3174.01 rows=100001 width=145) (actual
time=0.007..10.412 rows=100001 loops=1)
Output: test_jsonb_agg.test_jsonb
Buffers: shared hit=2174
-> Seq Scan on public.test_jsonb_agg (cost=0.00..3174.01
rows=100001 width=145) (actual time=0.007..4.928 rows=100001 loops=1)
Output: test_jsonb_agg.test_jsonb
Buffers: shared hit=2174
Query Identifier: 8741670630168910811
Planning Time: 0.051 ms
Execution Time: 50.920 ms
explain (analyze,verbose,timing,costs,buffers)
with test_speed as not MATERIALIZED(
select test_jsonb as col
from test_jsonb_agg -- table size is 100_000
--limit 900000 -- try to comment this line then parallel aggregation plan
will be used.
) select agg_top1_units_weights_v1_parallel(col)
from test_speed
-> output
Finalize Aggregate (cost=14008.38..14008.39 rows=1 width=32) (actual
time=28.777..32.664 rows=1 loops=1)
Output: agg_top1_units_weights_v1_parallel(test_jsonb_agg.test_jsonb)
Buffers: shared hit=2364
-> Gather (cost=14007.42..14007.63 rows=2 width=32) (actual
time=28.619..32.508 rows=3 loops=1)
Output: (PARTIAL
agg_top1_units_weights_v1_parallel(test_jsonb_agg.test_jsonb))
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=2364
-> Partial Aggregate (cost=13007.42..13007.43 rows=1 width=32)
(actual time=15.567..15.568 rows=1 loops=3)
Output: PARTIAL
agg_top1_units_weights_v1_parallel(test_jsonb_agg.test_jsonb)
Buffers: shared hit=2364
Worker 0: actual time=9.197..9.198 rows=1 loops=1
Buffers: shared hit=501
Worker 1: actual time=9.201..9.202 rows=1 loops=1
Buffers: shared hit=520
-> Parallel Seq Scan on public.test_jsonb_agg
(cost=0.00..2590.67 rows=41667 width=145) (actual time=0.008..1.872
rows=33334 loops=3)
Output: test_jsonb_agg.test_jsonb
Buffers: shared hit=2174
Worker 0: actual time=0.009..1.195 rows=18612 loops=1
Buffers: shared hit=406
Worker 1: actual time=0.009..1.205 rows=19462 loops=1
Buffers: shared hit=425
Query Identifier: 4368818925053284440
Planning Time: 0.054 ms
Execution Time: 32.718 ms
PG Bug reporting form <noreply@postgresql.org> writes:
Below is an example of queries. It shows that the query planner chooses
sequential aggregation instead of parallel, although limit should not have
affected this.
On what grounds do you assert that LIMIT can be parallelized?
AFAICS it has to be done by a single process to ensure that the
limit is enforced accurately. So LIMIT below an aggregation
removes the opportunity to parallelize the aggregation.
regards, tom lane