effect of JIT tuple deform?

Started by Pavel Stehuleover 7 years ago12 messages
#1Pavel Stehule
pavel.stehule@gmail.com

Hi

I try to measure effect of JIT tuple deform and I don't see any possible
effect.

Is it this feature active in master branch?

Is possible to see this feature in EXPLAIN ANALYZE?

Regards

Pavel

#2Pavel Stehule
pavel.stehule@gmail.com
In reply to: Pavel Stehule (#1)
Re: effect of JIT tuple deform?

2018-06-23 8:35 GMT+02:00 Pavel Stehule <pavel.stehule@gmail.com>:

Hi

I try to measure effect of JIT tuple deform and I don't see any possible
effect.

Is it this feature active in master branch?

Is possible to see this feature in EXPLAIN ANALYZE?

Unfortunately I got slowdown

0. shared buffers = 1GB
1. create table with 50 int columns
2. insert into this table 4M rows

postgres=# \dt+ wt
                   List of relations
+--------+------+-------+-------+--------+-------------+
| Schema | Name | Type  | Owner |  Size  | Description |
+--------+------+-------+-------+--------+-------------+
| public | wt   | table | pavel | 893 MB |             |
+--------+------+-------+-------+--------+-------------+
(1 row)

default setting

postgres=# explain analyze select sum(c45) from wt;
+------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                QUERY
PLAN                                                                |
+------------------------------------------------------------------------------------------------------------------------------------------+
| Finalize Aggregate  (cost=136120.69..136120.70 rows=1 width=8) (actual
time=879.547..879.547 rows=1 loops=1)                             |
|   ->  Gather  (cost=136120.47..136120.68 rows=2 width=8) (actual
time=879.514..879.538 rows=3 loops=1)                                   |
|         Workers Planned:
2
|
|         Workers Launched:
2
|
|         ->  Partial Aggregate  (cost=135120.47..135120.48 rows=1 width=8)
(actual time=850.283..850.284 rows=1 loops=3)                  |
|               ->  Parallel Seq Scan on wt  (cost=0.00..130953.77
rows=1666678 width=4) (actual time=0.071..223.338 rows=1333347 loops=3) |
| Planning Time: 0.158
ms
|
|
JIT:
|
|   Functions:
6
|
|   Generation Time: 4.267
ms
|
|   Inlining:
false
|
|   Inlining Time: 0.000
ms
|
|   Optimization:
false
|
|   Optimization Time: 2.472
ms
|
|   Emission Time: 15.929
ms
|
| Execution Time: 899.496
ms
|
+------------------------------------------------------------------------------------------------------------------------------------------+
(16 rows)
postgres=# set jit_tuple_deforming to off;
SET
postgres=# explain analyze select sum(c45) from wt;
+------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                QUERY
PLAN                                                                |
+------------------------------------------------------------------------------------------------------------------------------------------+
| Finalize Aggregate  (cost=136120.69..136120.70 rows=1 width=8) (actual
time=743.667..743.667 rows=1 loops=1)                             |
|   ->  Gather  (cost=136120.47..136120.68 rows=2 width=8) (actual
time=743.654..743.657 rows=3 loops=1)                                   |
|         Workers Planned:
2
|
|         Workers Launched:
2
|
|         ->  Partial Aggregate  (cost=135120.47..135120.48 rows=1 width=8)
(actual time=715.532..715.533 rows=1 loops=3)                  |
|               ->  Parallel Seq Scan on wt  (cost=0.00..130953.77
rows=1666678 width=4) (actual time=0.067..216.245 rows=1333347 loops=3) |
| Planning Time: 0.157
ms
|
|
JIT:
|
|   Functions:
4
|
|   Generation Time: 1.989
ms
|
|   Inlining:
false
|
|   Inlining Time: 0.000
ms
|
|   Optimization:
false
|
|   Optimization Time: 0.449
ms
|
|   Emission Time: 7.254
ms
|
| Execution Time: 761.549
ms
|
+------------------------------------------------------------------------------------------------------------------------------------------+
(16 rows)

When jit_tuple_deforming is enabled, the query is slower about 100ms, looks
like performance regression

Show quoted text

Regards

Pavel

#3Dmitry Dolgov
9erthalion6@gmail.com
In reply to: Pavel Stehule (#2)
Re: effect of JIT tuple deform?

On 23 June 2018 at 08:47, Pavel Stehule <pavel.stehule@gmail.com> wrote:

2018-06-23 8:35 GMT+02:00 Pavel Stehule <pavel.stehule@gmail.com>:

Hi

I try to measure effect of JIT tuple deform and I don't see any possible
effect.

Is it this feature active in master branch?

Is possible to see this feature in EXPLAIN ANALYZE?

Unfortunately I got slowdown

0. shared buffers = 1GB
1. create table with 50 int columns
2. insert into this table 4M rows

Hi,

Looks like I can reproduce the situation you're talking about (with some minor
differences)

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
-------------------------------------------------------------------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=813.957..813.957 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=813.946..813.950 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=802.585..802.585 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.040..207.326 rows=1333333 loops=3)
Planning Time: 0.212 ms
JIT:
Functions: 6
Generation Time: 3.076 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 1.328 ms
Emission Time: 8.601 ms
Execution Time: 820.127 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
-------------------------------------------------------------------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=742.578..742.578 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=742.529..742.569 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=727.876..727.876 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.044..204.097 rows=1333333 loops=3)
Planning Time: 0.361 ms
JIT:
Functions: 4
Generation Time: 2.840 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 0.751 ms
Emission Time: 6.436 ms
Execution Time: 749.827 ms
(16 rows)

But at the same time on the bigger dataset (20M rows instead of 4M) the very
same query gets better:

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
-------------------------------------------------------------------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=2350.288..2350.288 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=2350.276..2350.279 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=2328.378..2328.379 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.027..1175.960 rows=6666667 loops=3)
Planning Time: 0.600 ms
JIT:
Functions: 6
Generation Time: 3.661 ms
Inlining: true
Inlining Time: 65.373 ms
Optimization: true
Optimization Time: 120.885 ms
Emission Time: 58.836 ms
Execution Time: 2543.280 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
-------------------------------------------------------------------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=3616.977..3616.977 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=3616.959..3616.971 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=3593.220..3593.220 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.049..1027.353 rows=6666667 loops=3)
Planning Time: 0.149 ms
JIT:
Functions: 4
Generation Time: 1.803 ms
Inlining: true
Inlining Time: 23.529 ms
Optimization: true
Optimization Time: 18.843 ms
Emission Time: 9.307 ms
Execution Time: 3625.674 ms
(16 rows)

`perf diff` indeed shows that in the first case (with the 4M rows dataset) the
jitted version has some noticeable delta for one call, and unfortunately so far
I couldn't figure out which one exactly because of JIT (btw, who can explain
how to see a correct full `perf report` in this case? Somehow `perf
inject --jit -o
perf.data.jitted` and jit_profiling_support didn't help).

But since on the bigger dataset I've got expected results, maybe it's just a
sign that JIT kicks in too early in this case and what's necessary is to adjust
jit_above_cost/jit_optimize_above_cost/jit_inline_above_cost?

#4Andres Freund
andres@anarazel.de
In reply to: Dmitry Dolgov (#3)
Re: effect of JIT tuple deform?

Hi,

On 2018-06-24 22:32:07 +0200, Dmitry Dolgov wrote:

`perf diff` indeed shows that in the first case (with the 4M rows dataset) the
jitted version has some noticeable delta for one call, and unfortunately so far
I couldn't figure out which one exactly because of JIT (btw, who can explain
how to see a correct full `perf report` in this case? Somehow `perf
inject --jit -o
perf.data.jitted` and jit_profiling_support didn't help).

jit_profiling_support currently requires a patch (that I'm about to
merge into their trunk) in LLVM. I've posted it a couple times to the
list.

But since on the bigger dataset I've got expected results, maybe it's just a
sign that JIT kicks in too early in this case and what's necessary is to adjust
jit_above_cost/jit_optimize_above_cost/jit_inline_above_cost?

Yea, that's very likely the problem. I wonder if we should multiply the
cost w/ the number of targeted workers to reduce problems with
parallelism...

Greetings,

Andres Freund

#5Pavel Stehule
pavel.stehule@gmail.com
In reply to: Dmitry Dolgov (#3)
Re: effect of JIT tuple deform?

2018-06-24 22:32 GMT+02:00 Dmitry Dolgov <9erthalion6@gmail.com>:

On 23 June 2018 at 08:47, Pavel Stehule <pavel.stehule@gmail.com> wrote:

2018-06-23 8:35 GMT+02:00 Pavel Stehule <pavel.stehule@gmail.com>:

Hi

I try to measure effect of JIT tuple deform and I don't see any possible
effect.

Is it this feature active in master branch?

Is possible to see this feature in EXPLAIN ANALYZE?

Unfortunately I got slowdown

0. shared buffers = 1GB
1. create table with 50 int columns
2. insert into this table 4M rows

Hi,

Looks like I can reproduce the situation you're talking about (with some
minor
differences)

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=813.957..813.957 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=813.946..813.950 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=802.585..802.585 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.040..207.326 rows=1333333 loops=3)
Planning Time: 0.212 ms
JIT:
Functions: 6
Generation Time: 3.076 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 1.328 ms
Emission Time: 8.601 ms
Execution Time: 820.127 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=742.578..742.578 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=742.529..742.569 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=727.876..727.876 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.044..204.097 rows=1333333 loops=3)
Planning Time: 0.361 ms
JIT:
Functions: 4
Generation Time: 2.840 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 0.751 ms
Emission Time: 6.436 ms
Execution Time: 749.827 ms
(16 rows)

But at the same time on the bigger dataset (20M rows instead of 4M) the
very
same query gets better:

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=2350.288..2350.288 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=2350.276..2350.279 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=2328.378..2328.379 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.027..1175.960 rows=6666667 loops=3)
Planning Time: 0.600 ms
JIT:
Functions: 6
Generation Time: 3.661 ms
Inlining: true
Inlining Time: 65.373 ms
Optimization: true
Optimization Time: 120.885 ms
Emission Time: 58.836 ms
Execution Time: 2543.280 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=3616.977..3616.977 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=3616.959..3616.971 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=3593.220..3593.220 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.049..1027.353 rows=6666667 loops=3)
Planning Time: 0.149 ms
JIT:
Functions: 4
Generation Time: 1.803 ms
Inlining: true
Inlining Time: 23.529 ms
Optimization: true
Optimization Time: 18.843 ms
Emission Time: 9.307 ms
Execution Time: 3625.674 ms
(16 rows)

`perf diff` indeed shows that in the first case (with the 4M rows dataset)
the
jitted version has some noticeable delta for one call, and unfortunately
so far
I couldn't figure out which one exactly because of JIT (btw, who can
explain
how to see a correct full `perf report` in this case? Somehow `perf
inject --jit -o
perf.data.jitted` and jit_profiling_support didn't help).

But since on the bigger dataset I've got expected results, maybe it's just
a
sign that JIT kicks in too early in this case and what's necessary is to
adjust
jit_above_cost/jit_optimize_above_cost/jit_inline_above_cost?

maybe llvm does real compilation too late. It is too strange, because I
though about JIT cost like initial (fixed) costs. Now, it looks so this
cost is related to row numbers, and then the situation is much more complex.

Regards

Pavel

#6Andres Freund
andres@anarazel.de
In reply to: Pavel Stehule (#5)
Re: effect of JIT tuple deform?

On 2018-06-25 05:32:37 +0200, Pavel Stehule wrote:

2018-06-24 22:32 GMT+02:00 Dmitry Dolgov <9erthalion6@gmail.com>:

On 23 June 2018 at 08:47, Pavel Stehule <pavel.stehule@gmail.com> wrote:

2018-06-23 8:35 GMT+02:00 Pavel Stehule <pavel.stehule@gmail.com>:

Hi

I try to measure effect of JIT tuple deform and I don't see any possible
effect.

Is it this feature active in master branch?

Is possible to see this feature in EXPLAIN ANALYZE?

Unfortunately I got slowdown

0. shared buffers = 1GB
1. create table with 50 int columns
2. insert into this table 4M rows

Hi,

Looks like I can reproduce the situation you're talking about (with some
minor
differences)

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=813.957..813.957 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=813.946..813.950 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=802.585..802.585 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.040..207.326 rows=1333333 loops=3)
Planning Time: 0.212 ms
JIT:
Functions: 6
Generation Time: 3.076 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 1.328 ms
Emission Time: 8.601 ms
Execution Time: 820.127 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=742.578..742.578 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=742.529..742.569 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=727.876..727.876 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.044..204.097 rows=1333333 loops=3)
Planning Time: 0.361 ms
JIT:
Functions: 4
Generation Time: 2.840 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 0.751 ms
Emission Time: 6.436 ms
Execution Time: 749.827 ms
(16 rows)

But at the same time on the bigger dataset (20M rows instead of 4M) the
very
same query gets better:

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=2350.288..2350.288 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=2350.276..2350.279 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=2328.378..2328.379 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.027..1175.960 rows=6666667 loops=3)
Planning Time: 0.600 ms
JIT:
Functions: 6
Generation Time: 3.661 ms
Inlining: true
Inlining Time: 65.373 ms
Optimization: true
Optimization Time: 120.885 ms
Emission Time: 58.836 ms
Execution Time: 2543.280 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=3616.977..3616.977 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=3616.959..3616.971 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=3593.220..3593.220 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.049..1027.353 rows=6666667 loops=3)
Planning Time: 0.149 ms
JIT:
Functions: 4
Generation Time: 1.803 ms
Inlining: true
Inlining Time: 23.529 ms
Optimization: true
Optimization Time: 18.843 ms
Emission Time: 9.307 ms
Execution Time: 3625.674 ms
(16 rows)

`perf diff` indeed shows that in the first case (with the 4M rows dataset)
the
jitted version has some noticeable delta for one call, and unfortunately
so far
I couldn't figure out which one exactly because of JIT (btw, who can
explain
how to see a correct full `perf report` in this case? Somehow `perf
inject --jit -o
perf.data.jitted` and jit_profiling_support didn't help).

But since on the bigger dataset I've got expected results, maybe it's just
a
sign that JIT kicks in too early in this case and what's necessary is to
adjust
jit_above_cost/jit_optimize_above_cost/jit_inline_above_cost?

maybe llvm does real compilation too late. It is too strange, because I
though about JIT cost like initial (fixed) costs. Now, it looks so this
cost is related to row numbers, and then the situation is much more complex.

I don't understand what you're trying to say here. Compilation happens
on the first execution of the function. It's a fixed cost. The point
with more rows showing a benefit is that the benefit of JITing is bigger
in total if the function is going to be executed more. And thus at some
point the execution benefit is bigger than the cost of doing JITing.

Greetings,

Andres Freund

#7Pavel Stehule
pavel.stehule@gmail.com
In reply to: Andres Freund (#6)
Re: effect of JIT tuple deform?

2018-06-25 5:41 GMT+02:00 Andres Freund <andres@anarazel.de>:

On 2018-06-25 05:32:37 +0200, Pavel Stehule wrote:

2018-06-24 22:32 GMT+02:00 Dmitry Dolgov <9erthalion6@gmail.com>:

On 23 June 2018 at 08:47, Pavel Stehule <pavel.stehule@gmail.com>

wrote:

2018-06-23 8:35 GMT+02:00 Pavel Stehule <pavel.stehule@gmail.com>:

Hi

I try to measure effect of JIT tuple deform and I don't see any

possible

effect.

Is it this feature active in master branch?

Is possible to see this feature in EXPLAIN ANALYZE?

Unfortunately I got slowdown

0. shared buffers = 1GB
1. create table with 50 int columns
2. insert into this table 4M rows

Hi,

Looks like I can reproduce the situation you're talking about (with

some

minor
differences)

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=813.957..813.957 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=813.946..813.950 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=802.585..802.585 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.040..207.326 rows=1333333 loops=3)
Planning Time: 0.212 ms
JIT:
Functions: 6
Generation Time: 3.076 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 1.328 ms
Emission Time: 8.601 ms
Execution Time: 820.127 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=127097.71..127097.72 rows=1 width=8)
(actual time=742.578..742.578 rows=1 loops=1)
-> Gather
(cost=127097.50..127097.71 rows=2 width=8)
(actual time=742.529..742.569 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=126097.50..126097.51 rows=1 width=8)
(actual time=727.876..727.876 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..121930.80 rows=1666680 width=4)
(actual time=0.044..204.097 rows=1333333 loops=3)
Planning Time: 0.361 ms
JIT:
Functions: 4
Generation Time: 2.840 ms
Inlining: false
Inlining Time: 0.000 ms
Optimization: false
Optimization Time: 0.751 ms
Emission Time: 6.436 ms
Execution Time: 749.827 ms
(16 rows)

But at the same time on the bigger dataset (20M rows instead of 4M) the
very
same query gets better:

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=2350.288..2350.288 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=2350.276..2350.279 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=2328.378..2328.379 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.027..1175.960 rows=6666667 loops=3)
Planning Time: 0.600 ms
JIT:
Functions: 6
Generation Time: 3.661 ms
Inlining: true
Inlining Time: 65.373 ms
Optimization: true
Optimization Time: 120.885 ms
Emission Time: 58.836 ms
Execution Time: 2543.280 ms
(16 rows)

=# set jit_tuple_deforming to off;

=# explain analyze select sum(data45) from test_deform;
QUERY PLAN
------------------------------------------------------------
-------------------
Finalize Aggregate
(cost=631482.92..631482.93 rows=1 width=8)
(actual time=3616.977..3616.977 rows=1 loops=1)
-> Gather
(cost=631482.71..631482.92 rows=2 width=8)
(actual time=3616.959..3616.971 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate
(cost=630482.71..630482.72 rows=1 width=8)
(actual time=3593.220..3593.220 rows=1 loops=3)
-> Parallel Seq Scan on test_deform
(cost=0.00..609649.37 rows=8333337 width=4)
(actual time=0.049..1027.353 rows=6666667 loops=3)
Planning Time: 0.149 ms
JIT:
Functions: 4
Generation Time: 1.803 ms
Inlining: true
Inlining Time: 23.529 ms
Optimization: true
Optimization Time: 18.843 ms
Emission Time: 9.307 ms
Execution Time: 3625.674 ms
(16 rows)

`perf diff` indeed shows that in the first case (with the 4M rows

dataset)

the
jitted version has some noticeable delta for one call, and

unfortunately

so far
I couldn't figure out which one exactly because of JIT (btw, who can
explain
how to see a correct full `perf report` in this case? Somehow `perf
inject --jit -o
perf.data.jitted` and jit_profiling_support didn't help).

But since on the bigger dataset I've got expected results, maybe it's

just

a
sign that JIT kicks in too early in this case and what's necessary is

to

adjust
jit_above_cost/jit_optimize_above_cost/jit_inline_above_cost?

maybe llvm does real compilation too late. It is too strange, because I
though about JIT cost like initial (fixed) costs. Now, it looks so this
cost is related to row numbers, and then the situation is much more

complex.

I don't understand what you're trying to say here. Compilation happens
on the first execution of the function. It's a fixed cost. The point
with more rows showing a benefit is that the benefit of JITing is bigger
in total if the function is going to be executed more. And thus at some
point the execution benefit is bigger than the cost of doing JITing.

Maybe I am little bit messy because I see slowdown about 60ms, but I don't
see this value in JIT statistics

Regards

Pavel

Show quoted text

Greetings,

Andres Freund

#8Pavel Stehule
pavel.stehule@gmail.com
In reply to: Pavel Stehule (#7)
Re: effect of JIT tuple deform?

Hi

I played with it and maybe I got interesting result.

When I played with really big table, then I got IO waits and effect of
jit_tuple_deforming is near to zero (as expected)

When I played with smaller table under RAM, then I can see positive effect
of JIT_TD, but only when optimization is active. When optimization was
false, then JIT_TD has negative slowdown.

Shared buffers 1GB
table wt .. 100columns, 2M rows, cca 823MB
set max_parallel_workers to 0;

query:

select sum(c99) from wt;

default (jit active) ... 1853ms
set jit_tuple_deforming to off .. 1397ms
set jit off .. 1400ms

== enforced inline ==
jit_tuple_deforming on .. 1610ms
jit_tuple_deforming off .. 1420ms

== enforced optimize ==
jit_tuple_deforming on .. 842ms
jit_tuple_deforming off .. 1420ms

I played with setting and I got the best speed with

jit_inline_above_cost ..some pretty high number
jit_optimize_above_cost 0
jit_tuple_deforming on

So I am able to see effect of jit_tuple_deforming, and very well, but only
if optimization is active. When optimization is not active then
jit_tuple_deforming does slowdown.

So maybe a usage of jit_tuple_deforming can be conditioned by
jit_optimization?

Regards

Pavel

#9Andres Freund
andres@anarazel.de
In reply to: Pavel Stehule (#8)
Re: effect of JIT tuple deform?

On 2018-06-26 21:25:54 +0200, Pavel Stehule wrote:

Hi

I played with it and maybe I got interesting result.

When I played with really big table, then I got IO waits and effect of
jit_tuple_deforming is near to zero (as expected)

When I played with smaller table under RAM, then I can see positive effect
of JIT_TD, but only when optimization is active. When optimization was
false, then JIT_TD has negative slowdown.

Shared buffers 1GB
table wt .. 100columns, 2M rows, cca 823MB
set max_parallel_workers to 0;

query:

select sum(c99) from wt;

default (jit active) ... 1853ms
set jit_tuple_deforming to off .. 1397ms
set jit off .. 1400ms

== enforced inline ==
jit_tuple_deforming on .. 1610ms
jit_tuple_deforming off .. 1420ms

== enforced optimize ==
jit_tuple_deforming on .. 842ms
jit_tuple_deforming off .. 1420ms

I played with setting and I got the best speed with

jit_inline_above_cost ..some pretty high number
jit_optimize_above_cost 0
jit_tuple_deforming on

So I am able to see effect of jit_tuple_deforming, and very well, but only
if optimization is active. When optimization is not active then
jit_tuple_deforming does slowdown.

So maybe a usage of jit_tuple_deforming can be conditioned by
jit_optimization?

No, it's definitely useful outside of that. I don't think we should
optimize for one individual query.

Greetings,

Andres Freund

#10Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Pavel Stehule (#8)
Re: effect of JIT tuple deform?

On 06/26/2018 09:25 PM, Pavel Stehule wrote:

Hi

...

So I am able to see effect of jit_tuple_deforming, and very well, but
only if optimization is active. When optimization is not active then
jit_tuple_deforming does slowdown.

So maybe a usage of jit_tuple_deforming can be conditioned by
jit_optimization?

Can you share the test case and some detail about the hardware and
PostgreSQL configuration?

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#11Pavel Stehule
pavel.stehule@gmail.com
In reply to: Tomas Vondra (#10)
Re: effect of JIT tuple deform?

2018-06-27 17:19 GMT+02:00 Tomas Vondra <tomas.vondra@2ndquadrant.com>:

On 06/26/2018 09:25 PM, Pavel Stehule wrote:

Hi

...

So I am able to see effect of jit_tuple_deforming, and very well, but
only if optimization is active. When optimization is not active then
jit_tuple_deforming does slowdown.

So maybe a usage of jit_tuple_deforming can be conditioned by
jit_optimization?

Can you share the test case and some detail about the hardware and
PostgreSQL configuration?

I did very simple test

0.

master branch without asserts, shared buffer to 1GB
tested on Lenovo T520 8GB RAM 8CPU, i7
Fedora 28, gcc CFLAGS="-ggdb -Og -g3 -fno-omit-frame-pointer" --with-llvm

1.

select 'create table wt(' || string_agg(format('%I int', 'c' || i),',') ||
')' from generate_series(1,100) g(i) \gexec

2.

begin;
select 'insert into wt values(' || (select
string_agg((random()*10000)::int::text,',') from generate_series(1,j - j +
100) g(i)) || ')' from generate_series(1,1000000) gg(j) \gexec
insert into wt select * from wt;
commit;

3.

set max_paralel_workers to 0; -- the effect of JIT will be more visible

analyze wt;
\timing

select sum(c99) from wt;

I tested some combination of:

jit: off on
jit_inline_above_cost: 0, 10000000000000
jit_optimize_above_cost: 0, 10000000000000
jit_tuple_deforming: on, off

My primitive tests shows nice possitive effect of jit_tuple_deforming if
jit optimization is active. When jit optimization is not active, then
jit_tuple_deforming did slowdown in my test.

So there is range of costs between 100000 and 500000 where
jit_tuple_deforming didn't work well (without optimization)

I am limmited by small memory of my notebook - when I created table larger
than 3GB, then I got IO waits on my crypted disc, and any effect of JIT was
eliminated.

Regards

Pavel

Show quoted text

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#12Pierre Ducroquet
p.psql@pinaraf.info
In reply to: Pavel Stehule (#11)
1 attachment(s)
Re: effect of JIT tuple deform?

On Wednesday, June 27, 2018 5:38:31 PM CEST Pavel Stehule wrote:

2018-06-27 17:19 GMT+02:00 Tomas Vondra <tomas.vondra@2ndquadrant.com>:

On 06/26/2018 09:25 PM, Pavel Stehule wrote:

Hi

...

So I am able to see effect of jit_tuple_deforming, and very well, but
only if optimization is active. When optimization is not active then
jit_tuple_deforming does slowdown.

So maybe a usage of jit_tuple_deforming can be conditioned by
jit_optimization?

Can you share the test case and some detail about the hardware and
PostgreSQL configuration?

I did very simple test

0.

master branch without asserts, shared buffer to 1GB
tested on Lenovo T520 8GB RAM 8CPU, i7
Fedora 28, gcc CFLAGS="-ggdb -Og -g3 -fno-omit-frame-pointer" --with-llvm

1.

select 'create table wt(' || string_agg(format('%I int', 'c' || i),',') ||
')' from generate_series(1,100) g(i) \gexec

2.

begin;
select 'insert into wt values(' || (select
string_agg((random()*10000)::int::text,',') from generate_series(1,j - j +
100) g(i)) || ')' from generate_series(1,1000000) gg(j) \gexec
insert into wt select * from wt;
commit;

3.

set max_paralel_workers to 0; -- the effect of JIT will be more visible

analyze wt;
\timing

select sum(c99) from wt;

I tested some combination of:

jit: off on
jit_inline_above_cost: 0, 10000000000000
jit_optimize_above_cost: 0, 10000000000000
jit_tuple_deforming: on, off

My primitive tests shows nice possitive effect of jit_tuple_deforming if
jit optimization is active. When jit optimization is not active, then
jit_tuple_deforming did slowdown in my test.

So there is range of costs between 100000 and 500000 where
jit_tuple_deforming didn't work well (without optimization)

I am limmited by small memory of my notebook - when I created table larger
than 3GB, then I got IO waits on my crypted disc, and any effect of JIT was
eliminated.

Regards

Pavel

Hi

I have studied this case a bit, and I think too that there is something wrong
here.
Right now, jit_optimize is a -O3. It's really expensive, and triggering it
here is not the right solution. In the attached patch, I force a -O1 for tuple
deforming. With your test case, on my computer, the results are :
- no jit : 850ms
- jit with tuple deforming without optimizations : 1650 ms (1.5ms spent
optimizing)
- jit without tuple deforming : 820ms (0.2ms)
- jit with tuple deforming with optimization (-O3) : 770ms (105ms)
- jit with tuple deforming with patch (-O1) : 725ms (54ms)

I will look at the generated code for tuple deforming, but I think we should
pre-optimize the LLVM bytecode if we do not want to involve the LLVM
optimization passes.

Regards

Pierre

Attachments:

0001-Introduce-opt1-in-LLVM-JIT-and-force-it-with-deformi.patchtext/x-patch; charset=UTF-8; name=0001-Introduce-opt1-in-LLVM-JIT-and-force-it-with-deformi.patchDownload
From c2e70c8fbb7715283d3d53bdf5a70e4db18c99a9 Mon Sep 17 00:00:00 2001
From: Pierre Ducroquet <p.psql@pinaraf.info>
Date: Mon, 2 Jul 2018 13:44:10 +0200
Subject: [PATCH] Introduce opt1 in LLVM/JIT, and force it with deforming

---
 src/backend/jit/llvm/llvmjit.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/backend/jit/llvm/llvmjit.c b/src/backend/jit/llvm/llvmjit.c
index 5d0cdab1fc..025319e9c1 100644
--- a/src/backend/jit/llvm/llvmjit.c
+++ b/src/backend/jit/llvm/llvmjit.c
@@ -91,10 +91,12 @@ static const char *llvm_layout = NULL;
 
 
 static LLVMTargetMachineRef llvm_opt0_targetmachine;
+static LLVMTargetMachineRef llvm_opt1_targetmachine;
 static LLVMTargetMachineRef llvm_opt3_targetmachine;
 
 static LLVMTargetRef llvm_targetref;
 static LLVMOrcJITStackRef llvm_opt0_orc;
+static LLVMOrcJITStackRef llvm_opt1_orc;
 static LLVMOrcJITStackRef llvm_opt3_orc;
 
 
@@ -277,6 +279,8 @@ llvm_get_function(LLVMJitContext *context, const char *funcname)
 #if LLVM_VERSION_MAJOR < 5
 	if ((addr = LLVMOrcGetSymbolAddress(llvm_opt0_orc, funcname)))
 		return (void *) (uintptr_t) addr;
+	if ((addr = LLVMOrcGetSymbolAddress(llvm_opt1_orc, funcname)))
+		return (void *) (uintptr_t) addr;
 	if ((addr = LLVMOrcGetSymbolAddress(llvm_opt3_orc, funcname)))
 		return (void *) (uintptr_t) addr;
 #else
@@ -284,6 +288,10 @@ llvm_get_function(LLVMJitContext *context, const char *funcname)
 		elog(ERROR, "failed to look up symbol \"%s\"", funcname);
 	if (addr)
 		return (void *) (uintptr_t) addr;
+	if (LLVMOrcGetSymbolAddress(llvm_opt1_orc, &addr, funcname))
+		elog(ERROR, "failed to look up symbol \"%s\"", funcname);
+	if (addr)
+		return (void *) (uintptr_t) addr;
 	if (LLVMOrcGetSymbolAddress(llvm_opt3_orc, &addr, funcname))
 		elog(ERROR, "failed to look up symbol \"%s\"", funcname);
 	if (addr)
@@ -420,6 +428,8 @@ llvm_optimize_module(LLVMJitContext *context, LLVMModuleRef module)
 
 	if (context->base.flags & PGJIT_OPT3)
 		compile_optlevel = 3;
+	else if (context->base.flags & PGJIT_DEFORM)
+		compile_optlevel = 1;
 	else
 		compile_optlevel = 0;
 
@@ -491,6 +501,8 @@ llvm_compile_module(LLVMJitContext *context)
 
 	if (context->base.flags & PGJIT_OPT3)
 		compile_orc = llvm_opt3_orc;
+	else if (context->base.flags & PGJIT_DEFORM)
+		compile_orc = llvm_opt1_orc;
 	else
 		compile_orc = llvm_opt0_orc;
 
@@ -646,6 +658,11 @@ llvm_session_initialize(void)
 								LLVMCodeGenLevelNone,
 								LLVMRelocDefault,
 								LLVMCodeModelJITDefault);
+	llvm_opt1_targetmachine =
+		LLVMCreateTargetMachine(llvm_targetref, llvm_triple, cpu, features,
+								LLVMCodeGenLevelLess,
+								LLVMRelocDefault,
+								LLVMCodeModelJITDefault);
 	llvm_opt3_targetmachine =
 		LLVMCreateTargetMachine(llvm_targetref, llvm_triple, cpu, features,
 								LLVMCodeGenLevelAggressive,
@@ -661,12 +678,14 @@ llvm_session_initialize(void)
 	LLVMLoadLibraryPermanently(NULL);
 
 	llvm_opt0_orc = LLVMOrcCreateInstance(llvm_opt0_targetmachine);
+	llvm_opt1_orc = LLVMOrcCreateInstance(llvm_opt1_targetmachine);
 	llvm_opt3_orc = LLVMOrcCreateInstance(llvm_opt3_targetmachine);
 
 #if defined(HAVE_DECL_LLVMORCREGISTERGDB) && HAVE_DECL_LLVMORCREGISTERGDB
 	if (jit_debugging_support)
 	{
 		LLVMOrcRegisterGDB(llvm_opt0_orc);
+		LLVMOrcRegisterGDB(llvm_opt1_orc);
 		LLVMOrcRegisterGDB(llvm_opt3_orc);
 	}
 #endif
@@ -674,6 +693,7 @@ llvm_session_initialize(void)
 	if (jit_profiling_support)
 	{
 		LLVMOrcRegisterPerf(llvm_opt0_orc);
+		LLVMOrcRegisterPerf(llvm_opt1_orc);
 		LLVMOrcRegisterPerf(llvm_opt3_orc);
 	}
 #endif
@@ -700,6 +720,16 @@ llvm_shutdown(int code, Datum arg)
 		llvm_opt3_orc = NULL;
 	}
 
+	if (llvm_opt1_orc)
+	{
+#if defined(HAVE_DECL_LLVMORCREGISTERPERF) && HAVE_DECL_LLVMORCREGISTERPERF
+		if (jit_profiling_support)
+			LLVMOrcUnregisterPerf(llvm_opt1_orc);
+#endif
+		LLVMOrcDisposeInstance(llvm_opt1_orc);
+		llvm_opt1_orc = NULL;
+	}
+
 	if (llvm_opt0_orc)
 	{
 #if defined(HAVE_DECL_LLVMORCREGISTERPERF) && HAVE_DECL_LLVMORCREGISTERPERF
-- 
2.18.0