Parallel Seq Scan vs kernel read ahead
Hello hackers,
Parallel sequential scan relies on the kernel detecting sequential
access, but we don't make the job easy. The resulting striding
pattern works terribly on strict next-block systems like FreeBSD UFS,
and degrades rapidly when you add too many workers on sliding window
systems like Linux.
Demonstration using FreeBSD on UFS on a virtual machine, taking ball
park figures from iostat:
create table t as select generate_series(1, 200000000)::int i;
set max_parallel_workers_per_gather = 0;
select count(*) from t;
-> execution time 13.3s, average read size = ~128kB, ~500MB/s
set max_parallel_workers_per_gather = 1;
select count(*) from t;
-> execution time 24.9s, average read size = ~32kB, ~250MB/s
Note the small read size, which means that there was no read
clustering happening at all: that's the logical block size of this
filesystem.
That explains some complaints I've heard about PostgreSQL performance
on that filesystem: parallel query destroys I/O performance.
As a quick experiment, I tried teaching the block allocated to
allocate ranges of up 64 blocks at a time, ramping up incrementally,
and ramping down at the end, and I got:
set max_parallel_workers_per_gather = 1;
select count(*) from t;
-> execution time 7.5s, average read size = ~128kB, ~920MB/s
set max_parallel_workers_per_gather = 3;
select count(*) from t;
-> execution time 5.2s, average read size = ~128kB, ~1.2GB/s
I've attached the quick and dirty patch I used for that.
Attachments:
0001-Use-larger-step-sizes-for-Parallel-Seq-Scan.patchtext/x-patch; charset=US-ASCII; name=0001-Use-larger-step-sizes-for-Parallel-Seq-Scan.patchDownload
From 65726902f42a19c319623eb7194b0040517008a6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 19 May 2020 09:16:07 +1200
Subject: [PATCH] Use larger step sizes for Parallel Seq Scan.
Instead of handing out a single block at a time, which
confuses the read ahead heuristics of many systems, use
an increasing step size.
XXX Proof of concept grade code only, needs better ramp
down algorithm and contains a magic number 16
---
src/backend/access/heap/heapam.c | 22 +++++++++++++------
src/backend/access/table/tableam.c | 34 +++++++++++++++++++++++++++---
src/include/access/relscan.h | 13 +++++++++++-
src/include/access/tableam.h | 2 ++
4 files changed, 61 insertions(+), 10 deletions(-)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 0d4ed602d7..8982d59ff0 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
+ scan->rs_base.rs_parallel_work =
+ palloc0(sizeof(ParallelBlockTableScanWorkData));
scan->rs_strategy = NULL; /* set in initscan */
/*
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index c814733b22..6d72589673 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -404,10 +404,15 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
* to set the startblock once.
*/
void
-table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber sync_startpage = InvalidBlockNumber;
+ /* Reset the state we use for controlling allocation size. */
+ memset(workspace, 0, sizeof(*workspace));
+
retry:
/* Grab the spinlock. */
SpinLockAcquire(&pbscan->phs_mutex);
@@ -447,7 +452,9 @@ retry:
* backend gets an InvalidBlockNumber return.
*/
BlockNumber
-table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber page;
uint64 nallocated;
@@ -467,7 +474,28 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
* The actual page to return is calculated by adding the counter to the
* starting block number, modulo nblocks.
*/
- nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
+ if (workspace->phsw_nallocated_range > 0)
+ {
+ nallocated = ++workspace->phsw_nallocated;
+ workspace->phsw_nallocated_range--;
+ }
+ else
+ {
+ /*
+ * Ramp up the step size as we go, until we approach the end of the
+ * scan, and then ramp it back down again to avoid unfair allocation.
+ * XXX Come up with a better algorithm!
+ */
+ if (workspace->phsw_nallocated > pbscan->phs_nblocks - 64)
+ workspace->phsw_step_size = 1;
+ else if (workspace->phsw_step_size < 64)
+ workspace->phsw_step_size++;
+
+ nallocated = workspace->phsw_nallocated =
+ pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+ workspace->phsw_step_size);
+ workspace->phsw_nallocated_range = workspace->phsw_step_size - 1;
+ }
if (nallocated >= pbscan->phs_nblocks)
page = InvalidBlockNumber; /* all blocks have been allocated */
else
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f0258831f..0d440c9de3 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
*/
uint32 rs_flags;
+ void *rs_parallel_work;
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
* information */
-
} TableScanDescData;
typedef struct TableScanDescData *TableScanDesc;
@@ -81,6 +81,17 @@ typedef struct ParallelBlockTableScanDescData
} ParallelBlockTableScanDescData;
typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
+/*
+ * Per backend state for parallel table sacan, for block oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkData
+{
+ uint64 phsw_nallocated;
+ int phsw_nallocated_range;
+ int phsw_step_size;
+} ParallelBlockTableScanWorkData;
+typedef struct ParallelBlockTableScanWorkData *ParallelBlockTableScanWork;
+
/*
* Base class for fetches from a table via an index. This is the base-class
* for such scans, which needs to be embedded in the respective struct for
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 94903dd8de..d0a854a692 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1790,8 +1790,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
extern void table_block_parallelscan_reinitialize(Relation rel,
ParallelTableScanDesc pscan);
extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
extern void table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
--
2.20.1
On Wed, May 20, 2020 at 7:24 AM Thomas Munro <thomas.munro@gmail.com> wrote:
Hello hackers,
Parallel sequential scan relies on the kernel detecting sequential
access, but we don't make the job easy. The resulting striding
pattern works terribly on strict next-block systems like FreeBSD UFS,
and degrades rapidly when you add too many workers on sliding window
systems like Linux.Demonstration using FreeBSD on UFS on a virtual machine, taking ball
park figures from iostat:create table t as select generate_series(1, 200000000)::int i;
set max_parallel_workers_per_gather = 0;
select count(*) from t;
-> execution time 13.3s, average read size = ~128kB, ~500MB/sset max_parallel_workers_per_gather = 1;
select count(*) from t;
-> execution time 24.9s, average read size = ~32kB, ~250MB/sNote the small read size, which means that there was no read
clustering happening at all: that's the logical block size of this
filesystem.That explains some complaints I've heard about PostgreSQL performance
on that filesystem: parallel query destroys I/O performance.As a quick experiment, I tried teaching the block allocated to
allocate ranges of up 64 blocks at a time, ramping up incrementally,
and ramping down at the end, and I got:
Good experiment. IIRC, we have discussed a similar idea during the
development of this feature but we haven't seen any better results by
allocating in ranges on the systems we have tried. So, we want with
the current approach which is more granular and seems to allow better
parallelism. I feel we need to ensure that we don't regress
parallelism in existing cases, otherwise, the idea sounds promising to
me.
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Wed, May 20, 2020 at 2:23 PM Amit Kapila <amit.kapila16@gmail.com> wrote:
Good experiment. IIRC, we have discussed a similar idea during the
development of this feature but we haven't seen any better results by
allocating in ranges on the systems we have tried. So, we want with
the current approach which is more granular and seems to allow better
parallelism. I feel we need to ensure that we don't regress
parallelism in existing cases, otherwise, the idea sounds promising to
me.
Yeah, Linux seems to do pretty well at least with smallish numbers of
workers, and when you use large numbers you can probably tune your way
out of the problem. ZFS seems to do fine. I wonder how well the
other OSes cope.
Em qua., 20 de mai. de 2020 às 00:09, Thomas Munro <thomas.munro@gmail.com>
escreveu:
On Wed, May 20, 2020 at 2:23 PM Amit Kapila <amit.kapila16@gmail.com>
wrote:Good experiment. IIRC, we have discussed a similar idea during the
development of this feature but we haven't seen any better results by
allocating in ranges on the systems we have tried. So, we want with
the current approach which is more granular and seems to allow better
parallelism. I feel we need to ensure that we don't regress
parallelism in existing cases, otherwise, the idea sounds promising to
me.Yeah, Linux seems to do pretty well at least with smallish numbers of
workers, and when you use large numbers you can probably tune your way
out of the problem. ZFS seems to do fine. I wonder how well the
other OSes cope.
Windows 10 (64bits, i5, 8GB, SSD)
postgres=# set max_parallel_workers_per_gather = 0;
SET
Time: 2,537 ms
postgres=# select count(*) from t;
count
-----------
200000000
(1 row)
Time: 47767,916 ms (00:47,768)
postgres=# set max_parallel_workers_per_gather = 1;
SET
Time: 4,889 ms
postgres=# select count(*) from t;
count
-----------
200000000
(1 row)
Time: 32645,448 ms (00:32,645)
How display " -> execution time 5.2s, average read size ="?
regards,
Ranier VIlela
On Wed, May 20, 2020 at 11:03 PM Ranier Vilela <ranier.vf@gmail.com> wrote:
Time: 47767,916 ms (00:47,768)
Time: 32645,448 ms (00:32,645)
Just to make sure kernel caching isn't helping here, maybe try making
the table 2x or 4x bigger? My test was on a virtual machine with only
4GB RAM, so the table couldn't be entirely cached.
How display " -> execution time 5.2s, average read size ="?
Execution time is what you showed, and average read size should be
inside the Windows performance window somewhere (not sure what it's
called).
Em qua., 20 de mai. de 2020 às 18:49, Thomas Munro <thomas.munro@gmail.com>
escreveu:
On Wed, May 20, 2020 at 11:03 PM Ranier Vilela <ranier.vf@gmail.com>
wrote:Time: 47767,916 ms (00:47,768)
Time: 32645,448 ms (00:32,645)Just to make sure kernel caching isn't helping here, maybe try making
the table 2x or 4x bigger? My test was on a virtual machine with only
4GB RAM, so the table couldn't be entirely cached.
4x bigger.
Postgres defaults settings.
postgres=# create table t as select generate_series(1, 800000000)::int i;
SELECT 800000000
postgres=# \timing
Timing is on.
postgres=# set max_parallel_workers_per_gather = 0;
SET
Time: 8,622 ms
postgres=# select count(*) from t;
count
-----------
800000000
(1 row)
Time: 227238,445 ms (03:47,238)
postgres=# set max_parallel_workers_per_gather = 1;
SET
Time: 20,975 ms
postgres=# select count(*) from t;
count
-----------
800000000
(1 row)
Time: 138027,351 ms (02:18,027)
regards,
Ranier Vilela
On Thu, May 21, 2020 at 11:15 AM Ranier Vilela <ranier.vf@gmail.com> wrote:
postgres=# set max_parallel_workers_per_gather = 0;
Time: 227238,445 ms (03:47,238)
postgres=# set max_parallel_workers_per_gather = 1;
Time: 138027,351 ms (02:18,027)
Ok, so it looks like NT/NTFS isn't suffering from this problem.
Thanks for testing!
Em qua., 20 de mai. de 2020 às 20:48, Thomas Munro <thomas.munro@gmail.com>
escreveu:
On Thu, May 21, 2020 at 11:15 AM Ranier Vilela <ranier.vf@gmail.com>
wrote:postgres=# set max_parallel_workers_per_gather = 0;
Time: 227238,445 ms (03:47,238)
postgres=# set max_parallel_workers_per_gather = 1;
Time: 138027,351 ms (02:18,027)Ok, so it looks like NT/NTFS isn't suffering from this problem.
Thanks for testing!
Maybe it wasn’t clear, the tests were done with your patch applied.
regards,
Ranier Vilela
On Thu, May 21, 2020 at 11:51 AM Ranier Vilela <ranier.vf@gmail.com> wrote:
Em qua., 20 de mai. de 2020 às 20:48, Thomas Munro <thomas.munro@gmail.com> escreveu:
On Thu, May 21, 2020 at 11:15 AM Ranier Vilela <ranier.vf@gmail.com> wrote:
postgres=# set max_parallel_workers_per_gather = 0;
Time: 227238,445 ms (03:47,238)
postgres=# set max_parallel_workers_per_gather = 1;
Time: 138027,351 ms (02:18,027)Ok, so it looks like NT/NTFS isn't suffering from this problem.
Thanks for testing!Maybe it wasn’t clear, the tests were done with your patch applied.
Oh! And how do the times look without it?
Em qua., 20 de mai. de 2020 às 21:03, Thomas Munro <thomas.munro@gmail.com>
escreveu:
On Thu, May 21, 2020 at 11:51 AM Ranier Vilela <ranier.vf@gmail.com>
wrote:Em qua., 20 de mai. de 2020 às 20:48, Thomas Munro <
thomas.munro@gmail.com> escreveu:
On Thu, May 21, 2020 at 11:15 AM Ranier Vilela <ranier.vf@gmail.com>
wrote:
postgres=# set max_parallel_workers_per_gather = 0;
Time: 227238,445 ms (03:47,238)
postgres=# set max_parallel_workers_per_gather = 1;
Time: 138027,351 ms (02:18,027)Ok, so it looks like NT/NTFS isn't suffering from this problem.
Thanks for testing!Maybe it wasn’t clear, the tests were done with your patch applied.
Oh! And how do the times look without it?
Vanila Postgres (latest)
create table t as select generate_series(1, 800000000)::int i;
set max_parallel_workers_per_gather = 0;
Time: 210524,317 ms (03:30,524)
set max_parallel_workers_per_gather = 1;
Time: 146982,737 ms (02:26,983)
regards,
Ranier Vilela
On Thu, May 21, 2020 at 1:38 PM Ranier Vilela <ranier.vf@gmail.com> wrote:
On Thu, May 21, 2020 at 11:15 AM Ranier Vilela <ranier.vf@gmail.com> wrote:
postgres=# set max_parallel_workers_per_gather = 0;
Time: 227238,445 ms (03:47,238)
postgres=# set max_parallel_workers_per_gather = 1;
Time: 138027,351 ms (02:18,027)
Vanila Postgres (latest)
create table t as select generate_series(1, 800000000)::int i;
set max_parallel_workers_per_gather = 0;
Time: 210524,317 ms (03:30,524)
set max_parallel_workers_per_gather = 1;
Time: 146982,737 ms (02:26,983)
Thanks. So it seems like Linux, Windows and anything using ZFS are
OK, which probably explains why we hadn't heard complaints about it.
On Thu, 21 May 2020 at 14:32, Thomas Munro <thomas.munro@gmail.com> wrote:
Thanks. So it seems like Linux, Windows and anything using ZFS are
OK, which probably explains why we hadn't heard complaints about it.
I tried out a different test on a Windows 8.1 machine I have here. I
was concerned that the test that was used here ends up with tuples
that are too narrow and that the executor would spend quite a bit of
time going between nodes and performing the actual aggregation. I
thought it might be good to add some padding so that there are far
fewer tuples on the page.
I ended up with:
create table t (a int, b text);
-- create a table of 100GB in size.
insert into t select x,md5(x::text) from
generate_series(1,1000000*1572.7381809)x; -- took 1 hr 18 mins
vacuum freeze t;
query = select count(*) from t;
Disk = Samsung SSD 850 EVO mSATA 1TB.
Master:
workers = 0 : Time: 269104.281 ms (04:29.104) 380MB/s
workers = 1 : Time: 741183.646 ms (12:21.184) 138MB/s
workers = 2 : Time: 656963.754 ms (10:56.964) 155MB/s
Patched:
workers = 0 : Should be the same as before as the code for this didn't change.
workers = 1 : Time: 300299.364 ms (05:00.299) 340MB/s
workers = 2 : Time: 270213.726 ms (04:30.214) 379MB/s
(A better query would likely have been just: SELECT * FROM t WHERE a =
1; but I'd run the test by the time I thought of that.)
So, this shows that Windows, at least 8.1, does suffer from this too.
For the patch. I know you just put it together quickly, but I don't
think you can do that ramp up the way you have. It looks like there's
a risk of torn reads and torn writes and I'm unsure how much that
could affect the test results here. It looks like there's a risk that
a worker gets some garbage number of pages to read rather than what
you think it will. Also, I also don't quite understand the need for a
ramp-up in pages per serving. Shouldn't you instantly start at some
size and hold that, then only maybe ramp down at the end so that
workers all finish at close to the same time? However, I did have
other ideas which I'll explain below.
From my previous work on that function to add the atomics. I did think
that it would be better to dish out more than 1 page at a time.
However, there is the risk that the workload is not evenly distributed
between the workers. My thoughts were that we could divide the total
pages by the number of workers then again by 100 and dish out blocks
based on that. That way workers will get about 100th of their fair
share of pages at once, so assuming there's an even amount of work to
do per serving of pages, then the last worker should only run on at
most 1% longer. Perhaps that 100 should be 1000, then the run on time
for the last worker is just 0.1%. Perhaps the serving size can also
be capped at some maximum like 64. We'll certainly need to ensure it's
at least 1! I imagine that will eliminate the need for any ramp down
of pages per serving near the end of the scan.
David
On Thu, 21 May 2020 at 17:06, David Rowley <dgrowleyml@gmail.com> wrote:
For the patch. I know you just put it together quickly, but I don't
think you can do that ramp up the way you have. It looks like there's
a risk of torn reads and torn writes and I'm unsure how much that
could affect the test results here.
Oops. On closer inspection, I see that memory is per worker, not
global to the scan.
On Fri, May 22, 2020 at 10:00 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 21 May 2020 at 17:06, David Rowley <dgrowleyml@gmail.com> wrote:
For the patch. I know you just put it together quickly, but I don't
think you can do that ramp up the way you have. It looks like there's
a risk of torn reads and torn writes and I'm unsure how much that
could affect the test results here.Oops. On closer inspection, I see that memory is per worker, not
global to the scan.
Right, I think it's safe. I think you were probably right that
ramp-up isn't actually useful though, it's only the end of the scan
that requires special treatment so we don't get unfair allocation as
the work runs out, due to course grain. I suppose that even if you
have a scheme that falls back to fine grained allocation for the final
N pages, it's still possible that a highly distracted process (most
likely the leader given its double duties) can finish up sitting on a
large range of pages and eventually have to process them all at the
end after the other workers have already knocked off and gone for a
pint.
Hi Thomas,
Some more data points:
create table t_heap as select generate_series(1, 100000000) i;
Query: select count(*) from t_heap;
shared_buffers=32MB (so that I don't have to clear buffers, OS page
cache)
OS: FreeBSD 12.1 with UFS on GCP
4 vCPUs, 4GB RAM Intel Skylake
22G Google PersistentDisk
Time is measured with \timing on.
Without your patch:
max_parallel_workers_per_gather Time(seconds)
0 33.88s
1 57.62s
2 62.01s
6 222.94s
With your patch:
max_parallel_workers_per_gather Time(seconds)
0 29.04s
1 29.17s
2 28.78s
6 291.27s
I checked with explain analyze to ensure that the number of workers
planned = max_parallel_workers_per_gather
Apart from the last result (max_parallel_workers_per_gather=6), all
the other results seem favorable.
Could the last result be down to the fact that the number of workers
planned exceeded the number of vCPUs?
I also wanted to evaluate Zedstore with your patch.
I used the same setup as above.
No discernible difference though, maybe I'm missing something:
Without your patch:
max_parallel_workers_per_gather Time(seconds)
0 25.86s
1 15.70s
2 12.60s
6 12.41s
With your patch:
max_parallel_workers_per_gather Time(seconds)
0 26.96s
1 15.73s
2 12.46s
6 12.10s
--
Soumyadeep
On Thu, May 21, 2020 at 3:28 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Show quoted text
On Fri, May 22, 2020 at 10:00 AM David Rowley <dgrowleyml@gmail.com>
wrote:On Thu, 21 May 2020 at 17:06, David Rowley <dgrowleyml@gmail.com> wrote:
For the patch. I know you just put it together quickly, but I don't
think you can do that ramp up the way you have. It looks like there's
a risk of torn reads and torn writes and I'm unsure how much that
could affect the test results here.Oops. On closer inspection, I see that memory is per worker, not
global to the scan.Right, I think it's safe. I think you were probably right that
ramp-up isn't actually useful though, it's only the end of the scan
that requires special treatment so we don't get unfair allocation as
the work runs out, due to course grain. I suppose that even if you
have a scheme that falls back to fine grained allocation for the final
N pages, it's still possible that a highly distracted process (most
likely the leader given its double duties) can finish up sitting on a
large range of pages and eventually have to process them all at the
end after the other workers have already knocked off and gone for a
pint.
On Fri, May 22, 2020 at 1:14 PM Soumyadeep Chakraborty
<sochakraborty@pivotal.io> wrote:
Some more data points:
Thanks!
max_parallel_workers_per_gather Time(seconds)
0 29.04s
1 29.17s
2 28.78s
6 291.27sI checked with explain analyze to ensure that the number of workers
planned = max_parallel_workers_per_gatherApart from the last result (max_parallel_workers_per_gather=6), all
the other results seem favorable.
Could the last result be down to the fact that the number of workers
planned exceeded the number of vCPUs?
Interesting. I guess it has to do with patterns emerging from various
parameters like that magic number 64 I hard coded into the test patch,
and other unknowns in your storage stack. I see a small drop off that
I can't explain yet, but not that.
I also wanted to evaluate Zedstore with your patch.
I used the same setup as above.
No discernible difference though, maybe I'm missing something:
It doesn't look like it's using table_block_parallelscan_nextpage() as
a block allocator so it's not affected by the patch. It has its own
thing zs_parallelscan_nextrange(), which does
pg_atomic_fetch_add_u64(&pzscan->pzs_allocatedtids,
ZS_PARALLEL_CHUNK_SIZE), and that macro is 0x100000.
On Tue, May 19, 2020 at 10:23 PM Amit Kapila <amit.kapila16@gmail.com> wrote:
Good experiment. IIRC, we have discussed a similar idea during the
development of this feature but we haven't seen any better results by
allocating in ranges on the systems we have tried. So, we want with
the current approach which is more granular and seems to allow better
parallelism. I feel we need to ensure that we don't regress
parallelism in existing cases, otherwise, the idea sounds promising to
me.
I think there's a significant difference. The idea I remember being
discussed at the time was to divide the relation into equal parts at
the very start and give one part to each worker. I think that carries
a lot of risk of some workers finishing much sooner than others. This
idea, AIUI, is to divide the relation into chunks that are small
compared to the size of the relation, but larger than 1 block. That
carries some risk of an unequal division of work, as has already been
noted, but it's much less, especially if we use smaller chunk sizes
once we get close to the end, as proposed here.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Thu, May 21, 2020 at 6:28 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Right, I think it's safe. I think you were probably right that
ramp-up isn't actually useful though, it's only the end of the scan
that requires special treatment so we don't get unfair allocation as
the work runs out, due to course grain.
The ramp-up seems like it might be useful if the query involves a LIMIT.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Sat, May 23, 2020 at 12:00 AM Robert Haas <robertmhaas@gmail.com> wrote:
On Tue, May 19, 2020 at 10:23 PM Amit Kapila <amit.kapila16@gmail.com> wrote:
Good experiment. IIRC, we have discussed a similar idea during the
development of this feature but we haven't seen any better results by
allocating in ranges on the systems we have tried. So, we want with
the current approach which is more granular and seems to allow better
parallelism. I feel we need to ensure that we don't regress
parallelism in existing cases, otherwise, the idea sounds promising to
me.I think there's a significant difference. The idea I remember being
discussed at the time was to divide the relation into equal parts at
the very start and give one part to each worker.
I have checked the archives and found that we have done some testing
by allowing each worker to work on a block-by-block basis and by
having a fixed number of chunks for each worker. See the results [1]/messages/by-id/CAA4eK1JHCmN2X1LjQ4bOmLApt+btOuid5Vqqk5G6dDFV69iyHg@mail.gmail.com
(the program used is attached in another email [2]/messages/by-id/CAA4eK1JyVNEBE8KuxKd3bJhkG6tSbpBYX_+ZtP34ZSTCSucA1A@mail.gmail.com). The conclusion
was that we didn't find much difference with any of those approaches.
Now, the reason could be that because we have tested on a machine (I
think it was hydra (Power-7)) where the chunk-size doesn't matter but
I think it can show some difference in the machines on which Thomas
and David are testing. At that time there was also a discussion to
chunk on the basis of "each worker processes one 1GB-sized segment"
which Tom and Stephen seem to support [3]/messages/by-id/30549.1422459647@sss.pgh.pa.us. I think an idea to divide
the relation into segments based on workers for a parallel scan has
been used by other database (DynamoDB) as well [4]https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Scan.html#Scan.ParallelScan so it is not
completely without merit. I understand that larger sized chunks can
lead to unequal work distribution but they have their own advantages,
so we might want to get the best of both the worlds where in the
beginning we have larger sized chunks and then slowly reduce the
chunk-size towards the end of the scan. I am not sure what is the
best thing to do here but maybe some experiments can shed light on
this mystery.
[1]: /messages/by-id/CAA4eK1JHCmN2X1LjQ4bOmLApt+btOuid5Vqqk5G6dDFV69iyHg@mail.gmail.com
[2]: /messages/by-id/CAA4eK1JyVNEBE8KuxKd3bJhkG6tSbpBYX_+ZtP34ZSTCSucA1A@mail.gmail.com
[3]: /messages/by-id/30549.1422459647@sss.pgh.pa.us
[4]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Scan.html#Scan.ParallelScan
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Sat, 23 May 2020 at 06:31, Robert Haas <robertmhaas@gmail.com> wrote:
On Thu, May 21, 2020 at 6:28 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Right, I think it's safe. I think you were probably right that
ramp-up isn't actually useful though, it's only the end of the scan
that requires special treatment so we don't get unfair allocation as
the work runs out, due to course grain.The ramp-up seems like it might be useful if the query involves a LIMIT.
That's true, but I think the intelligence there would need to go
beyond, "if there's a LIMIT clause, do ramp-up", as we might have
already fully ramped up well before the LIMIT is reached.
David
It doesn't look like it's using table_block_parallelscan_nextpage() as
a block allocator so it's not affected by the patch. It has its own
thing zs_parallelscan_nextrange(), which does
pg_atomic_fetch_add_u64(&pzscan->pzs_allocatedtids,
ZS_PARALLEL_CHUNK_SIZE), and that macro is 0x100000.
My apologies, I was too hasty. Indeed, you are correct. Zedstore's
unit of work is chunks of the logical zstid space. There is a
correlation between the zstid and blocks: zstids near each other are
likely to lie in the same block or in neighboring blocks. It would be
interesting to try something like this patch for Zedstore.
Regards,
Soumyadeep
On Sat, May 23, 2020 at 12:00 AM Robert Haas
<robertmhaas(at)gmail(dot)com> wrote:
I think there's a significant difference. The idea I remember being
discussed at the time was to divide the relation into equal parts at
the very start and give one part to each worker. I think that carries
a lot of risk of some workers finishing much sooner than others.
Was the idea of work-stealing considered? Here is what I have been
thinking about:
Each worker would be assigned a contiguous chunk of blocks at init time.
Then if a worker is finished with its work, it can inspect other
workers' remaining work and "steal" some of the blocks from the end of
the victim worker's allocation.
Considerations for such a scheme:
1. Victim selection: Who will be the victim worker? It can be selected at
random if nothing else.
2. How many blocks to steal? Stealing half of the victim's remaining
blocks seems to be fair.
3. Stealing threshold: We should disallow stealing if the amount of
remaining work is not enough in the victim worker.
4. Additional parallel state: Representing the chunk of "work". I guess
one variable for the current block and one for the last block in the
chunk allocated. The latter would have to be protected with atomic
fetches as it would be decremented by the stealing worker.
5. Point 4 implies that there might be more atomic fetch operations as
compared to this patch. Idk if that is a lesser evil than the workers
being idle..probably not? A way to salvage that a little would be to
forego atomic fetches when the amount of work remaining is less than the
threshold discussed in 3 as there is no possibility of work stealing then.
Regards,
Soumyadeep
On Wed, Jun 3, 2020 at 3:18 PM Soumyadeep Chakraborty
<soumyadeep2007@gmail.com> wrote:
Idk if that is a lesser evil than the workers
being idle..probably not?
Apologies, I meant that the extra atomic fetches is probably a lesser
evil than the workers being idle.
Soumyadeep
On Thu, 21 May 2020 at 17:06, David Rowley <dgrowleyml@gmail.com> wrote:
create table t (a int, b text);
-- create a table of 100GB in size.
insert into t select x,md5(x::text) from
generate_series(1,1000000*1572.7381809)x; -- took 1 hr 18 mins
vacuum freeze t;query = select count(*) from t;
Disk = Samsung SSD 850 EVO mSATA 1TB.Master:
workers = 0 : Time: 269104.281 ms (04:29.104) 380MB/s
workers = 1 : Time: 741183.646 ms (12:21.184) 138MB/s
workers = 2 : Time: 656963.754 ms (10:56.964) 155MB/sPatched:
workers = 0 : Should be the same as before as the code for this didn't change.
workers = 1 : Time: 300299.364 ms (05:00.299) 340MB/s
workers = 2 : Time: 270213.726 ms (04:30.214) 379MB/s(A better query would likely have been just: SELECT * FROM t WHERE a =
1; but I'd run the test by the time I thought of that.)So, this shows that Windows, at least 8.1, does suffer from this too.
I repeated this test on an up-to-date Windows 10 machine to see if the
later kernel is any better at the readahead.
Results for the same test are:
Master:
max_parallel_workers_per_gather = 0: Time: 148481.244 ms (02:28.481)
(706.2MB/sec)
max_parallel_workers_per_gather = 1: Time: 327556.121 ms (05:27.556)
(320.1MB/sec)
max_parallel_workers_per_gather = 2: Time: 329055.530 ms (05:29.056)
(318.6MB/sec)
Patched:
max_parallel_workers_per_gather = 0: Time: 141363.991 ms (02:21.364)
(741.7MB/sec)
max_parallel_workers_per_gather = 1: Time: 144982.202 ms (02:24.982)
(723.2MB/sec)
max_parallel_workers_per_gather = 2: Time: 143355.656 ms (02:23.356)
(731.4MB/sec)
David
On Wed, Jun 10, 2020 at 5:06 PM David Rowley <dgrowleyml@gmail.com> wrote:
I repeated this test on an up-to-date Windows 10 machine to see if the
later kernel is any better at the readahead.Results for the same test are:
Master:
max_parallel_workers_per_gather = 0: Time: 148481.244 ms (02:28.481)
(706.2MB/sec)
max_parallel_workers_per_gather = 1: Time: 327556.121 ms (05:27.556)
(320.1MB/sec)
max_parallel_workers_per_gather = 2: Time: 329055.530 ms (05:29.056)
(318.6MB/sec)Patched:
max_parallel_workers_per_gather = 0: Time: 141363.991 ms (02:21.364)
(741.7MB/sec)
max_parallel_workers_per_gather = 1: Time: 144982.202 ms (02:24.982)
(723.2MB/sec)
max_parallel_workers_per_gather = 2: Time: 143355.656 ms (02:23.356)
(731.4MB/sec)
Thanks!
I also heard from Andres that he likes this patch with his AIO
prototype, because of the way request merging works. So it seems like
there are several reasons to want it.
But ... where should we get the maximum step size from? A GUC?
On Wed, 10 Jun 2020 at 17:21, Thomas Munro <thomas.munro@gmail.com> wrote:
I also heard from Andres that he likes this patch with his AIO
prototype, because of the way request merging works. So it seems like
there are several reasons to want it.But ... where should we get the maximum step size from? A GUC?
I guess we'd need to determine if other step sizes were better under
any conditions. I guess one condition would be if there was a LIMIT
clause. I could check if setting it to 1024 makes any difference, but
I'm thinking it won't since I got fairly consistent results on all
worker settings with the patched version.
David
On Wed, 10 Jun 2020 at 17:39, David Rowley <dgrowleyml@gmail.com> wrote:
On Wed, 10 Jun 2020 at 17:21, Thomas Munro <thomas.munro@gmail.com> wrote:
I also heard from Andres that he likes this patch with his AIO
prototype, because of the way request merging works. So it seems like
there are several reasons to want it.But ... where should we get the maximum step size from? A GUC?
I guess we'd need to determine if other step sizes were better under
any conditions. I guess one condition would be if there was a LIMIT
clause. I could check if setting it to 1024 makes any difference, but
I'm thinking it won't since I got fairly consistent results on all
worker settings with the patched version.
I did another round of testing on the same machine trying some step
sizes larger than 64 blocks. I can confirm that it does improve the
situation further going bigger than 64.
I got up as far as 16384, but made a couple of additional changes for
that run only. Instead of increasing the ramp-up 1 block at a time, I
initialised phsw_step_size to 1 and multiplied it by 2 until I reached
the chosen step size. With numbers that big, ramping up 1 block at a
time was slow enough that I'd never have reached the target step size
Here are the results of the testing:
Master:
max_parallel_workers_per_gather = 0: Time: 148481.244 ms (02:28.481)
(706.2MB/sec)
max_parallel_workers_per_gather = 1: Time: 327556.121 ms (05:27.556)
(320.1MB/sec)
max_parallel_workers_per_gather = 2: Time: 329055.530 ms (05:29.056)
(318.6MB/sec)
Patched stepsize = 64:
max_parallel_workers_per_gather = 0: Time: 141363.991 ms (02:21.364)
(741.7MB/sec)
max_parallel_workers_per_gather = 1: Time: 144982.202 ms (02:24.982)
(723.2MB/sec)
max_parallel_workers_per_gather = 2: Time: 143355.656 ms (02:23.356)
(731.4MB/sec)
Patched stepsize = 1024:
max_parallel_workers_per_gather = 0: Time: 152599.159 ms (02:32.599)
(687.1MB/sec)
max_parallel_workers_per_gather = 1: Time: 104227.232 ms (01:44.227)
(1006.04MB/sec)
max_parallel_workers_per_gather = 2: Time: 97149.343 ms (01:37.149)
(1079.3MB/sec)
Patched stepsize = 8192:
max_parallel_workers_per_gather = 0: Time: 143524.038 ms (02:23.524)
(730.59MB/sec)
max_parallel_workers_per_gather = 1: Time: 102899.288 ms (01:42.899)
(1019.0MB/sec)
max_parallel_workers_per_gather = 2: Time: 91148.340 ms (01:31.148)
(1150.4MB/sec)
Patched stepsize = 16384 (power 2 ramp-up)
max_parallel_workers_per_gather = 0: Time: 144598.502 ms (02:24.599)
(725.16MB/sec)
max_parallel_workers_per_gather = 1: Time: 97344.160 ms (01:37.344)
(1077.1MB/sec)
max_parallel_workers_per_gather = 2: Time: 88025.420 ms (01:28.025)
(1191.2MB/sec)
I thought about what you mentioned about a GUC, and I think it's a bad
idea to do that. I think it would be better to choose based on the
relation size. For smaller relations, we want to keep the step size
small. Someone may enable parallel query on such a small relation if
they're doing something like calling an expensive function on the
results, so we do need to avoid going large for small relations.
I considered something like:
create function nextpower2(a bigint) returns bigint as $$ declare n
bigint := 1; begin while n < a loop n := n * 2; end loop; return n;
end; $$ language plpgsql;
select pg_size_pretty(power(2,p)::numeric * 8192) rel_size,
nextpower2(power(2,p)::bigint / 1024) as stepsize from
generate_series(1,32) p;
rel_size | stepsize
----------+----------
16 kB | 1
32 kB | 1
64 kB | 1
128 kB | 1
256 kB | 1
512 kB | 1
1024 kB | 1
2048 kB | 1
4096 kB | 1
8192 kB | 1
16 MB | 2
32 MB | 4
64 MB | 8
128 MB | 16
256 MB | 32
512 MB | 64
1024 MB | 128
2048 MB | 256
4096 MB | 512
8192 MB | 1024
16 GB | 2048
32 GB | 4096
64 GB | 8192
128 GB | 16384
256 GB | 32768
512 GB | 65536
1024 GB | 131072
2048 GB | 262144
4096 GB | 524288
8192 GB | 1048576
16 TB | 2097152
32 TB | 4194304
So with that algorithm with this 100GB table that I've been using in
my test, we'd go with a step size of 16384. I think we'd want to avoid
going any more than that. The above code means we'll do between just
below 0.1% and 0.2% of the relation per step. If I divided the number
of blocks by say 128 instead of 1024, then that would be about 0.78%
and 1.56% of the relation each time. It's not unrealistic today that
someone might throw that many workers at a job, so, I'd say dividing
by 1024 or even 2048 would likely be about right.
David
On Wed, Jun 10, 2020 at 6:04 PM David Rowley <dgrowleyml@gmail.com> wrote:
On Wed, 10 Jun 2020 at 17:39, David Rowley <dgrowleyml@gmail.com> wrote:
On Wed, 10 Jun 2020 at 17:21, Thomas Munro <thomas.munro@gmail.com> wrote:
I also heard from Andres that he likes this patch with his AIO
prototype, because of the way request merging works. So it seems like
there are several reasons to want it.But ... where should we get the maximum step size from? A GUC?
I guess we'd need to determine if other step sizes were better under
any conditions. I guess one condition would be if there was a LIMIT
clause. I could check if setting it to 1024 makes any difference, but
I'm thinking it won't since I got fairly consistent results on all
worker settings with the patched version.I did another round of testing on the same machine trying some step
sizes larger than 64 blocks. I can confirm that it does improve the
situation further going bigger than 64.
Can we try the same test with 4, 8, 16 workers as well? I don't
foresee any problem with a higher number of workers but it might be
better to once check that if it is not too much additional work.
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Thu, 11 Jun 2020 at 01:24, Amit Kapila <amit.kapila16@gmail.com> wrote:
Can we try the same test with 4, 8, 16 workers as well? I don't
foresee any problem with a higher number of workers but it might be
better to once check that if it is not too much additional work.
I ran the tests again with up to 7 workers. The CPU here only has 8
cores (a laptop), so I'm not sure if there's much sense in going
higher than that?
CPU = Intel i7-8565U. 16GB RAM.
Note that I did the power2 ramp-up with each of the patched tests this
time. Thomas' version ramps up 1 pages at a time, which is ok when
only ramping up to 64 pages, but not for these higher numbers I'm
testing with. (Patch attached)
Results attached in a graph format, or in text below:
Master:
workers=0: Time: 141175.935 ms (02:21.176) (742.7MB/sec)
workers=1: Time: 316854.538 ms (05:16.855) (330.9MB/sec)
workers=2: Time: 323471.791 ms (05:23.472) (324.2MB/sec)
workers=3: Time: 321637.945 ms (05:21.638) (326MB/sec)
workers=4: Time: 308689.599 ms (05:08.690) (339.7MB/sec)
workers=5: Time: 289014.709 ms (04:49.015) (362.8MB/sec)
workers=6: Time: 267785.27 ms (04:27.785) (391.6MB/sec)
workers=7: Time: 248735.817 ms (04:08.736) (421.6MB/sec)
Patched 64: (power 2 ramp-up)
workers=0: Time: 152752.558 ms (02:32.753) (686.5MB/sec)
workers=1: Time: 149940.841 ms (02:29.941) (699.3MB/sec)
workers=2: Time: 136534.043 ms (02:16.534) (768MB/sec)
workers=3: Time: 119387.248 ms (01:59.387) (878.3MB/sec)
workers=4: Time: 114080.131 ms (01:54.080) (919.2MB/sec)
workers=5: Time: 111472.144 ms (01:51.472) (940.7MB/sec)
workers=6: Time: 108290.608 ms (01:48.291) (968.3MB/sec)
workers=7: Time: 104349.947 ms (01:44.350) (1004.9MB/sec)
Patched 1024: (power 2 ramp-up)
workers=0: Time: 146106.086 ms (02:26.106) (717.7MB/sec)
workers=1: Time: 109832.773 ms (01:49.833) (954.7MB/sec)
workers=2: Time: 98921.515 ms (01:38.922) (1060MB/sec)
workers=3: Time: 94259.243 ms (01:34.259) (1112.4MB/sec)
workers=4: Time: 93275.637 ms (01:33.276) (1124.2MB/sec)
workers=5: Time: 93921.452 ms (01:33.921) (1116.4MB/sec)
workers=6: Time: 93988.386 ms (01:33.988) (1115.6MB/sec)
workers=7: Time: 92096.414 ms (01:32.096) (1138.6MB/sec)
Patched 8192: (power 2 ramp-up)
workers=0: Time: 143367.057 ms (02:23.367) (731.4MB/sec)
workers=1: Time: 103138.918 ms (01:43.139) (1016.7MB/sec)
workers=2: Time: 93368.573 ms (01:33.369) (1123.1MB/sec)
workers=3: Time: 89464.529 ms (01:29.465) (1172.1MB/sec)
workers=4: Time: 89921.393 ms (01:29.921) (1166.1MB/sec)
workers=5: Time: 93575.401 ms (01:33.575) (1120.6MB/sec)
workers=6: Time: 93636.584 ms (01:33.637) (1119.8MB/sec)
workers=7: Time: 93682.21 ms (01:33.682) (1119.3MB/sec)
Patched 16384 (power 2 ramp-up)
workers=0: Time: 144598.502 ms (02:24.599) (725.2MB/sec)
workers=1: Time: 97344.16 ms (01:37.344) (1077.2MB/sec)
workers=2: Time: 88025.42 ms (01:28.025) (1191.2MB/sec)
workers=3: Time: 97711.521 ms (01:37.712) (1073.1MB/sec)
workers=4: Time: 88877.913 ms (01:28.878) (1179.8MB/sec)
workers=5: Time: 96985.978 ms (01:36.986) (1081.2MB/sec)
workers=6: Time: 92368.543 ms (01:32.369) (1135.2MB/sec)
workers=7: Time: 87498.156 ms (01:27.498) (1198.4MB/sec)
David
Attachments:
parallel_step_size.patchapplication/octet-stream; name=parallel_step_size.patchDownload
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 94eb37d48d..20a823bbad 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
+ scan->rs_base.rs_parallel_work =
+ palloc0(sizeof(ParallelBlockTableScanWorkData));
scan->rs_strategy = NULL; /* set in initscan */
/*
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index c814733b22..be84ac4947 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -404,10 +404,16 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
* to set the startblock once.
*/
void
-table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber sync_startpage = InvalidBlockNumber;
+ /* Reset the state we use for controlling allocation size. */
+ memset(workspace, 0, sizeof(*workspace));
+ workspace->phsw_step_size = 1;
+
retry:
/* Grab the spinlock. */
SpinLockAcquire(&pbscan->phs_mutex);
@@ -447,7 +453,9 @@ retry:
* backend gets an InvalidBlockNumber return.
*/
BlockNumber
-table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber page;
uint64 nallocated;
@@ -467,7 +475,28 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
* The actual page to return is calculated by adding the counter to the
* starting block number, modulo nblocks.
*/
- nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
+ if (workspace->phsw_nallocated_range > 0)
+ {
+ nallocated = ++workspace->phsw_nallocated;
+ workspace->phsw_nallocated_range--;
+ }
+ else
+ {
+ /*
+ * Ramp up the step size as we go, until we approach the end of the
+ * scan, and then ramp it back down again to avoid unfair allocation.
+ * XXX Come up with a better algorithm!
+ */
+ if (workspace->phsw_nallocated > pbscan->phs_nblocks - 64)
+ workspace->phsw_step_size = 1;
+ else if (workspace->phsw_step_size < 64)
+ workspace->phsw_step_size *= 2;
+
+ nallocated = workspace->phsw_nallocated =
+ pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+ workspace->phsw_step_size);
+ workspace->phsw_nallocated_range = workspace->phsw_step_size - 1;
+ }
if (nallocated >= pbscan->phs_nblocks)
page = InvalidBlockNumber; /* all blocks have been allocated */
else
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f0258831f..0d440c9de3 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
*/
uint32 rs_flags;
+ void *rs_parallel_work;
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
* information */
-
} TableScanDescData;
typedef struct TableScanDescData *TableScanDesc;
@@ -81,6 +81,17 @@ typedef struct ParallelBlockTableScanDescData
} ParallelBlockTableScanDescData;
typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
+/*
+ * Per backend state for parallel table sacan, for block oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkData
+{
+ uint64 phsw_nallocated;
+ int phsw_nallocated_range;
+ int phsw_step_size;
+} ParallelBlockTableScanWorkData;
+typedef struct ParallelBlockTableScanWorkData *ParallelBlockTableScanWork;
+
/*
* Base class for fetches from a table via an index. This is the base-class
* for such scans, which needs to be embedded in the respective struct for
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index eb18739c36..f0cefd7e26 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1790,8 +1790,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
extern void table_block_parallelscan_reinitialize(Relation rel,
ParallelTableScanDesc pscan);
extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
extern void table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
On Thu, Jun 11, 2020 at 7:18 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 11 Jun 2020 at 01:24, Amit Kapila <amit.kapila16@gmail.com> wrote:
Can we try the same test with 4, 8, 16 workers as well? I don't
foresee any problem with a higher number of workers but it might be
better to once check that if it is not too much additional work.I ran the tests again with up to 7 workers. The CPU here only has 8
cores (a laptop), so I'm not sure if there's much sense in going
higher than that?
I think it proves your point that there is a value in going for step
size greater than 64. However, I think the difference at higher sizes
is not significant. For example, the difference between 8192 and
16384 doesn't seem much if we leave higher worker count where the data
could be a bit misleading due to variation. I could see that there is
a clear and significant difference till 1024 but after that difference
is not much.
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Thu, 11 Jun 2020 at 14:09, Amit Kapila <amit.kapila16@gmail.com> wrote:
On Thu, Jun 11, 2020 at 7:18 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 11 Jun 2020 at 01:24, Amit Kapila <amit.kapila16@gmail.com> wrote:
Can we try the same test with 4, 8, 16 workers as well? I don't
foresee any problem with a higher number of workers but it might be
better to once check that if it is not too much additional work.I ran the tests again with up to 7 workers. The CPU here only has 8
cores (a laptop), so I'm not sure if there's much sense in going
higher than that?I think it proves your point that there is a value in going for step
size greater than 64. However, I think the difference at higher sizes
is not significant. For example, the difference between 8192 and
16384 doesn't seem much if we leave higher worker count where the data
could be a bit misleading due to variation. I could see that there is
a clear and significant difference till 1024 but after that difference
is not much.
I guess the danger with going too big is that we have some Seqscan
filter that causes some workers to do very little to nothing with the
rows, despite discarding them and other workers are left with rows
that are not filtered and require some expensive processing. Keeping
the number of blocks on the smaller side would reduce the chances of
someone being hit by that. The algorithm I proposed above still can
be capped by doing something like nblocks = Min(1024,
pg_nextpower2_32(pbscan->phs_nblocks / 1024)); That way we'll end up
with:
rel_size | stepsize
----------+----------
16 kB | 1
32 kB | 1
64 kB | 1
128 kB | 1
256 kB | 1
512 kB | 1
1024 kB | 1
2048 kB | 1
4096 kB | 1
8192 kB | 1
16 MB | 2
32 MB | 4
64 MB | 8
128 MB | 16
256 MB | 32
512 MB | 64
1024 MB | 128
2048 MB | 256
4096 MB | 512
8192 MB | 1024
16 GB | 1024
32 GB | 1024
64 GB | 1024
128 GB | 1024
256 GB | 1024
512 GB | 1024
1024 GB | 1024
2048 GB | 1024
4096 GB | 1024
8192 GB | 1024
16 TB | 1024
32 TB | 1024
(32 rows)
David
On Thu, Jun 11, 2020 at 8:35 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 11 Jun 2020 at 14:09, Amit Kapila <amit.kapila16@gmail.com> wrote:
On Thu, Jun 11, 2020 at 7:18 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 11 Jun 2020 at 01:24, Amit Kapila <amit.kapila16@gmail.com> wrote:
Can we try the same test with 4, 8, 16 workers as well? I don't
foresee any problem with a higher number of workers but it might be
better to once check that if it is not too much additional work.I ran the tests again with up to 7 workers. The CPU here only has 8
cores (a laptop), so I'm not sure if there's much sense in going
higher than that?I think it proves your point that there is a value in going for step
size greater than 64. However, I think the difference at higher sizes
is not significant. For example, the difference between 8192 and
16384 doesn't seem much if we leave higher worker count where the data
could be a bit misleading due to variation. I could see that there is
a clear and significant difference till 1024 but after that difference
is not much.I guess the danger with going too big is that we have some Seqscan
filter that causes some workers to do very little to nothing with the
rows, despite discarding them and other workers are left with rows
that are not filtered and require some expensive processing. Keeping
the number of blocks on the smaller side would reduce the chances of
someone being hit by that.
Right and good point.
The algorithm I proposed above still can
be capped by doing something like nblocks = Min(1024,
pg_nextpower2_32(pbscan->phs_nblocks / 1024)); That way we'll end up
with:
I think something on these lines would be a good idea especially
keeping step-size proportional to relation size. However, I am not
completely sure if doubling the step-size with equal increase in
relation size (ex. what is happening between 16MB~8192MB) is the best
idea. Why not double the step-size when relation size increases by
four times? Will some more tests help us to identify this? I also
don't know what is the right answer here so just trying to brainstorm.
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Thu, 11 Jun 2020 at 16:03, Amit Kapila <amit.kapila16@gmail.com> wrote:
I think something on these lines would be a good idea especially
keeping step-size proportional to relation size. However, I am not
completely sure if doubling the step-size with equal increase in
relation size (ex. what is happening between 16MB~8192MB) is the best
idea. Why not double the step-size when relation size increases by
four times? Will some more tests help us to identify this? I also
don't know what is the right answer here so just trying to brainstorm.
Brainstorming sounds good. I'm by no means under any illusion that the
formula is correct.
But, why four times? The way I did it tries to keep the number of
chunks roughly the same each time. I think the key is the number of
chunks more than the size of the chunks. Having fewer chunks increases
the chances of an imbalance of work between workers, and with what you
mention, the number of chunks will vary more than what I have proposed
The code I showed above will produce something between 512-1024 chunks
for all cases until we 2^20 pages, then we start capping the chunk
size to 1024. I could probably get onboard with making it depend on
the number of parallel workers, but perhaps it would be better just to
divide by, say, 16384 rather than 1024, as I proposed above. That way
we'll be more fine-grained, but we'll still read in larger than 1024
chunk sizes when the relation gets beyond 128GB.
David
On Thu, Jun 11, 2020 at 10:13 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 11 Jun 2020 at 16:03, Amit Kapila <amit.kapila16@gmail.com> wrote:
I think something on these lines would be a good idea especially
keeping step-size proportional to relation size. However, I am not
completely sure if doubling the step-size with equal increase in
relation size (ex. what is happening between 16MB~8192MB) is the best
idea. Why not double the step-size when relation size increases by
four times? Will some more tests help us to identify this? I also
don't know what is the right answer here so just trying to brainstorm.Brainstorming sounds good. I'm by no means under any illusion that the
formula is correct.But, why four times?
Just trying to see if we can optimize such that we use bigger
step-size for bigger relations and smaller step-size for smaller
relations.
The way I did it tries to keep the number of
chunks roughly the same each time. I think the key is the number of
chunks more than the size of the chunks. Having fewer chunks increases
the chances of an imbalance of work between workers, and with what you
mention, the number of chunks will vary more than what I have proposed
But, I think it will lead to more number of chunks for smaller relations.
The code I showed above will produce something between 512-1024 chunks
for all cases until we 2^20 pages, then we start capping the chunk
size to 1024. I could probably get onboard with making it depend on
the number of parallel workers, but perhaps it would be better just to
divide by, say, 16384 rather than 1024, as I proposed above. That way
we'll be more fine-grained, but we'll still read in larger than 1024
chunk sizes when the relation gets beyond 128GB.
I think increasing step-size might be okay for very large relations.
Another point I am thinking is that whatever formula we come up here
might not be a good fit for every case. For ex. as you mentioned
above that larger step-size can impact the performance based on
qualification, similarly there could be other things like having a
target list or qual having some function which takes more time for
certain tuples and lesser for others especially if function evaluation
is based on some column values. So, can we think of providing a
rel_option for step-size?
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Thu, 11 Jun 2020 at 23:35, Amit Kapila <amit.kapila16@gmail.com> wrote:
Another point I am thinking is that whatever formula we come up here
might not be a good fit for every case. For ex. as you mentioned
above that larger step-size can impact the performance based on
qualification, similarly there could be other things like having a
target list or qual having some function which takes more time for
certain tuples and lesser for others especially if function evaluation
is based on some column values. So, can we think of providing a
rel_option for step-size?
I think someone at some point is not going to like the automatic
choice. So perhaps a reloption to allow users to overwrite it is a
good idea. -1 should most likely mean use the automatic choice based
on relation size. I think for parallel seq scans that filter a large
portion of the records most likely need some sort of index, but there
are perhaps some genuine cases for not having one. e.g perhaps the
query is just not run often enough for an index to be worthwhile. In
that case, the performance is likely less critical, but at least the
reloption would allow users to get the old behaviour.
David
On Fri, Jun 12, 2020 at 2:24 AM David Rowley <dgrowleyml@gmail.com> wrote:
On Thu, 11 Jun 2020 at 23:35, Amit Kapila <amit.kapila16@gmail.com> wrote:
Another point I am thinking is that whatever formula we come up here
might not be a good fit for every case. For ex. as you mentioned
above that larger step-size can impact the performance based on
qualification, similarly there could be other things like having a
target list or qual having some function which takes more time for
certain tuples and lesser for others especially if function evaluation
is based on some column values. So, can we think of providing a
rel_option for step-size?I think someone at some point is not going to like the automatic
choice. So perhaps a reloption to allow users to overwrite it is a
good idea. -1 should most likely mean use the automatic choice based
on relation size. I think for parallel seq scans that filter a large
portion of the records most likely need some sort of index, but there
are perhaps some genuine cases for not having one. e.g perhaps the
query is just not run often enough for an index to be worthwhile. In
that case, the performance is likely less critical, but at least the
reloption would allow users to get the old behaviour.
makes sense to me.
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Thu, Jun 11, 2020 at 4:54 PM David Rowley <dgrowleyml@gmail.com> wrote:
I think someone at some point is not going to like the automatic
choice. So perhaps a reloption to allow users to overwrite it is a
good idea. -1 should most likely mean use the automatic choice based
on relation size. I think for parallel seq scans that filter a large
portion of the records most likely need some sort of index, but there
are perhaps some genuine cases for not having one. e.g perhaps the
query is just not run often enough for an index to be worthwhile. In
that case, the performance is likely less critical, but at least the
reloption would allow users to get the old behaviour.
Let me play the devil's advocate here. I feel like if the step size is
limited by the relation size and there is ramp-up and ramp-down, or
maybe even if you don't have all 3 of those but perhaps say 2 of them,
the chances of there being a significant downside from using this seem
quite small. At that point I wonder whether you really need an option.
It's true that someone might not like it, but there are all sorts of
things that at least one person doesn't like and one can't cater to
all of them.
To put that another way, in what scenario do we suppose that a
reasonable person would wish to use this reloption? If there's none,
we don't need it. If there is one, can we develop a mitigation that
solves their problem automatically instead?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Fri, Jun 12, 2020 at 11:28 PM Robert Haas <robertmhaas@gmail.com> wrote:
On Thu, Jun 11, 2020 at 4:54 PM David Rowley <dgrowleyml@gmail.com> wrote:
I think someone at some point is not going to like the automatic
choice. So perhaps a reloption to allow users to overwrite it is a
good idea. -1 should most likely mean use the automatic choice based
on relation size. I think for parallel seq scans that filter a large
portion of the records most likely need some sort of index, but there
are perhaps some genuine cases for not having one. e.g perhaps the
query is just not run often enough for an index to be worthwhile. In
that case, the performance is likely less critical, but at least the
reloption would allow users to get the old behaviour.Let me play the devil's advocate here. I feel like if the step size is
limited by the relation size and there is ramp-up and ramp-down, or
maybe even if you don't have all 3 of those but perhaps say 2 of them,
the chances of there being a significant downside from using this seem
quite small. At that point I wonder whether you really need an option.
It's true that someone might not like it, but there are all sorts of
things that at least one person doesn't like and one can't cater to
all of them.To put that another way, in what scenario do we suppose that a
reasonable person would wish to use this reloption?
The performance can vary based on qualification where some workers
discard more rows as compared to others, with the current system with
step-size as one, the probability of unequal work among workers is
quite low as compared to larger step-sizes.
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Sat, Jun 13, 2020 at 2:13 AM Amit Kapila <amit.kapila16@gmail.com> wrote:
The performance can vary based on qualification where some workers
discard more rows as compared to others, with the current system with
step-size as one, the probability of unequal work among workers is
quite low as compared to larger step-sizes.
It seems like this would require incredibly bad luck, though. If the
step size is less than 1/1024 of the relation size, and we ramp down
for, say, the last 5% of the relation, then the worst case is that
chunk 972 of 1024 is super-slow compared to all the other blocks, so
that it takes longer to process chunk 972 only than it does to process
chunks 973-1024 combined. It is not impossible, but that chunk has to
be like 50x worse than all the others, which doesn't seem like
something that is going to happen often enough to be worth worrying
about very much. I'm not saying it will never happen. I'm just
skeptical about the benefit of adding a GUC or reloption for a corner
case like this. I think people will fiddle with it when it isn't
really needed, and won't realize it exists in the scenarios where it
would have helped. And then, because we have the setting, we'll have
to keep it around forever, even as we improve the algorithm in other
ways, which could become a maintenance burden. I think it's better to
treat stuff like this as an implementation detail rather than
something we expect users to adjust.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Tue, 16 Jun 2020 at 03:29, Robert Haas <robertmhaas@gmail.com> wrote:
On Sat, Jun 13, 2020 at 2:13 AM Amit Kapila <amit.kapila16@gmail.com> wrote:
The performance can vary based on qualification where some workers
discard more rows as compared to others, with the current system with
step-size as one, the probability of unequal work among workers is
quite low as compared to larger step-sizes.It seems like this would require incredibly bad luck, though. If the
step size is less than 1/1024 of the relation size, and we ramp down
for, say, the last 5% of the relation, then the worst case is that
chunk 972 of 1024 is super-slow compared to all the other blocks, so
that it takes longer to process chunk 972 only than it does to process
chunks 973-1024 combined. It is not impossible, but that chunk has to
be like 50x worse than all the others, which doesn't seem like
something that is going to happen often enough to be worth worrying
about very much. I'm not saying it will never happen. I'm just
skeptical about the benefit of adding a GUC or reloption for a corner
case like this. I think people will fiddle with it when it isn't
really needed, and won't realize it exists in the scenarios where it
would have helped.
I'm trying to think of likely scenarios where "lots of work at the
end" is going to be common. I can think of queue processing, but
everything I can think about there requires an UPDATE to the processed
flag, which won't be using parallel query anyway. There's then
processing something based on some period of time like "the last
hour", "today". For append-only tables the latest information is
likely to be at the end of the heap. For that, anyone that's getting
a SeqScan on a large relation should likely have added an index. If a
btree is too costly, then BRIN is pretty perfect for that case.
FWIW, I'm not really keen on adding a reloption or a GUC. I've also
voiced here that I'm not even keen on the ramp-up.
To summarise what's all been proposed so far:
1. Use a constant, (e.g. 64) as the parallel step size
2. Ramp up the step size over time
3. Ramp down the step size towards the end of the scan.
4. Auto-determine a good stepsize based on the size of the relation.
5. Add GUC to allow users to control or override the step size.
6. Add relption to allow users to control or override the step size.
Here are my thoughts on each of those:
#1 is a bad idea as there are legitimate use-cases for using parallel
query on small tables. e.g calling some expensive parallel safe
function. Small tables are more likely to be cached.
#2 I don't quite understand why this is useful
#3 I understand this is to try to make it so workers all complete
around about the same time.
#4 We really should be doing it this way.
#5 Having a global knob to control something that is very specific to
the size of a relation does not make much sense to me.
#6. I imagine someone will have some weird use-case that works better
when parallel workers get 1 page at a time. I'm not convinced that
they're not doing something else wrong.
So my vote is for 4 with possibly 3, if we can come up with something
smart enough * that works well in parallel. I think there's less of a
need for this if we divided the relation into more chunks, e.g. 8192
or 16384.
* Perhaps when there are less than 2 full chunks remaining, workers
can just take half of what is left. Or more specifically
Max(pg_next_power2(remaining_blocks) / 2, 1), which ideally would work
out allocating an amount of pages proportional to the amount of beer
each mathematician receives in the "An infinite number of
mathematicians walk into a bar" joke, obviously with the exception
that we stop dividing when we get to 1. However, I'm not quite sure
how well that can be made to work with multiple bartenders working in
parallel.
David
On Mon, Jun 15, 2020 at 8:59 PM Robert Haas <robertmhaas@gmail.com> wrote:
On Sat, Jun 13, 2020 at 2:13 AM Amit Kapila <amit.kapila16@gmail.com> wrote:
The performance can vary based on qualification where some workers
discard more rows as compared to others, with the current system with
step-size as one, the probability of unequal work among workers is
quite low as compared to larger step-sizes.It seems like this would require incredibly bad luck, though.
I agree that won't be a common scenario but apart from that also I am
not sure if we can conclude that the proposed patch won't cause any
regressions. See one of the tests [1]/messages/by-id/CADwEdoqirzK3H8bB=xyJ192EZCNwGfcCa_WJ5GHVM7Sv8oenuA@mail.gmail.com done by Soumyadeep where the
patch has caused regression in one of the cases, now we can either try
to improve the patch and see we didn't cause any regressions or assume
that those are some minority cases which we don't care. Another point
is that this thread has started with a theory that this idea can give
benefits on certain filesystems and AFAICS we have tested it on one
other type of system, so not sure if that is sufficient.
Having said that, I just want to clarify that I am positive about this
work but just not very sure that it is a universal win based on the
testing done till now.
[1]: /messages/by-id/CADwEdoqirzK3H8bB=xyJ192EZCNwGfcCa_WJ5GHVM7Sv8oenuA@mail.gmail.com
--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com
On Mon, Jun 15, 2020 at 5:09 PM David Rowley <dgrowleyml@gmail.com> wrote:
To summarise what's all been proposed so far:
1. Use a constant, (e.g. 64) as the parallel step size
2. Ramp up the step size over time
3. Ramp down the step size towards the end of the scan.
4. Auto-determine a good stepsize based on the size of the relation.
5. Add GUC to allow users to control or override the step size.
6. Add relption to allow users to control or override the step size.Here are my thoughts on each of those:
#1 is a bad idea as there are legitimate use-cases for using parallel
query on small tables. e.g calling some expensive parallel safe
function. Small tables are more likely to be cached.
I agree.
#2 I don't quite understand why this is useful
I was thinking that if the query had a small LIMIT, you'd want to
avoid handing out excessively large chunks, but actually that seems
like it might just be fuzzy thinking on my part. We're not committing
to scanning the entirety of the chunk just because we've assigned it
to a worker.
#3 I understand this is to try to make it so workers all complete
around about the same time.
#4 We really should be doing it this way.
#5 Having a global knob to control something that is very specific to
the size of a relation does not make much sense to me.
#6. I imagine someone will have some weird use-case that works better
when parallel workers get 1 page at a time. I'm not convinced that
they're not doing something else wrong.
Agree with all of that.
So my vote is for 4 with possibly 3, if we can come up with something
smart enough * that works well in parallel. I think there's less of a
need for this if we divided the relation into more chunks, e.g. 8192
or 16384.
I agree with that too.
* Perhaps when there are less than 2 full chunks remaining, workers
can just take half of what is left. Or more specifically
Max(pg_next_power2(remaining_blocks) / 2, 1), which ideally would work
out allocating an amount of pages proportional to the amount of beer
each mathematician receives in the "An infinite number of
mathematicians walk into a bar" joke, obviously with the exception
that we stop dividing when we get to 1. However, I'm not quite sure
how well that can be made to work with multiple bartenders working in
parallel.
That doesn't sound nearly aggressive enough to me. I mean, let's
suppose that we're concerned about the scenario where one chunk takes
50x as long as all the other chunks. Well, if we have 1024 chunks
total, and we hit the problem chunk near the beginning, there will be
no problem. In effect, there are 1073 units of work instead of 1024,
and we accidentally assigned one guy 50 units of work when we thought
we were assigning 1 unit of work. If there's enough work left that we
can assign each other worker 49 units more than what we would have
done had that chunk been the same cost as all the others, then there's
no problem. So for instance if there are 4 workers, we can still even
things out if we hit the problematic chunk more than ~150 chunks from
the end. If we're closer to the end than that, there's no way to avoid
the slow chunk delaying the overall completion time, and the problem
gets worse as the problem chunk gets closer to the end.
How can we improve? Well, if when we're less than 150 chunks from the
end, we reduce the chunk size by 2x, then instead of having 1 chunk
that is 50x as expensive, hopefully we'll have 2 smaller chunks that
are each 50x as expensive. They'll get assigned to 2 different
workers, and the remaining 2 workers now need enough extra work from
other chunks to even out the work distribution, which should still be
possible. It gets tough though if breaking the one expensive chunk in
half produces 1 regular-price half-chunk and one half-chunk that is
50x as expensive as all the others. Because we have <150 chunks left,
there's no way to keep everybody else busy until the sole expensive
half-chunk completes. In a sufficiently-extreme scenario, assigning
even a single full block to a worker is too much, and you really want
to handle the tuples out individually.
Anyway, if we don't do anything special until we get down to the last
2 chunks, it's only going to make a difference when one of those last
2 chunks happens to be the expensive one. If say the third-to-last
chunk is the expensive one, subdividing the last 2 chunks lets all the
workers who didn't get the expensive chunk fight over the scraps, but
that's not an improvement. If anything it's worse, because there's
more communication overhead and you don't gain anything vs. just
assigning each chunk to a worker straight up.
In a real example we don't know that we have a single expensive chunk
-- each chunk just has its own cost, and they could all be the same,
or some could be much more expensive. When we have a lot of work left,
we can be fairly cavalier in handing out larger chunks of it with the
full confidence that even if some of those chunks turn out to be way
more expensive than others, we'll still be able to equalize the
finishing times by our choice of how to distribute the remaining work.
But as there's less and less work left, I think you need to hand out
the work in smaller increments to maximize the chances of obtaining an
equal work distribution.
So maybe one idea is to base the chunk size on the number of blocks
remaining to be scanned. Say that the chunk size is limited to 1/512
of the *remaining* blocks in the relation, probably with some upper
limit. I doubt going beyond 1GB makes any sense just because that's
how large the files are, for example, and that might be too big for
other reasons. But let's just use that as an example. Say you have a
20TB relation. You hand out 1GB segments until you get down to 512GB
remaining. Then you hand out 512MB segments until you get down to
256GB remaining, and then 256MB segments until you get down to 128GB
remaining, and so on. Once you get down to the last 4MB you're handing
out individual blocks, just as you would do from the beginning if the
whole relation size was 4MB.
This kind of thing is a bit overcomplicated and doesn't really help if
the first 1GB you hand out at the very beginning turns out to be the
1GB chunk of death, and it takes a bazillion times longer than
anything else, and it's just going to be the last worker to finish no
matter what you do about anything else. The increasing granularity
near the end is just fighting over scraps in that case. The only thing
you can do to avoid this kind of problem is use a lower maximum chunk
size from the beginning, and I think we might want to consider doing
that, because I suspect that the incremental benefits from 64MB chunks
to 1GB chunks are pretty small, for example.
But, in more normal cases where you have some somewhat-expensive
chunks mixed in with the regular-price chunks, I think this sort of
thing should work pretty well. If you never give out more that 1/512
of the remaining blocks, then you can still achieve an equal work
distribution as long as you don't hit a chunk whose cost relative to
others is more than 512/(# of processes you have - 1). So for example
with 6 processes, you need a single chunk that's more than 100x as
expensive as the others to break it. That can definitely happen,
because we can construct arbitrarily bad cases for this sort of thing,
but hopefully they wouldn't come up all that frequently...
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Tue, Jun 16, 2020 at 6:57 AM Amit Kapila <amit.kapila16@gmail.com> wrote:
I agree that won't be a common scenario but apart from that also I am
not sure if we can conclude that the proposed patch won't cause any
regressions. See one of the tests [1] done by Soumyadeep where the
patch has caused regression in one of the cases, now we can either try
to improve the patch and see we didn't cause any regressions or assume
that those are some minority cases which we don't care. Another point
is that this thread has started with a theory that this idea can give
benefits on certain filesystems and AFAICS we have tested it on one
other type of system, so not sure if that is sufficient.
Yeah, it seems like those cases might need some more investigation,
but they're also not necessarily an argument for a configuration
setting. It's not so much that I dislike the idea of being able to
configure something here; it's really that I don't want a reloption
that feels like magic. For example, we know that work_mem can be
really hard to configure because there may be no value that's high
enough to make your queries run fast during normal periods but low
enough to avoid running out of memory during busy periods. That kind
of thing sucks, and we should avoid creating more such cases.
One problem here is that the best value might depend not only on the
relation but on the individual query. A GUC could be changed
per-query, but different tables in the query might need different
values. Changing a reloption requires locking, and you wouldn't want
to have to keep changing it for each different query. Now if we figure
out that something is hardware-dependent -- like we come up with a
good formula that adjusts the value automatically most of the time,
but say it needs to more more on SSDs than on spinning disks or the
other way around, well then that's a good candidate for some kind of
setting, maybe a tablespace option. But if it seems to depend on the
query, we need a better idea, not a user-configurable setting.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Wed, 17 Jun 2020 at 03:20, Robert Haas <robertmhaas@gmail.com> wrote:
On Mon, Jun 15, 2020 at 5:09 PM David Rowley <dgrowleyml@gmail.com> wrote:
* Perhaps when there are less than 2 full chunks remaining, workers
can just take half of what is left. Or more specifically
Max(pg_next_power2(remaining_blocks) / 2, 1), which ideally would work
out allocating an amount of pages proportional to the amount of beer
each mathematician receives in the "An infinite number of
mathematicians walk into a bar" joke, obviously with the exception
that we stop dividing when we get to 1. However, I'm not quite sure
how well that can be made to work with multiple bartenders working in
parallel.That doesn't sound nearly aggressive enough to me. I mean, let's
suppose that we're concerned about the scenario where one chunk takes
50x as long as all the other chunks. Well, if we have 1024 chunks
total, and we hit the problem chunk near the beginning, there will be
no problem. In effect, there are 1073 units of work instead of 1024,
and we accidentally assigned one guy 50 units of work when we thought
we were assigning 1 unit of work. If there's enough work left that we
can assign each other worker 49 units more than what we would have
done had that chunk been the same cost as all the others, then there's
no problem. So for instance if there are 4 workers, we can still even
things out if we hit the problematic chunk more than ~150 chunks from
the end. If we're closer to the end than that, there's no way to avoid
the slow chunk delaying the overall completion time, and the problem
gets worse as the problem chunk gets closer to the end.
I've got something like that in the attached. Currently, I've set the
number of chunks to 2048 and I'm starting the ramp down when 64 chunks
remain, which means we'll start the ramp-down when there's about 3.1%
of the scan remaining. I didn't see the point of going with the larger
number of chunks and having ramp-down code.
Attached is the patch and an .sql file with a function which can be
used to demonstrate what chunk sizes the patch will choose and demo
the ramp-down.
e.g.
# select show_parallel_scan_chunks(1000000, 2048, 64);
It would be really good if people could test this using the test case
mentioned in [1]/messages/by-id/CAApHDvrfJfYH51_WY-iQqPw8yGR4fDoTxAQKqn+Sa7NTKEVWtg@mail.gmail.com. We really need to get a good idea of how this
behaves on various operating systems.
With a 32TB relation, the code will make the chunk size 16GB. Perhaps
I should change the code to cap that at 1GB.
David
[1]: /messages/by-id/CAApHDvrfJfYH51_WY-iQqPw8yGR4fDoTxAQKqn+Sa7NTKEVWtg@mail.gmail.com
Attachments:
bigger_io_chunks_for_parallel_seqscan.patchapplication/octet-stream; name=bigger_io_chunks_for_parallel_seqscan.patchDownload
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 537913d1bb..5792040bc1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
+ scan->rs_base.rs_parallel_work =
+ palloc0(sizeof(ParallelBlockTableScanWorkData));
scan->rs_strategy = NULL; /* set in initscan */
/*
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index c814733b22..d388a744b9 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -25,10 +25,15 @@
#include "access/tableam.h"
#include "access/xact.h"
#include "optimizer/plancat.h"
+#include "port/pg_bitutils.h"
#include "storage/bufmgr.h"
#include "storage/shmem.h"
#include "storage/smgr.h"
+/* The number of I/O chunks we try to break a parallel seqscan down into */
+#define PARALLEL_SEQSCAN_NCHUNKS 2048
+/* Ramp down size of allocations when we've only this number of chunks left */
+#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
/* GUC variables */
char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
@@ -404,10 +409,21 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
* to set the startblock once.
*/
void
-table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber sync_startpage = InvalidBlockNumber;
+ /* Reset the state we use for controlling allocation size. */
+ memset(workspace, 0, sizeof(*workspace));
+
+ StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
+ "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
+
+ workspace->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
+ PARALLEL_SEQSCAN_NCHUNKS, 1));
+
retry:
/* Grab the spinlock. */
SpinLockAcquire(&pbscan->phs_mutex);
@@ -447,12 +463,35 @@ retry:
* backend gets an InvalidBlockNumber return.
*/
BlockNumber
-table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber page;
uint64 nallocated;
/*
+ * The logic below allocates block numbers out to parallel workers in a
+ * way that each worker will receive a set of consecutive block numbers to
+ * scan. Earlier versions of this would allocate the next highest block
+ * number to the next worker to call this function. This would generally
+ * result in the scan being divided up in stripes of 1 block in size.
+ * Some operating systems would not detect the sequential I/O pattern due
+ * to each backend being a different process which could result in poor
+ * performance due to inefficient or no readahead. To work around this
+ * issue, we now allocate a range of block numbers for each worker and
+ * when they come back for another block, we give them the next one in
+ * that range until the range is complete, we then allocate them another
+ * range of blocks to scan and return the first block number from that
+ * range.
+ *
+ * Here we name these ranges of blocks "chunks". The initial size of
+ * these chunks is determined in table_block_parallelscan_startblock_init
+ * based on the size of the relation. Towards the end of the scan, we
+ * start making reductions in the size of the chunks in order to attempt
+ * to divide the remaining work evenly over all workers as evenly as
+ * possible.
+ *
* phs_nallocated tracks how many pages have been allocated to workers
* already. When phs_nallocated >= rs_nblocks, all blocks have been
* allocated.
@@ -467,7 +506,48 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
* The actual page to return is calculated by adding the counter to the
* starting block number, modulo nblocks.
*/
- nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
+
+ /*
+ * First check if we have any remaining blocks in a previous chunk for
+ * this worker. We must consume all of the blocks from that before we
+ * allocate another chunk for the worker.
+ */
+ if (workspace->phsw_alloc_remaining > 0)
+ {
+ /*
+ * Give them the next page in the range and update the remaining pages
+ */
+ nallocated = ++workspace->phsw_nallocated;
+ workspace->phsw_alloc_remaining--;
+ }
+ else
+ {
+ /*
+ * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
+ * remaining in the scan, we half the chunk size. Since we reduce
+ * the chunk size here, we'll hit this again after doing
+ * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
+ * iterations of this, we'll end up doing the last few blocks with the
+ * chunk size set to 1.
+ */
+ if (workspace->phsw_chunk_size > 1 &&
+ workspace->phsw_nallocated > pbscan->phs_nblocks -
+ (workspace->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
+ workspace->phsw_chunk_size >>= 1;
+
+ nallocated = workspace->phsw_nallocated =
+ pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+ workspace->phsw_chunk_size);
+
+#ifdef TBPN_DEBUG
+ elog(NOTICE, "chunksize = %u, allocated = %u", workspace->phsw_chunk_size, nallocated);
+#endif
+ /*
+ * Set the remaining number of pages in this chunk so that subsequent
+ * calls from this worker continue on with this chunk until it's done.
+ */
+ workspace->phsw_alloc_remaining = workspace->phsw_chunk_size - 1;
+ }
if (nallocated >= pbscan->phs_nblocks)
page = InvalidBlockNumber; /* all blocks have been allocated */
else
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f0258831f..78d563aab1 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
*/
uint32 rs_flags;
+ void *rs_parallel_work;
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
* information */
-
} TableScanDescData;
typedef struct TableScanDescData *TableScanDesc;
@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData
} ParallelBlockTableScanDescData;
typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
+/*
+ * Per backend state for parallel table sacan, for block oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkData
+{
+ uint64 phsw_nallocated; /* Current # of pages into the scan */
+ uint32 phsw_alloc_remaining; /* Pages left for this allocation */
+ uint32 phsw_chunk_size; /* The number of pages to take in each
+ * I/O chunk for the scan */
+} ParallelBlockTableScanWorkData;
+typedef struct ParallelBlockTableScanWorkData *ParallelBlockTableScanWork;
+
/*
* Base class for fetches from a table via an index. This is the base-class
* for such scans, which needs to be embedded in the respective struct for
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index eb18739c36..f0cefd7e26 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1790,8 +1790,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
extern void table_block_parallelscan_reinitialize(Relation rel,
ParallelTableScanDesc pscan);
extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
extern void table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
On Thu, Jun 18, 2020 at 6:15 AM David Rowley <dgrowleyml@gmail.com> wrote:
With a 32TB relation, the code will make the chunk size 16GB. Perhaps
I should change the code to cap that at 1GB.
It seems pretty hard to believe there's any significant advantage to a
chunk size >1GB, so I would be in favor of that change.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Fri, 19 Jun 2020 at 03:26, Robert Haas <robertmhaas@gmail.com> wrote:
On Thu, Jun 18, 2020 at 6:15 AM David Rowley <dgrowleyml@gmail.com> wrote:
With a 32TB relation, the code will make the chunk size 16GB. Perhaps
I should change the code to cap that at 1GB.It seems pretty hard to believe there's any significant advantage to a
chunk size >1GB, so I would be in favor of that change.
I could certainly make that change. With the standard page size, 1GB
is 131072 pages and a power of 2. That would change for non-standard
page sizes, so we'd need to decide if we want to keep the chunk size a
power of 2, or just cap it exactly at whatever number of pages 1GB is.
I'm not sure how much of a difference it'll make, but I also just want
to note that synchronous scans can mean we'll start the scan anywhere
within the table, so capping to 1GB does not mean we read an entire
extent. It's more likely to span 2 extents.
David
On Fri, 19 Jun 2020 at 11:34, David Rowley <dgrowleyml@gmail.com> wrote:
On Fri, 19 Jun 2020 at 03:26, Robert Haas <robertmhaas@gmail.com> wrote:
On Thu, Jun 18, 2020 at 6:15 AM David Rowley <dgrowleyml@gmail.com> wrote:
With a 32TB relation, the code will make the chunk size 16GB. Perhaps
I should change the code to cap that at 1GB.It seems pretty hard to believe there's any significant advantage to a
chunk size >1GB, so I would be in favor of that change.I could certainly make that change. With the standard page size, 1GB
is 131072 pages and a power of 2. That would change for non-standard
page sizes, so we'd need to decide if we want to keep the chunk size a
power of 2, or just cap it exactly at whatever number of pages 1GB is.I'm not sure how much of a difference it'll make, but I also just want
to note that synchronous scans can mean we'll start the scan anywhere
within the table, so capping to 1GB does not mean we read an entire
extent. It's more likely to span 2 extents.
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.
I tested the performance on a Windows 10 laptop using the test case from [1]/messages/by-id/CAApHDvrfJfYH51_WY-iQqPw8yGR4fDoTxAQKqn+Sa7NTKEVWtg@mail.gmail.com
Master:
workers=0: Time: 141175.935 ms (02:21.176)
workers=1: Time: 316854.538 ms (05:16.855)
workers=2: Time: 323471.791 ms (05:23.472)
workers=3: Time: 321637.945 ms (05:21.638)
workers=4: Time: 308689.599 ms (05:08.690)
workers=5: Time: 289014.709 ms (04:49.015)
workers=6: Time: 267785.270 ms (04:27.785)
workers=7: Time: 248735.817 ms (04:08.736)
Patched:
workers=0: Time: 155985.204 ms (02:35.985)
workers=1: Time: 112238.741 ms (01:52.239)
workers=2: Time: 105861.813 ms (01:45.862)
workers=3: Time: 91874.311 ms (01:31.874)
workers=4: Time: 92538.646 ms (01:32.539)
workers=5: Time: 93012.902 ms (01:33.013)
workers=6: Time: 94269.076 ms (01:34.269)
workers=7: Time: 90858.458 ms (01:30.858)
David
[1]: /messages/by-id/CAApHDvrfJfYH51_WY-iQqPw8yGR4fDoTxAQKqn+Sa7NTKEVWtg@mail.gmail.com
Attachments:
bigger_io_chunks_for_parallel_seqscan_v2.patchapplication/x-patch; name=bigger_io_chunks_for_parallel_seqscan_v2.patchDownload
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 537913d1bb..5792040bc1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
/* Other processes might have already finished the scan. */
if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
{
ParallelBlockTableScanDesc pbscan =
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+ ParallelBlockTableScanWork pbscanwork =
+ (ParallelBlockTableScanWork) scan->rs_base.rs_parallel_work;
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
- pbscan);
+ pbscanwork, pbscan);
finished = (page == InvalidBlockNumber);
}
else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
+ scan->rs_base.rs_parallel_work =
+ palloc0(sizeof(ParallelBlockTableScanWorkData));
scan->rs_strategy = NULL; /* set in initscan */
/*
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index c814733b22..c2ee42e795 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -25,10 +25,17 @@
#include "access/tableam.h"
#include "access/xact.h"
#include "optimizer/plancat.h"
+#include "port/pg_bitutils.h"
#include "storage/bufmgr.h"
#include "storage/shmem.h"
#include "storage/smgr.h"
+/* The number of I/O chunks we try to break a parallel seqscan down into */
+#define PARALLEL_SEQSCAN_NCHUNKS 2048
+/* Ramp down size of allocations when we've only this number of chunks left */
+#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
+/* Cap the size of parallel I/O chunks to this number of blocks */
+#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 131072
/* GUC variables */
char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
@@ -404,10 +411,25 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
* to set the startblock once.
*/
void
-table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber sync_startpage = InvalidBlockNumber;
+ /* Reset the state we use for controlling allocation size. */
+ memset(workspace, 0, sizeof(*workspace));
+
+ StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
+ "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
+
+ workspace->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
+ PARALLEL_SEQSCAN_NCHUNKS, 1));
+
+ /* Ensure we don't go over the maximum size with larger rels */
+ workspace->phsw_chunk_size = Min(workspace->phsw_chunk_size,
+ PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
+
retry:
/* Grab the spinlock. */
SpinLockAcquire(&pbscan->phs_mutex);
@@ -447,12 +469,35 @@ retry:
* backend gets an InvalidBlockNumber return.
*/
BlockNumber
-table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork workspace,
+ ParallelBlockTableScanDesc pbscan)
{
BlockNumber page;
uint64 nallocated;
/*
+ * The logic below allocates block numbers out to parallel workers in a
+ * way that each worker will receive a set of consecutive block numbers to
+ * scan. Earlier versions of this would allocate the next highest block
+ * number to the next worker to call this function. This would generally
+ * result in the scan being divided up in stripes of 1 block in size.
+ * Some operating systems would not detect the sequential I/O pattern due
+ * to each backend being a different process which could result in poor
+ * performance due to inefficient or no readahead. To work around this
+ * issue, we now allocate a range of block numbers for each worker and
+ * when they come back for another block, we give them the next one in
+ * that range until the range is complete, we then allocate them another
+ * range of blocks to scan and return the first block number from that
+ * range.
+ *
+ * Here we name these ranges of blocks "chunks". The initial size of
+ * these chunks is determined in table_block_parallelscan_startblock_init
+ * based on the size of the relation. Towards the end of the scan, we
+ * start making reductions in the size of the chunks in order to attempt
+ * to divide the remaining work evenly over all workers as evenly as
+ * possible.
+ *
* phs_nallocated tracks how many pages have been allocated to workers
* already. When phs_nallocated >= rs_nblocks, all blocks have been
* allocated.
@@ -467,7 +512,48 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
* The actual page to return is calculated by adding the counter to the
* starting block number, modulo nblocks.
*/
- nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
+
+ /*
+ * First check if we have any remaining blocks in a previous chunk for
+ * this worker. We must consume all of the blocks from that before we
+ * allocate another chunk for the worker.
+ */
+ if (workspace->phsw_alloc_remaining > 0)
+ {
+ /*
+ * Give them the next page in the range and update the remaining pages
+ */
+ nallocated = ++workspace->phsw_nallocated;
+ workspace->phsw_alloc_remaining--;
+ }
+ else
+ {
+ /*
+ * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
+ * remaining in the scan, we half the chunk size. Since we reduce
+ * the chunk size here, we'll hit this again after doing
+ * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
+ * iterations of this, we'll end up doing the last few blocks with the
+ * chunk size set to 1.
+ */
+ if (workspace->phsw_chunk_size > 1 &&
+ workspace->phsw_nallocated > pbscan->phs_nblocks -
+ (workspace->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
+ workspace->phsw_chunk_size >>= 1;
+
+ nallocated = workspace->phsw_nallocated =
+ pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+ workspace->phsw_chunk_size);
+
+#ifdef TBPN_DEBUG
+ elog(NOTICE, "chunksize = %u, allocated = %u", workspace->phsw_chunk_size, nallocated);
+#endif
+ /*
+ * Set the remaining number of pages in this chunk so that subsequent
+ * calls from this worker continue on with this chunk until it's done.
+ */
+ workspace->phsw_alloc_remaining = workspace->phsw_chunk_size - 1;
+ }
if (nallocated >= pbscan->phs_nblocks)
page = InvalidBlockNumber; /* all blocks have been allocated */
else
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f0258831f..78d563aab1 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
*/
uint32 rs_flags;
+ void *rs_parallel_work;
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
* information */
-
} TableScanDescData;
typedef struct TableScanDescData *TableScanDesc;
@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData
} ParallelBlockTableScanDescData;
typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
+/*
+ * Per backend state for parallel table sacan, for block oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkData
+{
+ uint64 phsw_nallocated; /* Current # of pages into the scan */
+ uint32 phsw_alloc_remaining; /* Pages left for this allocation */
+ uint32 phsw_chunk_size; /* The number of pages to take in each
+ * I/O chunk for the scan */
+} ParallelBlockTableScanWorkData;
+typedef struct ParallelBlockTableScanWorkData *ParallelBlockTableScanWork;
+
/*
* Base class for fetches from a table via an index. This is the base-class
* for such scans, which needs to be embedded in the respective struct for
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index eb18739c36..f0cefd7e26 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1790,8 +1790,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
extern void table_block_parallelscan_reinitialize(Relation rel,
ParallelTableScanDesc pscan);
extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
extern void table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWork pbscanwork,
ParallelBlockTableScanDesc pbscan);
On Thu, Jun 18, 2020 at 10:10 PM David Rowley <dgrowleyml@gmail.com> wrote:
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.
Maybe use RELSEG_SIZE?
I tested the performance on a Windows 10 laptop using the test case from [1]
Master:
workers=0: Time: 141175.935 ms (02:21.176)
workers=1: Time: 316854.538 ms (05:16.855)
workers=2: Time: 323471.791 ms (05:23.472)
workers=3: Time: 321637.945 ms (05:21.638)
workers=4: Time: 308689.599 ms (05:08.690)
workers=5: Time: 289014.709 ms (04:49.015)
workers=6: Time: 267785.270 ms (04:27.785)
workers=7: Time: 248735.817 ms (04:08.736)Patched:
workers=0: Time: 155985.204 ms (02:35.985)
workers=1: Time: 112238.741 ms (01:52.239)
workers=2: Time: 105861.813 ms (01:45.862)
workers=3: Time: 91874.311 ms (01:31.874)
workers=4: Time: 92538.646 ms (01:32.539)
workers=5: Time: 93012.902 ms (01:33.013)
workers=6: Time: 94269.076 ms (01:34.269)
workers=7: Time: 90858.458 ms (01:30.858)
Nice results. I wonder if these stack with the gains Thomas was
discussing with his DSM-from-the-main-shmem-segment patch.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Sat, 20 Jun 2020 at 08:00, Robert Haas <robertmhaas@gmail.com> wrote:
On Thu, Jun 18, 2020 at 10:10 PM David Rowley <dgrowleyml@gmail.com> wrote:
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.Maybe use RELSEG_SIZE?
I was hoping to keep the guarantees that the chunk size is always a
power of 2. If, for example, someone configured PostgreSQL
--with-segsize=3, then RELSEG_SIZE would be 393216 with the standard
BLCKSZ.
Not having it a power of 2 does mean the ramp-down is more uneven when
the sizes become very small:
postgres=# select 393216>>x from generate_Series(0,18)x;
?column?
----------
393216
196608
98304
49152
24576
12288
6144
3072
1536
768
384
192
96
48
24
12
6
3
1
(19 rows)
Perhaps that's not a problem though, but then again, perhaps just
keeping it at 131072 regardless of RELSEG_SIZE and BLCKSZ is also ok.
The benchmarks I did on Windows [1]/messages/by-id/CAApHDvopPkA+q5y_k_6CUV4U6DPhmz771VeUMuzLs3D3mWYMOg@mail.gmail.com showed that the returns diminished
once we started making the step size some decent amount so my thoughts
are that I've set PARALLEL_SEQSCAN_MAX_CHUNK_SIZE to something large
enough that it'll make no difference to the performance anyway. So
there's probably not much point in giving it too much thought.
Perhaps pg_nextpower2_32(RELSEG_SIZE) would be okay though.
David
[1]: /messages/by-id/CAApHDvopPkA+q5y_k_6CUV4U6DPhmz771VeUMuzLs3D3mWYMOg@mail.gmail.com
On Fri, 19 Jun 2020 at 14:10, David Rowley <dgrowleyml@gmail.com> wrote:
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.I tested the performance on a Windows 10 laptop using the test case from [1]
I also tested this an AMD machine running Ubuntu 20.04 on kernel
version 5.4.0-37. I used the same 100GB table I mentioned in [1], but
with the query "select * from t where a < 0;", which saves having to
do any aggregate work.
There seems to be quite a big win with Linux too. See the attached
graphs. Both graphs are based on the same results, just the MB/sec
one takes the query time in milliseconds and converts that into MB/sec
for the 100 GB table. i.e. 100*1024/(<milliseconds> /1000)
The machine is a 64core / 128 thread AMD machine (3990x) with a 1TB
Samsung 970 Pro evo plus SSD, 64GB RAM
Show quoted text
[1] /messages/by-id/CAApHDvrfJfYH51_WY-iQqPw8yGR4fDoTxAQKqn+Sa7NTKEVWtg@mail.gmail.com
On Mon, 22 Jun 2020 at 16:54, David Rowley <dgrowleyml@gmail.com> wrote:
I also tested this an AMD machine running Ubuntu 20.04 on kernel
version 5.4.0-37. I used the same 100GB table I mentioned in [1], but
with the query "select * from t where a < 0;", which saves having to
do any aggregate work.
I just wanted to add a note here that Thomas and I just discussed this
a bit offline. He recommended I try setting the kernel readhead a bit
higher.
It was set to 128kB, so I cranked it up to 2MB with:
sudo blockdev --setra 4096 /dev/nvme0n1p2
I didn't want to run the full test again as it took quite a long time,
so I just tried with 32 workers.
The first two results here are taken from the test results I just
posted 1 hour ago.
Master readhead=128kB = 89921.283 ms
v2 patch readhead=128kB = 36085.642 ms
master readhead=2MB = 60984.905 ms
v2 patch readhead=2MB = 22611.264 ms
There must be a fairly large element of reading from the page cache
there since 22.6 seconds means 4528MB/sec over the 100GB table. The
maximum for a PCIe 3.0 x4 slot is 3940MB/s
David
On Sun, Jun 21, 2020 at 6:52 PM David Rowley <dgrowleyml@gmail.com> wrote:
Perhaps that's not a problem though, but then again, perhaps just
keeping it at 131072 regardless of RELSEG_SIZE and BLCKSZ is also ok.
The benchmarks I did on Windows [1] showed that the returns diminished
once we started making the step size some decent amount so my thoughts
are that I've set PARALLEL_SEQSCAN_MAX_CHUNK_SIZE to something large
enough that it'll make no difference to the performance anyway. So
there's probably not much point in giving it too much thought.Perhaps pg_nextpower2_32(RELSEG_SIZE) would be okay though.
I guess I don't care that much; it was just a thought. Maybe tying it
to RELSEG_SIZE is a bad idea anyway. After all, what if we find cases
where 1GB is too much? Like, how much benefit do we get from making it
1GB rather than 64MB, say? I don't think we should be making this
value big just because we can.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Em seg., 22 de jun. de 2020 às 02:53, David Rowley <dgrowleyml@gmail.com>
escreveu:
On Mon, 22 Jun 2020 at 16:54, David Rowley <dgrowleyml@gmail.com> wrote:
I also tested this an AMD machine running Ubuntu 20.04 on kernel
version 5.4.0-37. I used the same 100GB table I mentioned in [1], but
with the query "select * from t where a < 0;", which saves having to
do any aggregate work.I just wanted to add a note here that Thomas and I just discussed this
a bit offline. He recommended I try setting the kernel readhead a bit
higher.It was set to 128kB, so I cranked it up to 2MB with:
sudo blockdev --setra 4096 /dev/nvme0n1p2
I didn't want to run the full test again as it took quite a long time,
so I just tried with 32 workers.The first two results here are taken from the test results I just
posted 1 hour ago.Master readhead=128kB = 89921.283 ms
v2 patch readhead=128kB = 36085.642 ms
master readhead=2MB = 60984.905 ms
v2 patch readhead=2MB = 22611.264 ms
Hi, redoing the tests with v2 here.
notebook with i5, 8GB, 256 GB (SSD)
Windows 10 64 bits (2004
msvc 2019 64 bits
Postgresql head (with v2 patch)
Configuration: none
Connection local ipv4 (not localhost)
create table t (a int, b text);
insert into t select x,md5(x::text) from
generate_series(1,1000000*1572.7381809)x;
vacuum freeze t;
set max_parallel_workers_per_gather = 0;
Time: 354211,826 ms (05:54,212)
set max_parallel_workers_per_gather = 1;
Time: 332805,773 ms (05:32,806)
set max_parallel_workers_per_gather = 2;
Time: 282566,711 ms (04:42,567)
set max_parallel_workers_per_gather = 3;
Time: 263383,945 ms (04:23,384)
set max_parallel_workers_per_gather = 4;
Time: 255728,259 ms (04:15,728)
set max_parallel_workers_per_gather = 5;
Time: 238288,720 ms (03:58,289)
set max_parallel_workers_per_gather = 6;
Time: 238647,792 ms (03:58,648)
set max_parallel_workers_per_gather = 7;
Time: 231295,763 ms (03:51,296)
set max_parallel_workers_per_gather = 8;
Time: 232502,828 ms (03:52,503)
set max_parallel_workers_per_gather = 9;
Time: 230970,604 ms (03:50,971)
set max_parallel_workers_per_gather = 10;
Time: 232104,182 ms (03:52,104)
set max_parallel_workers_per_gather = 8;
postgres=# explain select count(*) from t;
QUERY PLAN
-------------------------------------------------------------------------------------------
Finalize Aggregate (cost=15564556.43..15564556.44 rows=1 width=8)
-> Gather (cost=15564555.60..15564556.41 rows=8 width=8)
Workers Planned: 8
-> Partial Aggregate (cost=15563555.60..15563555.61 rows=1
width=8)
-> Parallel Seq Scan on t (cost=0.00..15072074.88
rows=196592288 width=0)
(5 rows)
Questions:
1. Why acquire and release lock in retry: loop.
Wouldn't that be better?
/* Grab the spinlock. */
SpinLockAcquire(&pbscan->phs_mutex);
retry:
/*
* If the scan's startblock has not yet been initialized, we must do so
* now. If this is not a synchronized scan, we just start at block 0, but
* if it is a synchronized scan, we must get the starting position from
* the synchronized scan machinery. We can't hold the spinlock while
* doing that, though, so release the spinlock, get the information we
* need, and retry. If nobody else has initialized the scan in the
* meantime, we'll fill in the value we fetched on the second time
* through.
*/
if (pbscan->phs_startblock == InvalidBlockNumber)
{
if (!pbscan->base.phs_syncscan)
pbscan->phs_startblock = 0;
else if (sync_startpage != InvalidBlockNumber)
pbscan->phs_startblock = sync_startpage;
else
{
sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
goto retry;
}
}
SpinLockRelease(&pbscan->phs_mutex);
}
Acquire lock once, before retry?
2. Is there any configuration to improve performance?
regards,
Ranier Vilela
Ranier,
This topic is largely unrelated to the current thread. Also...
On Mon, Jun 22, 2020 at 12:47 PM Ranier Vilela <ranier.vf@gmail.com> wrote:
Questions:
1. Why acquire and release lock in retry: loop.
This is a super-bad idea. Note the coding rule mentioned in spin.h.
There are many discussion on this mailing list about the importance of
keeping the critical section for a spinlock to a few instructions.
Calling another function that *itself acquires an LWLock* is
definitely not OK.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Em seg., 22 de jun. de 2020 às 16:33, Robert Haas <robertmhaas@gmail.com>
escreveu:
Ranier,
This topic is largely unrelated to the current thread. Also...
Weel, I was trying to improve the patch for the current thread.
Or perhaps, you are referring to something else, which I may not have
understood.
On Mon, Jun 22, 2020 at 12:47 PM Ranier Vilela <ranier.vf@gmail.com>
wrote:Questions:
1. Why acquire and release lock in retry: loop.This is a super-bad idea. Note the coding rule mentioned in spin.h.
There are many discussion on this mailing list about the importance of
keeping the critical section for a spinlock to a few instructions.
Calling another function that *itself acquires an LWLock* is
definitely not OK.
Perhaps, I was not clear and it is another misunderstanding.
I am not suggesting a function to acquire the lock.
By the way, I did the tests with this change and it worked perfectly.
But, as it is someone else's patch, I asked why to learn.
By the way, my suggestion is with less instructions than the patch.
The only change I asked is why to acquire and release the lock repeatedly
within the goto retry, when you already have it.
If I can acquire the lock before retry: and release it only at the end when
I leave table_block_parallelscan_startblock_init,
why not do it.
I will attach the suggested excerpt so that I have no doubts about what I
am asking.
regards,
Ranier Vilela
Attachments:
On Fri, Jun 19, 2020 at 2:10 PM David Rowley <dgrowleyml@gmail.com> wrote:
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.
I wonder how this interacts with the sync scan feature. It has a
conflicting goal...
On Tue, 23 Jun 2020 at 09:52, Thomas Munro <thomas.munro@gmail.com> wrote:
On Fri, Jun 19, 2020 at 2:10 PM David Rowley <dgrowleyml@gmail.com> wrote:
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.I wonder how this interacts with the sync scan feature. It has a
conflicting goal...
Of course, syncscan relies on subsequent scanners finding buffers
cached, either in (ideally) shared buffers or the kernel cache. The
scans need to be roughly synchronised for that to work. If we go and
make the chunk size too big, then that'll reduce the chances useful
buffers being found by subsequent scans. It sounds like a good reason
to try and find the smallest chunk size that allows readahead to work
well. The benchmarks I did on Windows certainly show that there are
diminishing returns when the chunk size gets larger, so capping it at
some number of megabytes would probably be a good idea. It would just
take a bit of work to figure out how many megabytes that should be.
Likely it's going to depend on the size of shared buffers and how much
memory the machine has got, but also what other work is going on that
might be evicting buffers at the same time. Perhaps something in the
range of 2-16MB would be ok. I can do some tests with that and see if
I can get the same performance as with the larger chunks.
David
On Tue, 23 Jun 2020 at 07:33, Robert Haas <robertmhaas@gmail.com> wrote:
On Mon, Jun 22, 2020 at 12:47 PM Ranier Vilela <ranier.vf@gmail.com> wrote:
Questions:
1. Why acquire and release lock in retry: loop.This is a super-bad idea. Note the coding rule mentioned in spin.h.
There are many discussion on this mailing list about the importance of
keeping the critical section for a spinlock to a few instructions.
Calling another function that *itself acquires an LWLock* is
definitely not OK.
Just a short history lesson for Ranier to help clear up any confusion:
Back before 3cda10f41 there was some merit in improving the
performance of these functions. Before that, we used to dish out pages
under a lock. With that old method, if given enough workers and a
simple enough query, we could start to see workers waiting on the lock
just to obtain the next block number they're to work on. After the
atomics were added in that commit, we didn't really see that again.
What we're trying to fix here is the I/O pattern that these functions
induce and that's all we should be doing here. Changing this is
tricky to get right as we need to consider so many operating systems
and how they deal with I/O readahead.
David
Em seg., 22 de jun. de 2020 às 23:29, David Rowley <dgrowleyml@gmail.com>
escreveu:
On Tue, 23 Jun 2020 at 07:33, Robert Haas <robertmhaas@gmail.com> wrote:
On Mon, Jun 22, 2020 at 12:47 PM Ranier Vilela <ranier.vf@gmail.com>
wrote:
Questions:
1. Why acquire and release lock in retry: loop.This is a super-bad idea. Note the coding rule mentioned in spin.h.
There are many discussion on this mailing list about the importance of
keeping the critical section for a spinlock to a few instructions.
Calling another function that *itself acquires an LWLock* is
definitely not OK.Just a short history lesson for Ranier to help clear up any confusion:
Back before 3cda10f41 there was some merit in improving the
performance of these functions. Before that, we used to dish out pages
under a lock. With that old method, if given enough workers and a
simple enough query, we could start to see workers waiting on the lock
just to obtain the next block number they're to work on. After the
atomics were added in that commit, we didn't really see that again.
It is a good explanation. I already imagined it could be to help other
processes, but I still wasn't sure.
However, I did a test with this modification (lock before retry), and it
worked.
What we're trying to fix here is the I/O pattern that these functions
induce and that's all we should be doing here. Changing this is
tricky to get right as we need to consider so many operating systems
and how they deal with I/O readahead.
Yes, I understand that focus here is I/O.
Sorry, by the noise.
regards,
Ranier Vilela
On Tue, 23 Jun 2020 at 10:50, David Rowley <dgrowleyml@gmail.com> wrote:
On Tue, 23 Jun 2020 at 09:52, Thomas Munro <thomas.munro@gmail.com> wrote:
On Fri, Jun 19, 2020 at 2:10 PM David Rowley <dgrowleyml@gmail.com> wrote:
Here's a patch which caps the maximum chunk size to 131072. If
someone doubles the page size then that'll be 2GB instead of 1GB. I'm
not personally worried about that.I wonder how this interacts with the sync scan feature. It has a
conflicting goal...Of course, syncscan relies on subsequent scanners finding buffers
cached, either in (ideally) shared buffers or the kernel cache. The
scans need to be roughly synchronised for that to work. If we go and
make the chunk size too big, then that'll reduce the chances useful
buffers being found by subsequent scans. It sounds like a good reason
to try and find the smallest chunk size that allows readahead to work
well. The benchmarks I did on Windows certainly show that there are
diminishing returns when the chunk size gets larger, so capping it at
some number of megabytes would probably be a good idea. It would just
take a bit of work to figure out how many megabytes that should be.
Likely it's going to depend on the size of shared buffers and how much
memory the machine has got, but also what other work is going on that
might be evicting buffers at the same time. Perhaps something in the
range of 2-16MB would be ok. I can do some tests with that and see if
I can get the same performance as with the larger chunks.
I did some further benchmarking on both Windows 10 and on Linux with
the 5.4.0-37 kernel running on Ubuntu 20.04. I started by reducing
PARALLEL_SEQSCAN_MAX_CHUNK_SIZE down to 256 and ran the test multiple
times, each time doubling the PARALLEL_SEQSCAN_MAX_CHUNK_SIZE. On the
Linux test, I used the standard kernel readhead of 128kB. Thomas and I
discovered earlier that increasing that increases the throughput all
round.
These tests were done with the PARALLEL_SEQSCAN_NCHUNKS as 2048, which
means with the 100GB table I used for testing, the uncapped chunk size
of 8192 blocks would be selected (aka 16MB). The performance is quite
a bit slower when the chunk size is capped to 256 blocks and it does
increase again with larger maximum chunk sizes, but the returns do get
smaller and smaller with each doubling of
PARALLEL_SEQSCAN_MAX_CHUNK_SIZE. Uncapped, or 8192 did give the best
performance on both Windows and Linux. I didn't test with anything
higher than that.
So, based on these results, it seems 16MBs is not a bad value to cap
the chunk size at. If that turns out to be true for other tests too,
then likely 16MB is not too unrealistic a value to cap the size of the
block chunks to.
Please see the attached v2_on_linux.png and v2_on_windows.png for the
results of that.
I also performed another test to see how the performance looks with
both synchronize_seqscans on and off. To test this I decided that a
100GB table on a 64GB RAM machine was just not larger enough, so I
increased the table size to 800GB. I set parallel_workers for the
relation to 10 and ran:
drowley@amd3990x:~$ cat bench.sql
select * from t where a < 0;
pgbench -n -f bench.sql -T 10800 -P 600 -c 6 -j 6 postgres
(This query returns 0 rows).
So each query had 11 backends (including the main process) and there
were 6 of those running concurrently. i.e 66 backends busy working on
the problem in total.
The results of that were:
Auto chunk size selection without any cap (for an 800GB table that's
65536 blocks)
synchronize_seqscans = on: latency average = 372738.134 ms (2197.7 MB/s) <-- bad
synchronize_seqscans = off: latency average = 320204.028 ms (2558.3 MB/s)
So here it seems that synchronize_seqscans = on slows things down.
Trying again after capping the number of blocks per chunk to 8192:
synchronize_seqscans = on: latency average = 321969.172 ms (2544.3 MB/s)
synchronize_seqscans = off: latency average = 321389.523 ms (2548.9 MB/s)
So the performance there is about the same.
I was surprised to see that synchronize_seqscans = off didn't slow
down the performance by about 6x. So I tested to see what master does,
and:
synchronize_seqscans = on: latency average = 1070226.162 ms (765.4MB/s)
synchronize_seqscans = off: latency average = 1085846.859 ms (754.4MB/s)
It does pretty poorly in both cases.
The full results of that test are in the attached
800gb_table_synchronize_seqscans_test.txt file.
In summary, based on these tests, I don't think we're making anything
worse in regards to synchronize_seqscans if we cap the maximum number
of blocks to allocate to each worker at once to 8192. Perhaps there's
some argument for using something smaller than that for servers with
very little RAM, but I don't personally think so as it still depends
on the table size and It's hard to imagine tables in the hundreds of
GBs on servers that struggle with chunk allocations of 16MB. The
table needs to be at least ~70GB to get a 8192 chunk size with the
current v2 patch settings.
David
Attachments:
v2_on_windows.pngimage/png; name=v2_on_windows.pngDownload
�PNG
IHDR \ � ��TH sRGB ��� gAMA ���a pHYs % %IR$� ��IDATx^��|������jk���k�j�Zm��m���( ����"�* "��{��B ;@��Ev��{���q��u����� $\_?�Gr�;�w=���z���B!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!���%-�DFF���ce�� ���$uuu�w���*k���o��F8 ����wi�w���e��Q�v�Z)))��sf����+u��;wJyy��B!�B!�B!��v\RRRd��q2|�p ���Z�;�/c���e����e+++e��E���]����F_�����_��A�$((���8�X,Y�x�<Xv�����
�������O�.999g� 2��7o��9R���k�?�3�������2]��={�O�� �
:T�,Y"����w�Lff�L�:U���K�JQQ���s����%m���-z������z�j}���A��y����7<�f����z�1�I?���������:����M���o��1�A�]�j�$$$�&�>|X���/s�����N����'e��)2i�$=ng�f�����/��8�)
B!�B!�B�/�*����U A �LYN�"_|��,\������T=z�.{���� gXX�XdG��=��\D�wDD��!|�� ������/8w��
����������+��m��(-b��Y�t}�[QQa���{�n��7��K,Z+�`9�X��|���p`>�gj��-k�q��[�nU�����=��7����=���Xq���;6d�4'�x!�a��y�h�6��x>1�@�1/��p�cbbNTN�8��8��S�N�Up�}�� �Xrrr����B!�B!���K�
.X�/_�����'7��8� �lE 3++�Y����c�G�3--�a9;�E���������d0 pmt#P�k������mD� �P��
<���vK�k[.��3fHvv�y��UpA�������3� ��35d�?^���C�����}!��$�G� �;l�po { Y"��?,����!��~A? �8�
�����9����c2t h`�h��:r��f�9��_�z6l�k�l���)�����!�B!�B!��^�v=�"���������������q�D@�2X�<�N��,�@��?�
h�B� �k~6�%<<\���CHCp� �JAA���3G��7rss��C�������.����}�v�`���5b ���������krN��������_�!������l`{�6����1���6���!�@0l����B!�B!�B��K�.��� #���w`���� ,��hX�Og�KRx�26�dS��\���`#!,##C���~�Ip��[�n�^{�#����Y/��Z����:`� ������8�l �Lg���6m�.����D\sEp��1��}�=_\!�B!�B!�B�.�@A��\���g��[V�X��>�BTq�LLL��(�����(��Y��EB0���2)��h�2�"��Z3��)c�B���s�j��b@���M�_�IpAPZ��1
z��[�lqj���sQ�Y�[�g��sa��~�Od-�� �m�3~~~2s�L]�������2�o���:-\��b�Q��7c\W�C�;��9��������b�����P;�������&��r��q��7�� ��X����`�@����r((��Y}�s\p��.���g���_�f�����������2�Gd�vb8�����6��x����+�����k���}��B!�B!�B9�]pA0��B��\�P�`:j,�b
VT�"�������.��(�� 5�fa`�=��dG`�1j�`�x}qtE �{��b^�E���{��!* ���b=�6��CAq�����{8�Z�,�v,X�4���~���F6�
?8W����ysHKh����3�������:���Z��3 .��V�\��������ki�?D�}�� ���1*�D����g���O�*�`����LG��9P[ �pX��������5�1�V�Y��1�N�9������7���V�c����B!�B!�B9�]p�@��V�"��(c�;�!� �ls@���#���� ��9�AY������@���'���3������G������FA 7�\0��`�����EP��2��1
�#(��c
��oc��C��~!9�^^^z��/���f17��8g8.��+���}���G��A��gd�����L
��!������
1��!�������`�g��@�!\\G�s8/8?��q�P�������
����������,�����s�`�q��*����EpA��}q��i
x�!��xq_A���c���q�p~�/�%����m@���Z���(���![ �C�y���%�[�D=����3A�\p����AH��o��Pp!�B!�B!����.<"
�������� $����Z�W��7�����~����"�����.&���`�A\l� ���-����{#��bt#K[g����a��@�9p�m`�A�>p����G������_���X����0d.�]�3;���{R�\a�8>�?�?��\�IKi��r��I
tCp��d�����O��Y.�>�@|1�.�����p��X�{�3�sc>x����qm�Sp�28���\3OZ����q�F���&���7��sg�,��^67\��g���B�D����X��>� �E�wO���O�z������{���Pp!�B!�B!����.��� &f���F�4��;�]�5��`�4G@������l���`�3k.lk����_��7��G����$gAU6F��Qp����0��\�������#�����a �(^�/�����u�j���q>�m�"��}!�1��\�JKX�a�� . p����o���3�"�a���'���&�`y�D8>\gX���3���=X�!�������pL8�F��!^���������������B�����C����ZD���
� �1D+��g���������,:�}�,03�sp��_4��.�/�
�����B!�B!�B._.���ICL@`�fBakda9��$��Q����6TvLn����hs������@�v!l`&��u�uV4X����8�3�a9a} i�����
+0C�@����r �F���#�G���l!�h��clb
��p=�-\p�H������;b^�t\�l�29b��Y��!H�"��}�����
k4�G!��`����7dT��������a?�js�A�cw<��b�'<+�|������ B�����s��/�����(,9�/��8��c=d�8��\!�B!�B�|�`���F��W�����G#��`*��X�� C�(����������QX�q�uI�����!��Qp����u������7��c��z#x���`2��@<��xVJ���PG���7���>�t0��!`mr�}�(6D��zWl�Z"�@di���Y<A-�lg�Tpq&���-s6��w���q��ldY`;�w{. ��~��8;���`��Q,��q�m��}z�x���@�D����
u*�4�
���>�3�`��������9BD-����N���8�]���5n(�B!�B!�r�r��FpE��4��� :��rb�=���"8��6�7X�
��@e[.N;�� �mNp1^G��n.��� �?��d@A0d+������j1��46�fp
p<{��������G �9�%�DpA����B�i. ���C<i��UG\�����$�}h�a�6���\p= <���^qv�q����yt���@��}����>##����`�
��!0��#��a����%�v�8���_g��@�������s
0�������,�Qp!�B!�B!���� .�"��`+���"h����[`���*��"���*���*f�Zp�x�,�$w&����l1f��);�b�E �QTr�3�
�72Rp�8GH#�����1g�h������_8tp0�b1��Y�CpA�r�Cg 0���_��kQf��\p���D���pi��\p����Q?��c����
���gs�
^C�2` �bMK0E\��?@�p�c�g���!c�8W-Y-�/�-,���1�M���t�������M��!�1�B!�B!����&� �F!h��(�& �AR#�4�@F ��8c�V�KXX���n.���
�YpFv
2d��70�G
�rFF��s�}�O��22>PL��>�9�={�.�Y�- �#���3���Y6��%�D#�`zsAp��� 0�4H�\/c���8�����\�W<�p^aA��9po�N�G�Q�����/�1<�-\�M��G����@��}A���cA$X�����4�g��>��x>���S#�lBD�E��iM�
!�B!�B!��qA��724E�ALd:8h�5j���F�"���=h+���Q�F`���
���������Y������d��[��p�X��n0��[�QF ���G`�D���iq`�n3�u[
.8��x0D8gu\p^��(��\��#�%� �������{����^��\p��?�72���kd��yn��� B������-�/�����c& �Y<��xFp|fp>��<;���!�@�1�T\#�� �9�3�<7x�� >�j�@4�q���
!�B!�B!��qAaQ����?����
G�Aa,��7'������l��U�����
-xA��jAYg���>���`�GVb�F���.j��f���p��/o��Y��qx�A�r�;���:@4�}\����A�c��vV����Dp�����8d[�|ay��F��p�����������}B�P�b��
a�~��q"�5������*(���w[
.,p]�/\'G�\@��!���a�e>V<O6lPQ���m���=�k<7)))*��A���i���6��1�������5�-��`���>��l�y�3��0��P�u � �������j.{��5l��&��4��jq���C�bv!�B!�B!?Tp~~~�4
oC�h.�!ZcY�� �3�Jp�2,���� *���q���(� ����X�a}4��:��g��x�����5�Aa�9��� ���a`�+�������2q��ka �qL�6��f>V,g�R�5
n 07������r<�3����6#���>�!xo,�D4�c3���u���\p�m�^��a~N�+�'\G\sg��~�<@41���p?b,��z�\:�/�
��^���2^C���X������������ �Ks��������8�����`�}5��B!�B!�rqr��a�1 H�$�|�F�\,���sF[&C�A��A��&���Ss���7����7����O�B�@{����~"��(�`}��c�~p��]�f�����,�����:MA 6W�.��_�m�/�:A��a�� 0�h�"
�Tl8.��z�k��@�������?~���,�`��Lg�g}���5� f�c[8��U� X�6�1D�����������k>w86��j@��e�\w��of1 ��d"�Tp������L�4g�`���`������b_87$a��~��?�%���3e�=n�h� x�>v\�^���1�?���g �B��������qc}�'<��7�A
p�N� ����iX��:�5w�B!�B!����.� 0��#�hg�D��X�lAOl�a��"�5�k.���e�7��3��@;^;�8�}��
��:- ��hn?������>c[�cAC�����
�}��a9g������������������? ��i��X����{�:�7���_K���3����������s�����+�v���q�l�i=G�=_g[�ag}n�9j��n�:B!�B!�B.^.��B!�B!�B!�r�C��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!���V$w�y��]�V�V���z�{������Fn��V����'aaaR[[k_B���S���?�����~����/%�����1��0a�TTT��%�B!�B!�BHG��K�X,��7��-��"W\q�,\�P�������Q����*����H�>}Tt
m]6n�(O>�����JRR�$''k��RWW��@l�z�BB�����/��I�(�B!�B!�B.[gE<��/���_%��@����;W�{�9��{�<��S�t��&���������������^�-[��,��]���b,f
��7��3fHii���Ad�����c����'���F_'�B!�B!�\�����!��s��W^Y�����
b����������;���q������%;;[rrr�6�Qpq�����G?��,X������~X�e��S������
/�Vf�B!�B!�����\�����*���7���o�h�@y���:t���a���Pw� a2��i��5L$_�~�}�z����m}�������o�����f_Bt����������5K_CV�}��'��~��}��
�!�������|��=��C��W_III�n���\/^��]�i�>�o�O����K+���o�����&���;v�!+��~�3�������n��z���m�����>(~~~M2Yrm�'��|�r�/;8 eee������������������]�-//�-�����������������&�#�������r���L�j�*B����{�k�L� ��5J����T��������K?*++e��-��_?]% ��Q���L�=*����n"
���/>>>����8Y�dI�v!��3F���^7n��s@�}"�|a{�HKi��k�^�z���Y!��
.�.����Y1\ �����^��l��}��������������������v�5�Q����.�e���C���5V�x&�!k���z�3p->��SY�r��q���@������:���"�
1Y/S��e��9��3g��9sfC����������qS�a�z�-d�9�*�'����.x�Q`��g�����3*�X�v�&d��*�`�@�������������������.�f�/i:����%.�8hll��!;w��������,�y,g������6a/����n7���G�u���z����}�D�;��S�i4l-��
�e��I���wB��EZ���!#G�T�d��}N�q�Q��+�Q�����A����k�B!�B!�B.J�1��'O��o���0 �O�H
�`���U0���'�P[���X<x�|����Ms���c �L�<YF�}�I���+�'���� .�m�����qK@�(��Pa��u�&+V�h��{xxh�(b��B!�B!�B�eJy����(&�Zsw9~����q��b���q���x�YpX1R�b1b�f� 3���Xqq�L�8Q����$x
.�=���
�$�@lAJsb�;��Kzz� +7�|�l��Y��A ��Ph
J/�������.�M!�B!�B!�#�����)��\���
��Bx�:Y��Pssse���j+��1I������+���
�1Zl�����
�X�-(���Sp!��������z��r��W��~�3-v�PP4��?����W��_���
���������;�hx�a(���a�������o�}`���AZ��!�B!�B!�\(�Up���T�����s��f��(����_G���Q'������(�$//O��,777��� ��f�R��!�t��E��c��e\H{@��`�����0X�a� O�3-�l;k���At1/WRRB��B!�B!��!@���O��M�I!�����SPP�����7������Ds�%XqTc���5XW5���?���=��X�F���,�A���<!�
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!�t0���$33S���Y�v��U����B!�B!�B!��RQQ!6l��zH����?�?�TUU��8���
��^{M�y��G��O>���t�"����i�����s��e_����r�={v�2�?����3��}�H@pIII�^�z���s����B!�B!�B!����j����Y�f�r��)qww�=z��y��
���TX8~�������'���?�?��At���r�J!N�<)��������b��2�V~~�L�8QF��}�2 �:��X��D�Kaa������B!�B!�B!�����q�D|;��=c��>d�������7Znn�6L����4��MA�KAA�X�V��������{��}��5��j�*a����5G�o777����T���rqC��B!�B!�B.2���rj�9��y���;oJ������d\ �b��C�B
^ Q���h�����;v�X����� -�0;v�����K��&����#d���R[[k����~~~���_kFL�~������<�����!�2x���������_�������`e���u��I�c,�}���W�'�������F�x��c�<y��p���v�96���q.RSSu��<<<���'d����i{(�B!�B!�B�E�
.sf��n]\jG{��*�Frr�|���2u��{0�KY�~��!x��P���%C�����KYY���3qe�����kW���ixm��%����N|�A�y�������?:�����9R^�u]������(�e�@���g�������{��YDD�L�2Ex�y���Td�k��O���z�A,����o��V���X9|������
" �����+4h�
O*��������|�M��}��,+V���`���f�p��`��5����8�X�f��1
}9v���c]�
.�B!�B!�r�q� qY] @T����X�9f�@��%����/����g�N �@�0l��.�> 8�!�D��7�z������j��X.,,L��>}�8]�}��
��md�@lA�
��a�H������Q�8�����A��c���p]����^
b
���Q)((H��~a��cq\�j����K�q���Y����F_���K���q��1�vPp!�B!�B!����)� ��
��
p�(x 2M&M������X�,8��a��,���/��������n` X�u��]BCC�53x���[�'&&�k8���@����0� l���S_�1�-��ys�}�i��q�����e�����,������������}g�^���7��!!���;����
}y�������|��N-�����!�B!�B!�\d\H�@�0a��Y�F3^��2j��&va ,�P>j������i���,d}�Is`���-�l�Z��K1��@_4�d����W9W�Y#�������X}��b����Tt�y10�������>�5`��3����?_m�`�p|���O>���!����#q
.�B!�B!�r�Q�`|P�d9�R��="��Q���dl@p����4�?~��&va�0�Z%[P��x�L@��3g�f��^g���T�M�&�|��Zh�^����F�������5W9W�b,��� 6�5�7l�f8&)X�\�L,��,���S��������fx�5D��>v��E3�����!�B!�B!�\l��ImU��i-51�Xy!�"�Q,���!��Fl�� �1gbDX�A�0�
?��u��#Gt?�},��<�[��}��_|�ug9W��Z�|������d��h�������G�
��\!������Ah��]*� �����Q�,���`�z,��m38.����?o8?8.l�I�C��B!�B!�BH�AP�(�Y2o���ld�@L@=����Kk���?����S'��i�.g�23�����.���?�y���UH�� @�1B���#��x�����0�� ������{�1�D1�7s����7���g��
��[�n��={6�7B_��!�8X����7.�2ep����������!nA��5�{����?i{(�B!�B!�Bi5 ����o�7^G}��1g�C��x����5H��Ic~/--��ll�����k���a�,c[F���9��C����03�?�bA&
����������'l�q}��x/;;[�d D%�s��}��!*�a{������B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�B!�B\��!�B!�B!�B��Pp!�B!�B!�Bq
.�B!�B!�B!.B��B!�B!�B!�E(�B!�B!�B!��B!�B!�B!���B!�B!�B!��"\!�B!�\���ZjI���fhK..��!�Bi(�B!�B��(��H@V�L��������������{��H�r*]������!�B9Pp!�B!�rI�[Q%^��2�H���#P�w?.�n�m��s;� �|}8R6�J���2����B!�B\��!�B!���%q�e�44I�{��k�����������|~ L�8v� ���m'���A���Y�&���RUSk�:!�B�����N233e����a����#C��B!�B�EGIU�M��o�b���A*�t��(���#@�'��XV�E[hN��*C����~��//�����"dmd�D�����!�rNTVV��-[������sgY�h�TUU��8���2�7o^�:hO>��|��g���a_���������?�X6o�l�iii2`�������C����K\�8RRR�W�^2w�\���#C��B!�B�ECf�E��d�(��W�������Y.n1���Rh���d��-�U��]n���b�����Xc_����wv��pY�"a9�RQ]c�!�B�Duu�DEE��Y�����*�6m�^xA.\���RRR"'N���z��������&������/���{t�f���e����{�n�F@@�n�S�N��b�ZUt�����B:2\!�B!�th�e�W"s����'��C �-��c��'%W2�*��Z-�g)����m9�/�%ZH5] �4_����������<<YN���A��B���R]&'�<�@�����E���g�����p���������5�o�!����%��e��I2v�X�X,�W�������k���6[~��i�Kmm�����B��h��~����'�V���i(�B!�B��TZer��=��l�:���- MR������/���!#�F�+���/'���2�`��;(�P�f!�����Z"�����o����H�R��[=;�^A�
���=j����?�\N�:e����bRz���+�h��1���@%55U�����G�7d�<����`���M�2���2j�(�?2b�q����f��� ����{N:d_�~�c���{h�<����?_�r���u���1���>��S������u]X����__�PUTT$3f���7��YYYz��}a�O>�D-� �%OO���q���H�C��B!�BH�"��R�E�����3@��l�`!��=�3%��\�,V-��@fL��Fr�-�_*�b3e��(-��(����/(��($Q�3���!��v����E�h�+.��{�Jp�����������M���>}�h6����*�|���j �.<`��+`C�L�1o�l�D��{��}���D$2�e���#w�u�f����JPP�Z�=�����K�����&O�,���^�X��Kxx�.��q>�Od�2D�
� �h X�X�B3v ����;Z��C�Y�z��N�����Q����G���
4HJKK�Z,_�\����d(��\!�B!�\p rf���8yW�Z{A�0��=�e�����k|�S92�s�����R����l�^�����G��v�g��d~p��e��D!��Jp���u�V
����1c4��@����[>��;b28 &�A�@V��O?-G��/UOK,��'rP|#����o�>�D#;�F�����JF�~X�ayl�mD���Gk���uBBB��7�TAd�����W_��� �t��U����8����;q�����+z��7�)�E�������<�i;(�B!�B�`�
lGB�|s4J�E�woZ���]A�:"E���%��J���6�U �@�Ac
Je������7������v�e��P�;��h�B���B
.���ldg@�@��~�������`5��[o��������'&&F�3u��&�AK�C�{��WU�0�G r@���������� � #�����V�.�c�m�O<�
�9<���9���� �������!�����my�� b��Lp�����jIf)�+l����?��?��d �����s!��-\!�B!��+��_T&���e��P�^���h�Z-(`�'1[RK*���Z��(�����2���~1�������i{�S�P�/�S�T8"�B\�B
. P��]�b�� 0�����2: �d�`?�u] :�Mp��2o�<���#�v�:c���
.�&�1X����b �����5��_�J����h����s3�VO=����qf)���E���w����}
��aYv����0����=��Pp!�B!��.@8A���1�{W���(a&f����-`_Y���������*�+,���r"V�������;Tf����\m!��s���")���q���yT�
O��r6P{d���Z�=>>^3-�va����vb
���,�z0��
S�������`cf����s\ ���
D�Qx
5]�� ���_�i��������F&���s�r�Jy�����9
.�Vtt����Q�d����Y�!�B!�rF�-V�a@!�JV�E��d�W�#�j��
!uZ��
���iZ'�[��c�*�UV����PT�����8yw�����`G�����+��sTd"�BZJ��?�.����~ ����p�n�:U�b�xc��"��2CPo��DA�2=`����Ip�����&�>�h��p����e���eYZZ�f����a)��a�,-��b�sY�~�����M�t�8v��G������[(��x\xo���C�A����vf����B!�B��R\!+������2�P��
O���b�/���Z+�y%2�d���"�m�����YP�o��O����J)��V��K��v�+�$��\�����8��1�/�������U|�y�|"V�&eKf�B!�g���5C � ������?���js����O��w�2P��X�S�N���%3DX�CP�����uQ��_|����r�f��FA|�@�-3�*��j�����>+���S�����m����^�may� �:�����
4H����w��0K�������}���p\�j�����!tAtA���3�� ������B!�B��QZ)������K���~������;�(�B�H��*Rrd��S��m�@y�
�� ���'%(�Hr�-*J\�@|� �%��\������xy������^^����2�x��J������
B!��� �70^GV�cs\��%���?�X{��mcYf���3-�2f ����k��e�9�����(Bo�7��@��P��,�{AA�i���p.�]���#�g,�����,����Od� �d��a{[�
.�B!DA��?_�W��P�s �hf��K�pYI��bY*��Re�m\@�Fw�m����P������2e�k/�l��`�/�NC&���|�� }v7k;��[/� �����;vJv�g�y&�B!
.�B!�9�y�g��>��vT����{Cdmd�f��:��@�� �B.o��k$(�Pf��`�b�i����_�N��,
�
��k�y�j
m�pjI���,��!�����*���_^���m������q��b[�B!�t(�B!�\��TU�wR�|}8R�x��
��K��X
��a��#�"�=A�1B.aP~Wb�����wv��~M���v���d �)�\����m�� Y?������Y KB����c�a7�z�c5�pd���b!�B����!�B�eFEu��6�h��n&����$JXn�[�R�`��`�!�S|!�����N��e��y��@�Z]u3��At�!;�$����BZ��"�9��X@V�,K�O�C���h_0�;XF���t�/*��E!�B�
.�B!� ���(������W����f�� N���E-��p��m����s~�6vL>+����l�sS���2= Nk�d�U��K�/���^Z)AY��"<Y�kR'
b�!q����M��[Pf[�F�B!�B!�����9E2��)y{G`� 2\���I`V��ri���g���j�P|!�c�UfQ�*�b��a;�l��B��"S�z0��J����m��Z����"Y�"_���_���k�!*M��Ky�!�B�
.�B!�0y%2�?V�)���BP��/�@k1�h���,�����{�#���`|�m�Xx2Q�����;��qB�9)������uZW]8`�Vl�����l�O�������Y|fo
���l$B!���
B!��K�S�2;(^*?��(�@��-G��$��r������b=4[���K]^�����(�rq +��)�� ��|O?����E����-��e{�+��p@|�x�QV)!9E�!:M�S\K�x���x����WG����bZ�B!��(�B!�\B��i���������6�~���su&��l�25V���)U[�K��nbY��X��+V�IR�tL�,��[���/�#���-������7��d�u�=�(���T���������#m��Jm�-j�@L�l������t��d���62e����-B!���@��B!� ��B��%�'�B��L��v\���%��Bg�����.?Q���e�+R9��T�}��-�*�U����+� u���l�Z����r��/_9 �A�Y��Z�VSx����?~J3^�J+5�^K���_l�0��"�y��5&]�9����x����doQck�O�,�}�e�pC!�BZB!���d���J����T����>h���������E�R�������R�m�T.�V/��F�������z�^<+��/K�����w��fG��T���:��U|�<K��� ���c���k������1��f~�h_����a9x�L8r���[���g����g_����$ �,�b���%B!��
.�B!!���������
f���4d�l�N�����L�+���cK���-S �X��'5a���,����6+R���e���/����bY��X����8��,�o��8_���B���3���+��>?&oxHw�mX��P��) ��P��!���_j�&OT~�x��������3��xk�|��qB��(_��!���Q��
!�Bi
B!����P���R{-�f �mA�c� Z�F�%���u�H�\��T�}J��*���6�_�������Z���H]a��D����*�4d�,�"���K���R�Q��3�+�
�K�Eg���v�$��R|!�9��:�Q S�c����M2��zy����r$-O3���Pf�|���{ 6���������A�3����`[ �����M��}6�[�#!�B��B!��� ��r��1f?gZ����a��["��Z�R*�'����CK4�e�+R��J�
S�lf����,�M>!���Uhi^�?#�e=5�zh�����������h�c_ JQ|!��o���G"�����{c6��������*��mq ��� ��TA��N��o�b�-��d�� ����'o�^�?Tf����<��#�B�\��B!���A��Pj��4y�+@��5O!*�?� !9E�������$�,I*����Um(5������@�Uc���|��>%����j��
�d�wS���]��&z�.�*f�1C|qV������|!�8��Z�JX�(�C�uO�X�(�"(�����r���ba�\��c�_b
JeoR�L>�.��V��F�ef`����J�m=B!���
.�B!km�z��=����BZ��%(�P-�ZUo��NjbH����r�s�����b��Q_���3Ql����jfLM��Xw����������5�r�L��WK]AJ�E�3p�������` ���KVV����O|||$?�uA�rB+���b�}����l��{��+��HfY�� 9W0dWV�jfT\a��O��i�qZX�<�B|y��O?�>���qr��,��!�B.u(�B!�t A@8�x<Fg��[�J5�3%��J�����+L��C��zMm� ����&r���������X���J��bY�F��2��X��(�u��z`��NL�}E�0�/'m�pCT���9�/�����Rf��o��5eeer��a2d�<�������V~����?��Oy��7d������+EEE�5����J����yj�0�(p�o_�l���Qy%�4Vr6�Ye��_���`J����q�<�v��+�������v_�3Y2 !�c�X���S���&�F���={J�^�$;;��!���
B!��Bxn��~wp� ����e��-�����Ok����k�T.�� ~T�)��A"�
�R���j��(���8�u���j�h[��XV�)U;FHM������Wt�/��N�*�;wV��'?��\y�����������r�5���7�(����������{O���/���RRRb���Geu��d����xo���ZN����F���I-����8ik��[���pj��>��zA���>� �@D]��A@$�r���FEooo�l�������>*��_�O����������g���@���n ��I!��B��B!�s*�T�'H�='����=8���OF�F�Oj�d�Y���u]y�Zx!��rAg:,�zIu�F�+��,mO�f� �5b�{��ey�F���/��W����T_*��q*��/��/�k?���e��5����j�b�UW]� �������f~
�;������?�_��W��?�Qy����e��%"�����\� ��71G�;��&�dc2��L���B��a�D���>��Y�XT.G��������� ���eo�N*���%i�m4��@EE����+e��A��kWy����{��;��C?���g���!�A=<<�[#��Q��B!�r��
���$��������@���������l
>a���P�.�=����r��*nTm$5�G��J}h����H]i����Tq��[-K���������$�/�1�/!9E�1�u�3_Z�����+�|Pn��V���������:y���e��)r�� V���C��SO=%7�pC�������k���o�;��S������"##�����)�����rY�*��kF��^���@���fHlAi�k9�N #3��J�m�3�4�L�����}�5fgA|�=
���;vJ<��$����8��������m��?^^�u�����Y+�����[������}�{�}��]q�*��u�]��S'���d����q�FIII���BHG��!�BH;�����d-&\_��>�� ��������V�>�j�E-��6��U�X�C����������[E���'���)U����-���8U�_IM����d�W<�8_���Q|i����u��������d�.�����?k���;wJll���
�g}NN����I@@��_�^����W���9 el�
�W�^y��'���?�u���������:����Z���=�����CM�^|�vo�@>�� ���k�X���\��)��g���$��X;��\�/�Y�������������IG��b~�:tH���+��~���o�{��G~�����L8�e��gD,�N�h6l�,\�P���#�����:��KB9���5k��)�I�A��B!���O=��03|Ch����C�-&]�^`au�������z`�Zt�@=�����&z���I]���K]Q�f�X�O���!�,xV,�_��Gb=2Oj3#�D02�/�9EZ���K�HJJR��_|Qg�"��8k���n�w�}Wg�"���������S����,O�?.�V����v��`�>����������}f����y�f�'�c��r�x�e�7G����AZd�|��~[���x9�e�5����2�
-V�b��,�e�I: ��[c&��8��[����OD�|aR���k������������w�����&�/��i�Kfnx�������/���e��������111���I�^lid�>��g�=����l�� x�=*/�������_mu�Ifl��e���,����/��5�A3::Z����������}��#�\����!�BH�`���r0\]�g�cv���T��/�R�kE�k��H���R��[�X1���K�Mi������*�+���7T�O���;c�B������z�e�;j�V{@3d�C|�,�����3��!��a�//�%//O�o���T<��
���K�H�?��y���}jj����%������?�aC�x���> �e�?��G�ad�t��Et��9~�_H0��TA��
��3���f�s��'B3�P��i���KC|I-��ZD+���3�3a��3�\���qr��t�-,�sA�+�&''��;d��I���o�c�=�Y+�T� ?��O�����i�9h����kT�Gp�Q����Zf��������D�,��&���r)��,0q���*Z���S�.]����gY]�~��N����g��|��///���2�43D|o\�z����Ix1q�O!r:�������O?�b����!��;xG�tt�B��B!���o�W|�fB H�`�8�U���d��+���j�q��6����-���������5oKu���K���L ��Xe���'IM�N��\��1�ea��z]�������V����d�\��f(��2�N����U�0���6�_����5J����
3u��G�YZZ�� X������ F��C� !}�2��u�1c���2i��"�U���$��`w��d�vCC��qZlYq����!�O��@| �*�U)�������0����_����G"U?�_*Vf���nqq���_�h���(�"���_�@=2R~��6[��)�<����o�)_��nv`����#�Bey�������.�k�$*����sfL���6���[��}������RVV�0�6��61.���O?�TV�X������X"��g�C �}���O<�"�������w��]p!
.�B!�((x?�p���
h"���;X�$i0A%dS�Bmj�Xw~#�K����O� Q�9Tj��I]U�}��k��gHM�1�����������eYO�l�@��M��(j�L�/���*���C=x�c����G,`�e:�N�_�~:��'��io;�����Y|�-��3s�Ly��7������W]u����?���c������8l]�'��:#�m\x{G��0�����^�]�*a�%�[Q%U��C|I+����"}>���k������Q�_����Q��WB�=� &DFF��M���o��W^y����������������;�����9�Za�|�����'O6�C��v`�=)/�����d�Gk]jSm� ���>��EXV;v��j���t�%�#:�����^���tc Z"C�Yp�> �`I�:2iz��!aaa�W����y��ge���W��!�B�a���.�0��0 h��)���]d�`BF^C�����k�����������lx1nm���ay���M��}�<C����U�wq�"K��Q
c�����;�g�}� |��`��>���Fi=\!�B�(r����A�7�`�(��.���
�h��(�%R}r�X6~�*@,�)���i!z���
#�X��,Oj�"�:p�Tm���K�����f�Xw������l[b_�r�est�Yk�\L�~�!0+�����Q��kZ�\�R}����#�C�G#�Z��8�<y�������i�5���Q��aS��k����S��&f(��J����=����/Dg����<N�7��fOb�$����F��
�<��B-��Qi�=���9��_0���k"Rt���3������"������O�B�
,�`/�I��l��=�#Q�A��}��� t�;2aHE����!.�V�o�[�R��C��a���'���3�F���f��@0c����3�}@tq\ $���_�L20�d�a��#F������B����2��A(00�U�",��SO���1�
��|PE�������*$��i>>>j{�
�r�(�D"�m�����#����q}�9�X���E�q\�����������������@��x���{�_�"�������YD]ZB!�A-��2��)����4s�-�=|��+�tYW�������b�F��*��R�]_��#�?_�����D�
��&z�f�X�"��.�,��Y0U[?��+���6>��/�O5/� ��}�v,�c�/�a�T���u���o�JB��������3����'�>\�G6�������:�?*Q�A8G�7,�n��F�
qP�g������������gp�_�����{�D����i[�*��F�����G�/!9E�m�����1�+���s�U�������|�.����%�(���(F�_���j{���i4�����E&�������Qsv`���P��s�y�f8p��
�*m��]�����������3 . ���d�|BB����;M���gX�BD0��Y1�8C��|.�D�-Vp^�$��3g����F��U�V��8�!��yzz��a��qN!��;:����������nL�g�D��,��b�1����O��p��b2����~��C��B!��t�](��b��]AM
�"�>�D��e�=�����%jk�&z�Tm(����]�z`��fF���W��L�o]q��&����\����(���$��/�e����3Ejl��Y�<��|�����.��� f�a6/���}�����g�y����q���g3�A������(�?r�H=��1��03����,x�����(�)�KJ4K��=��jp�?�UA.*�DE��2.� +�0Uj�v�Y���r�\��J]����'c��e�P��#�D�+����]�4Z==!���%(�H�% ���u�V�H�H���;��Sn����j��=�tXG����:��Ao��f;���\�x��+!Rp�wA�����I<?�z���ake�ea����QW�L�O����
����_���a�����u�1��s\�{���uqB���v��Qw�x
2V`u+/d�`[��
�}lc�c�=&K�,iP �`L����`Ef�Lp1d�@3��dS1�
�0��Ed"���
.�B!�@xn��
��>{���b@@�.(x�]n9ou ��%���7XiY����pO�+��}������K]�)�Y���_*tn_='��oJ���R��v���������t����l�����6 ?:a��,�31�����6l���qv)�n��!����Qm�]����|n�����~��������<,��%�0B��J�^�_�/���?%��s%��B��Z��A�?u�2�M���u��o�?����z�S�����}����X���Z��A.�����!��F���y���bX=B]�$Y�jYF.��s6A�
���a���)d����LE�a���]�j�k������GO��lmr�p! b(�5dD��j�����|��d���UX�������a���l3�j1� �NA���-dpC�0�,���'��LfCh���L9��q�l�g���.X�/*d�A�A� �!����p� ��~��*�`�����1t�P��
!��R�8f�}�'��>���K�G}T3e����p
(F�
.�B!� *�T��L��������M,�P��@r���=��k�I��WZ���v���d������l���@�����b�3V��
���g�����^}lq}���}
�j �Z/�@���!�}��U|AV
�R����[o�U-���*�������U��G��~���(~�b&$�'���j�'�yC����#W^}�|��7��n���x�������������`)*��z��3_�+5q>b=2W3X,k���m�{����`����X������)���r������qcnDn�l��������g@�1��K�1Y������D�L-���y[����A����������QW5��Ix�"�x���n���Xw@��w_��u�H�3�JB.U��j�>�*��c]j�G�%9������yxx�w$d�@�@��x��d��bD��>�X�����Q��V� �c�v��9www�L���9�b2D"""|�]�v�����c� q&�Up������>