Creating foreign key on partitioned table is too slow
Hello
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.
Is the community aware of this? is anyone working on this?
If you are discussing, please let me know the thread.
Table definition and pstack are as follows.
* table definition *
CREATE TABLE accounts (aid INTEGER, bid INTEGER, abalance INTEGER, filler CHAR(84)) PARTITION BY HASH(aid);
CREATE TABLE history (tid INTEGER, bid INTEGER, aid INTEGER, delta INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION BY HASH(aid);
\o /dev/null
SELECT 'CREATE TABLE accounts_' || p || ' PARTITION OF accounts FOR VALUES WITH (modulus 8192, remainder ' || p || ');' FROM generate_series(0, 8191) p;
\gexec
SELECT 'CREATE TABLE history_' || p || ' PARTITION OF history FOR VALUES WITH (modulus 8192, remainder ' || p || ');' FROM generate_series(0, 8191) p;
\gexec
\o
ALTER TABLE accounts ADD CONSTRAINT accounts_pk PRIMARY KEY (aid);
ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);
* pstack before killed by OOM *
#0 0x0000000000a84aec in ReleaseSysCache (tuple=0x7fbb0a15dc28) at syscache.c:1175
#1 0x0000000000a7135d in get_rel_relkind (relid=164628) at lsyscache.c:1816
#2 0x0000000000845f0a in RelationBuildPartitionDesc (rel=0x7fbadb9bfb10) at partdesc.c:230
#3 0x0000000000a78b9a in RelationBuildDesc (targetRelId=139268, insertIt=false) at relcache.c:1173
#4 0x0000000000a7b52e in RelationClearRelation (relation=0x7fbb0a1393e8, rebuild=true) at relcache.c:2534
#5 0x0000000000a7bacf in RelationFlushRelation (relation=0x7fbb0a1393e8) at relcache.c:2692
#6 0x0000000000a7bbe1 in RelationCacheInvalidateEntry (relationId=139268) at relcache.c:2744
#7 0x0000000000a6e11d in LocalExecuteInvalidationMessage (msg=0x7fbadb62e480) at inval.c:589
#8 0x0000000000a6de7d in ProcessInvalidationMessages (hdr=0x1d36d48, func=0xa6e01a <LocalExecuteInvalidationMessage>) at inval.c:460
#9 0x0000000000a6e94e in CommandEndInvalidationMessages () at inval.c:1095
#10 0x0000000000559c93 in AtCCI_LocalCache () at xact.c:1458
#11 0x00000000005596ac in CommandCounterIncrement () at xact.c:1040
#12 0x00000000006b1811 in addFkRecurseReferenced (wqueue=0x7fffcb0a0588, fkconstraint=0x20cf6a0, rel=0x7fbb0a1393e8, pkrel=0x7fbadb9bbe90, indexOid=189582, parentConstr=204810, numfks=1, pkattnum=0x7fffcb0a0190, fkattnum=0x7fffcb0a0150, pfeqoperators=0x7fffcb09ff50, ppeqoperators=0x7fffcb09fed0, ffeqoperators=0x7fffcb09fe50, old_check_ok=false) at tablecmds.c:8168
#13 0x00000000006b1a0b in addFkRecurseReferenced (wqueue=0x7fffcb0a0588, fkconstraint=0x20cf6a0, rel=0x7fbb0a1393e8, pkrel=0x7fbadc188840, indexOid=188424, parentConstr=0, numfks=1, pkattnum=0x7fffcb0a0190, fkattnum=0x7fffcb0a0150, pfeqoperators=0x7fffcb09ff50, ppeqoperators=0x7fffcb09fed0, ffeqoperators=0x7fffcb09fe50, old_check_ok=false) at tablecmds.c:8219
#14 0x00000000006b13e0 in ATAddForeignKeyConstraint (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, fkconstraint=0x20cf6a0, parentConstr=0, recurse=true, recursing=false, lockmode=6) at tablecmds.c:8005
#15 0x00000000006afa0c in ATExecAddConstraint (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, newConstraint=0x20cf6a0, recurse=true, is_readd=false, lockmode=6) at tablecmds.c:7419
#16 0x00000000006a8a7a in ATExecCmd (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, cmd=0x20cf648, lockmode=6) at tablecmds.c:4300
#17 0x00000000006a8448 in ATRewriteCatalogs (wqueue=0x7fffcb0a0588, lockmode=6) at tablecmds.c:4185
#18 0x00000000006a7bf9 in ATController (parsetree=0x1cb4350, rel=0x7fbb0a1393e8, cmds=0x20cf428, recurse=true, lockmode=6) at tablecmds.c:3843
#19 0x00000000006a78a4 in AlterTable (relid=139268, lockmode=6, stmt=0x1cb4350) at tablecmds.c:3504
#20 0x0000000000914999 in ProcessUtilitySlow (pstate=0x1cb3a10, pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:1131
#21 0x0000000000914490 in standard_ProcessUtility (pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:927
#22 0x0000000000913534 in ProcessUtility (pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:360
#23 0x000000000091245a in PortalRunUtility (portal=0x1cf5ee0, pstmt=0x1c91380, isTopLevel=true, setHoldSnapshot=false, dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at pquery.c:1175
#24 0x0000000000912671 in PortalRunMulti (portal=0x1cf5ee0, isTopLevel=true, setHoldSnapshot=false, dest=0x1c91470, altdest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at pquery.c:1321
#25 0x0000000000911ba6 in PortalRun (portal=0x1cf5ee0, count=9223372036854775807, isTopLevel=true, run_once=true, dest=0x1c91470, altdest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at pquery.c:796
#26 0x000000000090b9ad in exec_simple_query (query_string=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);") at postgres.c:1231
#27 0x000000000090fd13 in PostgresMain (argc=1, argv=0x1cb9fb8, dbname=0x1cb9ed0 "postgres", username=0x1cb9eb0 "k5user") at postgres.c:4256
#28 0x0000000000864cbd in BackendRun (port=0x1cb1e90) at postmaster.c:4498
#29 0x000000000086449b in BackendStartup (port=0x1cb1e90) at postmaster.c:4189
#30 0x00000000008608d7 in ServerLoop () at postmaster.c:1727
#31 0x000000000086018d in PostmasterMain (argc=1, argv=0x1c8aa40) at postmaster.c:1400
#32 0x0000000000770835 in main (argc=1, argv=0x1c8aa40) at main.c:210
regards,
Kato Sho
On 2019-Oct-23, kato-sho@fujitsu.com wrote:
Hello
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.
Thanks for reporting. It sounds like there must be a memory leak here.
I am fairly pressed for time at present so I won't be able to
investigate this until, at least, mid November.
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On Thu, Oct 24, 2019 at 03:48:57PM -0300, Alvaro Herrera wrote:
On 2019-Oct-23, kato-sho@fujitsu.com wrote:
Hello
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.
Thanks for reporting. It sounds like there must be a memory leak here.
I am fairly pressed for time at present so I won't be able to
investigate this until, at least, mid November.
I've briefly looked into this, and I think the main memory leak is in
RelationBuildPartitionDesc. It gets called with PortalContext, it
allocates a lot of memory building the descriptor, copies it into
CacheContext but does not even try to free anything. So we end up with
something like this:
TopMemoryContext: 215344 total in 11 blocks; 47720 free (12 chunks); 167624 used
pgstat TabStatusArray lookup hash table: 32768 total in 3 blocks; 9160 free (4 chunks); 23608 used
TopTransactionContext: 4194304 total in 10 blocks; 1992968 free (18 chunks); 2201336 used
RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used
MessageContext: 8192 total in 1 blocks; 3256 free (1 chunks); 4936 used
Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used
TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used
Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used
PortalContext: 1557985728 total in 177490 blocks; 9038656 free (167645 chunks); 1548947072 used:
Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used
CacheMemoryContext: 17039424 total in 13 blocks; 7181480 free (9 chunks); 9857944 used
partition key: 1024 total in 1 blocks; 168 free (0 chunks); 856 used: history
index info: 2048 total in 2 blocks; 568 free (1 chunks); 1480 used: pg_class_tblspc_relfilenode_index
...
index info: 2048 total in 2 blocks; 872 free (0 chunks); 1176 used: pg_class_oid_index
WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used
PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used
MdSmgr: 8192 total in 1 blocks; 5976 free (0 chunks); 2216 used
LOCALLOCK hash: 65536 total in 4 blocks; 18584 free (12 chunks); 46952 used
Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used
ErrorContext: 8192 total in 1 blocks; 6840 free (4 chunks); 1352 used
Grand total: 1580997216 bytes in 177834 blocks; 18482808 free (167857 chunks); 1562514408 used
(At which point I simply interrupted the query, it'd allocate more and
more memory until an OOM).
The attached patch trivially fixes that by adding a memory context
tracking all the temporary data, and then just deletes it as a whole at
the end of the function. This significantly reduces the memory usage for
me, not sure it's 100% correct.
FWIW, even with this fix it still takes an awful lot to create the
foreign key, because the CPU is stuck doing this
60.78% 60.78% postgres postgres [.] bms_equal
32.58% 32.58% postgres postgres [.] get_eclass_for_sort_expr
3.83% 3.83% postgres postgres [.] add_child_rel_equivalences
0.23% 0.00% postgres [unknown] [.] 0x0000000000000005
0.22% 0.00% postgres [unknown] [.] 0000000000000000
0.18% 0.18% postgres postgres [.] AllocSetCheck
...
Haven't looked into the details yet.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
memleak-fix.patchtext/plain; charset=us-asciiDownload+12-0
On Fri, Oct 25, 2019 at 12:17:58AM +0200, Tomas Vondra wrote:
...
FWIW, even with this fix it still takes an awful lot to create the
foreign key, because the CPU is stuck doing this60.78% 60.78% postgres postgres [.] bms_equal
32.58% 32.58% postgres postgres [.] get_eclass_for_sort_expr
3.83% 3.83% postgres postgres [.] add_child_rel_equivalences
0.23% 0.00% postgres [unknown] [.] 0x0000000000000005
0.22% 0.00% postgres [unknown] [.] 0000000000000000
0.18% 0.18% postgres postgres [.] AllocSetCheck
...Haven't looked into the details yet.
OK, a bit more info. A better perf report (with framepointers etc) looks
like this:
+ 99.98% 0.00% postgres [unknown] [.] 0x495641002c4d133d
+ 99.98% 0.00% postgres libc-2.29.so [.] __libc_start_main
+ 99.98% 0.00% postgres postgres [.] startup_hacks
+ 99.98% 0.00% postgres postgres [.] PostmasterMain
+ 99.98% 0.00% postgres postgres [.] ServerLoop
+ 99.98% 0.00% postgres postgres [.] BackendStartup
+ 99.98% 0.00% postgres postgres [.] ExitPostmaster
+ 99.98% 0.00% postgres postgres [.] PostgresMain
+ 99.98% 0.00% postgres postgres [.] exec_simple_query
+ 99.98% 0.00% postgres postgres [.] PortalRun
+ 99.98% 0.00% postgres postgres [.] PortalRunMulti
+ 99.98% 0.00% postgres postgres [.] PortalRunUtility
+ 99.98% 0.00% postgres postgres [.] ProcessUtility
+ 99.98% 0.00% postgres postgres [.] standard_ProcessUtility
+ 99.98% 0.00% postgres postgres [.] ProcessUtilitySlow
+ 99.98% 0.00% postgres postgres [.] AlterTable
+ 99.98% 0.00% postgres postgres [.] ATController
+ 99.98% 0.00% postgres postgres [.] ATRewriteTables
+ 99.98% 0.00% postgres postgres [.] validateForeignKeyConstraint
+ 99.98% 0.00% postgres postgres [.] RI_Initial_Check
+ 99.96% 0.00% postgres postgres [.] SPI_execute_snapshot
+ 99.86% 0.00% postgres postgres [.] _SPI_execute_plan
+ 99.70% 0.00% postgres postgres [.] GetCachedPlan
+ 99.70% 0.00% postgres postgres [.] BuildCachedPlan
+ 99.66% 0.00% postgres postgres [.] pg_plan_queries
+ 99.66% 0.00% postgres postgres [.] pg_plan_query
+ 99.66% 0.00% postgres postgres [.] planner
+ 99.66% 0.00% postgres postgres [.] standard_planner
+ 99.62% 0.00% postgres postgres [.] subquery_planner
+ 99.62% 0.00% postgres postgres [.] grouping_planner
+ 99.62% 0.00% postgres postgres [.] query_planner
+ 99.31% 0.00% postgres postgres [.] make_one_rel
+ 97.53% 0.00% postgres postgres [.] set_base_rel_pathlists
+ 97.53% 0.00% postgres postgres [.] set_rel_pathlist
+ 97.53% 0.01% postgres postgres [.] set_append_rel_pathlist
+ 97.42% 0.00% postgres postgres [.] set_plain_rel_pathlist
+ 97.40% 0.02% postgres postgres [.] create_index_paths
+ 97.16% 0.01% postgres postgres [.] get_index_paths
+ 97.12% 0.02% postgres postgres [.] build_index_paths
+ 96.67% 0.01% postgres postgres [.] build_index_pathkeys
+ 96.61% 0.01% postgres postgres [.] make_pathkey_from_sortinfo
+ 95.70% 21.27% postgres postgres [.] get_eclass_for_sort_expr
+ 75.21% 75.21% postgres postgres [.] bms_equal
+ 48.72% 0.00% postgres postgres [.] consider_index_join_clauses
+ 48.72% 0.00% postgres postgres [.] consider_index_join_outer_rels
+ 48.72% 0.02% postgres postgres [.] get_join_index_paths
+ 1.78% 0.00% postgres postgres [.] set_base_rel_sizes
+ 1.78% 0.00% postgres postgres [.] set_rel_size
+ 1.78% 0.01% postgres postgres [.] set_append_rel_size
+ 1.66% 1.34% postgres postgres [.] add_child_rel_equivalences
It is (pretty much) a single callstack, i.e. each function is simply
calling the one below it (with some minor exceptions at the end, but
that's pretty negligible here).
This essentially says that planning queries executed by RI_Initial_Check
with many partitions is damn expensive. An example query is this one:
test=# \timing
test=# SELECT fk."aid" FROM ONLY "public"."history_60" fk LEFT OUTER
JOIN "public"."accounts" pk ON ( pk."aid" OPERATOR(pg_catalog.=)
fk."aid") WHERE pk."aid" IS NULL AND (fk."aid" IS NOT NULL);
aid
-----
(0 rows)
Time: 28791.492 ms (00:28.791)
Bear in mind those are *empty* tables, so the execution is pretty cheap
(explain analyze says the execution takes ~65ms, but the planning itself
takes ~28 seconds). And we have 8192 such partitions, which means we'd
spend ~230k seconds just planning the RI queries. That's 64 hours.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Hi,
On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote:
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
Obviously this should be improved. But I think it's also worthwhile to
note that using 8k partitions is very unlikely to be a good choice for
anything. The metadata, partition pruning, etc overhead is just going to
be very substantial.
Greetings,
Andres Freund
On Thu, Oct 24, 2019 at 04:28:38PM -0700, Andres Freund wrote:
Hi,
On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote:
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).Obviously this should be improved. But I think it's also worthwhile to
note that using 8k partitions is very unlikely to be a good choice for
anything. The metadata, partition pruning, etc overhead is just going to
be very substantial.
True. Especially with two partitioned tables, each with 8k partitions.
I do think it makes sense to reduce the memory usage, because just
eating all available memory (in the extreme case) is not very nice. I've
added that patch to the CF, although the patch I shared is very crude
and I'm by no means suggesting it's how it should be done ultimately.
The other bit (speed of planning with 8k partitions) is probably a more
general issue, and I suppose we'll improve that over time. I don't think
there's a simple change magically improving that.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On Thu, 31 Oct 2019 at 07:30, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:
On Thu, Oct 24, 2019 at 04:28:38PM -0700, Andres Freund wrote:
Hi,
On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote:
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).Obviously this should be improved. But I think it's also worthwhile to
note that using 8k partitions is very unlikely to be a good choice for
anything. The metadata, partition pruning, etc overhead is just going to
be very substantial.True. Especially with two partitioned tables, each with 8k partitions.
In Ottawa this year, Andres and I briefly talked about the possibility
of making a series of changes to how equalfuncs.c works. The idea was
to make it easy by using some pre-processor magic to allow us to
create another version of equalfuncs which would let us have an equal
comparison function that returns -1 / 0 / +1 rather than just true or
false. This would allow us to Binary Search Trees of objects. I
identified that EquivalenceClass.ec_members would be better written as
a BST to allow much faster lookups in get_eclass_for_sort_expr().
The implementation I had in mind for the BST was a compact tree that
instead of using pointers for the left and right children, it just
uses an integer to reference the array element number. This would
allow us to maintain very fast insert-order traversals. Deletes would
need to decrement all child references greater than the deleted index.
This is sort of on-par with how the new List implementation in master.
i.e deletes take additional effort, but inserts are fast if there's
enough space in the array for a new element, traversals are
cache-friendly, etc. I think trees might be better than hash tables
for this as a hash function needs to hash all fields, whereas a
comparison function can stop when it finds the first non-match.
This may also be able to help simplify the code in setrefs.c to get
rid of the complex code around indexed tlists. tlist_member() would
become O(log n) instead of O(n), so perhaps there'd be not much point
in having both search_indexed_tlist_for_var() and
search_indexed_tlist_for_non_var().
--
David Rowley http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
Hi,
On 2019-10-31 11:19:05 +1300, David Rowley wrote:
In Ottawa this year, Andres and I briefly talked about the possibility
of making a series of changes to how equalfuncs.c works. The idea was
to make it easy by using some pre-processor magic to allow us to
create another version of equalfuncs which would let us have an equal
comparison function that returns -1 / 0 / +1 rather than just true or
false.
See also the thread at
/messages/by-id/20190920051857.2fhnvhvx4qdddviz@alap3.anarazel.de
which would make this fairly easy, without having to compile equalfuncs
twice or such.
Greetings,
Andres Freund
David Rowley <david.rowley@2ndquadrant.com> writes:
In Ottawa this year, Andres and I briefly talked about the possibility
of making a series of changes to how equalfuncs.c works. The idea was
to make it easy by using some pre-processor magic to allow us to
create another version of equalfuncs which would let us have an equal
comparison function that returns -1 / 0 / +1 rather than just true or
false. This would allow us to Binary Search Trees of objects. I
identified that EquivalenceClass.ec_members would be better written as
a BST to allow much faster lookups in get_eclass_for_sort_expr().
This seems like a good idea, but why would we want to maintain two
versions? Just change equalfuncs.c into comparefuncs.c, full stop.
equal() would be a trivial wrapper for (compare_objects(a,b) == 0).
Andres' ideas about autogenerating all that boilerplate aren't
bad, but that's no justification for carrying two full sets of
per-node logic when one set would do.
regards, tom lane
On Thu, 31 Oct 2019 at 17:56, Tom Lane <tgl@sss.pgh.pa.us> wrote:
David Rowley <david.rowley@2ndquadrant.com> writes:
In Ottawa this year, Andres and I briefly talked about the possibility
of making a series of changes to how equalfuncs.c works. The idea was
to make it easy by using some pre-processor magic to allow us to
create another version of equalfuncs which would let us have an equal
comparison function that returns -1 / 0 / +1 rather than just true or
false. This would allow us to Binary Search Trees of objects. I
identified that EquivalenceClass.ec_members would be better written as
a BST to allow much faster lookups in get_eclass_for_sort_expr().This seems like a good idea, but why would we want to maintain two
versions? Just change equalfuncs.c into comparefuncs.c, full stop.
equal() would be a trivial wrapper for (compare_objects(a,b) == 0).
If we can do that without slowing down the comparison, then sure.
Checking which node sorts earlier is a bit more expensive than just
checking for equality. But if that's not going to be noticeable in
real-world test cases, then I agree.
--
David Rowley http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
Hello,
On Fri, Oct 25, 2019 at 7:18 AM Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:
On Thu, Oct 24, 2019 at 03:48:57PM -0300, Alvaro Herrera wrote:
On 2019-Oct-23, kato-sho@fujitsu.com wrote:
Hello
To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killed by OOM.
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.
Thanks for reporting.
Thank you Kato-san.
It sounds like there must be a memory leak here.
I am fairly pressed for time at present so I won't be able to
investigate this until, at least, mid November.I've briefly looked into this, and I think the main memory leak is in
RelationBuildPartitionDesc. It gets called with PortalContext, it
allocates a lot of memory building the descriptor, copies it into
CacheContext but does not even try to free anything. So we end up with
something like this:
...
The attached patch trivially fixes that by adding a memory context
tracking all the temporary data, and then just deletes it as a whole at
the end of the function. This significantly reduces the memory usage for
me, not sure it's 100% correct.
Thank you Tomas. I think we have considered this temporary context
fix a number of times before, but it got stalled for one reason or
another ([1]/messages/by-id/CA+TgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw@mail.gmail.com comes to mind as the last thread where this came up).
Another angle to look at this is that our design where PartitionDesc
is rebuilt on relcache reload of the parent relation is not a great
one after all. It seems that we're rightly (?) invalidating the
parent's relcache 8192 times in this case, because its cacheable
foreign key descriptor changes on processing each partition, but
PartitionDesc itself doesn't change. Having to pointlessly rebuild it
8192 times seems really wasteful.
I recall a discussion where it was proposed to build PartitionDesc
only when needed as opposed on every relcache reload of the parent
relation. Attached PoC-at-best patch that does that seems to go
through without OOM and passes make check-world. I think this should
have a very minor impact on select queries.
But...
FWIW, even with this fix it still takes an awful lot to create the
foreign key, because the CPU is stuck doing this60.78% 60.78% postgres postgres [.] bms_equal
32.58% 32.58% postgres postgres [.] get_eclass_for_sort_expr
3.83% 3.83% postgres postgres [.] add_child_rel_equivalences
0.23% 0.00% postgres [unknown] [.] 0x0000000000000005
0.22% 0.00% postgres [unknown] [.] 0000000000000000
0.18% 0.18% postgres postgres [.] AllocSetCheck
...we have many problems to solve here. :-(
Thanks,
Amit
[1]: /messages/by-id/CA+TgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw@mail.gmail.com
Attachments:
build-PartitionDesc-when-needed-PoC.patchapplication/octet-stream; name=build-PartitionDesc-when-needed-PoC.patchDownload+21-7
On 2019-Oct-25, Tomas Vondra wrote:
The attached patch trivially fixes that by adding a memory context
tracking all the temporary data, and then just deletes it as a whole at
the end of the function. This significantly reduces the memory usage for
me, not sure it's 100% correct.
FWIW we already had this code (added by commit 2455ab48844c), but it was
removed by commit d3f48dfae42f. I think we should put it back. (I
think it may be useful to use a static MemoryContext that we can just
reset each time, instead of creating and deleting each time, to save on
memcxt churn. That'd make the function non-reentrant, but I don't see
that we'd make the catalogs partitioned any time soon. This may be
premature optimization though -- not really wedded to it.)
With Amit's patch to make RelationBuildPartitionDesc called lazily, the
time to plan the RI_InitialCheck query (using Kato Sho's test case) goes
from 30 seconds to 14 seconds on my laptop. Obviously there's more that
we'd need to fix to make the scenario fully supported, but it seems a
decent step forward.
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Alvaro Herrera <alvherre@2ndquadrant.com> writes:
On 2019-Oct-25, Tomas Vondra wrote:
The attached patch trivially fixes that by adding a memory context
tracking all the temporary data, and then just deletes it as a whole at
the end of the function. This significantly reduces the memory usage for
me, not sure it's 100% correct.
FWIW we already had this code (added by commit 2455ab48844c), but it was
removed by commit d3f48dfae42f. I think we should put it back.
I disagree. The point of d3f48dfae42f is that the management of that
leakage is now being done at the caller level, and I'm quite firmly
against having RelationBuildPartitionDesc duplicate that. If we
don't like the amount of space RelationBuildPartitionDesc is leaking,
we aren't going to like the amount of space that sibling routines
such as RelationBuildTriggers leak, either.
What we ought to be thinking about instead is adjusting the
RECOVER_RELATION_BUILD_MEMORY heuristic in relcache.c. I am not
sure what it ought to look like, but I doubt that "do it all the
time" has suddenly become the right answer, when it wasn't the
right answer for 20-something years.
It's conceivable that "do it if CCA is on, or if the current
table is a partition child table" is a reasonable approach.
But I'm not sure whether we can know the relation relkind
early enough for that :-(
(BTW, a different question one could ask is exactly why
RelationBuildPartitionDesc is so profligate of leaked memory.)
regards, tom lane
On 2019-Nov-13, Tom Lane wrote:
(BTW, a different question one could ask is exactly why
RelationBuildPartitionDesc is so profligate of leaked memory.)
The original partitioning code (f0e44751d717) decided that it didn't
want to bother with adding a "free" routine for PartitionBoundInfo
structs, maybe because it had too many pointers, so there's no way for
RelationBuildPartitionDesc to free everything it allocates anyway. We
could add a couple of pfrees and list_frees here and there, but for the
main thing being leaked we'd need to improve that API.
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On 2019-Nov-13, Alvaro Herrera wrote:
On 2019-Nov-13, Tom Lane wrote:
(BTW, a different question one could ask is exactly why
RelationBuildPartitionDesc is so profligate of leaked memory.)The original partitioning code (f0e44751d717) decided that it didn't
want to bother with adding a "free" routine for PartitionBoundInfo
structs, maybe because it had too many pointers, so there's no way for
RelationBuildPartitionDesc to free everything it allocates anyway. We
could add a couple of pfrees and list_frees here and there, but for the
main thing being leaked we'd need to improve that API.
Ah, we also leak an array of PartitionBoundSpec, which is a Node. Do we
have any way to free those? I don't think we do.
In short, it looks to me as if this function was explicitly designed
with the idea that it'd be called in a temp mem context.
I looked at d3f48dfae42f again per your earlier suggestion. Doing that
memory context dance for partitioned relations does seem to fix the
problem too; we just need to move the context creation to just after
ScanPgRelation, at which point we have the relkind. (Note: I think the
problematic case is the partitioned table, not the partitions
themselves. At least, with the attached patch the problem goes away. I
guess it would be sensible to research whether we need to do this for
relispartition=true as well, but I haven't done that.)
There is indeed some leakage for relations that have triggers too (or
rules), but in order for those to become significant you would have to
have thousands of triggers or rules ... and in reasonable designs, you
just don't because it doesn't make sense. But it is not totally
unreasonable to have lots of partitions, and as we improve the system,
more and more people will want to.
Aside: while messing with this I noticed that how significant pg_strtok
is as a resource hog when building partition descs (from the
stringToNode that's applied to each partition's partbound.)
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
v2-0001-build-partdesc-memcxt.patchtext/x-diff; charset=us-asciiDownload+37-32
On Wed, Nov 13, 2019 at 4:46 PM Alvaro Herrera <alvherre@2ndquadrant.com> wrote:
But it is not totally
unreasonable to have lots of partitions, and as we improve the system,
more and more people will want to.
Yep.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 11/13/19 4:45 PM, Alvaro Herrera wrote:
But it is not totally
unreasonable to have lots of partitions, and as we improve the system,
more and more people will want to.
+1
This patch still applies but there seems to be some disagreement on how
to proceed.
Any thoughts?
Regards,
--
-David
david@pgmasters.net
On 2020-Mar-24, David Steele wrote:
This patch still applies but there seems to be some disagreement on
how to proceed.
Actually, I don't think there's any disagreement regarding the patch I
last posted. (There was disagreement on the previous patches, which
were very different). Tom suggested to look at the heuristics used for
RECOVER_RELATION_BUILD_MEMORY, and the patch does exactly that. It
would be great if Kato Sho can try the original test case with my latest
patch (the one in /messages/by-id/20191113214544.GA16060@alvherre.pgsql )
and let us know if it improves things.
The patch as posted generates these warnings in my current GCC that it
didn't when I checked last, but they're harmless -- if/when I push,
it'll be without the parens.
/pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: warning: equality comparison with extraneous parentheses [-Wparentheses-equality]
if ((relp->relkind == RELKIND_PARTITIONED_TABLE)
~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: note: remove extraneous parentheses around the comparison to silence this warning
if ((relp->relkind == RELKIND_PARTITIONED_TABLE)
~ ^ ~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: note: use '=' to turn this equality comparison into an assignment
if ((relp->relkind == RELKIND_PARTITIONED_TABLE)
^~
=
/pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: warning: equality comparison with extraneous parentheses [-Wparentheses-equality]
if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: note: remove extraneous parentheses around the comparison to silence this warning
if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
~ ^ ~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: note: use '=' to turn this equality comparison into an assignment
if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
^~
=
2 warnings generated.
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On 24 Mar 2020, at 16:26, Alvaro Herrera <alvherre@2ndquadrant.com> wrote:
It
would be great if Kato Sho can try the original test case with my latest
patch (the one in /messages/by-id/20191113214544.GA16060@alvherre.pgsql )
and let us know if it improves things.
Hi!,
Are you able to test Alvaros latest patch to see if that solves the originally
reported problem, so that we can reach closure on this item during the
commitfest?
cheers ./daniel
On Tuesday, July 14, 2020 11:29 PM, Daniel Fustafsson wrote:
Are you able to test Alvaros latest patch to see if that solves the originally reported problem, so that we can reach >closure on this item during the commitfest?
Sorry for the too late replay. I missed this mail.
And, thanks for writing patches. I start test now.
I'll report the result before the end of August .
Regards,
Sho kato