-- Script to test btree bloating and compression BEGIN; CREATE TABLE test_btree ( id serial PRIMARY KEY, ts timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, txt text ); -- The primary key index is created before the data is added... -- Create this index before the data gets added... -- For inserts that increase monatomicly this should result -- in btree pages ~70% full. CREATE INDEX test_btree_ts_idx ON test_btree(ts); -- generate_series was added in pg8.x it returns a set of integers, -- use it to generate a suitable number or rows in increasing -- ts ending 'now'. -- If you don't have generate_series() you'll need to replace this -- with another method of inserting the required number or rows. \echo 'Inserting 8*32*1024 rows...' INSERT INTO test_btree (ts) SELECT date_trunc('seconds',current_timestamp) - cast(generate_series||' seconds' as interval) FROM generate_series (8*32*1024,1,-1); -- Create these indicies after the data is added... -- This is similar what would be done by a REINDEX and -- should result in btree pages ~90% full. CREATE INDEX test_btree_id_idx2 ON test_btree(id); CREATE INDEX test_btree_ts_idx2 ON test_btree(ts); COMMIT; -- Now we remove some fraction of 'old' rows simulating an expiration -- policy where we only need to keep some shrinking fraction of rows -- as they get older. -- The specific values used here are bogus but illustrate the point. \echo 'Leave 1/2 of the rows < 6*32*1024' DELETE FROM test_btree WHERE id&1 != 0 AND id < 6*32*1024; \echo 'Leave 1/4 of the rows < 5*32*1024' DELETE FROM test_btree WHERE id&3 != 0 AND id < 5*32*1024; \echo 'Leave 1/8 of the rows < 4*32*1024' DELETE FROM test_btree WHERE id&7 != 0 AND id < 4*32*1024; \echo 'Leave 1/16 of the rows < 3*32*1024' DELETE FROM test_btree WHERE id&15 != 0 AND id < 3*32*1024; \echo 'Leave 1/32 of the rows < 2*32*1024' DELETE FROM test_btree WHERE id&31 != 0 AND id < 2*32*1024; -- VACUUM FULL isn't required here. In a real application if a normal -- VACUUM were to occur after each expiration run and expiration where -- to happen reqularly during the inserts then the table would end up -- approximatly the same size (although with the rows in different -- locations). \echo 'Doing VACUUM FULL ...' VACUUM FULL test_btree; -- This is similar what would be done by a REINDEX and -- should result in btree pages ~90% full. CREATE INDEX test_btree_id_idx3 ON test_btree(id); CREATE INDEX test_btree_ts_idx3 ON test_btree(ts); \echo 'Now compare the on disk sizes of the various test_btree indices...' -- The pre-insert indices should end up ~4.6 times as large as the last. -- The post-insert indices should end up ~3.3 times as large as the last. SELECT pg_class.relname, pg_class.relpages FROM pg_class WHERE pg_class.relname ~ '^test_btree' ORDER BY pg_class.relpages DESC; -- At this point various in-place compression strategies can be tried -- on the indices to see how close to the size of *_idx3 they can get -- without locking the whole table for the duration of a reindex. -- On PostgreSQL 8.0.1 on FreeBSD with a pagesize of 8k this is what -- I get: -- (relsize is just relpages*8KB, avg_page_cap is the average page -- capacity as a percentage and was arrived at by using pg_filedump) -- -- relname | relpages | relsize | avg_page_cap -- --------------------+----------+--------------+-------------- -- test_btree_ts_idx | 1252 | 9.781 MB | 20.11% -- test_btree_pkey | 1001 | 7.820 MB | 20.03% -- test_btree_ts_idx2 | 901 | 7.039 MB | 27.59% -- test_btree_id_idx2 | 720 | 5.625 MB | 27.53% -- test_btree | 579 | 4.523 MB | -- test_btree_ts_idx3 | 271 | 2.117 MB | 90.04% -- test_btree_id_idx3 | 217 | 1.695 MB | 90.09% -- Running btree_index_compress('xx'::regclass::oid, 0.9, 0, 2000) -- (i.e. with a target capacity <= 90%) -- on each of the indices gives: -- (free is the number of currently reusable pages reported by vacuum, -- used is relpages-free, non_empty_avg is the average capacity of -- just the non-empty pages) -- -- relname | relpages | free | used | non_empty_avg -- --------------------+----------+------+------+--------------- -- test_btree_ts_idx | 1252 | 886 | 366 | 66.68% -- test_btree_pkey | 1001 | 709 | 292 | 67.03% -- test_btree_ts_idx2 | 900 | 319 | 581 | 52.32% -- test_btree_id_idx2 | 719 | 432 | 287 | 49.07% -- test_btree | 579 | | | -- test_btree_ts_idx3 | 271 | 0 | 271 | -- test_btree_id_idx3 | 217 | 0 | 217 | -- -- In test_btree_id_idx2 (as an example) there are 347 pages at between -- 43% and 48% but btree_index_compress() doesn't try and merge these as -- they are not adjacent to pages they can merge with and stay below the -- 90% target. -- Used page count in the resulting indices are ~1.35 times as large as -- with a REINDEXed index. -- Recreating the table and indices from scratch and then running -- btree_index_compress('xx'::regclass::oid, 1, 0 2000) on each gives: -- (i.e. with a target capacity <= 100%) -- (full is the number of pages > 95% full, e.g. likely to be split, bad) -- -- relname | relpages | free | used | full | non_empty_avg -- --------------------+----------+------+------+------+--------------- -- test_btree_ts_idx | 1251 | 912 | 339 | 80 | 71.93% -- test_btree_pkey | 1001 | 730 | 271 | 63 | 72.20% -- test_btree_ts_idx2 | 901 | 627 | 274 | 12 | 88.83% -- test_btree_id_idx2 | 720 | 501 | 219 | 11 | 89.31% -- test_btree | 579 | | | | -- test_btree_ts_idx3 | 271 | 0 | 271 | | -- test_btree_id_idx3 | 217 | 0 | 217 | | -- -- Used page count in the resulting indices are ~1.24 times as large as -- with a REINDEXed index and the numver of nearly-full pages isn't too bad. -- Although the above are small tables where the time the REINDEX holds -- an exclusive lock is small, and the btree_index_compress() call does -- the whole table in a single scan... It's assumed that these numbers -- would hold for larger tables where the btree_index_compress() could -- be run repeated doing say 1000 pages a run and thus only holding the -- exclusive lock briedly unlike a REINDEX.