-- Script to test btree bloating and compression

BEGIN;

CREATE TABLE test_btree (
        id      serial          PRIMARY KEY,
        ts      timestamptz     NOT NULL DEFAULT CURRENT_TIMESTAMP,
        txt     text
);
-- The primary key index is created before the data is added...
-- Create this index before the data gets added...
-- For inserts that increase monatomicly this should result
-- in btree pages ~70% full.
CREATE INDEX test_btree_ts_idx ON test_btree(ts);

-- generate_series was added in pg8.x it returns a set of integers,
-- use it to generate a suitable number or rows in increasing
-- ts ending 'now'.
-- If you don't have generate_series() you'll need to replace this
-- with another method of inserting the required number or rows.
\echo 'Inserting 8*32*1024 rows...'
INSERT INTO test_btree (ts)
    SELECT date_trunc('seconds',current_timestamp)
            - cast(generate_series||' seconds' as interval)
        FROM generate_series (8*32*1024,1,-1);

-- Create these indicies after the data is added...
-- This is similar what would be done by a REINDEX and
-- should result in btree pages ~90% full.
CREATE INDEX test_btree_id_idx2 ON test_btree(id);
CREATE INDEX test_btree_ts_idx2 ON test_btree(ts);

COMMIT;

-- Now we remove some fraction of 'old' rows simulating an expiration
-- policy where we only need to keep some shrinking fraction of rows
-- as they get older.
-- The specific values used here are bogus but illustrate the point.

\echo 'Leave 1/2 of the rows < 6*32*1024'
DELETE FROM test_btree
    WHERE id&1 != 0 AND id < 6*32*1024;

\echo 'Leave 1/4 of the rows < 5*32*1024'
DELETE FROM test_btree
    WHERE id&3 != 0 AND id < 5*32*1024;

\echo 'Leave 1/8 of the rows < 4*32*1024'
DELETE FROM test_btree
    WHERE id&7 != 0 AND id < 4*32*1024;

\echo 'Leave 1/16 of the rows < 3*32*1024'
DELETE FROM test_btree
    WHERE id&15 != 0 AND id < 3*32*1024;

\echo 'Leave 1/32 of the rows < 2*32*1024'
DELETE FROM test_btree
    WHERE id&31 != 0 AND id < 2*32*1024;

-- VACUUM FULL isn't required here.  In a real application if a normal
-- VACUUM were to occur after each expiration run and expiration where
-- to happen reqularly during the inserts then the table would end up
-- approximatly the same size (although with the rows in different
-- locations).
\echo 'Doing VACUUM FULL ...'
VACUUM FULL test_btree;

-- This is similar what would be done by a REINDEX and
-- should result in btree pages ~90% full.
CREATE INDEX test_btree_id_idx3 ON test_btree(id);
CREATE INDEX test_btree_ts_idx3 ON test_btree(ts);

\echo 'Now compare the on disk sizes of the various test_btree indices...'
-- The  pre-insert indices should end up ~4.6 times as large as the last.
-- The post-insert indices should end up ~3.3 times as large as the last.
SELECT pg_class.relname, pg_class.relpages
    FROM pg_class
    WHERE pg_class.relname ~ '^test_btree'
    ORDER BY pg_class.relpages DESC;

-- At this point various in-place compression strategies can be tried
-- on the indices to see how close to the size of *_idx3 they can get
-- without locking the whole table for the duration of a reindex.


-- On PostgreSQL 8.0.1 on FreeBSD with a pagesize of 8k this is what
-- I get:
-- (relsize is just relpages*8KB, avg_page_cap is the average page
--  capacity as a percentage and was arrived at by using pg_filedump)
--
--       relname       | relpages |   relsize    | avg_page_cap
-- --------------------+----------+--------------+--------------
--  test_btree_ts_idx  |     1252 |    9.781 MB  |  20.11%
--  test_btree_pkey    |     1001 |    7.820 MB  |  20.03%
--  test_btree_ts_idx2 |      901 |    7.039 MB  |  27.59%
--  test_btree_id_idx2 |      720 |    5.625 MB  |  27.53%
--  test_btree         |      579 |    4.523 MB  |
--  test_btree_ts_idx3 |      271 |    2.117 MB  |  90.04%
--  test_btree_id_idx3 |      217 |    1.695 MB  |  90.09%

-- Running btree_index_compress('xx'::regclass::oid, 0.9, 0, 2000)
-- (i.e. with a target capacity <= 90%)
-- on each of the indices gives:
-- (free is the number of currently reusable pages reported by vacuum,
--  used is relpages-free, non_empty_avg is the average capacity of
--  just the non-empty pages)
--
--       relname       | relpages | free | used | non_empty_avg
-- --------------------+----------+------+------+---------------
--  test_btree_ts_idx  |     1252 |  886 |  366 |  66.68%
--  test_btree_pkey    |     1001 |  709 |  292 |  67.03%
--  test_btree_ts_idx2 |      900 |  319 |  581 |  52.32%
--  test_btree_id_idx2 |      719 |  432 |  287 |  49.07%
--  test_btree         |      579 |      |      |
--  test_btree_ts_idx3 |      271 |    0 |  271 | 
--  test_btree_id_idx3 |      217 |    0 |  217 |
--
-- In test_btree_id_idx2 (as an example) there are 347 pages at between
-- 43% and 48% but btree_index_compress() doesn't try and merge these as
-- they are not adjacent to pages they can merge with and stay below the
-- 90% target.
-- Used page count in the resulting indices are ~1.35 times as large as
-- with a REINDEXed index.

-- Recreating the table and indices from scratch and then running
-- btree_index_compress('xx'::regclass::oid, 1, 0 2000) on each gives:
-- (i.e. with a target capacity <= 100%)
-- (full is the number of pages > 95% full, e.g. likely to be split, bad)
--
--       relname       | relpages | free | used | full | non_empty_avg
-- --------------------+----------+------+------+------+---------------
--  test_btree_ts_idx  |     1251 |  912 |  339 |   80 |  71.93%
--  test_btree_pkey    |     1001 |  730 |  271 |   63 |  72.20%
--  test_btree_ts_idx2 |      901 |  627 |  274 |   12 |  88.83%
--  test_btree_id_idx2 |      720 |  501 |  219 |   11 |  89.31%
--  test_btree         |      579 |      |      |      |
--  test_btree_ts_idx3 |      271 |    0 |  271 |      |
--  test_btree_id_idx3 |      217 |    0 |  217 |      |
--
-- Used page count in the resulting indices are ~1.24 times as large as
-- with a REINDEXed index and the numver of nearly-full pages isn't too bad.

-- Although the above are small tables where the time the REINDEX holds
-- an exclusive lock is small, and the btree_index_compress() call does
-- the whole table in a single scan...  It's assumed that these numbers
-- would hold for larger tables where the btree_index_compress() could
-- be run repeated doing say 1000 pages a run and thus only holding the
-- exclusive lock briedly unlike a REINDEX.