CREATE TABLE orders ( order_id bigint NOT NULL, tracking_code bigint, region int NOT NULL ); CREATE TABLE shipments ( shipment_id bigint NOT NULL, tracking_code bigint NOT NULL ); -- Skewed tracking_code distribution: -- 10% of rows share a single "hot" tracking_code (value 1), -- the rest get unique codes. -- This ensures mcv_freq > avgfreq, triggering the skew adjustment -- even in the fixed version of estimate_hash_bucket_stats. INSERT INTO orders SELECT g, CASE WHEN g > 150000 THEN NULL WHEN g <= 15000 THEN 1 ELSE g END, (g % 1000) + 1 FROM generate_series(1, 200000) g; INSERT INTO shipments SELECT g, CASE WHEN g <= 15000 THEN 1 ELSE g END FROM generate_series(1, 150000) g; CREATE INDEX ON orders (region); ANALYZE orders; ANALYZE shipments; -- Ground truth of the skew_ratio for orders WHERE region = 42: WITH base AS ( SELECT count(*)::numeric AS total, count(tracking_code)::numeric AS nonnull, count(DISTINCT tracking_code) AS ndistinct FROM orders WHERE region = 42 ), mcv AS ( SELECT count(*)::numeric AS mcv_count FROM orders WHERE region = 42 AND tracking_code IS NOT NULL GROUP BY tracking_code ORDER BY count(*) DESC LIMIT 1 ) SELECT round(mcv_count / total, 6) AS mcv_freq, round(nonnull / total / ndistinct, 6) AS avgfreq, round((mcv_count / total) / (nonnull / total / ndistinct), 6) AS skew_ratio FROM base, mcv; -- Ground truth: mcv_freq | avgfreq | skew_ratio ----------+----------+------------ 0.075000 | 0.005515 | 13.600000 (1 row) -- Query 1: join on unique columns (histogram only, no MCVs) -- Demonstrates the rows-vs-tuples mcv_freq bug. EXPLAIN ANALYZE SELECT * FROM orders o JOIN shipments s ON s.shipment_id = o.order_id WHERE o.region = 42; -- Below is the diff of the elog debugging added, -- showing the effect of the mcv_freq fix for Query 1: -- -- -mcv_freq=6.66667e-06 [from histogram fallback, 1/rows=150000] -- +mcv_freq=6.66667e-06 [from histogram fallback, 1/tuples=150000] -- ndistinct_raw=150000, avgfreq=6.66667e-06, stanullfrac=0 -- ndistinct adjusted 150000 -> 150000 (rows=150000, tuples=150000) -- estfract_base=6.66667e-06 (nbuckets=524288, ndistinct=150000) -- no skew adjustment (avgfreq=6.66667e-06, mcv_freq=6.66667e-06) -- final bucketsize_frac=6.66667e-06 -- -mcv_freq=0.00502513 [from histogram fallback, 1/rows=199] -- +mcv_freq=5e-06 [from histogram fallback, 1/tuples=200000] -- ndistinct_raw=200000, avgfreq=5e-06, stanullfrac=0 -- ndistinct adjusted 200000 -> 199 (rows=199, tuples=200000) -- estfract_base=0.00502513 (nbuckets=1024, ndistinct=199) -- -skew adjustment: mcv_freq=0.00502513 / avgfreq=5e-06 = skew_ratio=1005.03, estfract 0.00502513 -> 5.05038 -- -final bucketsize_frac=1 -- -- As we can see, this fix causes the skew adjustment to be skipped over, -- which fixes the plan to correctly hash the smaller filtered orders -- table, below is the diff of the query plan: -- QUERY PLAN -- ---------------------------------------------------------------------------------------------------------------------------------------- -- - Hash Join (cost=4924.84..6190.97 rows=149 width=36) (actual time=8.833..10.412 rows=150.00 loops=1) -- - Hash Cond: (o.order_id = s.shipment_id) -- - Buffers: shared hit=1014, temp read=330 written=330 -- - -> Bitmap Heap Scan on orders o (cost=5.84..532.73 rows=199 width=20) (actual time=0.018..0.059 rows=200.00 loops=1) -- +--------------------------------------------------------------------------------------------------------------------------------------------- -- + Hash Join (cost=535.22..3410.21 rows=149 width=36) (actual time=0.061..6.939 rows=150.00 loops=1) -- + Hash Cond: (s.shipment_id = o.order_id) -- + Buffers: shared hit=1014 -- + -> Seq Scan on shipments s (cost=0.00..2311.00 rows=150000 width=16) (actual time=0.002..3.049 rows=150000.00 loops=1) -- + Buffers: shared hit=811 -- + -> Hash (cost=532.73..532.73 rows=199 width=20) (actual time=0.056..0.056 rows=200.00 loops=1) -- + Buckets: 1024 Batches: 1 Memory Usage: 18kB -- + Buffers: shared hit=203 -- + -> Bitmap Heap Scan on orders o (cost=5.84..532.73 rows=199 width=20) (actual time=0.013..0.048 rows=200.00 loops=1) -- Recheck Cond: (region = 42) -- Heap Blocks: exact=200 -- Buffers: shared hit=203 -- - -> Bitmap Index Scan on orders_region_idx (cost=0.00..5.79 rows=199 width=0) (actual time=0.007..0.007 rows=200.00 loops=1) -- + -> Bitmap Index Scan on orders_region_idx (cost=0.00..5.79 rows=199 width=0) (actual time=0.005..0.005 rows=200.00 loops=1) -- Index Cond: (region = 42) -- Index Searches: 1 -- Buffers: shared hit=3 -- - -> Hash (cost=2311.00..2311.00 rows=150000 width=16) (actual time=8.779..8.779 rows=150000.00 loops=1) -- - Buckets: 262144 Batches: 2 Memory Usage: 5572kB -- - Buffers: shared hit=811, temp written=328 -- - -> Seq Scan on shipments s (cost=0.00..2311.00 rows=150000 width=16) (actual time=0.002..2.871 rows=150000.00 loops=1) -- - Buffers: shared hit=811 -- - Planning Time: 0.054 ms -- - Execution Time: 10.423 ms -- + Planning Time: 0.048 ms -- + Execution Time: 6.948 ms -- (18 rows) -- Query 2: join on skewed column (has MCVs) -- Demonstrates the avgfreq ordering bug. EXPLAIN ANALYZE SELECT * FROM orders o JOIN shipments s ON s.tracking_code = o.tracking_code WHERE o.region = 42; -- Below is the diff of the elog debugging added, -- showing the effect of the avgfreq fix for Query 2: ```diff -mcv_freq=0.100667 [from MCV stats] +mcv_freq=0.0991333 [from MCV stats] -ndistinct_raw=96178, avgfreq=1.03974e-05, stanullfrac=0 +ndistinct_raw=96765, stanullfrac=0 -ndistinct adjusted 96178 -> 96178 (rows=150000, tuples=150000) +ndistinct adjusted 96765 -> 96765 (rows=150000, tuples=150000) +avgfreq=1.03343e-05 (stanullfrac=0, ndistinct=96765) -estfract_base=1.03974e-05 (nbuckets=524288, ndistinct=96178) +estfract_base=1.03343e-05 (nbuckets=524288, ndistinct=96765) -skew adjustment: mcv_freq=0.100667 / avgfreq=1.03974e-05 = skew_ratio=9681.92, estfract 1.03974e-05 -> 0.100667 +skew adjustment: mcv_freq=0.0991333 / avgfreq=1.03343e-05 = skew_ratio=9592.64, estfract 1.03343e-05 -> 0.0991333 -final bucketsize_frac=0.100667 +final bucketsize_frac=0.0991333 -mcv_freq=0.0743333 [from MCV stats] +mcv_freq=0.0742667 [from MCV stats] -ndistinct_raw=86317, avgfreq=8.67384e-06, stanullfrac=0.2513 +ndistinct_raw=86337, stanullfrac=0.2514 -ndistinct adjusted 86317 -> 86 (rows=199, tuples=200000) +ndistinct adjusted 86337 -> 86 (rows=199, tuples=200000) +avgfreq=0.00870465 (stanullfrac=0.2514, ndistinct=86) estfract_base=0.0116279 (nbuckets=1024, ndistinct=86) -skew adjustment: mcv_freq=0.0743333 / avgfreq=8.67384e-06 = skew_ratio=8569.83, estfract 0.0116279 -> 99.6492 +skew adjustment: mcv_freq=0.0742667 / avgfreq=0.00870465 = skew_ratio=8.53184, estfract 0.0116279 -> 0.0992074 -final bucketsize_frac=1 +final bucketsize_frac=0.0992074 ``` -- As we can see, the skew ratio is now 8.53184 much closer to the ground truth 13.6, -- which causes the final bucketsize_frac to be 0.0992074 instead of clamped at 1, -- which fixes the plan to correctly hash the smaller table orders, -- below is the diff of the query plan: -- -- QUERY PLAN -- ---------------------------------------------------------------------------------------------------------------------------------------- -- - Hash Join (cost=4924.84..12180.87 rows=223552 width=36) (actual time=10.055..18.535 rows=225135.00 loops=1) -- - Hash Cond: (o.tracking_code = s.tracking_code) -- - Buffers: shared hit=1014, temp read=297 written=297 -- - -> Bitmap Heap Scan on orders o (cost=5.84..532.73 rows=199 width=20) (actual time=0.011..0.057 rows=200.00 loops=1) -- +--------------------------------------------------------------------------------------------------------------------------------------------- -- + Hash Join (cost=535.22..9170.74 rows=219952 width=36) (actual time=0.054..14.671 rows=225135.00 loops=1) -- + Hash Cond: (s.tracking_code = o.tracking_code) -- + Buffers: shared hit=1014 -- + -> Seq Scan on shipments s (cost=0.00..2311.00 rows=150000 width=16) (actual time=0.002..3.050 rows=150000.00 loops=1) -- + Buffers: shared hit=811 -- + -> Hash (cost=532.73..532.73 rows=199 width=20) (actual time=0.051..0.051 rows=150.00 loops=1) -- + Buckets: 1024 Batches: 1 Memory Usage: 16kB -- + Buffers: shared hit=203 -- + -> Bitmap Heap Scan on orders o (cost=5.84..532.73 rows=199 width=20) (actual time=0.012..0.043 rows=200.00 loops=1) -- Recheck Cond: (region = 42) -- Heap Blocks: exact=200 -- Buffers: shared hit=203 -- -> Bitmap Index Scan on orders_region_idx (cost=0.00..5.79 rows=199 width=0) (actual time=0.004..0.004 rows=200.00 loops=1) -- Index Cond: (region = 42) -- Index Searches: 1 -- Buffers: shared hit=3 -- - -> Hash (cost=2311.00..2311.00 rows=150000 width=16) (actual time=10.012..10.012 rows=150000.00 loops=1) -- - Buckets: 262144 Batches: 2 Memory Usage: 5927kB -- - Buffers: shared hit=811, temp written=295 -- - -> Seq Scan on shipments s (cost=0.00..2311.00 rows=150000 width=16) (actual time=0.003..3.384 rows=150000.00 loops=1) -- - Buffers: shared hit=811 -- Planning: -- Buffers: shared hit=26 -- - Planning Time: 0.074 ms -- - Execution Time: 21.582 ms -- + Planning Time: 0.077 ms -- + Execution Time: 17.728 ms -- (20 rows)