diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 5fd7f1e..f977db8 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -87,7 +87,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, bool allow_pagemode, bool is_bitmapscan, bool is_samplescan, - bool temp_snap); + bool temp_snap, + bool skip_all_visible); static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); @@ -1033,6 +1034,45 @@ heapgettup_pagemode(HeapScanDesc scan, return; } + /* + * Skip all-visible pages, if we are told. + * + * We skip pages only if we can skip a bunch of them. Otherwise OS's + * prefetch for sequential scan will do a much better job. This is + * similar to what we do in vacuumlazy.c + */ + if (scan->rs_skip_all_visible && scan->rs_next_unskippable_block < page) + { + scan->rs_next_unskippable_block = page; + + Assert(!backward); + Assert(scan->rs_startblock == 0); + Assert(scan->rs_nblocks == scan->rs_numblocks); + + while (scan->rs_next_unskippable_block < scan->rs_nblocks) + { + scan->rs_all_visible_checked++; + if (VM_ALL_VISIBLE(scan->rs_rd, scan->rs_next_unskippable_block, &scan->rs_vmbuf)) + scan->rs_next_unskippable_block++; + else + break; + } +#define SKIP_PAGES_THRESHOLD 32 + if (scan->rs_next_unskippable_block - page > SKIP_PAGES_THRESHOLD) + { + scan->rs_skipped_total += (scan->rs_next_unskippable_block - page); + scan->rs_skipped_times++; + page = scan->rs_next_unskippable_block; + scan->rs_numblocks -= (scan->rs_next_unskippable_block - page); + } + + /* + * If we reached the end, just return. + */ + if (page >= scan->rs_nblocks) + return; + } + scan->rs_scanned_total++; heapgetpage(scan, page); dp = BufferGetPage(scan->rs_cbuf); @@ -1394,7 +1434,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, - true, true, true, false, false, false); + true, true, true, false, false, false, false); } HeapScanDesc @@ -1404,7 +1444,7 @@ heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key) Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, - true, true, true, false, false, true); + true, true, true, false, false, true, false); } HeapScanDesc @@ -1414,7 +1454,7 @@ heap_beginscan_strat(Relation relation, Snapshot snapshot, { return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, allow_strat, allow_sync, true, - false, false, false); + false, false, false, false); } HeapScanDesc @@ -1422,7 +1462,8 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, - false, false, true, true, false, false); + false, false, true, true, false, false, + false); } HeapScanDesc @@ -1432,7 +1473,16 @@ heap_beginscan_sampling(Relation relation, Snapshot snapshot, { return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, allow_strat, allow_sync, allow_pagemode, - false, true, false); + false, true, false, false); +} + +HeapScanDesc +heap_beginscan_skip_all_visible(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key) +{ + return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, + true, false, true, + false, false, false, true); } static HeapScanDesc @@ -1444,11 +1494,18 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, bool allow_pagemode, bool is_bitmapscan, bool is_samplescan, - bool temp_snap) + bool temp_snap, + bool skip_all_visible) { HeapScanDesc scan; /* + * No support for parallel or synchronous scans when skip_all_visible is + * true. + */ + Assert(!skip_all_visible || (parallel_scan == NULL && !allow_sync)); + + /* * increment relation ref count while scanning relation * * This is just to make really sure the relcache entry won't go away while @@ -1472,6 +1529,12 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, scan->rs_allow_sync = allow_sync; scan->rs_temp_snap = temp_snap; scan->rs_parallel = parallel_scan; + scan->rs_skip_all_visible = skip_all_visible; + scan->rs_next_unskippable_block = InvalidBlockNumber; + scan->rs_all_visible_checked = InvalidBlockNumber; + scan->rs_skipped_total = scan->rs_scanned_total = 0; + scan->rs_skipped_times = 0; + scan->rs_vmbuf = InvalidBuffer; /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot @@ -1587,6 +1650,10 @@ heap_endscan(HeapScanDesc scan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + /* unpin visibility map buffer too */ + if (BufferIsValid(scan->rs_vmbuf)) + ReleaseBuffer(scan->rs_vmbuf); + /* * decrement relation reference count and free scan descriptor storage */ @@ -1601,6 +1668,13 @@ heap_endscan(HeapScanDesc scan) if (scan->rs_temp_snap) UnregisterSnapshot(scan->rs_snapshot); + if (scan->rs_skip_all_visible) + { + elog(LOG, "heap_endscan: scanned (%.0f), checked (%u), skipped (%.0f), avg skipped (%.0f)", + scan->rs_scanned_total, scan->rs_all_visible_checked, + scan->rs_skipped_total, + scan->rs_skipped_total / scan->rs_skipped_times); + } pfree(scan); } @@ -1658,7 +1732,7 @@ heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan) RegisterSnapshot(snapshot); return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan, - true, true, true, false, false, true); + true, true, true, false, false, true, false); } /* ---------------- diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 815a694..35c378d 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2806,6 +2806,9 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) Oid save_userid; int save_sec_context; int save_nestlevel; + PGRUsage ru0; + + pg_rusage_init(&ru0); /* Open and lock the parent heap relation */ heapRelation = heap_open(heapId, ShareUpdateExclusiveLock); @@ -2873,8 +2876,9 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) tuplesort_end(state.tuplesort); elog(DEBUG2, - "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples", - state.htups, state.itups, state.tups_inserted); + "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples: %s", + state.htups, state.itups, state.tups_inserted, + pg_rusage_show(&ru0)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -2994,16 +2998,26 @@ validate_index_heapscan(Relation heapRelation, /* * Prepare for scan of the base relation. We need just those tuples - * satisfying the passed-in reference snapshot. We must disable syncscan - * here, because it's critical that we read from block zero forward to - * match the sorted TIDs. + * satisfying the passed-in reference snapshot. But we are only interested + * is tuples which were either inserted or updated after we took reference + * snapshot for the initial build. Such tuples can only exist in pages + * which are not marked as "all-visible". So we can easily skip + * "all-visible" pages. This can be a huge win for very large tables, which + * do not fit in shared buffers or OS cache and are less freqeuntly + * updated. + * + * Note that since VACUUM conflicts with CIC on lock, there is no way new + * pages will be marked "all-visible" after CIC started. We don't care if + * pages are cleared with "all-visible" flag after we scanned them during + * the upcoming heap scan.. + * + * We must disable syncscan here, because it's critical that we read from + * block zero forward to match the sorted TIDs. */ - scan = heap_beginscan_strat(heapRelation, /* relation */ + scan = heap_beginscan_skip_all_visible(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ - NULL, /* scan key */ - true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + NULL); /* scan key */ /* * Scan all tuples matching the snapshot. diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index a864f78..aa10cd9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -118,6 +118,8 @@ extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, extern HeapScanDesc heap_beginscan_sampling(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, bool allow_strat, bool allow_sync, bool allow_pagemode); +extern HeapScanDesc heap_beginscan_skip_all_visible(Relation relation, + Snapshot snapshot, int nkeys, ScanKey key); extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber endBlk); extern void heapgetpage(HeapScanDesc scan, BlockNumber page); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ce3ca8d..e7b073c 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -54,6 +54,13 @@ typedef struct HeapScanDescData bool rs_allow_strat; /* allow or disallow use of access strategy */ bool rs_allow_sync; /* allow or disallow use of syncscan */ bool rs_temp_snap; /* unregister snapshot at scan end? */ + bool rs_skip_all_visible; /* skip all-visible pages ? */ + double rs_skipped_total; + int rs_skipped_times; + double rs_scanned_total; + BlockNumber rs_next_unskippable_block; + BlockNumber rs_all_visible_checked; + Buffer rs_vmbuf; /* visibility map buffer */ /* state set up at initscan time */ BlockNumber rs_nblocks; /* total number of blocks in rel */