diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0f2f2bf..e0251b7 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1592,6 +1592,35 @@ include 'filename' Asynchronous Behavior + + prune_page_dirty_limit (integer) + + prune_page_dirty_limit configuration parameter + + + + Sets the limit on the number of pages dirtied by page pruning during + foreground processing for plain SELECT queries. Maintenance operations + and write operations are not limited by this parameter, nor are + SELECT queries that require writes to occur, such as row locking or + writable common table expressions. + + + This allows larger SELECT queries and pg_dump to avoid generating + significant write I/O as they execute, aiding query performance during + mixed workload processing. The limit is reset for each new statement. + + + Prior to 9.4, there was no limit, which is provided by a setting of -1. + + + The limit is overridden and set to zero for any table where pruning is + disabled. + + + + + effective_io_concurrency (integer) diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index a422edd..61915ff 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -895,6 +895,20 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI + allow_buffer_cleanup (boolean) + + + Determines whether user SQL will perform cleanup of shared buffers + for this table. Does not prevent setting of tuple hint bits. + See also prune_page_dirty_limit. + Disabling this will prevent HOT from working effectively, which + will in most cases be a performance degradation for writes to this + table. This parameter cannot be set for TOAST tables. + + + + + autovacuum_enabled, toast.autovacuum_enabled (boolean) diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index fa08c45..d466819 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -85,6 +85,14 @@ static relopt_bool boolRelOpts[] = }, false }, + { + { + "allow_buffer_cleanup", + "Allow cleanup of shared buffers by foreground processes, allowing later cleanup by VACUUM", + RELOPT_KIND_HEAP + }, + true + }, /* list terminator */ {{NULL}} }; @@ -1175,7 +1183,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"check_option", RELOPT_TYPE_STRING, offsetof(StdRdOptions, check_option_offset)}, {"user_catalog_table", RELOPT_TYPE_BOOL, - offsetof(StdRdOptions, user_catalog_table)} + offsetof(StdRdOptions, user_catalog_table)}, + {"allow_buffer_cleanup", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, allow_prune)} }; options = parseRelOptions(reloptions, validate, kind, &numoptions); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index a771ccb..5ddd07d 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -273,6 +273,7 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) scan->rs_startblock = 0; } + scan->rs_allow_prune = RelationAllowPrune(scan->rs_rd); scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); @@ -348,7 +349,10 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) * Prune and repair fragmentation for the whole page, if possible. */ Assert(TransactionIdIsValid(RecentGlobalXmin)); - heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); + if (scan->rs_allow_prune && + (PrunePageDirtyLimit == -1 || + PrunePageDirtyLimit > PrunePageDirty)) + heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 1aba2f0..6b1f04e 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -67,7 +67,9 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/xact.h" #include "catalog/index.h" +#include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -249,6 +251,7 @@ index_beginscan(Relation heapRelation, * up by RelationGetIndexScan. */ scan->heapRelation = heapRelation; + scan->xs_allow_prune = RelationAllowPrune(heapRelation); scan->xs_snapshot = snapshot; return scan; @@ -275,6 +278,12 @@ index_beginscan_bitmap(Relation indexRelation, */ scan->xs_snapshot = snapshot; + /* + * Bitmap index scans don't touch the heap, so this is effectively + * ignored. We set pruning correctly when we do the bitmap heap scan. + */ + scan->xs_allow_prune = true; + return scan; } @@ -519,7 +528,10 @@ index_fetch_heap(IndexScanDesc scan) /* * Prune page, but only if we weren't already on this page */ - if (prev_buf != scan->xs_cbuf) + if (prev_buf != scan->xs_cbuf && + scan->xs_allow_prune && + (PrunePageDirtyLimit == -1 || + PrunePageDirtyLimit > PrunePageDirty)) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); } diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 3455a0b..993cb70 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -219,6 +219,8 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, VacuumPageHit = 0; VacuumPageMiss = 0; VacuumPageDirty = 0; + PrunePageDirty = 0; + PrunePageDirtyLimit = -1; /* VACUUM forces pruning always */ /* * Loop to process each selected relation. diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index e0000e6..f6424af 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -320,6 +320,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, TimestampDifferenceExceeds(starttime, endtime, Log_autovacuum_min_duration)) { + int NumPageDirty = PrunePageDirty + VacuumPageDirty; + TimestampDifference(starttime, endtime, &secs, &usecs); read_rate = 0; @@ -328,7 +330,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, { read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) / (secs + usecs / 1000000.0); - write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) / + write_rate = (double) BLCKSZ *NumPageDirty / (1024 * 1024) / (secs + usecs / 1000000.0); } ereport(LOG, @@ -348,7 +350,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vacrelstats->new_rel_tuples, VacuumPageHit, VacuumPageMiss, - VacuumPageDirty, + NumPageDirty, read_rate, write_rate, pg_rusage_show(&ru0)))); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 6b5f198..4075f4c 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -158,6 +158,11 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData)); /* + * Initialise the page pruning limit to "prune all pages" + */ + PrunePageDirtyLimit = -1; + + /* * If non-read-only query, set the command ID to mark output tuples with */ switch (queryDesc->operation) @@ -171,6 +176,14 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) if (queryDesc->plannedstmt->rowMarks != NIL || queryDesc->plannedstmt->hasModifyingCTE) estate->es_output_cid = GetCurrentCommandId(true); + else + { + /* + * If we aren't marking tuples then we apply the + * page pruning policy for this query. + */ + PrunePageDirtyLimit = PrunePageDirtyLimitPolicy; + } /* * A SELECT without modifying CTEs can't possibly queue triggers, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 1a8d4e5..5585851 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -39,6 +39,7 @@ #include "access/transam.h" #include "executor/execdebug.h" #include "executor/nodeBitmapHeapscan.h" +#include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/predicate.h" @@ -337,7 +338,10 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) * Prune and repair fragmentation for the whole page, if possible. */ Assert(TransactionIdIsValid(RecentGlobalXmin)); - heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); + if (scan->rs_allow_prune && + (PrunePageDirtyLimit == -1 || + PrunePageDirtyLimit > PrunePageDirty)) + heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index c6bae12..51f07ec 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1009,7 +1009,16 @@ MarkBufferDirty(Buffer buffer) */ if (!(bufHdr->flags & BM_DIRTY)) { - VacuumPageDirty++; + /* + * If we are tracking pruning in SELECTs then we can only get + * here by heap_page_prune_opt() call that cleans a block, + * so in that case, register it as a pruning operation. + * Make sure we don't double count during VACUUMs. + */ + if (PrunePageDirtyLimit > -1) + PrunePageDirty++; + else + VacuumPageDirty++; pgBufferUsage.shared_blks_dirtied++; if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; @@ -2601,6 +2610,12 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) return; } + /* + * Stop if we reached the limit, or if we set -1 as limit. + */ + if (PrunePageDirtyLimit <= PrunePageDirty && PrunePageDirtyLimit > -1) + return; + bufHdr = &BufferDescriptors[buffer - 1]; Assert(PrivateRefCount[buffer - 1] > 0); @@ -2703,7 +2718,11 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (dirtied) { - VacuumPageDirty++; + /* + * Keep counter accurate even if we aren't limiting pruning + */ + PrunePageDirty++; + if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index f85bd03..1af2a44 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -732,6 +732,12 @@ PortalRun(Portal portal, long count, bool isTopLevel, } /* + * Reset execution counters for each new top-level statement + */ + if (isTopLevel) + PrunePageDirty = 0; + + /* * Check for improper portal use, and mark portal active. */ if (portal->status != PORTAL_READY) @@ -1246,6 +1252,12 @@ PortalRunMulti(Portal portal, bool isTopLevel, */ CHECK_FOR_INTERRUPTS(); + /* + * Reset execution counters for each new top-level statement + */ + if (isTopLevel) + PrunePageDirty = 0; + if (IsA(stmt, PlannedStmt) && ((PlannedStmt *) stmt)->utilityStmt == NULL) { diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 63c951e..738a3bb 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -119,6 +119,9 @@ int VacuumCostDelay = 0; int VacuumPageHit = 0; int VacuumPageMiss = 0; int VacuumPageDirty = 0; +int PrunePageDirty = 0; +int PrunePageDirtyLimit = 0; +int PrunePageDirtyLimitPolicy = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 1217098..5a88f07 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1791,6 +1791,16 @@ static struct config_int ConfigureNamesInt[] = }, { + {"prune_page_dirty_limit", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Limit of foreground page cleaning by read-only statements."), + NULL + }, + &PrunePageDirtyLimitPolicy, + 4, -1, 10000, /* -1 is equivalent to the pre-9.4 default */ + NULL, NULL, NULL + }, + + { {"vacuum_cost_page_hit", PGC_USERSET, RESOURCES_VACUUM_DELAY, gettext_noop("Vacuum cost for a page found in the buffer cache."), NULL diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 8a57698..341326b 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -33,6 +33,7 @@ typedef struct HeapScanDescData bool rs_allow_strat; /* allow or disallow use of access strategy */ bool rs_allow_sync; /* allow or disallow use of syncscan */ bool rs_temp_snap; /* unregister snapshot at scan end? */ + bool rs_allow_prune; /* allow pruning during page-at-a-time access */ /* state set up at initscan time */ BlockNumber rs_nblocks; /* number of blocks to scan */ @@ -77,6 +78,7 @@ typedef struct IndexScanDescData bool ignore_killed_tuples; /* do not return killed entries */ bool xactStartedInRecovery; /* prevents killing/seeing killed * tuples */ + bool xs_allow_prune; /* allow pruning during index access */ /* index access method's private state */ void *opaque; /* access-method-specific info */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index b145a19..4f5605a 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -235,6 +235,9 @@ extern int VacuumCostDelay; extern int VacuumPageHit; extern int VacuumPageMiss; extern int VacuumPageDirty; +extern int PrunePageDirty; +extern int PrunePageDirtyLimit; +extern int PrunePageDirtyLimitPolicy; extern int VacuumCostBalance; extern bool VacuumCostActive; diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 9b8a4c9..e0e00a6 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -218,6 +218,7 @@ typedef struct StdRdOptions bool security_barrier; /* for views */ int check_option_offset; /* for views */ bool user_catalog_table; /* use as an additional catalog relation */ + bool allow_prune; /* whether user pruning is enabled on relation */ } StdRdOptions; #define HEAP_MIN_FILLFACTOR 10 @@ -232,6 +233,14 @@ typedef struct StdRdOptions ((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff)) /* + * RelationAllowPrune + * Returns the relation's prune state. Note multiple eval of argument! + */ +#define RelationAllowPrune(relation) \ + ((relation)->rd_options ? \ + ((StdRdOptions *) (relation)->rd_options)->allow_prune : true) + +/* * RelationGetTargetPageUsage * Returns the relation's desired space usage per page in bytes. */