diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0f2f2bf..e0251b7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1592,6 +1592,35 @@ include 'filename'
Asynchronous Behavior
+
+ prune_page_dirty_limit (integer)
+
+ prune_page_dirty_limit> configuration parameter
+
+
+
+ Sets the limit on the number of pages dirtied by page pruning during
+ foreground processing for plain SELECT queries. Maintenance operations
+ and write operations are not limited by this parameter, nor are
+ SELECT queries that require writes to occur, such as row locking or
+ writable common table expressions.
+
+
+ This allows larger SELECT queries and pg_dump to avoid generating
+ significant write I/O as they execute, aiding query performance during
+ mixed workload processing. The limit is reset for each new statement.
+
+
+ Prior to 9.4, there was no limit, which is provided by a setting of -1.
+
+
+ The limit is overridden and set to zero for any table where pruning is
+ disabled.
+
+
+
+
+
effective_io_concurrency (integer)
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index a422edd..61915ff 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -895,6 +895,20 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI
+ allow_buffer_cleanup> (boolean>)
+
+
+ Determines whether user SQL will perform cleanup of shared buffers
+ for this table. Does not prevent setting of tuple hint bits.
+ See also prune_page_dirty_limit.
+ Disabling this will prevent HOT from working effectively, which
+ will in most cases be a performance degradation for writes to this
+ table. This parameter cannot be set for TOAST tables.
+
+
+
+
+
autovacuum_enabled>, toast.autovacuum_enabled (boolean>)
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index fa08c45..d466819 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -85,6 +85,14 @@ static relopt_bool boolRelOpts[] =
},
false
},
+ {
+ {
+ "allow_buffer_cleanup",
+ "Allow cleanup of shared buffers by foreground processes, allowing later cleanup by VACUUM",
+ RELOPT_KIND_HEAP
+ },
+ true
+ },
/* list terminator */
{{NULL}}
};
@@ -1175,7 +1183,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
{"check_option", RELOPT_TYPE_STRING,
offsetof(StdRdOptions, check_option_offset)},
{"user_catalog_table", RELOPT_TYPE_BOOL,
- offsetof(StdRdOptions, user_catalog_table)}
+ offsetof(StdRdOptions, user_catalog_table)},
+ {"allow_buffer_cleanup", RELOPT_TYPE_BOOL,
+ offsetof(StdRdOptions, allow_prune)}
};
options = parseRelOptions(reloptions, validate, kind, &numoptions);
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a771ccb..5ddd07d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -273,6 +273,7 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
scan->rs_startblock = 0;
}
+ scan->rs_allow_prune = RelationAllowPrune(scan->rs_rd);
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -348,7 +349,10 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
* Prune and repair fragmentation for the whole page, if possible.
*/
Assert(TransactionIdIsValid(RecentGlobalXmin));
- heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+ if (scan->rs_allow_prune &&
+ (PrunePageDirtyLimit == -1 ||
+ PrunePageDirtyLimit > PrunePageDirty))
+ heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
/*
* We must hold share lock on the buffer content while examining tuple
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 1aba2f0..6b1f04e 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -67,7 +67,9 @@
#include "access/relscan.h"
#include "access/transam.h"
+#include "access/xact.h"
#include "catalog/index.h"
+#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
@@ -249,6 +251,7 @@ index_beginscan(Relation heapRelation,
* up by RelationGetIndexScan.
*/
scan->heapRelation = heapRelation;
+ scan->xs_allow_prune = RelationAllowPrune(heapRelation);
scan->xs_snapshot = snapshot;
return scan;
@@ -275,6 +278,12 @@ index_beginscan_bitmap(Relation indexRelation,
*/
scan->xs_snapshot = snapshot;
+ /*
+ * Bitmap index scans don't touch the heap, so this is effectively
+ * ignored. We set pruning correctly when we do the bitmap heap scan.
+ */
+ scan->xs_allow_prune = true;
+
return scan;
}
@@ -519,7 +528,10 @@ index_fetch_heap(IndexScanDesc scan)
/*
* Prune page, but only if we weren't already on this page
*/
- if (prev_buf != scan->xs_cbuf)
+ if (prev_buf != scan->xs_cbuf &&
+ scan->xs_allow_prune &&
+ (PrunePageDirtyLimit == -1 ||
+ PrunePageDirtyLimit > PrunePageDirty))
heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
RecentGlobalXmin);
}
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 3455a0b..993cb70 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -219,6 +219,8 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast,
VacuumPageHit = 0;
VacuumPageMiss = 0;
VacuumPageDirty = 0;
+ PrunePageDirty = 0;
+ PrunePageDirtyLimit = -1; /* VACUUM forces pruning always */
/*
* Loop to process each selected relation.
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index e0000e6..f6424af 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -320,6 +320,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
TimestampDifferenceExceeds(starttime, endtime,
Log_autovacuum_min_duration))
{
+ int NumPageDirty = PrunePageDirty + VacuumPageDirty;
+
TimestampDifference(starttime, endtime, &secs, &usecs);
read_rate = 0;
@@ -328,7 +330,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
{
read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) /
(secs + usecs / 1000000.0);
- write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
+ write_rate = (double) BLCKSZ *NumPageDirty / (1024 * 1024) /
(secs + usecs / 1000000.0);
}
ereport(LOG,
@@ -348,7 +350,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
vacrelstats->new_rel_tuples,
VacuumPageHit,
VacuumPageMiss,
- VacuumPageDirty,
+ NumPageDirty,
read_rate, write_rate,
pg_rusage_show(&ru0))));
}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 6b5f198..4075f4c 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -158,6 +158,11 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
/*
+ * Initialise the page pruning limit to "prune all pages"
+ */
+ PrunePageDirtyLimit = -1;
+
+ /*
* If non-read-only query, set the command ID to mark output tuples with
*/
switch (queryDesc->operation)
@@ -171,6 +176,14 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
if (queryDesc->plannedstmt->rowMarks != NIL ||
queryDesc->plannedstmt->hasModifyingCTE)
estate->es_output_cid = GetCurrentCommandId(true);
+ else
+ {
+ /*
+ * If we aren't marking tuples then we apply the
+ * page pruning policy for this query.
+ */
+ PrunePageDirtyLimit = PrunePageDirtyLimitPolicy;
+ }
/*
* A SELECT without modifying CTEs can't possibly queue triggers,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 1a8d4e5..5585851 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -39,6 +39,7 @@
#include "access/transam.h"
#include "executor/execdebug.h"
#include "executor/nodeBitmapHeapscan.h"
+#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/predicate.h"
@@ -337,7 +338,10 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
* Prune and repair fragmentation for the whole page, if possible.
*/
Assert(TransactionIdIsValid(RecentGlobalXmin));
- heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+ if (scan->rs_allow_prune &&
+ (PrunePageDirtyLimit == -1 ||
+ PrunePageDirtyLimit > PrunePageDirty))
+ heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
/*
* We must hold share lock on the buffer content while examining tuple
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index c6bae12..51f07ec 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1009,7 +1009,16 @@ MarkBufferDirty(Buffer buffer)
*/
if (!(bufHdr->flags & BM_DIRTY))
{
- VacuumPageDirty++;
+ /*
+ * If we are tracking pruning in SELECTs then we can only get
+ * here by heap_page_prune_opt() call that cleans a block,
+ * so in that case, register it as a pruning operation.
+ * Make sure we don't double count during VACUUMs.
+ */
+ if (PrunePageDirtyLimit > -1)
+ PrunePageDirty++;
+ else
+ VacuumPageDirty++;
pgBufferUsage.shared_blks_dirtied++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
@@ -2601,6 +2610,12 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
return;
}
+ /*
+ * Stop if we reached the limit, or if we set -1 as limit.
+ */
+ if (PrunePageDirtyLimit <= PrunePageDirty && PrunePageDirtyLimit > -1)
+ return;
+
bufHdr = &BufferDescriptors[buffer - 1];
Assert(PrivateRefCount[buffer - 1] > 0);
@@ -2703,7 +2718,11 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
if (dirtied)
{
- VacuumPageDirty++;
+ /*
+ * Keep counter accurate even if we aren't limiting pruning
+ */
+ PrunePageDirty++;
+
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index f85bd03..1af2a44 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -732,6 +732,12 @@ PortalRun(Portal portal, long count, bool isTopLevel,
}
/*
+ * Reset execution counters for each new top-level statement
+ */
+ if (isTopLevel)
+ PrunePageDirty = 0;
+
+ /*
* Check for improper portal use, and mark portal active.
*/
if (portal->status != PORTAL_READY)
@@ -1246,6 +1252,12 @@ PortalRunMulti(Portal portal, bool isTopLevel,
*/
CHECK_FOR_INTERRUPTS();
+ /*
+ * Reset execution counters for each new top-level statement
+ */
+ if (isTopLevel)
+ PrunePageDirty = 0;
+
if (IsA(stmt, PlannedStmt) &&
((PlannedStmt *) stmt)->utilityStmt == NULL)
{
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 63c951e..738a3bb 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -119,6 +119,9 @@ int VacuumCostDelay = 0;
int VacuumPageHit = 0;
int VacuumPageMiss = 0;
int VacuumPageDirty = 0;
+int PrunePageDirty = 0;
+int PrunePageDirtyLimit = 0;
+int PrunePageDirtyLimitPolicy = 0;
int VacuumCostBalance = 0; /* working state for vacuum */
bool VacuumCostActive = false;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1217098..5a88f07 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1791,6 +1791,16 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"prune_page_dirty_limit", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+ gettext_noop("Limit of foreground page cleaning by read-only statements."),
+ NULL
+ },
+ &PrunePageDirtyLimitPolicy,
+ 4, -1, 10000, /* -1 is equivalent to the pre-9.4 default */
+ NULL, NULL, NULL
+ },
+
+ {
{"vacuum_cost_page_hit", PGC_USERSET, RESOURCES_VACUUM_DELAY,
gettext_noop("Vacuum cost for a page found in the buffer cache."),
NULL
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 8a57698..341326b 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -33,6 +33,7 @@ typedef struct HeapScanDescData
bool rs_allow_strat; /* allow or disallow use of access strategy */
bool rs_allow_sync; /* allow or disallow use of syncscan */
bool rs_temp_snap; /* unregister snapshot at scan end? */
+ bool rs_allow_prune; /* allow pruning during page-at-a-time access */
/* state set up at initscan time */
BlockNumber rs_nblocks; /* number of blocks to scan */
@@ -77,6 +78,7 @@ typedef struct IndexScanDescData
bool ignore_killed_tuples; /* do not return killed entries */
bool xactStartedInRecovery; /* prevents killing/seeing killed
* tuples */
+ bool xs_allow_prune; /* allow pruning during index access */
/* index access method's private state */
void *opaque; /* access-method-specific info */
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index b145a19..4f5605a 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -235,6 +235,9 @@ extern int VacuumCostDelay;
extern int VacuumPageHit;
extern int VacuumPageMiss;
extern int VacuumPageDirty;
+extern int PrunePageDirty;
+extern int PrunePageDirtyLimit;
+extern int PrunePageDirtyLimitPolicy;
extern int VacuumCostBalance;
extern bool VacuumCostActive;
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 9b8a4c9..e0e00a6 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -218,6 +218,7 @@ typedef struct StdRdOptions
bool security_barrier; /* for views */
int check_option_offset; /* for views */
bool user_catalog_table; /* use as an additional catalog relation */
+ bool allow_prune; /* whether user pruning is enabled on relation */
} StdRdOptions;
#define HEAP_MIN_FILLFACTOR 10
@@ -232,6 +233,14 @@ typedef struct StdRdOptions
((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff))
/*
+ * RelationAllowPrune
+ * Returns the relation's prune state. Note multiple eval of argument!
+ */
+#define RelationAllowPrune(relation) \
+ ((relation)->rd_options ? \
+ ((StdRdOptions *) (relation)->rd_options)->allow_prune : true)
+
+/*
* RelationGetTargetPageUsage
* Returns the relation's desired space usage per page in bytes.
*/