From 4c944d4e7f4bbae7bdccd1949074e414a7b56b2e Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Mon, 30 Mar 2026 08:00:23 -0400
Subject: Scale parallel_tuple_cost by tuple width at Gather nodes

The parallel_tuple_cost GUC applies a flat per-tuple penalty to all
Gather and Gather Merge nodes, regardless of how wide or narrow the
tuples passing through the shared-memory queue actually are.  This
overcharges for narrow tuples (such as partial aggregate results with
a few integer columns) and undercharges for wide tuples.

The physical cost of the tuple queue is dominated by memcpy, which is
proportional to tuple width.  Introduce a width-based scaling factor
so that parallel_tuple_cost represents the cost at a reference width
of 100 bytes, with a 10% fixed floor for irreducible per-tuple queue
synchronization overhead.

For a Gather passing 12-byte partial aggregate tuples, the effective
per-tuple cost drops from 0.1 to ~0.02, which lets the planner choose
parallel plans for aggregation-heavy queries.

Tuples at the reference width (100 bytes) cost the same as before.
---
 src/backend/optimizer/path/costsize.c | 20 ++++++++++++--------
 src/backend/optimizer/plan/planner.c  |  4 +++-
 src/include/optimizer/cost.h          | 24 ++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1c575e56ff6..695cded910a 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -11,7 +11,9 @@
  *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
  *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
  *	cpu_operator_cost	Cost of CPU time to execute an operator or function
- *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to leader backend
+ *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to leader
+ *						backend.  Scaled by tuple width relative to a reference
+ *						width (see width_adjusted_parallel_tuple_cost).
  *	parallel_setup_cost Cost of setting up shared memory for parallelism
  *
  * We expect that the kernel will typically do some amount of read-ahead
@@ -446,9 +448,10 @@ cost_gather(GatherPath *path, PlannerInfo *root,

 	run_cost = path->subpath->total_cost - path->subpath->startup_cost;

-	/* Parallel setup and communication cost. */
+	/* Parallel setup and communication cost, scaled by tuple width. */
 	startup_cost += parallel_setup_cost;
-	run_cost += parallel_tuple_cost * path->path.rows;
+	run_cost += width_adjusted_parallel_tuple_cost(path->path.pathtarget->width) *
+		path->path.rows;

 	path->path.disabled_nodes = path->subpath->disabled_nodes
 		+ ((rel->pgs_mask & PGS_GATHER) != 0 ? 0 : 1);
@@ -509,13 +512,14 @@ cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
 	run_cost += cpu_operator_cost * path->path.rows;

 	/*
-	 * Parallel setup and communication cost.  Since Gather Merge, unlike
-	 * Gather, requires us to block until a tuple is available from every
-	 * worker, we bump the IPC cost up a little bit as compared with Gather.
-	 * For lack of a better idea, charge an extra 5%.
+	 * Parallel setup and communication cost, scaled by tuple width.  Since
+	 * Gather Merge, unlike Gather, requires us to block until a tuple is
+	 * available from every worker, we bump the IPC cost up a little bit as
+	 * compared with Gather.  For lack of a better idea, charge an extra 5%.
 	 */
 	startup_cost += parallel_setup_cost;
-	run_cost += parallel_tuple_cost * path->path.rows * 1.05;
+	run_cost += width_adjusted_parallel_tuple_cost(path->path.pathtarget->width) *
+		path->path.rows * 1.05;

 	path->path.disabled_nodes = path->subpath->disabled_nodes
 		+ ((rel->pgs_mask & PGS_GATHER_MERGE) != 0 ? 0 : 1);
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index d19800ad6a5..88d03ecfb4d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -580,7 +580,9 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 		gather->plan.startup_cost = top_plan->startup_cost +
 			parallel_setup_cost;
 		gather->plan.total_cost = top_plan->total_cost +
-			parallel_setup_cost + parallel_tuple_cost * top_plan->plan_rows;
+			parallel_setup_cost +
+			width_adjusted_parallel_tuple_cost(top_plan->plan_width) *
+			top_plan->plan_rows;
 		gather->plan.plan_rows = top_plan->plan_rows;
 		gather->plan.plan_width = top_plan->plan_width;
 		gather->plan.parallel_aware = false;
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index f2fd5d31507..d7997779b3e 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -175,6 +175,30 @@ extern void initial_cost_hashjoin(PlannerInfo *root,
 extern void final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 								JoinCostWorkspace *workspace,
 								JoinPathExtraData *extra);
+
+/*
+ * Width-adjusted parallel tuple cost.
+ *
+ * The cost of passing a tuple through the shared-memory tuple queue has a
+ * fixed component (queue synchronization, slot operations) and a variable
+ * component proportional to tuple width (memcpy into/out of the ring buffer).
+ * parallel_tuple_cost is calibrated for PARALLEL_TUPLE_COST_REF_WIDTH bytes;
+ * we scale proportionally so narrow tuples (e.g. partial aggregate results)
+ * are cheaper and wide tuples are more expensive.
+ *
+ * PARALLEL_TUPLE_COST_FIXED_FRAC is the irreducible per-tuple overhead
+ * (queue synchronization) as a fraction of the total cost at the reference
+ * width.
+ */
+#define PARALLEL_TUPLE_COST_REF_WIDTH	100 /* bytes */
+#define PARALLEL_TUPLE_COST_FIXED_FRAC	0.10	/* fixed overhead fraction */
+
+#define width_adjusted_parallel_tuple_cost(width) \
+	(parallel_tuple_cost * \
+	 (PARALLEL_TUPLE_COST_FIXED_FRAC + \
+	  (1.0 - PARALLEL_TUPLE_COST_FIXED_FRAC) * \
+	  (double) Max((width), 1) / PARALLEL_TUPLE_COST_REF_WIDTH))
+
 extern void cost_gather(GatherPath *path, PlannerInfo *root,
 						RelOptInfo *rel, ParamPathInfo *param_info, double *rows);
 extern void cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
--
2.43.0