*** a/src/backend/postmaster/autovacuum.c
--- b/src/backend/postmaster/autovacuum.c
***************
*** 223,231 **** typedef struct WorkerInfoData
  	Oid			wi_tableoid;
  	PGPROC	   *wi_proc;
  	TimestampTz wi_launchtime;
! 	int			wi_cost_delay;
! 	int			wi_cost_limit;
  	int			wi_cost_limit_base;
  } WorkerInfoData;
  
  typedef struct WorkerInfoData *WorkerInfo;
--- 223,234 ----
  	Oid			wi_tableoid;
  	PGPROC	   *wi_proc;
  	TimestampTz wi_launchtime;
! 	/* the "base" values are the configured values */
  	int			wi_cost_limit_base;
+ 	int			wi_cost_delay_base;
+ 	/* these are the ones actually in effect, considering balancing */
+ 	int			wi_cost_limit;
+ 	int			wi_cost_delay;
  } WorkerInfoData;
  
  typedef struct WorkerInfoData *WorkerInfo;
***************
*** 248,253 **** typedef enum
--- 251,257 ----
   *
   * av_signal		set by other processes to indicate various conditions
   * av_launcherpid	the PID of the autovacuum launcher
+  * av_vacuum_cost_*	globally configured values for the cost_delay feature
   * av_freeWorkers	the WorkerInfo freelist
   * av_runningWorkers the WorkerInfo non-free queue
   * av_startingWorker pointer to WorkerInfo currently being started (cleared by
***************
*** 261,266 **** typedef struct
--- 265,272 ----
  {
  	sig_atomic_t av_signal[AutoVacNumSignals];
  	pid_t		av_launcherpid;
+ 	int			av_vacuum_cost_delay;
+ 	int			av_vacuum_cost_limit;
  	dlist_head	av_freeWorkers;
  	dlist_head	av_runningWorkers;
  	WorkerInfo	av_startingWorker;
***************
*** 296,301 **** static List *get_database_list(void);
--- 302,309 ----
  static void rebuild_database_list(Oid newdb);
  static int	db_comparator(const void *a, const void *b);
  static void autovac_balance_cost(void);
+ static int choose_vacuum_cost_delay(AutoVacOpts *avopts);
+ static int choose_vacuum_cost_limit(AutoVacOpts *avopts);
  
  static void do_autovacuum(void);
  static void FreeWorkerInfo(int code, Datum arg);
***************
*** 566,571 **** AutoVacLauncherMain(int argc, char *argv[])
--- 574,583 ----
  	SetConfigOption("default_transaction_isolation", "read committed",
  					PGC_SUSET, PGC_S_OVERRIDE);
  
+ 	/* Set up initial values for cost delay balancing algorithm */
+ 	AutoVacuumShmem->av_vacuum_cost_limit = choose_vacuum_cost_limit(NULL);
+ 	AutoVacuumShmem->av_vacuum_cost_delay = choose_vacuum_cost_delay(NULL);
+ 
  	/* in emergency mode, just start a worker and go away */
  	if (!AutoVacuumingActive())
  	{
***************
*** 638,643 **** AutoVacLauncherMain(int argc, char *argv[])
--- 650,657 ----
  
  			/* rebalance in case the default cost parameters changed */
  			LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
+ 			AutoVacuumShmem->av_vacuum_cost_limit = choose_vacuum_cost_limit(NULL);
+ 			AutoVacuumShmem->av_vacuum_cost_delay = choose_vacuum_cost_delay(NULL);
  			autovac_balance_cost();
  			LWLockRelease(AutovacuumLock);
  
***************
*** 1717,1722 **** FreeWorkerInfo(int code, Datum arg)
--- 1731,1737 ----
  		MyWorkerInfo->wi_proc = NULL;
  		MyWorkerInfo->wi_launchtime = 0;
  		MyWorkerInfo->wi_cost_delay = 0;
+ 		MyWorkerInfo->wi_cost_delay_base = 0;
  		MyWorkerInfo->wi_cost_limit = 0;
  		MyWorkerInfo->wi_cost_limit_base = 0;
  		dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
***************
*** 1742,1749 **** AutoVacuumUpdateDelay(void)
--- 1757,1766 ----
  {
  	if (MyWorkerInfo)
  	{
+ 		LWLockAcquire(AutovacuumLock, LW_SHARED);
  		VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
  		VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
+ 		LWLockRelease(AutovacuumLock);
  	}
  }
  
***************
*** 1756,1830 **** AutoVacuumUpdateDelay(void)
  static void
  autovac_balance_cost(void)
  {
- 	/*
- 	 * The idea here is that we ration out I/O equally.  The amount of I/O
- 	 * that a worker can consume is determined by cost_limit/cost_delay, so we
- 	 * try to equalize those ratios rather than the raw limit settings.
- 	 *
- 	 * note: in cost_limit, zero also means use value from elsewhere, because
- 	 * zero is not a valid value.
- 	 */
- 	int			vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
- 								autovacuum_vac_cost_limit : VacuumCostLimit);
- 	int			vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
- 								autovacuum_vac_cost_delay : VacuumCostDelay);
- 	double		cost_total;
- 	double		cost_avail;
  	dlist_iter	iter;
! 
! 	/* not set? nothing to do */
! 	if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
  		return;
  
! 	/* caculate the total base cost limit of active workers */
! 	cost_total = 0.0;
  	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
  	{
  		WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
  
! 		if (worker->wi_proc != NULL &&
! 			worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
! 			cost_total +=
! 				(double) worker->wi_cost_limit_base / worker->wi_cost_delay;
  	}
- 	/* there are no cost limits -- nothing to do */
- 	if (cost_total <= 0)
- 		return;
  
  	/*
! 	 * Adjust cost limit of each active worker to balance the total of cost
! 	 * limit to autovacuum_vacuum_cost_limit.
  	 */
- 	cost_avail = (double) vac_cost_limit / vac_cost_delay;
  	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
  	{
  		WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
  
! 		if (worker->wi_proc != NULL &&
! 			worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
  		{
- 			int			limit = (int)
- 			(cost_avail * worker->wi_cost_limit_base / cost_total);
  
! 			/*
! 			 * We put a lower bound of 1 on the cost_limit, to avoid division-
! 			 * by-zero in the vacuum code.  Also, in case of roundoff trouble
! 			 * in these calculations, let's be sure we don't ever set
! 			 * cost_limit to more than the base value.
! 			 */
! 			worker->wi_cost_limit = Max(Min(limit,
! 											worker->wi_cost_limit_base),
! 										1);
! 
! 			elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
! 				 worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
! 				 worker->wi_cost_limit, worker->wi_cost_limit_base,
! 				 worker->wi_cost_delay);
  		}
  	}
  }
  
  /*
   * get_database_list
   *		Return a list of all databases found in pg_database.
   *
--- 1773,1954 ----
  static void
  autovac_balance_cost(void)
  {
  	dlist_iter	iter;
! 	int			num_regular_workers = 0;
! 	int			num_fast_workers = 0;
! 	float4		global_equiv_delay;
! 	float4		fast_equiv_delay;
! 	float4		regular_total_equiv_delay;
! 	float4		fast_total_equiv_delay;
! 
! 	/* not set in this worker? nothing to do */
! 	if (MyWorkerInfo && (MyWorkerInfo->wi_cost_limit_base <= 0 ||
! 						 MyWorkerInfo->wi_cost_delay_base <= 0))
  		return;
  
! 	/*
! 	 * We use a metric we call "equivalent delay", equal to cost_limit divided
! 	 * by cost_delay, to enable a reasonably simple algorithm to distribute
! 	 * vacuum I/O bandwidth among all active workers.  The "global" equiv delay
! 	 * is the value computed from the GUC parameters in effect for this
! 	 * database (computed from postgresql.conf values.  Values set in ALTER
! 	 * DATABASE have no effect here, but probably need to fix that XXX).  This
! 	 * value lets us split workers in two classes: regular workers are those
! 	 * that have an equiv delay less than or equal to global equiv delay, while
! 	 * fast workers are those that have a value greater than global equiv
! 	 * delay.
! 	 */
! 	global_equiv_delay = AutoVacuumShmem->av_vacuum_cost_limit /
! 		AutoVacuumShmem->av_vacuum_cost_delay;
! 
! 	/*
! 	 * Find the sum of equiv delay values in each class; also find the equiv
! 	 * delay of the fastest among all the fast workers.
! 	 */
  	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
  	{
  		WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
+ 		float4		this_equiv_delay;
+ 
+ 		/* ignore inactive workers, and workers not using cost_delay */
+ 		if (worker->wi_proc == NULL ||
+ 			worker->wi_cost_limit_base <= 0 || worker->wi_cost_delay_base <= 0)
+ 			continue;
+ 
+ 		this_equiv_delay = worker->wi_cost_limit_base / worker->wi_cost_delay_base;
  
! 		if (this_equiv_delay <= global_equiv_delay)
! 		{
! 			/* A regular worker -- count it and add its equiv delay as such */
! 			num_regular_workers++;
! 			regular_total_equiv_delay += this_equiv_delay;
! 		}
! 		else
! 		{
! 			/*
! 			 * A fast worker.  Count and add its equiv delay in a separate
! 			 * total; we also use these to determine the fast_equiv_delay which
! 			 * is the value which we will distribute to all of them, separately
! 			 * from global_equiv_delay.
! 			 */
! 			if (num_fast_workers == 0)
! 				fast_equiv_delay = this_equiv_delay;
! 			else if (this_equiv_delay > fast_equiv_delay)
! 				fast_equiv_delay = this_equiv_delay;
! 			num_fast_workers++;
! 			fast_total_equiv_delay += this_equiv_delay;
! 		}
  	}
  
  	/*
! 	 * FIXME We shouldn't actually distribute all of global equiv delay to
! 	 * regular workers and all of (fast equiv delay - global equiv delay) to
! 	 * fast workers; that makes fast workers much slower if the fast equiv
! 	 * delay is only slightly faster than regular workers.  For example,
! 	 * consider the scenario with one regular worker with equiv_delay=10 and
! 	 * one fast worker with equiv_delay=11; after subtraction the fast worker
! 	 * will have equiv_delay=1).  This part needs more fiddling to avoid this
! 	 * problem.
! 	 *
! 	 * Perhaps the way to solve this is to consider that all workers (including
! 	 * fast ones) get their fraction of global delay, and then fast workers get
! 	 * their pro-rated share of fast equiv delay *added* to that.  This would
! 	 * make regular workers slower.
! 	 *
! 	 * Another idea is to subtract only a fraction of global_equiv_delay from
! 	 * fast_equiv_delay here, rather than all of it (and of course decrease
! 	 * global_equiv_delay accordingly).  The fraction would be variable: if
! 	 * fast_equiv_delay is much higher than global_equiv_delay, then subtract
! 	 * all of it; if both values are close enough, subtract only half (or
! 	 * rather than subtracting half, subtract a pro-rated fraction according to
! 	 * the number of workers in each class.)
! 	 */
! 	if (num_regular_workers > 0)
! 		fast_equiv_delay -= global_equiv_delay;
! 
! 	/*
! 	 * Now we have all parameters we need; compute the values for individual
! 	 * workers.
  	 */
  	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
  	{
  		WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
+ 		float4		this_equiv_delay_base;
+ 		float4		this_equiv_delay_frac;
+ 
+ 		/* again, ignore inactive workers, and workers not using cost_delay */
+ 		if (worker->wi_proc == NULL ||
+ 			worker->wi_cost_limit_base <= 0 || worker->wi_cost_delay_base <= 0)
+ 			continue;
  
! 		this_equiv_delay_base = worker->wi_cost_limit_base / worker->wi_cost_delay_base;
! 
! 		/*
! 		 * If it's a regular worker, use pro-rated fraction of global_equiv_delay;
! 		 * otherwise, use pro-rated fraction of fast_total_equiv_delay
! 		 */
! 		if (this_equiv_delay_base <= global_equiv_delay)
  		{
  
! 			/* A regular worker; use a pro-rated fraction of global_equiv_delay */
! 			this_equiv_delay_frac =
! 				this_equiv_delay_base * global_equiv_delay / regular_total_equiv_delay;
  		}
+ 		else
+ 		{
+ 			/* a fast worker: use a pro-rated fraction of fast_equiv_delay */
+ 			this_equiv_delay_frac =
+ 				this_equiv_delay_base * fast_equiv_delay / fast_total_equiv_delay;
+ 		}
+ 
+ 		/*
+ 		 * Convert back into cost_limit and cost_delay values, and set them
+ 		 * into the worker's shmem struct.  We put a lower bound of 1 to the
+ 		 * cost_limit, to avoid a division-by-zero in the vacuum code.
+ 		 */
+ 		worker->wi_cost_limit = Max(this_equiv_delay_frac *
+ 									worker->wi_cost_delay_base,
+ 									1);
+ 		worker->wi_cost_delay = worker->wi_cost_delay_base;
+ 
+ 		elog(LOG, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_delay=%d)",
+ 			 worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
+ 			 worker->wi_cost_limit, worker->wi_cost_delay);
  	}
  }
  
  /*
+  * Determine the vacuum_cost_delay value to use: if this is a table and it has
+  * reloptions, use the value from there; otherwise the autovacuum parameter,
+  * unless it's -1 in which case we use plain vacuum_cost_delay.
+  */
+ static int
+ choose_vacuum_cost_delay(AutoVacOpts *avopts)
+ {
+ 	if (avopts && avopts->vacuum_cost_delay >= 0)
+ 		return avopts->vacuum_cost_delay;
+ 	if (autovacuum_vac_cost_delay >= 0)
+ 		return autovacuum_vac_cost_delay;
+ 	return VacuumCostDelay;
+ }
+ 
+ /*
+  * Determine the vacuum_cost_limit value to use: if this is a table and it has
+  * reloptions, use the value from there; otherwise the autovacuum parameter,
+  * unless it's -1 or 0 in which case we use plain vacuum_cost_limit.
+  */
+ static int
+ choose_vacuum_cost_limit(AutoVacOpts *avopts)
+ {
+ 	/* 0 is an invalid value here, so avoid that */
+ 	if (avopts && avopts->vacuum_cost_limit > 0)
+ 		return avopts->vacuum_cost_limit;
+ 	if (autovacuum_vac_cost_limit > 0)
+ 		return autovacuum_vac_cost_limit;
+ 	return VacuumCostLimit;
+ }
+ 
+ /*
   * get_database_list
   *		Return a list of all databases found in pg_database.
   *
***************
*** 2202,2209 **** do_autovacuum(void)
  		Oid			relid = lfirst_oid(cell);
  		autovac_table *tab;
  		bool		skipit;
- 		int			stdVacuumCostDelay;
- 		int			stdVacuumCostLimit;
  		dlist_iter	iter;
  
  		CHECK_FOR_INTERRUPTS();
--- 2326,2331 ----
***************
*** 2272,2302 **** do_autovacuum(void)
  		MyWorkerInfo->wi_tableoid = relid;
  		LWLockRelease(AutovacuumScheduleLock);
  
- 		/*
- 		 * Remember the prevailing values of the vacuum cost GUCs.  We have to
- 		 * restore these at the bottom of the loop, else we'll compute wrong
- 		 * values in the next iteration of autovac_balance_cost().
- 		 */
- 		stdVacuumCostDelay = VacuumCostDelay;
- 		stdVacuumCostLimit = VacuumCostLimit;
- 
  		/* Must hold AutovacuumLock while mucking with cost balance info */
  		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
  
  		/* advertise my cost delay parameters for the balancing algorithm */
  		MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
  		MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
  		MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
  
  		/* do a balance */
  		autovac_balance_cost();
  
- 		/* set the active cost parameters from the result of that */
- 		AutoVacuumUpdateDelay();
- 
  		/* done */
  		LWLockRelease(AutovacuumLock);
  
  		/* clean up memory before each iteration */
  		MemoryContextResetAndDeleteChildren(PortalContext);
  
--- 2394,2417 ----
  		MyWorkerInfo->wi_tableoid = relid;
  		LWLockRelease(AutovacuumScheduleLock);
  
  		/* Must hold AutovacuumLock while mucking with cost balance info */
  		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
  
  		/* advertise my cost delay parameters for the balancing algorithm */
  		MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
+ 		MyWorkerInfo->wi_cost_delay_base = tab->at_vacuum_cost_delay;
  		MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
  		MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
  
  		/* do a balance */
  		autovac_balance_cost();
  
  		/* done */
  		LWLockRelease(AutovacuumLock);
  
+ 		/* set the active cost parameters from the result of that */
+ 		AutoVacuumUpdateDelay();
+ 
  		/* clean up memory before each iteration */
  		MemoryContextResetAndDeleteChildren(PortalContext);
  
***************
*** 2381,2390 **** deleted:
  		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
  		MyWorkerInfo->wi_tableoid = InvalidOid;
  		LWLockRelease(AutovacuumLock);
- 
- 		/* restore vacuum cost GUCs for the next iteration */
- 		VacuumCostDelay = stdVacuumCostDelay;
- 		VacuumCostLimit = stdVacuumCostLimit;
  	}
  
  	/*
--- 2496,2501 ----
***************
*** 2532,2550 **** table_recheck_autovac(Oid relid, HTAB *table_toast_map,
  		 * defaults, autovacuum's own first and plain vacuum second.
  		 */
  
! 		/* -1 in autovac setting means use plain vacuum_cost_delay */
! 		vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
! 			? avopts->vacuum_cost_delay
! 			: (autovacuum_vac_cost_delay >= 0)
! 			? autovacuum_vac_cost_delay
! 			: VacuumCostDelay;
! 
! 		/* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
! 		vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
! 			? avopts->vacuum_cost_limit
! 			: (autovacuum_vac_cost_limit > 0)
! 			? autovacuum_vac_cost_limit
! 			: VacuumCostLimit;
  
  		/* these do not have autovacuum-specific settings */
  		freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
--- 2643,2650 ----
  		 * defaults, autovacuum's own first and plain vacuum second.
  		 */
  
! 		vac_cost_delay = choose_vacuum_cost_delay(avopts);
! 		vac_cost_limit = choose_vacuum_cost_limit(avopts);
  
  		/* these do not have autovacuum-specific settings */
  		freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
***************
*** 2935,2940 **** AutoVacuumShmemInit(void)
--- 3035,3042 ----
  		dlist_init(&AutoVacuumShmem->av_freeWorkers);
  		dlist_init(&AutoVacuumShmem->av_runningWorkers);
  		AutoVacuumShmem->av_startingWorker = NULL;
+ 		AutoVacuumShmem->av_vacuum_cost_limit = 0;
+ 		AutoVacuumShmem->av_vacuum_cost_delay = 0;
  
  		worker = (WorkerInfo) ((char *) AutoVacuumShmem +
  							   MAXALIGN(sizeof(AutoVacuumShmemStruct)));
