diff -rcpb a/backend/postmaster/autovacuum.c b/backend/postmaster/autovacuum.c
*** a/backend/postmaster/autovacuum.c	2015-10-05 12:09:44.000000000 -0700
--- b/backend/postmaster/autovacuum.c	2015-10-26 05:52:16.857928285 -0700
*************** db_comparator(const void *a, const void
*** 1063,1068 ****
--- 1063,1083 ----
  		return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
  }
  
+ 
+ /* qsort comparison function for oids */
+ static int
+ oid_cmp(const void *p1, const void *p2)
+ {
+ 	Oid		v1 = *((const Oid *) p1);
+ 	Oid		v2 = *((const Oid *) p2);
+ 
+ 	if (v1 < v2)
+ 		return -1;
+ 	if (v1 > v2)
+ 		return 1;
+ 	return 0;
+ }
+ 
  /*
   * do_start_worker
   *
*************** do_autovacuum(void)
*** 1884,1899 ****
  	HeapTuple	tuple;
  	HeapScanDesc relScan;
  	Form_pg_database dbForm;
- 	List	   *table_oids = NIL;
  	HASHCTL		ctl;
  	HTAB	   *table_toast_map;
- 	ListCell   *volatile cell;
  	PgStat_StatDBEntry *shared;
  	PgStat_StatDBEntry *dbentry;
  	BufferAccessStrategy bstrategy;
  	ScanKeyData key;
  	TupleDesc	pg_class_desc;
  	int			effective_multixact_freeze_max_age;
  
  	/*
  	 * StartTransactionCommand and CommitTransactionCommand will automatically
--- 1899,1916 ----
  	HeapTuple	tuple;
  	HeapScanDesc relScan;
  	Form_pg_database dbForm;
  	HASHCTL		ctl;
  	HTAB	   *table_toast_map;
  	PgStat_StatDBEntry *shared;
  	PgStat_StatDBEntry *dbentry;
  	BufferAccessStrategy bstrategy;
  	ScanKeyData key;
  	TupleDesc	pg_class_desc;
  	int			effective_multixact_freeze_max_age;
+ 	Oid			*table_oids;
+ 	int			maxoids,
+ 				numoids,
+ 				oid_idx;
  
  	/*
  	 * StartTransactionCommand and CommitTransactionCommand will automatically
*************** do_autovacuum(void)
*** 1978,1983 ****
--- 1995,2005 ----
  								  &ctl,
  								  HASH_ELEM | HASH_BLOBS);
  
+     /* create an array of all the tables that might need action */
+ 	maxoids = 32;
+ 	table_oids = (Oid *) palloc(maxoids * sizeof(Oid));
+ 	numoids = 0;
+ 
  	/*
  	 * Scan pg_class to determine which tables to vacuum.
  	 *
*************** do_autovacuum(void)
*** 2071,2077 ****
  		{
  			/* relations that need work are added to table_oids */
  			if (dovacuum || doanalyze)
! 				table_oids = lappend_oid(table_oids, relid);
  
  			/*
  			 * Remember the association for the second pass.  Note: we must do
--- 2093,2107 ----
  		{
  			/* relations that need work are added to table_oids */
  			if (dovacuum || doanalyze)
! 			{
! 				if (numoids >= maxoids)
! 				{
! 					maxoids *= 2;
! 					table_oids = (Oid *) repalloc(table_oids,
! 												  maxoids * sizeof(Oid));
! 				}
! 				table_oids[numoids++] = relid;
! 			}
  
  			/*
  			 * Remember the association for the second pass.  Note: we must do
*************** do_autovacuum(void)
*** 2155,2167 ****
  
  		/* ignore analyze for toast tables */
  		if (dovacuum)
! 			table_oids = lappend_oid(table_oids, relid);
  	}
  
  	heap_endscan(relScan);
  	heap_close(classRel, AccessShareLock);
  
  	/*
  	 * Create a buffer access strategy object for VACUUM to use.  We want to
  	 * use the same one across all the vacuum operations we perform, since the
  	 * point is for VACUUM not to blow out the shared cache.
--- 2185,2214 ----
  
  		/* ignore analyze for toast tables */
  		if (dovacuum)
! 		{
! 			if (numoids >= maxoids)
! 			{
! 				maxoids *= 2;
! 				table_oids = (Oid *) repalloc(table_oids,
! 											  maxoids * sizeof(Oid));
! 			}
! 			table_oids[numoids++] = relid;
! 		}
! 
  	}
  
  	heap_endscan(relScan);
  	heap_close(classRel, AccessShareLock);
  
  	/*
+ 	 * Sort the oids to establish a consistent order between multiple workers
+ 	 * so that by maintaining a high watermark of the most recently examined
+ 	 * oid workers can avoid competing to vacuum the same table.
+ 	 */
+ 	if (numoids > 1)
+ 		qsort(table_oids, numoids, sizeof(Oid), oid_cmp);
+ 
+ 	/*
  	 * Create a buffer access strategy object for VACUUM to use.  We want to
  	 * use the same one across all the vacuum operations we perform, since the
  	 * point is for VACUUM not to blow out the shared cache.
*************** do_autovacuum(void)
*** 2181,2191 ****
  	/*
  	 * Perform operations on collected tables.
  	 */
! 	foreach(cell, table_oids)
  	{
- 		Oid			relid = lfirst_oid(cell);
  		autovac_table *tab;
! 		bool		skipit;
  		int			stdVacuumCostDelay;
  		int			stdVacuumCostLimit;
  		dlist_iter	iter;
--- 2228,2238 ----
  	/*
  	 * Perform operations on collected tables.
  	 */
! 	for(oid_idx = 0; oid_idx < numoids; )
  	{
  		autovac_table *tab;
! 		Oid			relid;
! 		Oid			highwateroid;
  		int			stdVacuumCostDelay;
  		int			stdVacuumCostLimit;
  		dlist_iter	iter;
*************** do_autovacuum(void)
*** 2209,2277 ****
  		}
  
  		/*
! 		 * hold schedule lock from here until we're sure that this table still
! 		 * needs vacuuming.  We also need the AutovacuumLock to walk the
! 		 * worker array, but we'll let go of that one quickly.
  		 */
! 		LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
! 		LWLockAcquire(AutovacuumLock, LW_SHARED);
! 
  		/*
! 		 * Check whether the table is being vacuumed concurrently by another
! 		 * worker.
  		 */
! 		skipit = false;
  		dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
  		{
  			WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
  
! 			/* ignore myself */
! 			if (worker == MyWorkerInfo)
! 				continue;
  
! 			/* ignore workers in other databases */
! 			if (worker->wi_dboid != MyDatabaseId)
! 				continue;
  
- 			if (worker->wi_tableoid == relid)
- 			{
- 				skipit = true;
- 				break;
- 			}
- 		}
  		LWLockRelease(AutovacuumLock);
! 		if (skipit)
! 		{
! 			LWLockRelease(AutovacuumScheduleLock);
  			continue;
- 		}
  
  		/*
  		 * Check whether pgstat data still says we need to vacuum this table.
! 		 * It could have changed if something else processed the table while
! 		 * we weren't looking.
! 		 *
! 		 * Note: we have a special case in pgstat code to ensure that the
! 		 * stats we read are as up-to-date as possible, to avoid the problem
! 		 * that somebody just finished vacuuming this table.  The window to
! 		 * the race condition is not closed but it is very small.
  		 */
  		MemoryContextSwitchTo(AutovacMemCxt);
  		tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
  									effective_multixact_freeze_max_age);
  		if (tab == NULL)
- 		{
  			/* someone else vacuumed the table, or it went away */
- 			LWLockRelease(AutovacuumScheduleLock);
  			continue;
- 		}
- 
- 		/*
- 		 * Ok, good to go.  Store the table in shared memory before releasing
- 		 * the lock so that other workers don't vacuum it concurrently.
- 		 */
- 		MyWorkerInfo->wi_tableoid = relid;
- 		LWLockRelease(AutovacuumScheduleLock);
  
  		/*
  		 * Remember the prevailing values of the vacuum cost GUCs.  We have to
--- 2256,2313 ----
  		}
  
  		/*
! 		 * Scan the sorted array of table oids looking for the first one that has
! 		 * not already been claimed for consideration by some autovacuum.
! 		 * We hold the Autovacuum lock to walk the worker array and to update
! 		 * our WorkerInfo with the chosen candidate table.
  		 */
! 		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
  		/*
! 		 * Find the highest oid for any table claimed by another worker.
  		 */
! 		highwateroid = relid = InvalidOid;
  		dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
  		{
  			WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
  
! 			if (worker->wi_dboid == MyDatabaseId
! 				&& worker->wi_tableoid >= highwateroid)
! 				highwateroid = worker->wi_tableoid;
! 		}
  
! 		/*
! 		 * Skip past the highwater oid to find the next table to to consider
! 		 * vacuuming. There may not be one, but either way, store our choice
! 		 * in shared memory before releasing the lock so that other workers
! 		 * can see it. This avoids workers racing to vacuum the same table.
! 		 */
! 		while (oid_idx < numoids && relid <= highwateroid)
! 			relid = table_oids[oid_idx++];
! 		if (relid <= highwateroid)
! 			relid = InvalidOid;
! 		MyWorkerInfo->wi_tableoid = relid;
  
  		LWLockRelease(AutovacuumLock);
! 
! 		/* Is there a table to work on? */
! 		if (relid == InvalidOid)
  		    continue;
  
  		/*
  		 * Check whether pgstat data still says we need to vacuum this table.
! 		 * It could have changed if something else processed the table or even
! 		 * dropped it while we weren't looking.
! 		 * FIXME: This re-check is not really needed since workers won't try to
! 		 * vacuum the same table. It could protect against a manual vacuum if
! 		 * that was started after we built the table list and before now, but it
! 		 * is a lot of overhead to protect against a low probability scenario.
  		 */
  		MemoryContextSwitchTo(AutovacMemCxt);
  		tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
  									effective_multixact_freeze_max_age);
  		if (tab == NULL)
  			/* someone else vacuumed the table, or it went away */
  			continue;
  
  		/*
  		 * Remember the prevailing values of the vacuum cost GUCs.  We have to
*************** table_recheck_autovac(Oid relid, HTAB *t
*** 2479,2487 ****
  	bool		wraparound;
  	AutoVacOpts *avopts;
  
! 	/* use fresh stats */
! 	autovac_refresh_stats();
! 
  	shared = pgstat_fetch_stat_dbentry(InvalidOid);
  	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
  
--- 2515,2529 ----
  	bool		wraparound;
  	AutoVacOpts *avopts;
  
! 	/*
! 	 * FIXME: For the immediate fix just comment out the call to
! 	 * autovac_refresh_stats() as it drives the stats collector wild.
! 	 * We don't need it because workers no longer compete for tables
! 	 * to vacuum. Longer term, most of this re-checking duplicates work
! 	 * done to build the original list of tables to vacuum.  All that is
! 	 * needed here is to make sure the table was not dropped and to
! 	 * retrieve the table details.
! 	 */
  	shared = pgstat_fetch_stat_dbentry(InvalidOid);
  	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
  
*************** AutoVacuumShmemInit(void)
*** 2961,2986 ****
   * such refreshing in the autovacuum launcher.  This is mostly to avoid
   * rereading the pgstats files too many times in quick succession when there
   * are many databases.
-  *
-  * Note: we avoid throttling in the autovac worker, as it would be
-  * counterproductive in the recheck logic.
   */
  static void
  autovac_refresh_stats(void)
  {
- 	if (IsAutoVacuumLauncherProcess())
- 	{
  		static TimestampTz last_read = 0;
  		TimestampTz current_time;
  
  		current_time = GetCurrentTimestamp();
- 
  		if (!TimestampDifferenceExceeds(last_read, current_time,
  										STATS_READ_DELAY))
  			return;
  
  		last_read = current_time;
- 	}
- 
  	pgstat_clear_snapshot();
  }
--- 3003,3020 ----
diff -rcpb a/backend/postmaster/pgstat.c b/backend/postmaster/pgstat.c
*** a/backend/postmaster/pgstat.c	2015-10-05 12:09:44.000000000 -0700
--- b/backend/postmaster/pgstat.c	2015-10-22 06:12:43.356049410 -0700
*************** backend_read_statsfile(void)
*** 4545,4554 ****
  			/*
  			 * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
  			 * msec before now.  This indirectly ensures that the collector
! 			 * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
! 			 * an autovacuum worker, however, we want a lower delay to avoid
! 			 * using stale data, so we use PGSTAT_RETRY_DELAY (since the
! 			 * number of workers is low, this shouldn't be a problem).
  			 *
  			 * We don't recompute min_ts after sleeping, except in the
  			 * unlikely case that cur_ts went backwards.  So we might end up
--- 4545,4551 ----
  			/*
  			 * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
  			 * msec before now.  This indirectly ensures that the collector
! 			 * needn't write the file more often than PGSTAT_STAT_INTERVAL.
  			 *
  			 * We don't recompute min_ts after sleeping, except in the
  			 * unlikely case that cur_ts went backwards.  So we might end up
*************** backend_read_statsfile(void)
*** 4559,4570 ****
  			 * actually accept.
  			 */
  			ref_ts = cur_ts;
! 			if (IsAutoVacuumWorkerProcess())
! 				min_ts = TimestampTzPlusMilliseconds(ref_ts,
! 													 -PGSTAT_RETRY_DELAY);
! 			else
! 				min_ts = TimestampTzPlusMilliseconds(ref_ts,
! 													 -PGSTAT_STAT_INTERVAL);
  		}
  
  		/*
--- 4556,4562 ----
  			 * actually accept.
  			 */
  			ref_ts = cur_ts;
! 			min_ts = TimestampTzPlusMilliseconds(ref_ts, -PGSTAT_STAT_INTERVAL);
  		}
  
  		/*
diff -rcpb a/include/storage/lwlock.h b/include/storage/lwlock.h
*** a/include/storage/lwlock.h	2015-10-05 12:09:44.000000000 -0700
--- b/include/storage/lwlock.h	2015-10-22 03:40:14.715893232 -0700
*************** extern PGDLLIMPORT LWLockPadded *MainLWL
*** 117,123 ****
  #define BtreeVacuumLock				(&MainLWLockArray[20].lock)
  #define AddinShmemInitLock			(&MainLWLockArray[21].lock)
  #define AutovacuumLock				(&MainLWLockArray[22].lock)
- #define AutovacuumScheduleLock		(&MainLWLockArray[23].lock)
  #define SyncScanLock				(&MainLWLockArray[24].lock)
  #define RelationMappingLock			(&MainLWLockArray[25].lock)
  #define AsyncCtlLock				(&MainLWLockArray[26].lock)
--- 117,122 ----
