diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 861048f..8669bd8 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -97,6 +97,8 @@ static void update_attstats(Oid relid, bool inh,
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 
+static int perform_density_correction(int samplerows, HeapTuple *rows,
+									  BlockSampler bs);
 
 /*
  *	analyze_rel() -- analyze one relation
@@ -974,18 +976,15 @@ acquire_sample_rows(Relation onerel, int elevel,
 					HeapTuple *rows, int targrows,
 					double *totalrows, double *totaldeadrows)
 {
-	int			numrows = 0;	/* # rows now in reservoir */
-	double		samplerows = 0; /* total # rows collected */
+	int			samplerows = 0; /* total # rows collected */
 	double		liverows = 0;	/* # live rows seen */
 	double		deadrows = 0;	/* # dead rows seen */
-	double		rowstoskip = -1;	/* -1 means not set yet */
 	BlockNumber totalblocks;
 	TransactionId OldestXmin;
 	BlockSamplerData bs;
-	ReservoirStateData rstate;
 
 	Assert(targrows > 0);
-
+// pg_usleep(15*1000*1000);
 	totalblocks = RelationGetNumberOfBlocks(onerel);
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
@@ -993,20 +992,28 @@ acquire_sample_rows(Relation onerel, int elevel,
 
 	/* Prepare for sampling block numbers */
 	BlockSampler_Init(&bs, totalblocks, targrows, random());
-	/* Prepare for sampling rows */
-	reservoir_init_selection_state(&rstate, targrows);
 
 	/* Outer loop over blocks to sample */
 	while (BlockSampler_HasMore(&bs))
 	{
-		BlockNumber targblock = BlockSampler_Next(&bs);
+		SampledBlock targblock = BlockSampler_Next(&bs);
 		Buffer		targbuffer;
 		Page		targpage;
 		OffsetNumber targoffset,
 					maxoffset;
 
+		HeapTuple  *blockrows;
+		int			nblockrows;
+
 		vacuum_delay_point();
 
+		/* prefetch */
+		{
+			int b;
+			for (b = bs.cblock; (b < Min(bs.nblocks, bs.cblock+16)); b++)
+				PrefetchBuffer(onerel, MAIN_FORKNUM, bs.blocks[b].blockno);
+		}
+
 		/*
 		 * We must maintain a pin on the target page's buffer to ensure that
 		 * the maxoffset value stays good (else concurrent VACUUM might delete
@@ -1016,12 +1023,15 @@ acquire_sample_rows(Relation onerel, int elevel,
 		 * tuple, but since we aren't doing much work per tuple, the extra
 		 * lock traffic is probably better avoided.
 		 */
-		targbuffer = ReadBufferExtended(onerel, MAIN_FORKNUM, targblock,
+		targbuffer = ReadBufferExtended(onerel, MAIN_FORKNUM, targblock->blockno,
 										RBM_NORMAL, vac_strategy);
 		LockBuffer(targbuffer, BUFFER_LOCK_SHARE);
 		targpage = BufferGetPage(targbuffer);
 		maxoffset = PageGetMaxOffsetNumber(targpage);
 
+		blockrows = (HeapTuple*)palloc0(maxoffset * sizeof(HeapTuple));
+		nblockrows = 0;
+
 		/* Inner loop over all tuples on the selected page */
 		for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++)
 		{
@@ -1044,7 +1054,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 				continue;
 			}
 
-			ItemPointerSet(&targtuple.t_self, targblock, targoffset);
+			ItemPointerSet(&targtuple.t_self, targblock->blockno, targoffset);
 
 			targtuple.t_tableOid = RelationGetRelid(onerel);
 			targtuple.t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
@@ -1118,66 +1128,49 @@ acquire_sample_rows(Relation onerel, int elevel,
 			}
 
 			if (sample_it)
-			{
-				/*
-				 * The first targrows sample rows are simply copied into the
-				 * reservoir. Then we start replacing tuples in the sample
-				 * until we reach the end of the relation.  This algorithm is
-				 * from Jeff Vitter's paper (see full citation below). It
-				 * works by repeatedly computing the number of tuples to skip
-				 * before selecting a tuple, which replaces a randomly chosen
-				 * element of the reservoir (current set of tuples).  At all
-				 * times the reservoir is a true random sample of the tuples
-				 * we've passed over so far, so when we fall off the end of
-				 * the relation we're done.
-				 */
-				if (numrows < targrows)
-					rows[numrows++] = heap_copytuple(&targtuple);
-				else
-				{
-					/*
-					 * t in Vitter's paper is the number of records already
-					 * processed.  If we need to compute a new S value, we
-					 * must use the not-yet-incremented value of samplerows as
-					 * t.
-					 */
-					if (rowstoskip < 0)
-						rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
-
-					if (rowstoskip <= 0)
-					{
-						/*
-						 * Found a suitable tuple, so save it, replacing one
-						 * old tuple at random
-						 */
-						int			k = (int) (targrows * sampler_random_fract(rstate.randstate));
+				blockrows[nblockrows++] = heap_copytuple(&targtuple);
+		}
 
-						Assert(k >= 0 && k < targrows);
-						heap_freetuple(rows[k]);
-						rows[k] = heap_copytuple(&targtuple);
-					}
+		/* Now release the lock and pin on the page */
+		UnlockReleaseBuffer(targbuffer);
 
-					rowstoskip -= 1;
-				}
+		/* remember how many tuples were on the block */
+		targblock->ntuples = nblockrows;
 
-				samplerows += 1;
-			}
+		/* if we have more tuples than needed, we'll evict some first */
+		while (targblock->ntarget < nblockrows)
+		{
+			int r = pg_lrand48() % nblockrows;
+			heap_freetuple(blockrows[r]);
+			blockrows[r] = blockrows[nblockrows - 1];
+			nblockrows--;
 		}
 
-		/* Now release the lock and pin on the page */
-		UnlockReleaseBuffer(targbuffer);
+		/* now just copy the tuples into the sample */
+		{
+			int i;
+
+			/* never should be able to overflow the sample here */
+			Assert(nblockrows + samplerows <= targrows);
+
+			for (i = 0; i < nblockrows; i++)
+				rows[samplerows++] = blockrows[i];
+		}
 	}
 
 	/*
-	 * If we didn't find as many tuples as we wanted then we're done. No sort
-	 * is needed, since they're already in order.
-	 *
-	 * Otherwise we need to sort the collected tuples by position
-	 * (itempointer). It's not worth worrying about corner cases where the
-	 * tuples are already sorted.
+	 * We need to sort the collected tuples by position (itempointer).
+	 * It's not worth worrying about corner cases where the tuples are
+	 * already sorted.
+	 */
+	if (samplerows == targrows)
+		qsort((void *) rows, samplerows, sizeof(HeapTuple), compare_rows);
+
+	/*
+	 * Perform tuple-density correction - find the block with the
+	 * highest density, scale samples for the less dense blocks.
 	 */
-	if (numrows == targrows)
-		qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
+	samplerows = perform_density_correction(samplerows, rows, &bs);
 
 	/*
 	 * Estimate total numbers of rows in relation.  For live rows, use
@@ -1187,10 +1180,10 @@ acquire_sample_rows(Relation onerel, int elevel,
 	 */
 	*totalrows = vac_estimate_reltuples(onerel, true,
 										totalblocks,
-										bs.m,
+										bs.nblocks,
 										liverows);
-	if (bs.m > 0)
-		*totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
+	if (bs.nblocks > 0)
+		*totaldeadrows = floor((deadrows / bs.nblocks) * totalblocks + 0.5);
 	else
 		*totaldeadrows = 0.0;
 
@@ -1202,11 +1195,13 @@ acquire_sample_rows(Relation onerel, int elevel,
 					"containing %.0f live rows and %.0f dead rows; "
 					"%d rows in sample, %.0f estimated total rows",
 					RelationGetRelationName(onerel),
-					bs.m, totalblocks,
+					bs.nblocks, totalblocks,
 					liverows, deadrows,
-					numrows, *totalrows)));
+					samplerows, *totalrows)));
 
-	return numrows;
+	elog(WARNING, "samplerows=%d", samplerows);
+
+	return samplerows;
 }
 
 /*
@@ -2675,3 +2670,69 @@ compare_mcvs(const void *a, const void *b)
 
 	return da - db;
 }
+
+static int
+perform_density_correction(int samplerows, HeapTuple *rows, BlockSampler bs)
+{
+	int	i, j;
+	int	starttuple = 0;
+	int	max_tuples = 0;
+
+	/* walk through blocks, find highest density */
+	for (i = 0; i < bs->nblocks; i++)
+		max_tuples = Max(bs->blocks[i].ntuples, max_tuples);
+
+	/*
+	 * Correct the sample - for each block check whether the density
+	 * is lower than the maximum density, and discard some of the
+	 * tuples using the (density/maxdensity) probability.
+	 */
+	for (i = 0; i < bs->nblocks; i++)
+	{
+		double discard_ratio = (bs->blocks[i].ntuples / (double)max_tuples);
+
+		/* not really needed (no tuples will get evicted) */
+		if (bs->blocks[i].ntuples == max_tuples)
+			continue;
+
+		/*
+		 * We know both the blocks and tuples are sorted in the
+		 * same way (by tid), so we'll use that to efficiently
+		 * match those two arrays.
+		 */
+		for (j = starttuple; j < samplerows; j++)
+		{
+			int tupleblock = ItemPointerGetBlockNumber(&rows[j]->t_self);
+
+			/* first tuple for the next block, so terminate */
+			if (tupleblock > bs->blocks[i].blockno)
+				break;
+
+			/* remember the next starting position */
+			starttuple++;
+
+			/*
+			 * If it's really a tuple for the current block, do the
+			 * correction, i.e. discard part of the tuples randomly.
+			 * We'll just put NULL there and compact the array in
+			 * one go later (although it might probably be done here).
+			 */
+			if (tupleblock == bs->blocks[i].blockno)
+			{
+				if (pg_erand48(bs->randstate) > discard_ratio)
+				{
+					heap_freetuple(rows[j]);
+					rows[j] = NULL;
+				}
+			}
+		}
+	}
+
+	/* compact the array of sampled rows (remove the NULLs) */
+	j = 0;
+	for (i = 0; i < samplerows; i++)
+		if (rows[i] != NULL)
+			rows[j++] = rows[i];
+
+	return j;
+}
diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c
index aaf1d6c..836250a 100644
--- a/src/backend/utils/misc/sampling.c
+++ b/src/backend/utils/misc/sampling.c
@@ -19,6 +19,7 @@
 
 #include "utils/sampling.h"
 
+static int compare_sampled_blocks(const void *a, const void *b);
 
 /*
  * BlockSampler_Init -- prepare for random sampling of blocknumbers
@@ -37,6 +38,8 @@ void
 BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
 				  long randseed)
 {
+	int i;
+
 	bs->N = nblocks;			/* measured table size */
 
 	/*
@@ -44,71 +47,56 @@ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
 	 * more than samplesize blocks, here is the place to do it.
 	 */
 	bs->n = samplesize;
-	bs->t = 0;					/* blocks scanned so far */
-	bs->m = 0;					/* blocks selected so far */
 
 	sampler_random_init_state(randseed, bs->randstate);
+
+	/* generate random blocks */
+	bs->blocks = NULL;
+	bs->nblocks = 0;
+
+	if (bs->N > 0)
+	{
+		bs->blocks = (SampledBlock)palloc0(sizeof(SampledBlockData) * bs->n);
+
+		for (i = 0; i < bs->n; i++)
+		{
+			bs->blocks[i].blockno = pg_lrand48() % bs->N;
+			bs->blocks[i].nsampled = 0;
+			bs->blocks[i].ntarget  = 1;
+		}
+
+		qsort((void *) bs->blocks, bs->n, sizeof(SampledBlockData),
+			  compare_sampled_blocks);
+
+		bs->nblocks = 1;
+		for (i = 1; i < bs->n; i++)
+		{
+			/* existing block, so just increment number of target tuples */
+			if (bs->blocks[bs->nblocks-1].blockno == bs->blocks[i].blockno)
+			{
+				bs->blocks[bs->nblocks-1].ntarget += 1;
+				continue;
+			}
+
+			/* otherwise we have a new block number, so move it */
+			bs->nblocks++;
+			bs->blocks[bs->nblocks-1] = bs->blocks[i];
+		}
+	}
+
+	bs->cblock = 0;
 }
 
 bool
 BlockSampler_HasMore(BlockSampler bs)
 {
-	return (bs->t < bs->N) && (bs->m < bs->n);
+	return (bs->cblock < bs->nblocks);
 }
 
-BlockNumber
+SampledBlock
 BlockSampler_Next(BlockSampler bs)
 {
-	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
-	int			k = bs->n - bs->m;		/* blocks still to sample */
-	double		p;				/* probability to skip block */
-	double		V;				/* random */
-
-	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */
-
-	if ((BlockNumber) k >= K)
-	{
-		/* need all the rest */
-		bs->m++;
-		return bs->t++;
-	}
-
-	/*----------
-	 * It is not obvious that this code matches Knuth's Algorithm S.
-	 * Knuth says to skip the current block with probability 1 - k/K.
-	 * If we are to skip, we should advance t (hence decrease K), and
-	 * repeat the same probabilistic test for the next block.  The naive
-	 * implementation thus requires a sampler_random_fract() call for each
-	 * block number.  But we can reduce this to one sampler_random_fract()
-	 * call per selected block, by noting that each time the while-test
-	 * succeeds, we can reinterpret V as a uniform random number in the range
-	 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
-	 * the appropriate fraction of its former value, and our next loop
-	 * makes the appropriate probabilistic test.
-	 *
-	 * We have initially K > k > 0.  If the loop reduces K to equal k,
-	 * the next while-test must fail since p will become exactly zero
-	 * (we assume there will not be roundoff error in the division).
-	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
-	 * to be doubly sure about roundoff error.)  Therefore K cannot become
-	 * less than k, which means that we cannot fail to select enough blocks.
-	 *----------
-	 */
-	V = sampler_random_fract(bs->randstate);
-	p = 1.0 - (double) k / (double) K;
-	while (V < p)
-	{
-		/* skip */
-		bs->t++;
-		K--;					/* keep K == N - t */
-
-		/* adjust p to be new cutoff point in reduced range */
-		p *= 1.0 - (double) k / (double) K;
-	}
-
-	/* select */
-	bs->m++;
-	return bs->t++;
+	return &bs->blocks[bs->cblock++];
 }
 
 /*
@@ -283,3 +271,18 @@ anl_get_next_S(double t, int n, double *stateptr)
 	*stateptr = oldrs.W;
 	return result;
 }
+
+static int
+compare_sampled_blocks(const void *a, const void *b)
+{
+	SampledBlock sa = (SampledBlock)a;
+	SampledBlock sb = (SampledBlock)b;
+
+	if (sa->blockno < sb->blockno)
+		return -1;
+
+	if (sa->blockno > sb->blockno)
+		return 1;
+
+	return 0;
+}
diff --git a/src/include/utils/sampling.h b/src/include/utils/sampling.h
index 1653ed0..9c03190 100644
--- a/src/include/utils/sampling.h
+++ b/src/include/utils/sampling.h
@@ -24,15 +24,29 @@ extern void sampler_random_init_state(long seed,
 extern double sampler_random_fract(SamplerRandomState randstate);
 
 /* Block sampling methods */
+typedef struct
+{
+	BlockNumber	blockno;		/* number of sampled block */
+	int			ntarget;		/* tuples to sample (in this round) */
+	int			nsampled;		/* number of sampled tuples */
+	int			ntuples;		/* number of tuples on block */
+} SampledBlockData;
+
+typedef SampledBlockData* SampledBlock;
 
 /* Data structure for Algorithm S from Knuth 3.4.2 */
 typedef struct
 {
-	BlockNumber N;				/* number of blocks, known in advance */
-	int			n;				/* desired sample size */
-	BlockNumber t;				/* current block number */
-	int			m;				/* blocks selected so far */
-	SamplerRandomState randstate;		/* random generator state */
+	BlockNumber N;					/* number of blocks, known in advance */
+	int			n;					/* desired sample size (tuples) */
+
+	/* block sample */
+	SampledBlock	blocks;		/* preallocated to have 'n' blocks */
+	int				nblocks;	/* number of valid blocks */
+	int				nsampled;	/* sampled tuples */
+	int				cblock;		/* current block (index infto blocks) */
+
+	SamplerRandomState randstate;	/* random generator state */
 } BlockSamplerData;
 
 typedef BlockSamplerData *BlockSampler;
@@ -40,14 +54,13 @@ typedef BlockSamplerData *BlockSampler;
 extern void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
 				  int samplesize, long randseed);
 extern bool BlockSampler_HasMore(BlockSampler bs);
-extern BlockNumber BlockSampler_Next(BlockSampler bs);
+extern SampledBlock BlockSampler_Next(BlockSampler bs);
 
 /* Reservoir sampling methods */
-
 typedef struct
 {
-	double		W;
-	SamplerRandomState randstate;		/* random generator state */
+       double          W;
+       SamplerRandomState randstate;           /* random generator state */
 } ReservoirStateData;
 
 typedef ReservoirStateData *ReservoirState;