[PoC] pgstattuple2: block sampling to reduce physical read

Started by Satoshi Nagayasuover 12 years ago14 messages

snaga@uptime.jp

over 12 years ago

2 attachment(s)

Hi,

I've been working on new pgstattuple function to allow
block sampling [1]Re: [RFC] pgstattuple/pgstatindex enhancement in order to reduce block reads while
scanning a table. A PoC patch is attached.

[1]: Re: [RFC] pgstattuple/pgstatindex enhancement

/messages/by-id/CA+TgmoaxJhGZ2c4AYfbr9muUVNhGWU4co-cthqpZRwwDtamvhw@mail.gmail.com

This new function, pgstattuple2(), samples only 3,000 blocks
(which accounts 24MB) from the table randomly, and estimates
several parameters of the entire table.

The function calculates the averages of the samples, estimates
the parameters (averages and SDs), and shows "standard errors
(SE)" to allow estimating status of the table with statistical
approach.

And, of course, it reduces number of physical block reads
while scanning a bigger table.

The following example shows that new pgstattuple2 function
runs x100 faster than the original pgstattuple function with
well-estimated results.

Time: 1615.651 ms
postgres=# select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 2376.47, tuple_len 287552.58,
dead_tuple_count 497.63, dead_tuple_len 60213.08, free_space 289752.38
-[ RECORD 1 ]------+-----------
table_len | 1402642432
tuple_count | 9978074
tuple_len | 1207347074
tuple_percent | 86.08
dead_tuple_count | 187315
dead_tuple_len | 22665208
dead_tuple_percent | 1.62
free_space | 23400431
free_percent | 1.67

Time: 15.026 ms
postgres=#
----------------------------------------------

In addition to that, see attached chart to know how pgstattuple2
estimates well during repeating (long-running) pgbench.

I understand that pgbench would generate "random" transactions,
and those update operations might not have any skew over the table,
so estimating table status seems to be easy in this test.

However, I'm still curious to know whether it would work in
"real-world" worklaod.

Is it worth having this? Any comment or suggestion?

Regards,
--
Satoshi Nagayasu <snaga@uptime.jp>
Uptime Technologies, LLC. http://www.uptime.jp

Attachments:

pgstattuple2_v1.difftext/plain; charset=Shift_JIS; name=pgstattuple2_v1.diffDownload

diff --git a/contrib/pgstattuple/pgstattuple--1.1--1.2.sql b/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
index 2783a63..8ebec6f 100644
--- a/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
+++ b/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
@@ -37,3 +37,17 @@ CREATE FUNCTION pg_relpages(IN relname regclass)
 RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'pg_relpagesbyid'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pgstattuple2(IN relname regclass,
+    OUT table_len BIGINT,		-- physical table length in bytes
+    OUT tuple_count BIGINT,		-- number of live tuples
+    OUT tuple_len BIGINT,		-- total tuples length in bytes
+    OUT tuple_percent FLOAT8,		-- live tuples in %
+    OUT dead_tuple_count BIGINT,	-- number of dead tuples
+    OUT dead_tuple_len BIGINT,		-- total dead tuples length in bytes
+    OUT dead_tuple_percent FLOAT8,	-- dead tuples in %
+    OUT free_space BIGINT,		-- free space in bytes
+    OUT free_percent FLOAT8)		-- free space in %
+AS 'MODULE_PATHNAME', 'pgstattuple2'
+LANGUAGE C STRICT;
+
diff --git a/contrib/pgstattuple/pgstattuple--1.2.sql b/contrib/pgstattuple/pgstattuple--1.2.sql
index e5fa2f5..963eb00 100644
--- a/contrib/pgstattuple/pgstattuple--1.2.sql
+++ b/contrib/pgstattuple/pgstattuple--1.2.sql
@@ -77,3 +77,17 @@ CREATE FUNCTION pg_relpages(IN relname regclass)
 RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'pg_relpagesbyid'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pgstattuple2(IN relname regclass,
+    OUT table_len BIGINT,		-- physical table length in bytes
+    OUT tuple_count BIGINT,		-- number of live tuples
+    OUT tuple_len BIGINT,		-- total tuples length in bytes
+    OUT tuple_percent FLOAT8,		-- live tuples in %
+    OUT dead_tuple_count BIGINT,	-- number of dead tuples
+    OUT dead_tuple_len BIGINT,		-- total dead tuples length in bytes
+    OUT dead_tuple_percent FLOAT8,	-- dead tuples in %
+    OUT free_space BIGINT,		-- free space in bytes
+    OUT free_percent FLOAT8)		-- free space in %
+AS 'MODULE_PATHNAME', 'pgstattuple2'
+LANGUAGE C STRICT;
+
diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index 7f41ec3..c2adb75 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -41,9 +41,22 @@ PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(pgstattuple);
 PG_FUNCTION_INFO_V1(pgstattuplebyid);
+PG_FUNCTION_INFO_V1(pgstattuple2);
 
 extern Datum pgstattuple(PG_FUNCTION_ARGS);
 extern Datum pgstattuplebyid(PG_FUNCTION_ARGS);
+extern Datum pgstattuple2(PG_FUNCTION_ARGS);
+
+#define SAMPLE_SIZE 3000
+
+typedef struct pgstattuple_block_stats
+{
+	uint64		tuple_count;
+	uint64		tuple_len;
+	uint64		dead_tuple_count;
+	uint64		dead_tuple_len;
+	uint64		free_space;		/* free/reusable space in bytes */
+} pgstattuple_block_stats;
 
 /*
  * struct pgstattuple_type
@@ -66,8 +79,9 @@ typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
 					   FunctionCallInfo fcinfo);
-static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
+static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo, bool enable_sample);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
+static Datum pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
 				  Relation rel, BlockNumber blkno,
 				  BufferAccessStrategy bstrategy);
@@ -81,6 +95,11 @@ static Datum pgstat_index(Relation rel, BlockNumber start,
 			 pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
 				  OffsetNumber minoff, OffsetNumber maxoff);
+static void compute_parameters(pgstattuple_block_stats *block_stats,
+				   BlockNumber sample_size, BlockNumber nblocks,
+				   uint64 *tuple_count, uint64 *tuple_len,
+				   uint64 *dead_tuple_count, uint64 *dead_tuple_len,
+				   uint64 *free_space);
 
 /*
  * build_pgstattuple_type -- build a pgstattuple_type tuple
@@ -175,7 +194,7 @@ pgstattuple(PG_FUNCTION_ARGS)
 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
 	rel = relation_openrv(relrv, AccessShareLock);
 
-	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, false));
 }
 
 Datum
@@ -192,14 +211,14 @@ pgstattuplebyid(PG_FUNCTION_ARGS)
 	/* open relation */
 	rel = relation_open(relid, AccessShareLock);
 
-	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, false));
 }
 
 /*
  * pgstat_relation
  */
 static Datum
-pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
+pgstat_relation(Relation rel, FunctionCallInfo fcinfo, bool enable_sample)
 {
 	const char *err;
 
@@ -219,6 +238,9 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
 		case RELKIND_MATVIEW:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_SEQUENCE:
+			if ( enable_sample )
+				return pgstat_heap_sample(rel, fcinfo);
+
 			return pgstat_heap(rel, fcinfo);
 		case RELKIND_INDEX:
 			switch (rel->rd_rel->relam)
@@ -535,3 +557,225 @@ pgstat_index_page(pgstattuple_type *stat, Page page,
 		}
 	}
 }
+
+static void
+compute_parameters(pgstattuple_block_stats *block_stats, BlockNumber sample_size, BlockNumber nblocks,
+				   uint64 *tuple_count, uint64 *tuple_len,
+				   uint64 *dead_tuple_count, uint64 *dead_tuple_len,
+				   uint64 *free_space)
+{
+	double tuple_count_avg;
+	double tuple_len_avg;
+	double dead_tuple_count_avg;
+	double dead_tuple_len_avg;
+	double free_space_avg;
+
+	double tuple_count_sd;
+	double tuple_len_sd;
+	double dead_tuple_count_sd;
+	double dead_tuple_len_sd;
+	double free_space_sd;
+
+	double tuple_count_se;
+	double tuple_len_se;
+	double dead_tuple_count_se;
+	double dead_tuple_len_se;
+	double free_space_se;
+
+	int i;
+	
+	/*
+	 * sample average
+	 */
+	tuple_count_avg      = 0;
+	tuple_len_avg        = 0;
+	dead_tuple_count_avg = 0;
+	dead_tuple_len_avg   = 0;
+	free_space_avg       = 0;
+	
+	for (i=0 ; i<sample_size ; i++)
+	{
+		tuple_count_avg      += block_stats[i].tuple_count;
+		tuple_len_avg        += block_stats[i].tuple_len;
+		dead_tuple_count_avg += block_stats[i].dead_tuple_count;
+		dead_tuple_len_avg   += block_stats[i].dead_tuple_len;
+		free_space_avg       += block_stats[i].free_space;
+	}
+
+	tuple_count_avg      = tuple_count_avg / sample_size;
+	tuple_len_avg        = tuple_len_avg / sample_size;
+	dead_tuple_count_avg = dead_tuple_count_avg / sample_size;
+	dead_tuple_len_avg   = dead_tuple_len_avg / sample_size;
+	free_space_avg       = free_space_avg / sample_size;
+
+#ifdef NOT_USED
+	elog(NOTICE, "pgstattuple2: AVG tuple_count %.2f, tuple_len %.2f, dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+		 tuple_count_avg,
+		 tuple_len_avg,
+		 dead_tuple_count_avg,
+		 dead_tuple_len_avg,
+		 free_space_avg);
+#endif
+
+	/*
+	 * sample standard deviation
+	 */
+	tuple_count_sd      = 0;
+	tuple_len_sd        = 0;
+	dead_tuple_count_sd = 0;
+	dead_tuple_len_sd   = 0;
+	free_space_sd       = 0;
+
+	for (i=0 ; i<sample_size ; i++)
+	{
+		tuple_count_sd      += pow(block_stats[i].tuple_count - tuple_count_avg, 2);
+		tuple_len_sd        += pow(block_stats[i].tuple_len - tuple_len_avg, 2);
+		dead_tuple_count_sd += pow(block_stats[i].dead_tuple_count - dead_tuple_count_avg, 2);
+		dead_tuple_len_sd   += pow(block_stats[i].dead_tuple_len - dead_tuple_len_avg, 2);
+		free_space_sd       += pow(block_stats[i].free_space - free_space_avg, 2);
+	}
+
+	tuple_count_sd      = sqrt(tuple_count_sd / (sample_size - 1));
+	tuple_len_sd        = sqrt(tuple_len_sd / (sample_size - 1));
+	dead_tuple_count_sd = sqrt(dead_tuple_count_sd / (sample_size - 1));
+	dead_tuple_len_sd   = sqrt(dead_tuple_len_sd / (sample_size - 1));
+	free_space_sd       = sqrt(free_space_sd / (sample_size - 1));
+
+#ifdef NOT_USED
+	elog(NOTICE, "pgstattuple2: SD tuple_count %.2f, tuple_len %.2f, dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+		 tuple_count_sd,
+		 tuple_len_sd,
+		 dead_tuple_count_sd,
+		 dead_tuple_len_sd,
+		 free_space_sd);
+#endif
+
+	/*
+	 * standard error
+	 */
+	{
+		double tmp = ((double)nblocks - (double)sample_size) / (double)nblocks;
+
+		tuple_count_se      = tmp * tuple_count_sd / sqrt(nblocks);
+		tuple_len_se        = tmp * tuple_len_sd / sqrt(nblocks);
+		dead_tuple_count_se = tmp * dead_tuple_count_sd / sqrt(nblocks);
+		dead_tuple_len_se   = tmp * dead_tuple_len_sd / sqrt(nblocks);
+		free_space_se       = tmp * free_space_sd / sqrt(nblocks);
+	}
+
+	elog(NOTICE, "pgstattuple2: SE tuple_count %.2f, tuple_len %.2f, dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+		 tuple_count_se * nblocks,
+		 tuple_len_se * nblocks,
+		 dead_tuple_count_se * nblocks,
+		 dead_tuple_len_se * nblocks,
+		 free_space_se * nblocks);
+
+	*tuple_count      = tuple_count_avg * nblocks;
+	*tuple_len        = tuple_len_avg * nblocks;
+	*dead_tuple_count = dead_tuple_count_avg * nblocks;
+	*dead_tuple_len   = dead_tuple_len_avg * nblocks;
+	*free_space       = free_space_avg * nblocks;
+}
+
+/*
+ * pgstat_heap_sample -- returns live/dead tuples info in a heap
+ */
+static Datum
+pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo)
+{
+	BlockNumber nblocks;
+	BlockNumber block;
+	pgstattuple_type stat = {0};
+	pgstattuple_block_stats block_stats[SAMPLE_SIZE];
+	int i;
+
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	for (i=0 ; i<SAMPLE_SIZE ; i++)
+	{
+		Buffer		buffer;
+		Page		page;
+		OffsetNumber pageoff;
+
+		/*
+		 * sample random blocks
+		 */
+		block = (double)random() / RAND_MAX * nblocks;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Read and lock buffer */
+		buffer = ReadBuffer(rel, block);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+		page = BufferGetPage(buffer);
+
+		/* get block stats of the sample block */
+		block_stats[i].tuple_count      = 0;
+		block_stats[i].tuple_len        = 0;
+		block_stats[i].dead_tuple_count = 0;
+		block_stats[i].dead_tuple_len   = 0;
+		block_stats[i].free_space       = 0;
+
+		block_stats[i].free_space = PageGetFreeSpace(page);
+
+		for (pageoff = FirstOffsetNumber; pageoff <= PageGetMaxOffsetNumber(page); pageoff = OffsetNumberNext(pageoff))
+		{
+			ItemId		lp = PageGetItemId(page, pageoff);
+			HeapTupleData tup;
+
+			memset(&tup, 0, sizeof(HeapTupleData));
+
+			tup.t_data     = (HeapTupleHeader)PageGetItem(page, lp);
+			tup.t_len      = ItemIdGetLength(lp);
+			tup.t_tableOid = RelationGetRelid(rel);
+			ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), pageoff);
+
+			if (ItemIdIsNormal(lp))
+			{
+				if (HeapTupleSatisfiesVisibility(&tup, SnapshotNow, buffer))
+				{
+					block_stats[i].tuple_count++;
+					block_stats[i].tuple_len += tup.t_len;
+				}
+				else
+				{
+					block_stats[i].dead_tuple_count++;
+					block_stats[i].dead_tuple_len += tup.t_len;
+				}
+			}
+		}
+		
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+	}
+
+	relation_close(rel, AccessShareLock);
+
+	compute_parameters(block_stats, SAMPLE_SIZE, nblocks,
+					   &stat.tuple_count,
+					   &stat.tuple_len,
+					   &stat.dead_tuple_count,
+					   &stat.dead_tuple_len,
+					   &stat.free_space);
+
+	stat.table_len = (uint64) nblocks *BLCKSZ;
+
+	return build_pgstattuple_type(&stat, fcinfo);
+}
+
+Datum
+pgstattuple2(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	Relation	rel;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use pgstattuple functions"))));
+
+	/* open relation */
+	rel = relation_open(relid, AccessShareLock);
+
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, true));
+}

pgstattuple2.pngimage/png; name=pgstattuple2.pngDownload

Greg Smith

greg@2ndQuadrant.com

over 12 years ago

In reply to: Satoshi Nagayasu (#1)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 7/23/13 2:16 AM, Satoshi Nagayasu wrote:

I've been working on new pgstattuple function to allow
block sampling [1] in order to reduce block reads while
scanning a table. A PoC patch is attached.

Take a look at all of the messages linked in
https://commitfest.postgresql.org/action/patch_view?id=778

Jaime and I tried to do what you're working on then, including a random
block sampling mechanism modeled on the stats_target mechanism. We
didn't do that as part of pgstattuple though, which was a mistake.

Noah created some test cases as part of his thorough review that were
not computing the correct results. Getting the results correct for all
of the various types of PostgreSQL tables and indexes ended up being
much harder than the sampling part. See
/messages/by-id/20120222052747.GE8592@tornado.leadboat.com
in particular for that.

This new function, pgstattuple2(), samples only 3,000 blocks
(which accounts 24MB) from the table randomly, and estimates
several parameters of the entire table.

There should be an input parameter to the function for how much sampling
to do, and if it's possible to make the scale for it to look like
ANALYZE that's helpful too.

I have a project for this summer that includes reviving this topic and
making sure it works on some real-world systems. If you want to work on
this too, I can easily combine that project into what you're doing.

--
Greg Smith 2ndQuadrant US greg@2ndQuadrant.com Baltimore, MD
PostgreSQL Training, Services, and 24x7 Support www.2ndQuadrant.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Satoshi Nagayasu

snaga@uptime.jp

over 12 years ago

In reply to: Greg Smith (#2)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

(2013/07/23 20:02), Greg Smith wrote:

On 7/23/13 2:16 AM, Satoshi Nagayasu wrote:

I've been working on new pgstattuple function to allow
block sampling [1] in order to reduce block reads while
scanning a table. A PoC patch is attached.

Take a look at all of the messages linked in
https://commitfest.postgresql.org/action/patch_view?id=778

Jaime and I tried to do what you're working on then, including a random
block sampling mechanism modeled on the stats_target mechanism. We
didn't do that as part of pgstattuple though, which was a mistake.

Noah created some test cases as part of his thorough review that were
not computing the correct results. Getting the results correct for all
of the various types of PostgreSQL tables and indexes ended up being
much harder than the sampling part. See
/messages/by-id/20120222052747.GE8592@tornado.leadboat.com
in particular for that.

Thanks for the info. I have read the previous discussion.

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

This new function, pgstattuple2(), samples only 3,000 blocks
(which accounts 24MB) from the table randomly, and estimates
several parameters of the entire table.

There should be an input parameter to the function for how much sampling
to do, and if it's possible to make the scale for it to look like
ANALYZE that's helpful too.

I have a project for this summer that includes reviving this topic and
making sure it works on some real-world systems. If you want to work on
this too, I can easily combine that project into what you're doing.

Yeah, I'm interested in that. Something can be shared?

Regards,
--
Satoshi Nagayasu <snaga@uptime.jp>
Uptime Technologies, LLC. http://www.uptime.jp

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Peter Eisentraut

peter_e@gmx.net

over 12 years ago

In reply to: Satoshi Nagayasu (#3)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function ‘pgstat_heap_sample’:
pgstattuple.c:737:13: error: ‘SnapshotNow’ undeclared (first use in this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported only once for each function it appears in

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Satoshi Nagayasu

snaga@uptime.jp

over 12 years ago

In reply to: Peter Eisentraut (#4)

1 attachment(s)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

(2013/09/15 11:07), Peter Eisentraut wrote:

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function ‘pgstat_heap_sample’:
pgstattuple.c:737:13: error: ‘SnapshotNow’ undeclared (first use in this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported only once for each function it appears in

Thanks for checking. Fixed to eliminate SnapshotNow.

Regards,
--
Satoshi Nagayasu <snaga@uptime.jp>
Uptime Technologies, LLC. http://www.uptime.jp

Attachments:

pgstattuple2_v2.difftext/plain; charset=Shift_JIS; name=pgstattuple2_v2.diffDownload

diff --git a/contrib/pgstattuple/pgstattuple--1.1--1.2.sql b/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
index 2783a63..8ebec6f 100644
--- a/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
+++ b/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
@@ -37,3 +37,17 @@ CREATE FUNCTION pg_relpages(IN relname regclass)
 RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'pg_relpagesbyid'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pgstattuple2(IN relname regclass,
+    OUT table_len BIGINT,		-- physical table length in bytes
+    OUT tuple_count BIGINT,		-- number of live tuples
+    OUT tuple_len BIGINT,		-- total tuples length in bytes
+    OUT tuple_percent FLOAT8,		-- live tuples in %
+    OUT dead_tuple_count BIGINT,	-- number of dead tuples
+    OUT dead_tuple_len BIGINT,		-- total dead tuples length in bytes
+    OUT dead_tuple_percent FLOAT8,	-- dead tuples in %
+    OUT free_space BIGINT,		-- free space in bytes
+    OUT free_percent FLOAT8)		-- free space in %
+AS 'MODULE_PATHNAME', 'pgstattuple2'
+LANGUAGE C STRICT;
+
diff --git a/contrib/pgstattuple/pgstattuple--1.2.sql b/contrib/pgstattuple/pgstattuple--1.2.sql
index e5fa2f5..963eb00 100644
--- a/contrib/pgstattuple/pgstattuple--1.2.sql
+++ b/contrib/pgstattuple/pgstattuple--1.2.sql
@@ -77,3 +77,17 @@ CREATE FUNCTION pg_relpages(IN relname regclass)
 RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'pg_relpagesbyid'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pgstattuple2(IN relname regclass,
+    OUT table_len BIGINT,		-- physical table length in bytes
+    OUT tuple_count BIGINT,		-- number of live tuples
+    OUT tuple_len BIGINT,		-- total tuples length in bytes
+    OUT tuple_percent FLOAT8,		-- live tuples in %
+    OUT dead_tuple_count BIGINT,	-- number of dead tuples
+    OUT dead_tuple_len BIGINT,		-- total dead tuples length in bytes
+    OUT dead_tuple_percent FLOAT8,	-- dead tuples in %
+    OUT free_space BIGINT,		-- free space in bytes
+    OUT free_percent FLOAT8)		-- free space in %
+AS 'MODULE_PATHNAME', 'pgstattuple2'
+LANGUAGE C STRICT;
+
diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index f9ba0a6..2b29d44 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -41,9 +41,22 @@ PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(pgstattuple);
 PG_FUNCTION_INFO_V1(pgstattuplebyid);
+PG_FUNCTION_INFO_V1(pgstattuple2);
 
 extern Datum pgstattuple(PG_FUNCTION_ARGS);
 extern Datum pgstattuplebyid(PG_FUNCTION_ARGS);
+extern Datum pgstattuple2(PG_FUNCTION_ARGS);
+
+#define SAMPLE_SIZE 3000
+
+typedef struct pgstattuple_block_stats
+{
+	uint64		tuple_count;
+	uint64		tuple_len;
+	uint64		dead_tuple_count;
+	uint64		dead_tuple_len;
+	uint64		free_space;		/* free/reusable space in bytes */
+} pgstattuple_block_stats;
 
 /*
  * struct pgstattuple_type
@@ -66,8 +79,9 @@ typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
 					   FunctionCallInfo fcinfo);
-static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
+static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo, bool enable_sample);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
+static Datum pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
 				  Relation rel, BlockNumber blkno,
 				  BufferAccessStrategy bstrategy);
@@ -81,6 +95,11 @@ static Datum pgstat_index(Relation rel, BlockNumber start,
 			 pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
 				  OffsetNumber minoff, OffsetNumber maxoff);
+static void compute_parameters(pgstattuple_block_stats *block_stats,
+				   BlockNumber sample_size, BlockNumber nblocks,
+				   uint64 *tuple_count, uint64 *tuple_len,
+				   uint64 *dead_tuple_count, uint64 *dead_tuple_len,
+				   uint64 *free_space);
 
 /*
  * build_pgstattuple_type -- build a pgstattuple_type tuple
@@ -175,7 +194,7 @@ pgstattuple(PG_FUNCTION_ARGS)
 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
 	rel = relation_openrv(relrv, AccessShareLock);
 
-	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, false));
 }
 
 Datum
@@ -192,14 +211,14 @@ pgstattuplebyid(PG_FUNCTION_ARGS)
 	/* open relation */
 	rel = relation_open(relid, AccessShareLock);
 
-	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, false));
 }
 
 /*
  * pgstat_relation
  */
 static Datum
-pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
+pgstat_relation(Relation rel, FunctionCallInfo fcinfo, bool enable_sample)
 {
 	const char *err;
 
@@ -219,6 +238,9 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
 		case RELKIND_MATVIEW:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_SEQUENCE:
+			if ( enable_sample )
+				return pgstat_heap_sample(rel, fcinfo);
+
 			return pgstat_heap(rel, fcinfo);
 		case RELKIND_INDEX:
 			switch (rel->rd_rel->relam)
@@ -537,3 +559,235 @@ pgstat_index_page(pgstattuple_type *stat, Page page,
 		}
 	}
 }
+
+static void
+compute_parameters(pgstattuple_block_stats *block_stats, BlockNumber sample_size, BlockNumber nblocks,
+				   uint64 *tuple_count, uint64 *tuple_len,
+				   uint64 *dead_tuple_count, uint64 *dead_tuple_len,
+				   uint64 *free_space)
+{
+	double tuple_count_avg;
+	double tuple_len_avg;
+	double dead_tuple_count_avg;
+	double dead_tuple_len_avg;
+	double free_space_avg;
+
+	double tuple_count_sd;
+	double tuple_len_sd;
+	double dead_tuple_count_sd;
+	double dead_tuple_len_sd;
+	double free_space_sd;
+
+	double tuple_count_se;
+	double tuple_len_se;
+	double dead_tuple_count_se;
+	double dead_tuple_len_se;
+	double free_space_se;
+
+	int i;
+	
+	/*
+	 * sample average
+	 */
+	tuple_count_avg      = 0;
+	tuple_len_avg        = 0;
+	dead_tuple_count_avg = 0;
+	dead_tuple_len_avg   = 0;
+	free_space_avg       = 0;
+	
+	for (i=0 ; i<sample_size ; i++)
+	{
+		tuple_count_avg      += block_stats[i].tuple_count;
+		tuple_len_avg        += block_stats[i].tuple_len;
+		dead_tuple_count_avg += block_stats[i].dead_tuple_count;
+		dead_tuple_len_avg   += block_stats[i].dead_tuple_len;
+		free_space_avg       += block_stats[i].free_space;
+	}
+
+	tuple_count_avg      = tuple_count_avg / sample_size;
+	tuple_len_avg        = tuple_len_avg / sample_size;
+	dead_tuple_count_avg = dead_tuple_count_avg / sample_size;
+	dead_tuple_len_avg   = dead_tuple_len_avg / sample_size;
+	free_space_avg       = free_space_avg / sample_size;
+
+#ifdef NOT_USED
+	elog(NOTICE, "pgstattuple2: AVG tuple_count %.2f, tuple_len %.2f, dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+		 tuple_count_avg,
+		 tuple_len_avg,
+		 dead_tuple_count_avg,
+		 dead_tuple_len_avg,
+		 free_space_avg);
+#endif
+
+	/*
+	 * sample standard deviation
+	 */
+	tuple_count_sd      = 0;
+	tuple_len_sd        = 0;
+	dead_tuple_count_sd = 0;
+	dead_tuple_len_sd   = 0;
+	free_space_sd       = 0;
+
+	for (i=0 ; i<sample_size ; i++)
+	{
+		tuple_count_sd      += pow(block_stats[i].tuple_count - tuple_count_avg, 2);
+		tuple_len_sd        += pow(block_stats[i].tuple_len - tuple_len_avg, 2);
+		dead_tuple_count_sd += pow(block_stats[i].dead_tuple_count - dead_tuple_count_avg, 2);
+		dead_tuple_len_sd   += pow(block_stats[i].dead_tuple_len - dead_tuple_len_avg, 2);
+		free_space_sd       += pow(block_stats[i].free_space - free_space_avg, 2);
+	}
+
+	tuple_count_sd      = sqrt(tuple_count_sd / (sample_size - 1));
+	tuple_len_sd        = sqrt(tuple_len_sd / (sample_size - 1));
+	dead_tuple_count_sd = sqrt(dead_tuple_count_sd / (sample_size - 1));
+	dead_tuple_len_sd   = sqrt(dead_tuple_len_sd / (sample_size - 1));
+	free_space_sd       = sqrt(free_space_sd / (sample_size - 1));
+
+#ifdef NOT_USED
+	elog(NOTICE, "pgstattuple2: SD tuple_count %.2f, tuple_len %.2f, dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+		 tuple_count_sd,
+		 tuple_len_sd,
+		 dead_tuple_count_sd,
+		 dead_tuple_len_sd,
+		 free_space_sd);
+#endif
+
+	/*
+	 * standard error
+	 */
+	{
+		double tmp = ((double)nblocks - (double)sample_size) / (double)nblocks;
+
+		tuple_count_se      = tmp * tuple_count_sd / sqrt(nblocks);
+		tuple_len_se        = tmp * tuple_len_sd / sqrt(nblocks);
+		dead_tuple_count_se = tmp * dead_tuple_count_sd / sqrt(nblocks);
+		dead_tuple_len_se   = tmp * dead_tuple_len_sd / sqrt(nblocks);
+		free_space_se       = tmp * free_space_sd / sqrt(nblocks);
+	}
+
+	elog(NOTICE, "pgstattuple2: SE tuple_count %.2f, tuple_len %.2f, dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+		 tuple_count_se * nblocks,
+		 tuple_len_se * nblocks,
+		 dead_tuple_count_se * nblocks,
+		 dead_tuple_len_se * nblocks,
+		 free_space_se * nblocks);
+
+	*tuple_count      = tuple_count_avg * nblocks;
+	*tuple_len        = tuple_len_avg * nblocks;
+	*dead_tuple_count = dead_tuple_count_avg * nblocks;
+	*dead_tuple_len   = dead_tuple_len_avg * nblocks;
+	*free_space       = free_space_avg * nblocks;
+}
+
+/*
+ * pgstat_heap_sample -- returns live/dead tuples info in a heap
+ */
+static Datum
+pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo)
+{
+	BlockNumber nblocks;
+	BlockNumber block;
+	pgstattuple_type stat = {0};
+	pgstattuple_block_stats block_stats[SAMPLE_SIZE];
+	int i;
+
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	for (i=0 ; i<SAMPLE_SIZE ; i++)
+	{
+		Buffer		buffer;
+		Page		page;
+		OffsetNumber pageoff;
+
+		/*
+		 * sample random blocks
+		 */
+		block = (double)random() / RAND_MAX * nblocks;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Read and lock buffer */
+		buffer = ReadBuffer(rel, block);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+		page = BufferGetPage(buffer);
+
+		/* get block stats of the sample block */
+		block_stats[i].tuple_count      = 0;
+		block_stats[i].tuple_len        = 0;
+		block_stats[i].dead_tuple_count = 0;
+		block_stats[i].dead_tuple_len   = 0;
+		block_stats[i].free_space       = 0;
+
+		block_stats[i].free_space = PageGetFreeSpace(page);
+
+		for (pageoff = FirstOffsetNumber; pageoff <= PageGetMaxOffsetNumber(page); pageoff = OffsetNumberNext(pageoff))
+		{
+			ItemId		lp = PageGetItemId(page, pageoff);
+			HeapTupleData tup;
+
+			memset(&tup, 0, sizeof(HeapTupleData));
+
+			AssertMacro(PageIsValid(page));
+			if (!ItemIdHasStorage(lp))
+			{
+				/*
+				 * if an itemid doesn't have storage,
+				 * it means the item was already vacuumed.
+				 */
+				continue;
+			}
+
+			tup.t_data     = (HeapTupleHeader)PageGetItem(page, lp);
+			tup.t_len      = ItemIdGetLength(lp);
+			tup.t_tableOid = RelationGetRelid(rel);
+			ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), pageoff);
+
+			if (ItemIdIsNormal(lp))
+			{
+				if (HeapTupleSatisfiesVisibility(&tup, SnapshotAny, buffer))
+				{
+					block_stats[i].tuple_count++;
+					block_stats[i].tuple_len += tup.t_len;
+				}
+				else
+				{
+					block_stats[i].dead_tuple_count++;
+					block_stats[i].dead_tuple_len += tup.t_len;
+				}
+			}
+		}
+		
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+	}
+
+	relation_close(rel, AccessShareLock);
+
+	compute_parameters(block_stats, SAMPLE_SIZE, nblocks,
+					   &stat.tuple_count,
+					   &stat.tuple_len,
+					   &stat.dead_tuple_count,
+					   &stat.dead_tuple_len,
+					   &stat.free_space);
+
+	stat.table_len = (uint64) nblocks *BLCKSZ;
+
+	return build_pgstattuple_type(&stat, fcinfo);
+}
+
+Datum
+pgstattuple2(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	Relation	rel;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use pgstattuple functions"))));
+
+	/* open relation */
+	rel = relation_open(relid, AccessShareLock);
+
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, true));
+}

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

over 12 years ago

In reply to: Satoshi Nagayasu (#5)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 16/09/13 16:20, Satoshi Nagayasu wrote:

(2013/09/15 11:07), Peter Eisentraut wrote:

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function �ｽepgstat_heap_sample�ｽf:
pgstattuple.c:737:13: error: �ｽeSnapshotNow�ｽf undeclared (first use in
this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported
only once for each function it appears in

Thanks for checking. Fixed to eliminate SnapshotNow.

This seems like a cool idea! I took a quick look, and initally
replicated the sort of improvement you saw:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=786.368..786.369 rows=1 loops=1)
Total runtime: 786.384 ms
(2 rows)

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12.004..12.005 rows=1 loops=1)
Total runtime: 12.019 ms
(2 rows)

I wondered what sort of difference eliminating caching would make:

$ sudo sysctl -w vm.drop_caches=3

Repeating the above queries:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=9503.774..9503.776 rows=1 loops=1)
Total runtime: 9504.523 ms
(2 rows)

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12330.630..12330.631 rows=1 loops=1)
Total runtime: 12331.353 ms
(2 rows)

So the sampling code seems *slower* when the cache is completely cold -
is that expected? (I have not looked at how the code works yet - I'll
dive in later if I get a chance)!

Regards

Mark

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

over 12 years ago

In reply to: Mark Kirkwood (#6)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 11/10/13 11:09, Mark Kirkwood wrote:

On 16/09/13 16:20, Satoshi Nagayasu wrote:

(2013/09/15 11:07), Peter Eisentraut wrote:

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function �ｽepgstat_heap_sample�ｽf:
pgstattuple.c:737:13: error: �ｽeSnapshotNow�ｽf undeclared (first use in
this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported
only once for each function it appears in

Thanks for checking. Fixed to eliminate SnapshotNow.

This seems like a cool idea! I took a quick look, and initally
replicated the sort of improvement you saw:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=786.368..786.369 rows=1 loops=1)
Total runtime: 786.384 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12.004..12.005 rows=1 loops=1)
Total runtime: 12.019 ms
(2 rows)

I wondered what sort of difference eliminating caching would make:

$ sudo sysctl -w vm.drop_caches=3

Repeating the above queries:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=9503.774..9503.776 rows=1 loops=1)
Total runtime: 9504.523 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12330.630..12330.631 rows=1 loops=1)
Total runtime: 12331.353 ms
(2 rows)

So the sampling code seems *slower* when the cache is completely cold -
is that expected? (I have not looked at how the code works yet - I'll
dive in later if I get a chance)!

Quietly replying to myself - looking at the code the sampler does 3000
random page reads... I guess this is slower than 163935 (number of pages
in pgbench_accounts) sequential page reads thanks to os readahead on my
type of disk (WD Velociraptor). Tweaking the number of random reads (i.e
the sample size) down helps - but obviously that can impact estimation
accuracy.

Thinking about this a bit more, I guess the elapsed runtime is not the
*only* theng to consider - the sampling code will cause way less
disruption to the os page cache (3000 pages vs possibly lots more than
3000 for reading an entire ralation).

Thoughts?

Cheers

Mark

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Satoshi Nagayasu

snaga@uptime.jp

over 12 years ago

In reply to: Mark Kirkwood (#7)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

(2013/10/11 7:32), Mark Kirkwood wrote:

On 11/10/13 11:09, Mark Kirkwood wrote:

On 16/09/13 16:20, Satoshi Nagayasu wrote:

(2013/09/15 11:07), Peter Eisentraut wrote:

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function �ｽepgstat_heap_sample�ｽf:
pgstattuple.c:737:13: error: �ｽeSnapshotNow�ｽf undeclared (first use in
this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported
only once for each function it appears in

Thanks for checking. Fixed to eliminate SnapshotNow.

This seems like a cool idea! I took a quick look, and initally
replicated the sort of improvement you saw:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=786.368..786.369 rows=1 loops=1)
Total runtime: 786.384 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12.004..12.005 rows=1 loops=1)
Total runtime: 12.019 ms
(2 rows)

I wondered what sort of difference eliminating caching would make:

$ sudo sysctl -w vm.drop_caches=3

Repeating the above queries:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=9503.774..9503.776 rows=1 loops=1)
Total runtime: 9504.523 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12330.630..12330.631 rows=1 loops=1)
Total runtime: 12331.353 ms
(2 rows)

So the sampling code seems *slower* when the cache is completely cold -
is that expected? (I have not looked at how the code works yet - I'll
dive in later if I get a chance)!

Thanks for testing that. It would be very helpful to improve the
performance.

Quietly replying to myself - looking at the code the sampler does 3000
random page reads... I guess this is slower than 163935 (number of pages
in pgbench_accounts) sequential page reads thanks to os readahead on my
type of disk (WD Velociraptor). Tweaking the number of random reads (i.e
the sample size) down helps - but obviously that can impact estimation
accuracy.

Thinking about this a bit more, I guess the elapsed runtime is not the
*only* theng to consider - the sampling code will cause way less
disruption to the os page cache (3000 pages vs possibly lots more than
3000 for reading an entire ralation).

Thoughts?

I think it could be improved by sorting sample block numbers
*before* physical block reads in order to eliminate random access
on the disk.

pseudo code:
--------------------------------------
for (i=0 ; i<SAMPLE_SIZE ; i++)
{
sample_block[i] = random();
}

qsort(sample_block);

for (i=0 ; i<SAMPLE_SIZE ; i++)
{
buf = ReadBuffer(rel, sample_block[i]);

do_some_stats_stuff(buf);
}
--------------------------------------

I guess it would be helpful for reducing random access thing.

Any comments?
--
Satoshi Nagayasu <snaga@uptime.jp>
Uptime Technologies, LLC. http://www.uptime.jp

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Jaime Casanova

jaime@2ndquadrant.com

over 12 years ago

In reply to: Mark Kirkwood (#7)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On Thu, Oct 10, 2013 at 5:32 PM, Mark Kirkwood
<mark.kirkwood@catalyst.net.nz> wrote:

Quietly replying to myself - looking at the code the sampler does 3000
random page reads...

FWIW, something that bothers me is that there is 3000 random page
reads... i mean, why 3000? how do you get that number as absolute for
good accuracy in every relation? why not a percentage, maybe an
argument to the function?

also the name pgstattuple2, doesn't convince me... maybe you can use
pgstattuple() if you use a second argument (percentage of the sample)
to overload the function

--
Jaime Casanova www.2ndQuadrant.com
Professional PostgreSQL: Soporte 24x7 y capacitación
Phone: +593 4 5107566 Cell: +593 987171157

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

over 12 years ago

In reply to: Satoshi Nagayasu (#8)

1 attachment(s)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 11/10/13 17:08, Satoshi Nagayasu wrote:

(2013/10/11 7:32), Mark Kirkwood wrote:

On 11/10/13 11:09, Mark Kirkwood wrote:

On 16/09/13 16:20, Satoshi Nagayasu wrote:

(2013/09/15 11:07), Peter Eisentraut wrote:

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function �ｽepgstat_heap_sample�ｽf:
pgstattuple.c:737:13: error: �ｽeSnapshotNow�ｽf undeclared (first use in
this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported
only once for each function it appears in

Thanks for checking. Fixed to eliminate SnapshotNow.

This seems like a cool idea! I took a quick look, and initally
replicated the sort of improvement you saw:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=786.368..786.369 rows=1 loops=1)
Total runtime: 786.384 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12.004..12.005 rows=1 loops=1)
Total runtime: 12.019 ms
(2 rows)

I wondered what sort of difference eliminating caching would make:

$ sudo sysctl -w vm.drop_caches=3

Repeating the above queries:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=9503.774..9503.776 rows=1 loops=1)
Total runtime: 9504.523 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12330.630..12330.631 rows=1 loops=1)
Total runtime: 12331.353 ms
(2 rows)

So the sampling code seems *slower* when the cache is completely cold -
is that expected? (I have not looked at how the code works yet - I'll
dive in later if I get a chance)!

Thanks for testing that. It would be very helpful to improve the
performance.

Quietly replying to myself - looking at the code the sampler does 3000
random page reads... I guess this is slower than 163935 (number of pages
in pgbench_accounts) sequential page reads thanks to os readahead on my
type of disk (WD Velociraptor). Tweaking the number of random reads (i.e
the sample size) down helps - but obviously that can impact estimation
accuracy.

Thinking about this a bit more, I guess the elapsed runtime is not the
*only* theng to consider - the sampling code will cause way less
disruption to the os page cache (3000 pages vs possibly lots more than
3000 for reading an entire ralation).

Thoughts?

I think it could be improved by sorting sample block numbers
*before* physical block reads in order to eliminate random access
on the disk.

pseudo code:
--------------------------------------
for (i=0 ; i<SAMPLE_SIZE ; i++)
{
sample_block[i] = random();
}

qsort(sample_block);

for (i=0 ; i<SAMPLE_SIZE ; i++)
{
buf = ReadBuffer(rel, sample_block[i]);

do_some_stats_stuff(buf);
}
--------------------------------------

I guess it would be helpful for reducing random access thing.

Any comments?

Ah yes - that's a good idea (rough patch to your patch attached)!

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=9968.318..9968.319 rows=1 loops=1)
Total runtime: 9968.443 ms
(2 rows)

It would appear that we are now not any worse than a complete sampling...

Cheers

Mark

Attachments:

pgstattuple.c.difftext/x-patch; name=pgstattuple.c.diffDownload

*** pgstattuple.c.orig	2013-10-11 17:46:00.592666316 +1300
--- pgstattuple.c	2013-10-11 17:46:08.616450284 +1300
***************
*** 100,105 ****
--- 100,123 ----
  				   uint64 *tuple_count, uint64 *tuple_len,
  				   uint64 *dead_tuple_count, uint64 *dead_tuple_len,
  				   uint64 *free_space);
+ static int	compare_blocknumbers(const void *a, const void *b);
+ 
+ /*
+  * compare for qsort
+  */
+ static int
+ compare_blocknumbers(const void *a, const void *b)
+ {
+ 	BlockNumber ba = *(const BlockNumber *)a;
+ 	BlockNumber bb = *(const BlockNumber *)b;
+ 
+ 	if (ba > bb)
+ 		return -1;
+     if (ba < bb)
+ 		return 1;
+     return 0;
+ }
+ 
  
  /*
   * build_pgstattuple_type -- build a pgstattuple_type tuple
***************
*** 686,692 ****
  pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo)
  {
  	BlockNumber nblocks;
! 	BlockNumber block;
  	pgstattuple_type stat = {0};
  	pgstattuple_block_stats block_stats[SAMPLE_SIZE];
  	int i;
--- 704,710 ----
  pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo)
  {
  	BlockNumber nblocks;
! 	BlockNumber sample_block[SAMPLE_SIZE];
  	pgstattuple_type stat = {0};
  	pgstattuple_block_stats block_stats[SAMPLE_SIZE];
  	int i;
***************
*** 695,700 ****
--- 713,725 ----
  
  	for (i=0 ; i<SAMPLE_SIZE ; i++)
  	{
+ 		sample_block[i] = (double)random() / RAND_MAX * nblocks;
+ 	}
+ 
+ 	qsort(sample_block, SAMPLE_SIZE, sizeof(BlockNumber), compare_blocknumbers);
+ 
+ 	for (i=0 ; i<SAMPLE_SIZE ; i++)
+ 	{
  		Buffer		buffer;
  		Page		page;
  		OffsetNumber pageoff;
***************
*** 702,713 ****
  		/*
  		 * sample random blocks
  		 */
- 		block = (double)random() / RAND_MAX * nblocks;
- 
  		CHECK_FOR_INTERRUPTS();
  
  		/* Read and lock buffer */
! 		buffer = ReadBuffer(rel, block);
  		LockBuffer(buffer, BUFFER_LOCK_SHARE);
  		page = BufferGetPage(buffer);
  
--- 727,736 ----
  		/*
  		 * sample random blocks
  		 */
  		CHECK_FOR_INTERRUPTS();
  
  		/* Read and lock buffer */
! 		buffer = ReadBuffer(rel, sample_block[i]);
  		LockBuffer(buffer, BUFFER_LOCK_SHARE);
  		page = BufferGetPage(buffer);

#11

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

over 12 years ago

In reply to: Jaime Casanova (#9)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 11/10/13 17:33, Jaime Casanova wrote:

On Thu, Oct 10, 2013 at 5:32 PM, Mark Kirkwood
<mark.kirkwood@catalyst.net.nz> wrote:

Quietly replying to myself - looking at the code the sampler does 3000
random page reads...

FWIW, something that bothers me is that there is 3000 random page
reads... i mean, why 3000? how do you get that number as absolute for
good accuracy in every relation? why not a percentage, maybe an
argument to the function?

Right,

Looking at http://en.wikipedia.org/wiki/Sample_size_determination maybe
it is not such a bad setting - tho 400 or 1000 seem to be good magic
numbers too (if we are gonna punt on single number that is).

Perhaps it should reuse (some of) the code from acquire_sample_rows in
src/commands/analyze.c (we can't use exactly the same logic, as we need
to keep block data together in this case).

Cheers

Mark

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

about 12 years ago

In reply to: Mark Kirkwood (#10)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 11/10/13 17:49, Mark Kirkwood wrote:

On 11/10/13 17:08, Satoshi Nagayasu wrote:

(2013/10/11 7:32), Mark Kirkwood wrote:

On 11/10/13 11:09, Mark Kirkwood wrote:

On 16/09/13 16:20, Satoshi Nagayasu wrote:

(2013/09/15 11:07), Peter Eisentraut wrote:

On Sat, 2013-09-14 at 16:18 +0900, Satoshi Nagayasu wrote:

I'm looking forward to seeing more feedback on this approach,
in terms of design and performance improvement.
So, I have submitted this for the next CF.

Your patch fails to build:

pgstattuple.c: In function �ｽepgstat_heap_sample�ｽf:
pgstattuple.c:737:13: error: �ｽeSnapshotNow�ｽf undeclared (first use in
this function)
pgstattuple.c:737:13: note: each undeclared identifier is reported
only once for each function it appears in

Thanks for checking. Fixed to eliminate SnapshotNow.

This seems like a cool idea! I took a quick look, and initally
replicated the sort of improvement you saw:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=786.368..786.369 rows=1 loops=1)
Total runtime: 786.384 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12.004..12.005 rows=1 loops=1)
Total runtime: 12.019 ms
(2 rows)

I wondered what sort of difference eliminating caching would make:

$ sudo sysctl -w vm.drop_caches=3

Repeating the above queries:

bench=# explain analyze select * from pgstattuple('pgbench_accounts');
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple (cost=0.00..0.01 rows=1 width=72) (actual
time=9503.774..9503.776 rows=1 loops=1)
Total runtime: 9504.523 ms
(2 rows)

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=12330.630..12330.631 rows=1 loops=1)
Total runtime: 12331.353 ms
(2 rows)

So the sampling code seems *slower* when the cache is completely cold -
is that expected? (I have not looked at how the code works yet - I'll
dive in later if I get a chance)!

Thanks for testing that. It would be very helpful to improve the
performance.

Quietly replying to myself - looking at the code the sampler does 3000
random page reads... I guess this is slower than 163935 (number of pages
in pgbench_accounts) sequential page reads thanks to os readahead on my
type of disk (WD Velociraptor). Tweaking the number of random reads (i.e
the sample size) down helps - but obviously that can impact estimation
accuracy.

Thinking about this a bit more, I guess the elapsed runtime is not the
*only* theng to consider - the sampling code will cause way less
disruption to the os page cache (3000 pages vs possibly lots more than
3000 for reading an entire ralation).

Thoughts?

I think it could be improved by sorting sample block numbers
*before* physical block reads in order to eliminate random access
on the disk.

pseudo code:
--------------------------------------
for (i=0 ; i<SAMPLE_SIZE ; i++)
{
sample_block[i] = random();
}

qsort(sample_block);

for (i=0 ; i<SAMPLE_SIZE ; i++)
{
buf = ReadBuffer(rel, sample_block[i]);

do_some_stats_stuff(buf);
}
--------------------------------------

I guess it would be helpful for reducing random access thing.

Any comments?

Ah yes - that's a good idea (rough patch to your patch attached)!

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN

--------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=9968.318..9968.319 rows=1 loops=1)
Total runtime: 9968.443 ms
(2 rows)

Actually - correcting my compare function to sort the blocks in
*increasing* order (doh), gets a better result:

bench=# explain analyze select * from pgstattuple2('pgbench_accounts');
NOTICE: pgstattuple2: SE tuple_count 0.00, tuple_len 0.00,
dead_tuple_count 0.00, dead_tuple_len 0.00, free_space 0.00
QUERY PLAN
------------------------------------------------------------------------------------------------------------------
Function Scan on pgstattuple2 (cost=0.00..0.01 rows=1 width=72) (actual
time=7055.840..7055.841 rows=1 loops=1)
Total runtime: 7055.954 ms
(2 rows)

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

about 12 years ago

In reply to: Jaime Casanova (#9)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 11/10/13 17:33, Jaime Casanova wrote:

also the name pgstattuple2, doesn't convince me... maybe you can use
pgstattuple() if you use a second argument (percentage of the sample)
to overload the function

+1, that seems much nicer.

Regards

Mark

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14

Mark Kirkwood

mark.kirkwood@catalyst.net.nz

about 12 years ago

In reply to: Mark Kirkwood (#13)

Re: [PoC] pgstattuple2: block sampling to reduce physical read

On 16/10/13 11:51, Mark Kirkwood wrote:

On 11/10/13 17:33, Jaime Casanova wrote:

also the name pgstattuple2, doesn't convince me... maybe you can use
pgstattuple() if you use a second argument (percentage of the sample)
to overload the function

+1, that seems much nicer.

Oh - and if you produce a new version I'd be happy to review it (I guess
it will need to go in the next CF, given the discussion happening now).

Regards

Mark

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers