Define STATS_MIN_ROWS for minimum rows of stats in ANALYZE

Started by Ilia Evdokimovabout 1 year ago3 messages
#1Ilia Evdokimov
ilya.evdokimov@tantorlabs.com
1 attachment(s)

Hi hackers,

The repeated use of the number 300 in the ANALYZE-related code creates
redundancy and relies on scattered, sometimes unclear, comments to
explain its purpose. This can make the code harder to understand,
especially for new contributors who might not immediately understand its
significance. To address this, I propose introducing a macro
STATS_MIN_ROWS to represent this value and consolidating its explanation
in a single place, making the code more consistent and readable.

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.

Attachments:

v1-0001-Define-STATS_MIN_ROWS-for-minimum-rows-of-stats.patchtext/x-patch; charset=UTF-8; name=v1-0001-Define-STATS_MIN_ROWS-for-minimum-rows-of-stats.patchDownload
From d3cbea3875fdc9866c86c64c0d0e0d838887040c Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.com>
Date: Mon, 9 Dec 2024 15:50:40 +0300
Subject: [PATCH v1] Define STATS_MIN_ROWS for minimum rows of stats in ANALYZE

This introduces a macro STATS_MIN_ROWS to represent
the default minimum number of rows (300) sampled in ANALYZE.
---
 src/backend/commands/analyze.c                | 26 ++-----------------
 src/backend/statistics/extended_stats.c       |  2 +-
 src/backend/tsearch/ts_typanalyze.c           |  4 +--
 src/backend/utils/adt/rangetypes_typanalyze.c |  7 +++--
 src/include/statistics/statistics.h           | 23 ++++++++++++++++
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9a56de2282..ff7b6ffe8f 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1897,42 +1897,20 @@ std_typanalyze(VacAttrStats *stats)
 	{
 		/* Seems to be a scalar datatype */
 		stats->compute_stats = compute_scalar_stats;
-		/*--------------------
-		 * The following choice of minrows is based on the paper
-		 * "Random sampling for histogram construction: how much is enough?"
-		 * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
-		 * Proceedings of ACM SIGMOD International Conference on Management
-		 * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
-		 * says that for table size n, histogram size k, maximum relative
-		 * error in bin size f, and error probability gamma, the minimum
-		 * random sample size is
-		 *		r = 4 * k * ln(2*n/gamma) / f^2
-		 * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
-		 *		r = 305.82 * k
-		 * Note that because of the log function, the dependence on n is
-		 * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
-		 * bin size error with probability 0.99.  So there's no real need to
-		 * scale for n, which is a good thing because we don't necessarily
-		 * know it at this point.
-		 *--------------------
-		 */
-		stats->minrows = 300 * stats->attstattarget;
 	}
 	else if (OidIsValid(eqopr))
 	{
 		/* We can still recognize distinct values */
 		stats->compute_stats = compute_distinct_stats;
-		/* Might as well use the same minrows as above */
-		stats->minrows = 300 * stats->attstattarget;
 	}
 	else
 	{
 		/* Can't do much but the trivial stuff */
 		stats->compute_stats = compute_trivial_stats;
-		/* Might as well use the same minrows as above */
-		stats->minrows = 300 * stats->attstattarget;
 	}
 
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
+
 	return true;
 }
 
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 99fdf208db..451e2d1e9c 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -320,7 +320,7 @@ ComputeExtStatisticsRows(Relation onerel,
 	MemoryContextDelete(cxt);
 
 	/* compute sample size based on the statistics target */
-	return (300 * result);
+	return (STATS_MIN_ROWS * result);
 }
 
 /*
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index ccafe42729..befd90c9d5 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -17,6 +17,7 @@
 #include "catalog/pg_operator.h"
 #include "commands/vacuum.h"
 #include "common/hashfn.h"
+#include "statistics/statistics.h"
 #include "tsearch/ts_type.h"
 #include "utils/builtins.h"
 #include "varatt.h"
@@ -64,8 +65,7 @@ ts_typanalyze(PG_FUNCTION_ARGS)
 		stats->attstattarget = default_statistics_target;
 
 	stats->compute_stats = compute_tsvector_stats;
-	/* see comment about the choice of minrows in commands/analyze.c */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index 3773f98115..7fcac5621c 100644
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -26,6 +26,7 @@
 
 #include "catalog/pg_operator.h"
 #include "commands/vacuum.h"
+#include "statistics/statistics.h"
 #include "utils/float.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
@@ -56,8 +57,7 @@ range_typanalyze(PG_FUNCTION_ARGS)
 
 	stats->compute_stats = compute_range_stats;
 	stats->extra_data = typcache;
-	/* same as in std_typanalyze */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
@@ -82,8 +82,7 @@ multirange_typanalyze(PG_FUNCTION_ARGS)
 
 	stats->compute_stats = compute_range_stats;
 	stats->extra_data = typcache;
-	/* same as in std_typanalyze */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 7f2bf18716..f578f7fd1d 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -22,6 +22,29 @@
 #define STATS_NDISTINCT_MAGIC		0xA352BFA4	/* struct identifier */
 #define STATS_NDISTINCT_TYPE_BASIC	1	/* struct version */
 
+/*--------------------
+ * Minimum of rows wanted for stats
+ *
+ * The following choice of minrows is based on the paper
+ * "Random sampling for histogram construction: how much is enough?"
+ * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
+ * Proceedings of ACM SIGMOD International Conference on Management
+ * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
+ * says that for table size n, histogram size k, maximum relative
+ * error in bin size f, and error probability gamma, the minimum
+ * random sample size is
+ *		r = 4 * k * ln(2*n/gamma) / f^2
+ * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
+ *		r = 305.82 * k
+ * Note that because of the log function, the dependence on n is
+ * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
+ * bin size error with probability 0.99.  So there's no real need to
+ * scale for n, which is a good thing because we don't necessarily
+ * know it at this point.
+ *--------------------
+ */
+#define STATS_MIN_ROWS 	300
+
 /* MVNDistinctItem represents a single combination of columns */
 typedef struct MVNDistinctItem
 {
-- 
2.34.1

#2Ilia Evdokimov
ilya.evdokimov@tantorlabs.com
In reply to: Ilia Evdokimov (#1)
1 attachment(s)
Re: Define STATS_MIN_ROWS for minimum rows of stats in ANALYZE

On 09.12.2024 16:10, Ilia Evdokimov wrote:

Hi hackers,

The repeated use of the number 300 in the ANALYZE-related code creates
redundancy and relies on scattered, sometimes unclear, comments to
explain its purpose. This can make the code harder to understand,
especially for new contributors who might not immediately understand
its significance. To address this, I propose introducing a macro
STATS_MIN_ROWS to represent this value and consolidating its
explanation in a single place, making the code more consistent and
readable.

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.

Hi everyone,

Currently, the value 300 is used as the basis for determining the number
of rows sampled during ANALYZE, both for single-column and extended
statistics. While this value has a well-established rationale for
single-column statistics, its suitability for extended statistics
remains uncertain, as no specific research has confirmed that this is an
optimal choice for them. To better reflect this distinction, I propose
introducing two macros: STATS_MIN_ROWS for single-column statistics and
EXT_STATS_MIN_ROWS for extended statistics.

This change separates the concerns of single-column and extended
statistics sampling, making the code more explicit and easier to adapt
if future research suggests a different approach for extended
statistics. The values remain the same for now, but the introduction of
distinct macros improves clarity and prepares the codebase for potential
refinements.

Does this seem like a reasonable approach to handling these differences?

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.

Attachments:

v2-0001-Define-macros-for-minimum-rows-of-stats.patchtext/x-patch; charset=UTF-8; name=v2-0001-Define-macros-for-minimum-rows-of-stats.patchDownload
From 164535616040447207c36ea47c52b206f4d77dcf Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.ru>
Date: Tue, 10 Dec 2024 16:25:29 +0300
Subject: [PATCH v2] Define macros for minimum rows of stats in ANALYZE

This introduces two macros, STATS_MIN_ROWS and EXT_STATS_MIN_ROWS,
to represent the default minimum number of rows sampled in ANALYZE.
STATS_MIN_ROWS is used for single-column statistics,
while EXT_STATS_MIN_ROWS is intended for extended statistics.
Both macros replace the hardcoded value of 300, improving clarity.
---
 src/backend/commands/analyze.c                | 31 +++++--------------
 src/backend/statistics/extended_stats.c       |  2 +-
 src/backend/tsearch/ts_typanalyze.c           |  4 +--
 src/backend/utils/adt/rangetypes_typanalyze.c |  7 ++---
 .../statistics/extended_stats_internal.h      | 11 +++++++
 src/include/statistics/statistics.h           | 23 ++++++++++++++
 6 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9a56de2282..e358b0d828 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1897,42 +1897,25 @@ std_typanalyze(VacAttrStats *stats)
 	{
 		/* Seems to be a scalar datatype */
 		stats->compute_stats = compute_scalar_stats;
-		/*--------------------
-		 * The following choice of minrows is based on the paper
-		 * "Random sampling for histogram construction: how much is enough?"
-		 * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
-		 * Proceedings of ACM SIGMOD International Conference on Management
-		 * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
-		 * says that for table size n, histogram size k, maximum relative
-		 * error in bin size f, and error probability gamma, the minimum
-		 * random sample size is
-		 *		r = 4 * k * ln(2*n/gamma) / f^2
-		 * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
-		 *		r = 305.82 * k
-		 * Note that because of the log function, the dependence on n is
-		 * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
-		 * bin size error with probability 0.99.  So there's no real need to
-		 * scale for n, which is a good thing because we don't necessarily
-		 * know it at this point.
-		 *--------------------
-		 */
-		stats->minrows = 300 * stats->attstattarget;
 	}
 	else if (OidIsValid(eqopr))
 	{
 		/* We can still recognize distinct values */
 		stats->compute_stats = compute_distinct_stats;
-		/* Might as well use the same minrows as above */
-		stats->minrows = 300 * stats->attstattarget;
 	}
 	else
 	{
 		/* Can't do much but the trivial stuff */
 		stats->compute_stats = compute_trivial_stats;
-		/* Might as well use the same minrows as above */
-		stats->minrows = 300 * stats->attstattarget;
 	}
 
+	/*
+	 * For the scalar types, STATS_MIN_ROWS is derived from research on
+	 * optimal sample sizes for histograms. For other cases,
+	 * we assume the same value.
+	 */
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
+
 	return true;
 }
 
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 99fdf208db..8cd024b5ce 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -320,7 +320,7 @@ ComputeExtStatisticsRows(Relation onerel,
 	MemoryContextDelete(cxt);
 
 	/* compute sample size based on the statistics target */
-	return (300 * result);
+	return (EXT_STATS_MIN_ROWS * result);
 }
 
 /*
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index ccafe42729..befd90c9d5 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -17,6 +17,7 @@
 #include "catalog/pg_operator.h"
 #include "commands/vacuum.h"
 #include "common/hashfn.h"
+#include "statistics/statistics.h"
 #include "tsearch/ts_type.h"
 #include "utils/builtins.h"
 #include "varatt.h"
@@ -64,8 +65,7 @@ ts_typanalyze(PG_FUNCTION_ARGS)
 		stats->attstattarget = default_statistics_target;
 
 	stats->compute_stats = compute_tsvector_stats;
-	/* see comment about the choice of minrows in commands/analyze.c */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index 3773f98115..7fcac5621c 100644
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -26,6 +26,7 @@
 
 #include "catalog/pg_operator.h"
 #include "commands/vacuum.h"
+#include "statistics/statistics.h"
 #include "utils/float.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
@@ -56,8 +57,7 @@ range_typanalyze(PG_FUNCTION_ARGS)
 
 	stats->compute_stats = compute_range_stats;
 	stats->extra_data = typcache;
-	/* same as in std_typanalyze */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
@@ -82,8 +82,7 @@ multirange_typanalyze(PG_FUNCTION_ARGS)
 
 	stats->compute_stats = compute_range_stats;
 	stats->extra_data = typcache;
-	/* same as in std_typanalyze */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 8eed9b338d..69c7fcca02 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -17,6 +17,17 @@
 #include "statistics/statistics.h"
 #include "utils/sortsupport.h"
 
+/* Minimum of rows wanted for extended stats
+ *
+ * Based on research, a fixed size proportional to statistics_target
+ * (300 * statistics_target) is generally sufficient for accurate single-column
+ * histograms and MCV lists. Its suitability for extended statistics.
+ *
+ * XXX While this value works reasonably well for individual columns, its
+ * suitability for extended statistics is still an open question.
+ */
+#define EXT_STATS_MIN_ROWS	300
+
 typedef struct
 {
 	Oid			eqopr;			/* '=' operator for datatype, if any */
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 7f2bf18716..f578f7fd1d 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -22,6 +22,29 @@
 #define STATS_NDISTINCT_MAGIC		0xA352BFA4	/* struct identifier */
 #define STATS_NDISTINCT_TYPE_BASIC	1	/* struct version */
 
+/*--------------------
+ * Minimum of rows wanted for stats
+ *
+ * The following choice of minrows is based on the paper
+ * "Random sampling for histogram construction: how much is enough?"
+ * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
+ * Proceedings of ACM SIGMOD International Conference on Management
+ * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
+ * says that for table size n, histogram size k, maximum relative
+ * error in bin size f, and error probability gamma, the minimum
+ * random sample size is
+ *		r = 4 * k * ln(2*n/gamma) / f^2
+ * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
+ *		r = 305.82 * k
+ * Note that because of the log function, the dependence on n is
+ * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
+ * bin size error with probability 0.99.  So there's no real need to
+ * scale for n, which is a good thing because we don't necessarily
+ * know it at this point.
+ *--------------------
+ */
+#define STATS_MIN_ROWS 	300
+
 /* MVNDistinctItem represents a single combination of columns */
 typedef struct MVNDistinctItem
 {
-- 
2.34.1

#3Ilia Evdokimov
ilya.evdokimov@tantorlabs.com
In reply to: Ilia Evdokimov (#2)
1 attachment(s)
Re: Define STATS_MIN_ROWS for minimum rows of stats in ANALYZE

On 10.12.2024 16:32, Ilia Evdokimov wrote:

On 09.12.2024 16:10, Ilia Evdokimov wrote:

Hi hackers,

The repeated use of the number 300 in the ANALYZE-related code
creates redundancy and relies on scattered, sometimes unclear,
comments to explain its purpose. This can make the code harder to
understand, especially for new contributors who might not immediately
understand its significance. To address this, I propose introducing a
macro STATS_MIN_ROWS to represent this value and consolidating its
explanation in a single place, making the code more consistent and
readable.

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.

Hi everyone,

Currently, the value 300 is used as the basis for determining the
number of rows sampled during ANALYZE, both for single-column and
extended statistics. While this value has a well-established rationale
for single-column statistics, its suitability for extended statistics
remains uncertain, as no specific research has confirmed that this is
an optimal choice for them. To better reflect this distinction, I
propose introducing two macros: STATS_MIN_ROWS for single-column
statistics and EXT_STATS_MIN_ROWS for extended statistics.

This change separates the concerns of single-column and extended
statistics sampling, making the code more explicit and easier to adapt
if future research suggests a different approach for extended
statistics. The values remain the same for now, but the introduction
of distinct macros improves clarity and prepares the codebase for
potential refinements.

Does this seem like a reasonable approach to handling these differences?

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.

Hi everyone,

In my opinion, it is more appropriate to define||EXT_STATS_MIN_ROWS as
STATS_MIN_ROWS. I also reverted some of the code comments and rewrote
others. I attached patch.

Any thoughts?

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.

Attachments:

v3-0001-Define-macros-for-minimum-rows-of-stats.patchtext/x-patch; charset=UTF-8; name=v3-0001-Define-macros-for-minimum-rows-of-stats.patchDownload
From d8e38a78857006189a149acc5b36df3d77fe1e40 Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.com>
Date: Fri, 3 Jan 2025 16:32:43 +0300
Subject: [PATCH] Define macros for minimum rows of stats in ANALYZE

This introduces two macros, STATS_MIN_ROWS and EXT_STATS_MIN_ROWS,
to represent the default minimum number of rows sampled in ANALYZE.
STATS_MIN_ROWS is used for single-column statistics,
while EXT_STATS_MIN_ROWS is intended for extended statistics.
Both macros replace the hardcoded value of 300, improving clarity.
---
 src/backend/commands/analyze.c                | 26 +++----------------
 src/backend/statistics/extended_stats.c       |  2 +-
 src/backend/tsearch/ts_typanalyze.c           |  5 ++--
 src/backend/utils/adt/rangetypes_typanalyze.c |  5 ++--
 .../statistics/extended_stats_internal.h      | 11 ++++++++
 src/include/statistics/statistics.h           | 23 ++++++++++++++++
 6 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 2a7769b1fd..4c02e4f18a 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1897,40 +1897,22 @@ std_typanalyze(VacAttrStats *stats)
 	{
 		/* Seems to be a scalar datatype */
 		stats->compute_stats = compute_scalar_stats;
-		/*--------------------
-		 * The following choice of minrows is based on the paper
-		 * "Random sampling for histogram construction: how much is enough?"
-		 * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
-		 * Proceedings of ACM SIGMOD International Conference on Management
-		 * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
-		 * says that for table size n, histogram size k, maximum relative
-		 * error in bin size f, and error probability gamma, the minimum
-		 * random sample size is
-		 *		r = 4 * k * ln(2*n/gamma) / f^2
-		 * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
-		 *		r = 305.82 * k
-		 * Note that because of the log function, the dependence on n is
-		 * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
-		 * bin size error with probability 0.99.  So there's no real need to
-		 * scale for n, which is a good thing because we don't necessarily
-		 * know it at this point.
-		 *--------------------
-		 */
-		stats->minrows = 300 * stats->attstattarget;
+		/* see comment about the choice of minrows in statistics/statistics.h */
+		stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 	}
 	else if (OidIsValid(eqopr))
 	{
 		/* We can still recognize distinct values */
 		stats->compute_stats = compute_distinct_stats;
 		/* Might as well use the same minrows as above */
-		stats->minrows = 300 * stats->attstattarget;
+		stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 	}
 	else
 	{
 		/* Can't do much but the trivial stuff */
 		stats->compute_stats = compute_trivial_stats;
 		/* Might as well use the same minrows as above */
-		stats->minrows = 300 * stats->attstattarget;
+		stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 	}
 
 	return true;
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index aeec3ece41..05f3dd7fbf 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -317,7 +317,7 @@ ComputeExtStatisticsRows(Relation onerel,
 	MemoryContextDelete(cxt);
 
 	/* compute sample size based on the statistics target */
-	return (300 * result);
+	return (EXT_STATS_MIN_ROWS * result);
 }
 
 /*
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 1494da1c9d..e20e114335 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -17,6 +17,7 @@
 #include "catalog/pg_operator.h"
 #include "commands/vacuum.h"
 #include "common/hashfn.h"
+#include "statistics/statistics.h"
 #include "tsearch/ts_type.h"
 #include "utils/builtins.h"
 #include "varatt.h"
@@ -64,8 +65,8 @@ ts_typanalyze(PG_FUNCTION_ARGS)
 		stats->attstattarget = default_statistics_target;
 
 	stats->compute_stats = compute_tsvector_stats;
-	/* see comment about the choice of minrows in commands/analyze.c */
-	stats->minrows = 300 * stats->attstattarget;
+	/* see comment about the choice of minrows in statistics/statistics.h */
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index 9dc73af199..a78012ed89 100644
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -26,6 +26,7 @@
 
 #include "catalog/pg_operator.h"
 #include "commands/vacuum.h"
+#include "statistics/statistics.h"
 #include "utils/float.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
@@ -57,7 +58,7 @@ range_typanalyze(PG_FUNCTION_ARGS)
 	stats->compute_stats = compute_range_stats;
 	stats->extra_data = typcache;
 	/* same as in std_typanalyze */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
@@ -83,7 +84,7 @@ multirange_typanalyze(PG_FUNCTION_ARGS)
 	stats->compute_stats = compute_range_stats;
 	stats->extra_data = typcache;
 	/* same as in std_typanalyze */
-	stats->minrows = 300 * stats->attstattarget;
+	stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
 
 	PG_RETURN_BOOL(true);
 }
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index efcb7dc354..981211186a 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -17,6 +17,17 @@
 #include "statistics/statistics.h"
 #include "utils/sortsupport.h"
 
+/* Minimum of rows wanted for extended stats
+ *
+ * Based on research, a fixed size proportional to statistics_target
+ * (300 * statistics_target) is generally sufficient for accurate single-column
+ * histograms and MCV lists. Its suitability for extended statistics.
+ *
+ * XXX While this value works reasonably well for individual columns, its
+ * suitability for extended statistics is still an open question.
+ */
+#define EXT_STATS_MIN_ROWS	STATS_MIN_ROWS
+
 typedef struct
 {
 	Oid			eqopr;			/* '=' operator for datatype, if any */
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 7dd0f97554..0863e029b1 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -22,6 +22,29 @@
 #define STATS_NDISTINCT_MAGIC		0xA352BFA4	/* struct identifier */
 #define STATS_NDISTINCT_TYPE_BASIC	1	/* struct version */
 
+/*--------------------
+ * Minimum of rows wanted for stats
+ *
+ * The following choice of minrows is based on the paper
+ * "Random sampling for histogram construction: how much is enough?"
+ * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
+ * Proceedings of ACM SIGMOD International Conference on Management
+ * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
+ * says that for table size n, histogram size k, maximum relative
+ * error in bin size f, and error probability gamma, the minimum
+ * random sample size is
+ *		r = 4 * k * ln(2*n/gamma) / f^2
+ * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
+ *		r = 305.82 * k
+ * Note that because of the log function, the dependence on n is
+ * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
+ * bin size error with probability 0.99.  So there's no real need to
+ * scale for n, which is a good thing because we don't necessarily
+ * know it at this point.
+ *--------------------
+ */
+#define STATS_MIN_ROWS 	300
+
 /* MVNDistinctItem represents a single combination of columns */
 typedef struct MVNDistinctItem
 {
-- 
2.34.1