[PATCH] random_normal function
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.
random_normal(stddev float8 DEFAULT 1.0, mean float8 DEFAULT 0.0)
Attachments:
random_normal_01.patchapplication/octet-stream; name=random_normal_01.patchDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e57ffce971..fddca8595c 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>stddev</parameter> <type>double precision</type>
+ <optional>, <parameter>mean</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>stddev</literal> of 1.0 and default <literal>mean</literal> of 0.0.
+ </para>
+ <para>
+ <literal>random_normal(1.0, 0,0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..1917480419 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(stddev float8 DEFAULT 1.0, mean float8 DEFAULT 0.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'make_interval';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index da97538ebe..b471689435 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,14 +2743,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed()
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2770,6 +2765,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2777,6 +2783,34 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * random_normal - returns a random number from the a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 2)
+ mean = PG_GETARG_FLOAT8(1);
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 1)
+ stddev = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from normal(mean = 0.0, stddev = 1.0) */
+ result = pg_prng_double_normal(&drandom_seed);
+ /* Apply optional user parameters */
+ result = (stddev * result) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..bebdae1ca0 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -20,6 +20,7 @@
#include "c.h"
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
@@ -245,3 +246,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f9301b2627..078499935b 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{stddev,mean}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
On Thu, Dec 8, 2022 at 2:53 PM Paul Ramsey <pramsey@cleverelephant.ca>
wrote:
random_normal(stddev float8 DEFAULT 1.0, mean float8 DEFAULT 0.0)
Any particular justification for placing stddev before mean? A brief
survey seems to indicate other libraries, as well as (at least for me)
learned convention, has the mean be supplied first, then the standard
deviation. The implementation/commentary seems to use that convention as
well.
Some suggestions:
/* Apply optional user parameters */ - that isn't important or even what is
happening though, and the body of the function shouldn't care about the
source of the values for the variables it uses.
Instead:
/* Transform the normal standard variable (z) using the target normal
distribution parameters */
Personally I'd probably make that even more explicit:
+ float8 z
...
* z = pg_prng_double_normal(&drandom_seed)
+ /* ... */
* result = (stddev * z) + mean
And a possible micro-optimization...
+ bool rescale = true
+ if (PG_NARGS() = 0)
+ rescale = false
...
+ if (rescale)
... result = (stddev * z) + mean
+ else
+ result = z
David J.
On Thu, Dec 08, 2022 at 01:53:23PM -0800, Paul Ramsey wrote:
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.random_normal(stddev float8 DEFAULT 1.0, mean float8 DEFAULT 0.0)
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(stddev float8 DEFAULT 1.0, mean float8 DEFAULT 0.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'make_interval';
I guess make_interval is a typo ?
This is causing it to fail tests:
http://cfbot.cputube.org/paul-ramsey.html
BTW you can run the same tests as CFBOT does from your own github
account; see:
/messages/by-id/20221116232507.GO26337@telsasoft.com
--
Justin
On Dec 8, 2022, at 2:40 PM, David G. Johnston <david.g.johnston@gmail.com> wrote:
On Thu, Dec 8, 2022 at 2:53 PM Paul Ramsey <pramsey@cleverelephant.ca> wrote:
random_normal(stddev float8 DEFAULT 1.0, mean float8 DEFAULT 0.0)
Any particular justification for placing stddev before mean? A brief survey seems to indicate other libraries, as well as (at least for me) learned convention, has the mean be supplied first, then the standard deviation. The implementation/commentary seems to use that convention as well.
No, I'm not sure what was going through my head, but I'm sure it made sense at the time (maybe something like "people will tend to jimmy with the stddev more frequently than the mean"). I've reversed the order
Some suggestions:
Thanks! Taken :)
And a possible micro-optimization...
+ bool rescale = true + if (PG_NARGS() = 0) + rescale = false ... + if (rescale) ... result = (stddev * z) + mean + else + result = z
Feels a little too micro to me (relative to the overall cost of the transform from uniform to normal distribution). I'm going to leave it out unless you violently want it.
Revised patch attached.
Thanks!
P
Attachments:
random_normal_02.patchapplication/octet-stream; name=random_normal_02.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e57ffce971..c467e73c2f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>mean</literal> of 0.0 and default <literal>stddev</literal> of 1.0.
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..c800fdea90 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0.0, stddev float8 DEFAULT 1.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'drandom_normal';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index da97538ebe..bf72825a54 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,14 +2743,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed()
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2770,6 +2765,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2777,6 +2783,33 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * random_normal - returns a random number from the a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 2)
+ stddev = PG_GETARG_FLOAT8(1);
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 1)
+ mean = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from normal(mean = 0.0, stddev = 1.0) */
+ result = pg_prng_double_normal(&drandom_seed);
+ result = (stddev * result) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..bebdae1ca0 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -20,6 +20,7 @@
#include "c.h"
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
@@ -245,3 +246,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f9301b2627..f1633d476c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
On Dec 8, 2022, at 2:46 PM, Justin Pryzby <pryzby@telsasoft.com> wrote:
I guess make_interval is a typo ?
This is causing it to fail tests:
http://cfbot.cputube.org/paul-ramsey.html
Yep, dumb typo, thanks! This bot is amazeballs, thank you!
P.
Revised patch attached.
And again, because I think I missed one change in the last one.
Attachments:
random_normal_03.patchapplication/octet-stream; name=random_normal_03.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e57ffce971..c467e73c2f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>mean</literal> of 0.0 and default <literal>stddev</literal> of 1.0.
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..c800fdea90 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0.0, stddev float8 DEFAULT 1.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'drandom_normal';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index da97538ebe..6149200598 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,14 +2743,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed()
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2770,6 +2765,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2777,6 +2783,35 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * random_normal - returns a random number from the a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 z, result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 2)
+ stddev = PG_GETARG_FLOAT8(1);
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 1)
+ mean = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from standard normal(mean = 0.0, stddev = 1.0) */
+ z = pg_prng_double_normal(&drandom_seed);
+ /* Transform the normal standard variable (z) */
+ /* using the target normal distribution parameters */
+ result = (stddev * z) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..bebdae1ca0 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -20,6 +20,7 @@
#include "c.h"
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
@@ -245,3 +246,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f9301b2627..f1633d476c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
On Dec 8, 2022, at 3:25 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
Revised patch attached.
And again, because I think I missed one change in the last one.
<random_normal_03.patch>
Final tme, with fixes from cirrusci.
Attachments:
random_normal_04.patchapplication/octet-stream; name=random_normal_04.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e57ffce971..c467e73c2f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>mean</literal> of 0.0 and default <literal>stddev</literal> of 1.0.
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..e645c2c7a6 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0.0, stddev float8 DEFAULT 1.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'drandom_normal';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index da97538ebe..6149200598 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,14 +2743,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed()
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2770,6 +2765,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2777,6 +2783,35 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * random_normal - returns a random number from the a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 z, result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 2)
+ stddev = PG_GETARG_FLOAT8(1);
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 1)
+ mean = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from standard normal(mean = 0.0, stddev = 1.0) */
+ z = pg_prng_double_normal(&drandom_seed);
+ /* Transform the normal standard variable (z) */
+ /* using the target normal distribution parameters */
+ result = (stddev * z) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..2c70b4a121 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -20,9 +20,11 @@
#include "c.h"
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
+#include "utils/float.h"
/* process-wide state vector */
pg_prng_state pg_global_prng_state;
@@ -245,3 +247,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f9301b2627..f1633d476c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
On Thu, Dec 08, 2022 at 04:44:56PM -0800, Paul Ramsey wrote:
Final tme, with fixes from cirrusci.
Well, why not. Seems like you would use that a lot with PostGIS.
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
And be careful with the order here.
+static void
+drandom_check_default_seed()
We always use (void) rather than empty parenthesis sets.
I would not leave that unchecked, so I think that you should add
something in ramdom.sql. Or would you prefer switching some of
the regression tests be switched so as they use the new normal
function?
(Ahem. Bonus points for a random_string() returning a bytea, based on
pg_strong_random().)
--
Michael
On Dec 8, 2022, at 8:29 PM, Michael Paquier <michael@paquier.xyz> wrote:
On Thu, Dec 08, 2022 at 04:44:56PM -0800, Paul Ramsey wrote:
Final tme, with fixes from cirrusci.
Well, why not. Seems like you would use that a lot with PostGIS.
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
And be careful with the order here.
Should be ... alphabetical?
+static void +drandom_check_default_seed() We always use (void) rather than empty parenthesis sets.
OK
I would not leave that unchecked, so I think that you should add
something in ramdom.sql. Or would you prefer switching some of
the regression tests be switched so as they use the new normal
function?
Reading through those tests... seems like they will (rarely) fail. Is that... OK?
The tests seem to be mostly worried that random() starts returning constants, which seems like a good thing to test for (is the random number generating returning randomness).
An obvious test for this function is that the mean and stddev converge on the supplied parameters, given enough inputs, which is actually kind of the opposite test. I use the same random number generator as the uniform distribution, so that aspect is already covered by the existing tests.
(Ahem. Bonus points for a random_string() returning a bytea, based on
pg_strong_random().)
Would love to. Separate patch of bundled into this one?
P
Show quoted text
--
Michael
On Dec 8, 2022, at 1:53 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.
Thanks for the patch. What do you think about these results?
+-- The semantics of a negative stddev are not well defined
+SELECT random_normal(mean := 0, stddev := -1);
+ random_normal
+---------------------
+ -1.0285744583010896
+(1 row)
+
+SELECT random_normal(mean := 0, stddev := '-Inf');
+ random_normal
+---------------
+ Infinity
+(1 row)
+
+-- This result may be defensible...
+SELECT random_normal(mean := '-Inf', stddev := 'Inf');
+ random_normal
+---------------
+ -Infinity
+(1 row)
+
+-- but if so, why is this NaN?
+SELECT random_normal(mean := 'Inf', stddev := 'Inf');
+ random_normal
+---------------
+ NaN
+(1 row)
+
—
Mark Dilger
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Dec 9, 2022, at 10:39 AM, Mark Dilger <mark.dilger@enterprisedb.com> wrote:
On Dec 8, 2022, at 1:53 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.Thanks for the patch. What do you think about these results?
Angels on pins time! :)
+-- The semantics of a negative stddev are not well defined +SELECT random_normal(mean := 0, stddev := -1); + random_normal +--------------------- + -1.0285744583010896 +(1 row)
Question is does a negative stddev make enough sense? It is functionally using fabs(stddev),
SELECT avg(random_normal(mean := 0, stddev := -1)) from generate_series(1,1000);
avg
---------------------
0.03156106778729526
So could toss an invalid parameter on negative? Not sure if that's more helpful than just being mellow about this input.
+ +SELECT random_normal(mean := 0, stddev := '-Inf'); + random_normal +--------------- + Infinity +(1 row)
The existing logic around means and stddevs and Inf is hard to tease out:
SELECT avg(v),stddev(v) from (VALUES ('Inf'::float8, '-Inf'::float8)) a(v);
avg | stddev
----------+--------
Infinity |
The return of NULL of stddev would seem to argue that null in this case means "does not compute" at some level. So return NULL on Inf stddev?
+ +-- This result may be defensible... +SELECT random_normal(mean := '-Inf', stddev := 'Inf'); + random_normal +--------------- + -Infinity +(1 row) + +-- but if so, why is this NaN? +SELECT random_normal(mean := 'Inf', stddev := 'Inf'); + random_normal +--------------- + NaN +(1 row)
An Inf mean only implies that one value in the distribution is Inf, but running the function in reverse (generating values) and only generating one value from the distribution implies we have to always return Inf (except in this case stddev is also Inf, so I'd go with NULL, assuming we accept the NULL premise above.
How do you read the tea leaves?
P.
On 12/9/22 13:51, Paul Ramsey wrote:
On Dec 9, 2022, at 10:39 AM, Mark Dilger <mark.dilger@enterprisedb.com> wrote:
On Dec 8, 2022, at 1:53 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.Thanks for the patch. What do you think about these results?
Angels on pins time! :)
I just noticed this thread -- what is lacking in the normal_rand()
function in the tablefunc contrib?
https://www.postgresql.org/docs/current/tablefunc.html#id-1.11.7.52.5
--
Joe Conway
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Dec 9, 2022, at 11:01 AM, Joe Conway <mail@joeconway.com> wrote:
On 12/9/22 13:51, Paul Ramsey wrote:
On Dec 9, 2022, at 10:39 AM, Mark Dilger <mark.dilger@enterprisedb.com> wrote:
On Dec 8, 2022, at 1:53 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.Thanks for the patch. What do you think about these results?
Angels on pins time! :)
I just noticed this thread -- what is lacking in the normal_rand() function in the tablefunc contrib?
https://www.postgresql.org/docs/current/tablefunc.html#id-1.11.7.52.5
Simplicity I guess mostly. random_normal() has a direct analogue in random() which is also a core function. I mean it could equally be pointed out that a user can implement their own Box-Muller calculation pretty trivially. Part of this submission is a personal wondering to what extent the community values convenience vs composibility. The set-returning nature of normal_rand() may be a bit of a red herring to people who just want one value (even though normal_rand (1, 0.0, 1.0) does exactly what they want).
P.
On Dec 9, 2022, at 11:10 AM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
On Dec 9, 2022, at 11:01 AM, Joe Conway <mail@joeconway.com> wrote:
On 12/9/22 13:51, Paul Ramsey wrote:
On Dec 9, 2022, at 10:39 AM, Mark Dilger <mark.dilger@enterprisedb.com> wrote:
On Dec 8, 2022, at 1:53 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
Just a utility function to generate random numbers from a normal
distribution. I find myself doing this several times a year, and I am
sure I must not be the only one.Thanks for the patch. What do you think about these results?
Angels on pins time! :)
I just noticed this thread -- what is lacking in the normal_rand() function in the tablefunc contrib?
https://www.postgresql.org/docs/current/tablefunc.html#id-1.11.7.52.5
Simplicity I guess mostly. random_normal() has a direct analogue in random() which is also a core function. I mean it could equally be pointed out that a user can implement their own Box-Muller calculation pretty trivially. Part of this submission is a personal wondering to what extent the community values convenience vs composibility. The set-returning nature of normal_rand() may be a bit of a red herring to people who just want one value (even though normal_rand (1, 0.0, 1.0) does exactly what they want).
No related to the "reason to exist", but normal_rand() has some interesting behaviour under Mark's test cases!
select normal_rand (1, 'Inf', 'Inf'), a from generate_series(1,2) a;
normal_rand | a
-------------+---
NaN | 1
Infinity | 2
(2 rows)
On Dec 9, 2022, at 9:17 AM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
On Dec 8, 2022, at 8:29 PM, Michael Paquier <michael@paquier.xyz> wrote:
On Thu, Dec 08, 2022 at 04:44:56PM -0800, Paul Ramsey wrote:
Final tme, with fixes from cirrusci.
Well, why not. Seems like you would use that a lot with PostGIS.
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
And be careful with the order here.Should be ... alphabetical?
+static void +drandom_check_default_seed() We always use (void) rather than empty parenthesis sets.OK
I would not leave that unchecked, so I think that you should add
something in ramdom.sql. Or would you prefer switching some of
the regression tests be switched so as they use the new normal
function?Reading through those tests... seems like they will (rarely) fail. Is that... OK?
The tests seem to be mostly worried that random() starts returning constants, which seems like a good thing to test for (is the random number generating returning randomness).
An obvious test for this function is that the mean and stddev converge on the supplied parameters, given enough inputs, which is actually kind of the opposite test. I use the same random number generator as the uniform distribution, so that aspect is already covered by the existing tests.(Ahem. Bonus points for a random_string() returning a bytea, based on
pg_strong_random().)Would love to. Separate patch of bundled into this one?
Here's the original with suggestions applied and a random_string that applies on top of it.
Thanks!
P
Attachments:
random_normal_05a.patchapplication/octet-stream; name=random_normal_05a.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e57ffce971..c467e73c2f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>mean</literal> of 0.0 and default <literal>stddev</literal> of 1.0.
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..e645c2c7a6 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0.0, stddev float8 DEFAULT 1.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'drandom_normal';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index da97538ebe..d443aa985a 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,14 +2743,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed(void)
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2770,6 +2765,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2777,6 +2783,35 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_normal - returns a random number from a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 z, result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 2)
+ stddev = PG_GETARG_FLOAT8(1);
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 1)
+ mean = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from standard normal(mean = 0.0, stddev = 1.0) */
+ z = pg_prng_double_normal(&drandom_seed);
+ /* Transform the normal standard variable (z) */
+ /* using the target normal distribution parameters */
+ result = (stddev * z) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..b93ee46c68 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -19,10 +19,12 @@
#include "c.h"
+#include <float.h> /* for DBL_EPSILON */
#include <math.h> /* for ldexp() */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
+#include "utils/float.h"
/* process-wide state vector */
pg_prng_state pg_global_prng_state;
@@ -245,3 +247,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f9301b2627..f1633d476c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index a919b28d8d..eb31f93cf7 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -51,3 +51,19 @@ SELECT AVG(random) FROM RANDOM_TBL
-----
(0 rows)
+-- normal values converge on mean == 2.0
+SELECT round(avg(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
+ round
+-------
+ 2
+(1 row)
+
+-- normal values converge on stddev == 2.0
+SELECT round(stddev(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
+ round
+-------
+ 2
+(1 row)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 8187b2c288..8ed737f3e5 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -42,3 +42,11 @@ SELECT random, count(random) FROM RANDOM_TBL
SELECT AVG(random) FROM RANDOM_TBL
HAVING AVG(random) NOT BETWEEN 80 AND 120;
+
+-- normal values converge on mean == 2.0
+SELECT round(avg(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
+
+-- normal values converge on stddev == 2.0
+SELECT round(stddev(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
random_normal_05b.patchapplication/octet-stream; name=random_normal_05b.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index c467e73c2f..2ca545b375 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1836,6 +1836,25 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry">
+ <para role="func_signature">
+ <indexterm>
+ <primary>random_string</primary>
+ </indexterm>
+ <function>random_string</function> (
+ <parameter>nbytes</parameter> <type>integer</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Returns a bytea of the specified size, containing random values.
+ </para>
+ <para>
+ <literal>random_string(3)</literal>
+ <returnvalue>\x78da35</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index d443aa985a..2b444a4129 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2813,6 +2813,35 @@ drandom_normal(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_string - returns a bytea filled with random content
+ *
+ */
+Datum
+drandom_string(PG_FUNCTION_ARGS)
+{
+ int32 nbytes = PG_GETARG_INT32(0);;
+ if (nbytes < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("return size must be non-negative")));
+ }
+ else
+ {
+ size_t result_size = VARHDRSZ + nbytes;
+ bytea *result = palloc(result_size);
+
+ if (nbytes > 0)
+ {
+ drandom_check_default_seed();
+ pg_strong_random(VARDATA(result), nbytes);
+ }
+ SET_VARSIZE(result, result_size);
+ PG_RETURN_BYTEA_P(result);
+ }
+}
+
/*
* setseed - set seed for the random number generator
*/
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f1633d476c..c14780be94 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3366,6 +3366,9 @@
proname => 'random_normal', provolatile => 'v', proparallel => 'r',
prorettype => 'float8', proargtypes => 'float8 float8',
proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
+{ oid => '5152', descr => 'fill bytea with random values',
+ proname => 'random_string', provolatile => 'v', proparallel => 'r',
+ prorettype => 'bytea', proargtypes => 'int4', prosrc => 'drandom_string' },
# OIDS 1600 - 1699
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index eb31f93cf7..926e760af1 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -67,3 +67,11 @@ SELECT round(stddev(random_normal(2, 2)))
2
(1 row)
+-- should practically never happen
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
+ same
+------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 8ed737f3e5..3bf3db087e 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -50,3 +50,7 @@ SELECT round(avg(random_normal(2, 2)))
-- normal values converge on stddev == 2.0
SELECT round(stddev(random_normal(2, 2)))
FROM generate_series(1, 10000);
+
+-- should practically never happen
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
On Dec 9, 2022, at 3:20 PM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
On Dec 9, 2022, at 9:17 AM, Paul Ramsey <pramsey@cleverelephant.ca> wrote:
On Dec 8, 2022, at 8:29 PM, Michael Paquier <michael@paquier.xyz> wrote:
On Thu, Dec 08, 2022 at 04:44:56PM -0800, Paul Ramsey wrote:
Final tme, with fixes from cirrusci.
Well, why not. Seems like you would use that a lot with PostGIS.
#include <math.h> /* for ldexp() */
+#include <float.h> /* for DBL_EPSILON */
And be careful with the order here.Should be ... alphabetical?
+static void +drandom_check_default_seed() We always use (void) rather than empty parenthesis sets.OK
I would not leave that unchecked, so I think that you should add
something in ramdom.sql. Or would you prefer switching some of
the regression tests be switched so as they use the new normal
function?Reading through those tests... seems like they will (rarely) fail. Is that... OK?
The tests seem to be mostly worried that random() starts returning constants, which seems like a good thing to test for (is the random number generating returning randomness).
An obvious test for this function is that the mean and stddev converge on the supplied parameters, given enough inputs, which is actually kind of the opposite test. I use the same random number generator as the uniform distribution, so that aspect is already covered by the existing tests.(Ahem. Bonus points for a random_string() returning a bytea, based on
pg_strong_random().)Would love to. Separate patch of bundled into this one?
Here's the original with suggestions applied and a random_string that applies on top of it.
Thanks!
P
Clearing up one CI failure.
Attachments:
random_normal_06a.patchapplication/octet-stream; name=random_normal_06a.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e57ffce971..c467e73c2f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>mean</literal> of 0.0 and default <literal>stddev</literal> of 1.0.
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..e645c2c7a6 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0.0, stddev float8 DEFAULT 1.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'drandom_normal';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index da97538ebe..d443aa985a 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,14 +2743,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed(void)
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2770,6 +2765,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2777,6 +2783,35 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_normal - returns a random number from a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 z, result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 2)
+ stddev = PG_GETARG_FLOAT8(1);
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 1)
+ mean = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from standard normal(mean = 0.0, stddev = 1.0) */
+ z = pg_prng_double_normal(&drandom_seed);
+ /* Transform the normal standard variable (z) */
+ /* using the target normal distribution parameters */
+ result = (stddev * z) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..59a532fda2 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -19,11 +19,17 @@
#include "c.h"
+#include <float.h> /* for DBL_EPSILON */
#include <math.h> /* for ldexp() */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
+/* For environments that do not define M_PI in math.h */
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
/* process-wide state vector */
pg_prng_state pg_global_prng_state;
@@ -245,3 +251,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f9301b2627..f1633d476c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index a919b28d8d..eb31f93cf7 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -51,3 +51,19 @@ SELECT AVG(random) FROM RANDOM_TBL
-----
(0 rows)
+-- normal values converge on mean == 2.0
+SELECT round(avg(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
+ round
+-------
+ 2
+(1 row)
+
+-- normal values converge on stddev == 2.0
+SELECT round(stddev(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
+ round
+-------
+ 2
+(1 row)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 8187b2c288..8ed737f3e5 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -42,3 +42,11 @@ SELECT random, count(random) FROM RANDOM_TBL
SELECT AVG(random) FROM RANDOM_TBL
HAVING AVG(random) NOT BETWEEN 80 AND 120;
+
+-- normal values converge on mean == 2.0
+SELECT round(avg(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
+
+-- normal values converge on stddev == 2.0
+SELECT round(stddev(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
random_normal_06b.patchapplication/octet-stream; name=random_normal_06b.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index c467e73c2f..2ca545b375 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1836,6 +1836,25 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry">
+ <para role="func_signature">
+ <indexterm>
+ <primary>random_string</primary>
+ </indexterm>
+ <function>random_string</function> (
+ <parameter>nbytes</parameter> <type>integer</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Returns a bytea of the specified size, containing random values.
+ </para>
+ <para>
+ <literal>random_string(3)</literal>
+ <returnvalue>\x78da35</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index d443aa985a..2b444a4129 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2813,6 +2813,35 @@ drandom_normal(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_string - returns a bytea filled with random content
+ *
+ */
+Datum
+drandom_string(PG_FUNCTION_ARGS)
+{
+ int32 nbytes = PG_GETARG_INT32(0);;
+ if (nbytes < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("return size must be non-negative")));
+ }
+ else
+ {
+ size_t result_size = VARHDRSZ + nbytes;
+ bytea *result = palloc(result_size);
+
+ if (nbytes > 0)
+ {
+ drandom_check_default_seed();
+ pg_strong_random(VARDATA(result), nbytes);
+ }
+ SET_VARSIZE(result, result_size);
+ PG_RETURN_BYTEA_P(result);
+ }
+}
+
/*
* setseed - set seed for the random number generator
*/
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f1633d476c..c14780be94 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3366,6 +3366,9 @@
proname => 'random_normal', provolatile => 'v', proparallel => 'r',
prorettype => 'float8', proargtypes => 'float8 float8',
proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
+{ oid => '5152', descr => 'fill bytea with random values',
+ proname => 'random_string', provolatile => 'v', proparallel => 'r',
+ prorettype => 'bytea', proargtypes => 'int4', prosrc => 'drandom_string' },
# OIDS 1600 - 1699
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index eb31f93cf7..926e760af1 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -67,3 +67,11 @@ SELECT round(stddev(random_normal(2, 2)))
2
(1 row)
+-- should practically never happen
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
+ same
+------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 8ed737f3e5..3bf3db087e 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -50,3 +50,7 @@ SELECT round(avg(random_normal(2, 2)))
-- normal values converge on stddev == 2.0
SELECT round(stddev(random_normal(2, 2)))
FROM generate_series(1, 10000);
+
+-- should practically never happen
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
On Tue, Dec 13, 2022 at 03:51:11PM -0800, Paul Ramsey wrote:
Clearing up one CI failure.
+-- normal values converge on stddev == 2.0
+SELECT round(stddev(random_normal(2, 2)))
+ FROM generate_series(1, 10000);
I am not sure that it is a good idea to make a test based on a random
behavior that should tend to a normalized value. This is costly in
cycles, requiring a lot of work just for generate_series(). You could
do the same kind of thing as random() a few lines above?
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
That should be fine in terms of impossible chances :)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("return size must be non-negative")))
This could have a test, same for 0.
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
Postgres' float.h includes one version of that.
--
Michael
On Dec 14, 2022, at 9:17 PM, Michael Paquier <michael@paquier.xyz> wrote:
On Tue, Dec 13, 2022 at 03:51:11PM -0800, Paul Ramsey wrote:
Clearing up one CI failure.
+-- normal values converge on stddev == 2.0 +SELECT round(stddev(random_normal(2, 2))) + FROM generate_series(1, 10000);I am not sure that it is a good idea to make a test based on a random
behavior that should tend to a normalized value. This is costly in
cycles, requiring a lot of work just for generate_series(). You could
do the same kind of thing as random() a few lines above?+SELECT bool_and(random_string(16) != random_string(16)) AS same + FROM generate_series(1,8); That should be fine in terms of impossible chances :)+ ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("return size must be non-negative"))) This could have a test, same for 0.+#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif Postgres' float.h includes one version of that.
Thanks again!
P
Attachments:
random_normal_07a.patchapplication/octet-stream; name=random_normal_07a.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 1cd8b11334..cd0b4db07c 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,27 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type></optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from a normal distribution, with default
+ <literal>mean</literal> of 0.0 and default <literal>stddev</literal> of 1.0.
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1845,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 52517a6531..e645c2c7a6 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -620,6 +620,13 @@ CREATE OR REPLACE FUNCTION
STABLE PARALLEL SAFE
AS 'sql_localtimestamp';
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0.0, stddev float8 DEFAULT 1.0)
+RETURNS float8
+LANGUAGE INTERNAL
+STRICT VOLATILE PARALLEL SAFE
+AS 'drandom_normal';
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index b02a19be24..36a9712b0e 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2715,14 +2715,9 @@ datanh(PG_FUNCTION_ARGS)
}
-/*
- * drandom - returns a random number
- */
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+drandom_check_default_seed(void)
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2742,6 +2737,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ drandom_check_default_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2749,6 +2755,35 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_normal - returns a random number from a normal distribution
+ *
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 z, result;
+ float8 mean = 0.0;
+ float8 stddev = 1.0;
+
+ /* Read optional stddev */
+ if (PG_NARGS() >= 2)
+ stddev = PG_GETARG_FLOAT8(1);
+
+ /* Read optional mean */
+ if (PG_NARGS() >= 1)
+ mean = PG_GETARG_FLOAT8(0);
+
+ drandom_check_default_seed();
+
+ /* Get random value from standard normal(mean = 0.0, stddev = 1.0) */
+ z = pg_prng_double_normal(&drandom_seed);
+ /* Transform the normal standard variable (z) */
+ /* using the target normal distribution parameters */
+ result = (stddev * z) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index 3d2f42724e..23bdb235ab 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -19,10 +19,13 @@
#include "c.h"
+#include <float.h> /* for DBL_EPSILON */
#include <math.h> /* for ldexp() */
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
+#include "utils/float.h"
+
/* process-wide state vector */
pg_prng_state pg_global_prng_state;
@@ -245,3 +248,30 @@ pg_prng_bool(pg_prng_state *state)
return (bool) (v >> 63);
}
+
+
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result for a different normal distribution use
+ * STDDEV * pg_prng_double_normal + MEAN
+ *
+ * Using https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1, u2, z0;
+ /* Ensure u1 is at least as big as epsilon */
+ do
+ {
+ u1 = pg_prng_double(state);
+ }
+ while (u1 <= DBL_EPSILON);
+ u2 = pg_prng_double(state);
+
+ /* Apply Box-Muller calculation for one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+ return z0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 719599649a..b923db6fda 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3362,6 +3362,10 @@
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
+{ oid => '5151', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
# OIDS 1600 - 1699
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index d9895b495c..5b3ef7cd83 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index a919b28d8d..784001480b 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -51,3 +51,31 @@ SELECT AVG(random) FROM RANDOM_TBL
-----
(0 rows)
+-- now test the random_normal()
+TRUNCATE random_tbl;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+-- expect similar, but not identical values
+SELECT random, count(random) FROM random_tbl
+ GROUP BY random HAVING count(random) > 3;
+ random | count
+--------+-------
+(0 rows)
+
+-- approximately check expected distribution
+SELECT AVG(random) FROM random_tbl
+ HAVING AVG(random) NOT BETWEEN 400 AND 600;
+ avg
+-----
+(0 rows)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 8187b2c288..4e0a91c3e4 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -42,3 +42,28 @@ SELECT random, count(random) FROM RANDOM_TBL
SELECT AVG(random) FROM RANDOM_TBL
HAVING AVG(random) NOT BETWEEN 80 AND 120;
+
+-- now test the random_normal()
+TRUNCATE random_tbl;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+
+-- expect similar, but not identical values
+SELECT random, count(random) FROM random_tbl
+ GROUP BY random HAVING count(random) > 3;
+
+-- approximately check expected distribution
+SELECT AVG(random) FROM random_tbl
+ HAVING AVG(random) NOT BETWEEN 400 AND 600;
+
+
random_normal_07b.patchapplication/octet-stream; name=random_normal_07b.patch; x-unix-mode=0644Download
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index cd0b4db07c..8e56faaf9f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1836,6 +1836,25 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry">
+ <para role="func_signature">
+ <indexterm>
+ <primary>random_string</primary>
+ </indexterm>
+ <function>random_string</function> (
+ <parameter>nbytes</parameter> <type>integer</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Returns a bytea of the specified size, containing random values.
+ </para>
+ <para>
+ <literal>random_string(3)</literal>
+ <returnvalue>\x78da35</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 36a9712b0e..1590546759 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2785,6 +2785,35 @@ drandom_normal(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_string - returns a bytea filled with random content
+ *
+ */
+Datum
+drandom_string(PG_FUNCTION_ARGS)
+{
+ int32 nbytes = PG_GETARG_INT32(0);;
+ if (nbytes < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("string size must be non-negative")));
+ }
+ else
+ {
+ size_t result_size = VARHDRSZ + nbytes;
+ bytea *result = palloc(result_size);
+
+ if (nbytes > 0)
+ {
+ drandom_check_default_seed();
+ pg_strong_random(VARDATA(result), nbytes);
+ }
+ SET_VARSIZE(result, result_size);
+ PG_RETURN_BYTEA_P(result);
+ }
+}
+
/*
* setseed - set seed for the random number generator
*/
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index b923db6fda..42ddb2f827 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3366,6 +3366,9 @@
proname => 'random_normal', provolatile => 'v', proparallel => 'r',
prorettype => 'float8', proargtypes => 'float8 float8',
proargnames => '{mean,stddev}', prosrc => 'drandom_normal' },
+{ oid => '5152', descr => 'fill bytea with random values',
+ proname => 'random_string', provolatile => 'v', proparallel => 'r',
+ prorettype => 'bytea', proargtypes => 'int4', prosrc => 'drandom_string' },
# OIDS 1600 - 1699
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index 784001480b..34e42a9f66 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -79,3 +79,21 @@ SELECT AVG(random) FROM random_tbl
-----
(0 rows)
+-- not allowed
+SELECT random_string(-1);
+ERROR: string size must be non-negative
+-- zero length bytea
+SELECT random_string(0);
+ random_string
+---------------
+ \x
+(1 row)
+
+-- should practically never happen
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
+ same
+------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 4e0a91c3e4..7be48d83ae 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -66,4 +66,12 @@ SELECT random, count(random) FROM random_tbl
SELECT AVG(random) FROM random_tbl
HAVING AVG(random) NOT BETWEEN 400 AND 600;
+-- not allowed
+SELECT random_string(-1);
+-- zero length bytea
+SELECT random_string(0);
+
+-- should practically never happen
+SELECT bool_and(random_string(16) != random_string(16)) AS same
+ FROM generate_series(1,8);
Hello Paul,
My 0.02€ about the patch:
Patch did not apply with "git apply", I had to "patch -p1 <" and there was
a bunch of warnings.
Patches compile and make check is okay.
The first patch adds empty lines at the end of "sql/random.sql", I think
that it should not.
# About random_normal:
I'm fine with providing a random_normal implementation at prng and SQL
levels.
There is already such an implementation in "pgbench.c", which outputs
integers, I'd suggest that it should also use the new prng function, there
should not be two box-muller transformations in pg.
# About pg_prng_double_normal:
On the comment, I'd write "mean + stddev * val" instead of starting with
the stddev part.
Usually there is an empty line between the variable declarations and the
first statement.
There should be a comment about why it needs u1
larger than some epsilon. This constraint seems to generate a small bias?
I'd suggest to add the UNLIKELY() compiler hint on the loop.
# About random_string:
Should the doc about random_string tell that the output bytes are expected
to be uniformly distributed? Does it return "random values" or "pseudo
random values"?
I do not understand why the "drandom_string" function is in "float.c", as
it is not really related to floats. Also it does not return a string but a
bytea, so why call it "_string" in the first place? I'm do not think that
it should use pg_strong_random which may be very costly on some platform.
Also, pg_strong_random does not use the seed, so I do not understand why
it needs to be checked. I'd suggest that the output should really be
uniform pseudo-random, possibly based on the drandom() state, or maybe
not.
Overall, I think that there should be a clearer discussion and plan about
which random functionS postgres should provide to complement the standard
instead of going there… randomly:-)
--
Fabien.
On Sat, Dec 17, 2022 at 05:49:15PM +0100, Fabien COELHO wrote:
Overall, I think that there should be a clearer discussion and plan about
which random functionS postgres should provide to complement the standard
instead of going there… randomly:-)
So, what does the specification tells about seeds, normal and random
functions? A bunch of DBMSs implement RAND, sometimes RANDOM, SEED or
even NORMAL using from time to time specific SQL keywords to do the
work.
Note that SQLValueFunction made the addition of more returning data
types a bit more complicated (not much, still) than the new
COERCE_SQL_SYNTAX by going through a mapping function, so the
keyword/function mapping is straight-forward.
--
Michael
Bonjour Michaël,
Overall, I think that there should be a clearer discussion and plan about
which random functionS postgres should provide to complement the standard
instead of going there… randomly:-)So, what does the specification tells about seeds, normal and random
functions? A bunch of DBMSs implement RAND, sometimes RANDOM, SEED or
even NORMAL using from time to time specific SQL keywords to do the
work.
I do not have the SQL standard, so I have no idea about what is in there.
From a typical use case point of view, I'd say uniform, normal and
exponential would make sense for floats. I'm also okay with generating a
uniform bytes pseudo-randomly.
I'd be more at ease to add simple functions rather than a special
heavy-on-keywords syntax, even if standard.
Note that SQLValueFunction made the addition of more returning data
types a bit more complicated (not much, still) than the new
COERCE_SQL_SYNTAX by going through a mapping function, so the
keyword/function mapping is straight-forward.
I'm unclear about why this paragraph is here.
--
Fabien.
On 12/19/22 04:36, Michael Paquier wrote:
On Sat, Dec 17, 2022 at 05:49:15PM +0100, Fabien COELHO wrote:
Overall, I think that there should be a clearer discussion and plan about
which random functionS postgres should provide to complement the standard
instead of going there… randomly:-)So, what does the specification tells about seeds, normal and random
functions?
Nothing at all.
--
Vik Fearing
On Wed, Dec 21, 2022 at 08:47:32AM +0100, Fabien COELHO wrote:
From a typical use case point of view, I'd say uniform, normal and
exponential would make sense for floats. I'm also okay with generating a
uniform bytes pseudo-randomly.
I'd agree with this set.
I'd be more at ease to add simple functions rather than a special
heavy-on-keywords syntax, even if standard.
Okay.
Note that SQLValueFunction made the addition of more returning data
types a bit more complicated (not much, still) than the new
COERCE_SQL_SYNTAX by going through a mapping function, so the
keyword/function mapping is straight-forward.I'm unclear about why this paragraph is here.
Just saying that using COERCE_SQL_SYNTAX for SQL keywords is easier
than the older style. If the SQL specification mentions no SQL
keywords for such things, this is irrelevant, of course :)
--
Michael
On Thu, Dec 08, 2022 at 02:58:02PM -0800, Paul Ramsey wrote:
On Dec 8, 2022, at 2:46 PM, Justin Pryzby <pryzby@telsasoft.com> wrote:
I guess make_interval is a typo ?
This is causing it to fail tests:
http://cfbot.cputube.org/paul-ramsey.htmlYep, dumb typo, thanks! This bot is amazeballs, thank you!
This is still failing tests - did you enable cirrusci on your own github
account to run available checks on the patch ?
--
Justin
On Fri, Dec 30, 2022 at 09:58:04PM -0600, Justin Pryzby wrote:
This is still failing tests - did you enable cirrusci on your own github
account to run available checks on the patch ?
FYI, here is the failure:
[21:23:10.814] In file included from pg_prng.c:27:
[21:23:10.814] ../../src/include/utils/float.h:46:16: error: ‘struct
Node’ declared inside parameter list will not be visible outside of
this definition or declaration [-Werror]
[21:23:10.814] 46 | struct Node *escontext);
And a link to it, from the CF bot:
https://cirrus-ci.com/task/5969961391226880?logs=gcc_warning#L452
--
Michael
Michael Paquier <michael@paquier.xyz> writes:
FYI, here is the failure:
[21:23:10.814] In file included from pg_prng.c:27:
[21:23:10.814] ../../src/include/utils/float.h:46:16: error: ‘struct
Node’ declared inside parameter list will not be visible outside of
this definition or declaration [-Werror]
[21:23:10.814] 46 | struct Node *escontext);
Hmm ... this looks an awful lot like it is the fault of ccff2d20e
not of the random_normal patch; that is, we probably need a
"struct Node" stub declaration in float.h. However, why are we
not seeing any reports of this from elsewhere? I'm concerned now
that there are more places also needing stub declarations, but
my test process didn't expose it.
regards, tom lane
I wrote:
Michael Paquier <michael@paquier.xyz> writes:
FYI, here is the failure:
[21:23:10.814] In file included from pg_prng.c:27:
[21:23:10.814] ../../src/include/utils/float.h:46:16: error: ‘struct
Node’ declared inside parameter list will not be visible outside of
this definition or declaration [-Werror]
[21:23:10.814] 46 | struct Node *escontext);
Hmm ... this looks an awful lot like it is the fault of ccff2d20e
not of the random_normal patch; that is, we probably need a
"struct Node" stub declaration in float.h.
[ ... some head-scratching later ... ]
No, we don't per our normal headerscheck rules, which are that
headers such as utils/float.h need to be compilable after including
just postgres.h. The "struct Node" stub declaration in elog.h will
be visible, making the declaration of float8in_internal kosher.
So the problem in this patch is that it's trying to include
utils/float.h in a src/common file, where we have not included
postgres.h. Question is, why did you do that? I see nothing in
pg_prng_double_normal() that looks like it should require that header.
If it did, it'd be questionable whether it's okay to be in src/common.
regards, tom lane
I wrote:
So the problem in this patch is that it's trying to include
utils/float.h in a src/common file, where we have not included
postgres.h. Question is, why did you do that?
(Ah, for M_PI ... but our practice is just to duplicate that #define
where needed outside the backend.)
I spent some time reviewing this patch. I'm on board with
inventing random_normal(): the definition seems solid and
the use-case for it seems reasonably well established.
I'm not necessarily against inventing similar functions for
other distributions, but this patch is not required to do so.
We can leave that discussion until somebody is motivated to
submit a patch for one.
On the other hand, I'm much less on board with inventing
random_string(): we don't have any clear field demand for it
and the appropriate definitional details are a lot less obvious
(for example, whether it needs to be based on pg_strong_random()
rather than the random() sequence). I think we should leave that
out, and I have done so in the attached updated patch.
I noted several errors in the submitted patch. It was creating
the function as PARALLEL SAFE which is just wrong, and the whole
business with checking PG_NARGS is useless because it will always
be 2. (That's not how default arguments work.)
The business with checking against DBL_EPSILON seems wrong too.
All we need is to ensure that u1 > 0 so that log(u1) will not
choke; per spec, log() is defined for any positive input. I see that
that seems to have been modeled on the C++ code in the Wikipedia
page, but I'm not sure that C++'s epsilon means the same thing, and
if it does then their example code is just wrong. See the discussion
about "tails truncation" immediately above it: artificially
constraining the range of u1 just limits how much of the tail
of the distribution we can reproduce. So that led me to doing
it the same way as in the existing Box-Muller code in pgbench,
which I then deleted per Fabien's advice.
BTW, the pgbench code was using sin() not cos(), which I duplicated
because using cos() causes the expected output of the pgbench tests
to change. I'm not sure whether there was any hard reason to prefer
one or the other, and we can certainly change the expected output
if there's some reason to prefer cos().
I concur with not worrying about the Inf/NaN cases that Mark
pointed out. It's not obvious that the results the proposed code
produces are wrong, and it's even less obvious that anyone will
ever care.
Also, I tried running the new random.sql regression cases over
and over, and found that the "not all duplicates" test fails about
one time in 100000 or so. We could probably tolerate that given
that the random test is marked "ignore" in parallel_schedule, but
I thought it best to add one more iteration so we could knock the
odds down. Also I changed the test iterations so they weren't
all invoking random_normal() in exactly the same way.
This version seems committable to me, barring objections.
regards, tom lane
Attachments:
random_normal_8.patchtext/x-diff; charset=UTF-8; name=random_normal_8.patchDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 3bf8d021c3..b67dc26a35 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1815,6 +1815,28 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>random_normal</primary>
+ </indexterm>
+
+ <function>random_normal</function> (
+ <optional> <parameter>mean</parameter> <type>double precision</type>
+ <optional>, <parameter>stddev</parameter> <type>double precision</type> </optional></optional> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Returns a random value from the normal distribution with the given
+ parameters; <parameter>mean</parameter> defaults to 0.0
+ and <parameter>stddev</parameter> defaults to 1.0
+ </para>
+ <para>
+ <literal>random_normal(0.0, 1.0)</literal>
+ <returnvalue>0.051285419</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -1824,7 +1846,8 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<returnvalue>void</returnvalue>
</para>
<para>
- Sets the seed for subsequent <literal>random()</literal> calls;
+ Sets the seed for subsequent <literal>random()</literal> and
+ <literal>random_normal()</literal> calls;
argument must be between -1.0 and 1.0, inclusive
</para>
<para>
@@ -1848,6 +1871,7 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
Without any prior <function>setseed()</function> call in the same
session, the first <function>random()</function> call obtains a seed
from a platform-dependent source of random bits.
+ These remarks hold equally for <function>random_normal()</function>.
</para>
<para>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index f2470708e9..83ca893444 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -66,6 +66,13 @@ CREATE OR REPLACE FUNCTION bit_length(text)
IMMUTABLE PARALLEL SAFE STRICT COST 1
RETURN octet_length($1) * 8;
+CREATE OR REPLACE FUNCTION
+ random_normal(mean float8 DEFAULT 0, stddev float8 DEFAULT 1)
+ RETURNS float8
+ LANGUAGE internal
+ VOLATILE PARALLEL RESTRICTED STRICT COST 1
+AS 'drandom_normal';
+
CREATE OR REPLACE FUNCTION log(numeric)
RETURNS numeric
LANGUAGE sql
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 56e349b888..d290b4ca67 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2743,13 +2743,11 @@ datanh(PG_FUNCTION_ARGS)
/*
- * drandom - returns a random number
+ * initialize_drandom_seed - initialize drandom_seed if not yet done
*/
-Datum
-drandom(PG_FUNCTION_ARGS)
+static void
+initialize_drandom_seed(void)
{
- float8 result;
-
/* Initialize random seed, if not done yet in this process */
if (unlikely(!drandom_seed_set))
{
@@ -2769,6 +2767,17 @@ drandom(PG_FUNCTION_ARGS)
}
drandom_seed_set = true;
}
+}
+
+/*
+ * drandom - returns a random number
+ */
+Datum
+drandom(PG_FUNCTION_ARGS)
+{
+ float8 result;
+
+ initialize_drandom_seed();
/* pg_prng_double produces desired result range [0.0 - 1.0) */
result = pg_prng_double(&drandom_seed);
@@ -2776,6 +2785,27 @@ drandom(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(result);
}
+/*
+ * drandom_normal - returns a random number from a normal distribution
+ */
+Datum
+drandom_normal(PG_FUNCTION_ARGS)
+{
+ float8 mean = PG_GETARG_FLOAT8(0);
+ float8 stddev = PG_GETARG_FLOAT8(1);
+ float8 result,
+ z;
+
+ initialize_drandom_seed();
+
+ /* Get random value from standard normal(mean = 0.0, stddev = 1.0) */
+ z = pg_prng_double_normal(&drandom_seed);
+ /* Transform the normal standard variable (z) */
+ /* using the target normal distribution parameters */
+ result = (stddev * z) + mean;
+
+ PG_RETURN_FLOAT8(result);
+}
/*
* setseed - set seed for the random number generator
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 18d9c94ebd..9c12ffaea9 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -1136,8 +1136,8 @@ getGaussianRand(pg_prng_state *state, int64 min, int64 max,
Assert(parameter >= MIN_GAUSSIAN_PARAM);
/*
- * Get user specified random number from this loop, with -parameter <
- * stdev <= parameter
+ * Get normally-distributed random number in the range -parameter <= stdev
+ * < parameter.
*
* This loop is executed until the number is in the expected range.
*
@@ -1149,25 +1149,7 @@ getGaussianRand(pg_prng_state *state, int64 min, int64 max,
*/
do
{
- /*
- * pg_prng_double generates [0, 1), but for the basic version of the
- * Box-Muller transform the two uniformly distributed random numbers
- * are expected to be in (0, 1] (see
- * https://en.wikipedia.org/wiki/Box-Muller_transform)
- */
- double rand1 = 1.0 - pg_prng_double(state);
- double rand2 = 1.0 - pg_prng_double(state);
-
- /* Box-Muller basic form transform */
- double var_sqrt = sqrt(-2.0 * log(rand1));
-
- stdev = var_sqrt * sin(2.0 * M_PI * rand2);
-
- /*
- * we may try with cos, but there may be a bias induced if the
- * previous value fails the test. To be on the safe side, let us try
- * over.
- */
+ stdev = pg_prng_double_normal(state);
}
while (stdev < -parameter || stdev >= parameter);
diff --git a/src/common/pg_prng.c b/src/common/pg_prng.c
index e58b471cff..6e07d1c810 100644
--- a/src/common/pg_prng.c
+++ b/src/common/pg_prng.c
@@ -19,11 +19,17 @@
#include "c.h"
-#include <math.h> /* for ldexp() */
+#include <math.h>
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
+/* X/Open (XSI) requires <math.h> to provide M_PI, but core POSIX does not */
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+
/* process-wide state vector */
pg_prng_state pg_global_prng_state;
@@ -235,6 +241,35 @@ pg_prng_double(pg_prng_state *state)
return ldexp((double) (v >> (64 - 52)), -52);
}
+/*
+ * Select a random double from the normal distribution with
+ * mean = 0.0 and stddev = 1.0.
+ *
+ * To get a result from a different normal distribution use
+ * STDDEV * pg_prng_double_normal() + MEAN
+ *
+ * Uses https://en.wikipedia.org/wiki/Box–Muller_transform
+ */
+double
+pg_prng_double_normal(pg_prng_state *state)
+{
+ double u1,
+ u2,
+ z0;
+
+ /*
+ * pg_prng_double generates [0, 1), but for the basic version of the
+ * Box-Muller transform the two uniformly distributed random numbers are
+ * expected to be in (0, 1]; in particular we'd better not compute log(0).
+ */
+ u1 = 1.0 - pg_prng_double(state);
+ u2 = 1.0 - pg_prng_double(state);
+
+ /* Apply Box-Muller transform to get one normal-valued output */
+ z0 = sqrt(-2.0 * log(u1)) * sin(2.0 * M_PI * u2);
+ return z0;
+}
+
/*
* Select a random boolean value.
*/
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 7be9a50147..3810de7b22 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3359,6 +3359,10 @@
{ oid => '1598', descr => 'random value',
proname => 'random', provolatile => 'v', proparallel => 'r',
prorettype => 'float8', proargtypes => '', prosrc => 'drandom' },
+{ oid => '8074', descr => 'random value from normal distribution',
+ proname => 'random_normal', provolatile => 'v', proparallel => 'r',
+ prorettype => 'float8', proargtypes => 'float8 float8',
+ prosrc => 'drandom_normal' },
{ oid => '1599', descr => 'set random seed',
proname => 'setseed', provolatile => 'v', proparallel => 'r',
prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' },
diff --git a/src/include/common/pg_prng.h b/src/include/common/pg_prng.h
index 9e11e8fffd..b5c0b8d288 100644
--- a/src/include/common/pg_prng.h
+++ b/src/include/common/pg_prng.h
@@ -55,6 +55,7 @@ extern uint32 pg_prng_uint32(pg_prng_state *state);
extern int32 pg_prng_int32(pg_prng_state *state);
extern int32 pg_prng_int32p(pg_prng_state *state);
extern double pg_prng_double(pg_prng_state *state);
+extern double pg_prng_double_normal(pg_prng_state *state);
extern bool pg_prng_bool(pg_prng_state *state);
#endif /* PG_PRNG_H */
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index a919b28d8d..547b9c9b2b 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -51,3 +51,34 @@ SELECT AVG(random) FROM RANDOM_TBL
-----
(0 rows)
+-- now test random_normal()
+TRUNCATE random_tbl;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal() < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 10) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
+-- expect similar, but not identical values
+SELECT random, count(random) FROM random_tbl
+ GROUP BY random HAVING count(random) > 4;
+ random | count
+--------+-------
+(0 rows)
+
+-- approximately check expected distribution
+SELECT AVG(random) FROM random_tbl
+ HAVING AVG(random) NOT BETWEEN 400 AND 600;
+ avg
+-----
+(0 rows)
+
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 8187b2c288..56eb9b045c 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -42,3 +42,30 @@ SELECT random, count(random) FROM RANDOM_TBL
SELECT AVG(random) FROM RANDOM_TBL
HAVING AVG(random) NOT BETWEEN 80 AND 120;
+
+-- now test random_normal()
+
+TRUNCATE random_tbl;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 1) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal() < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(0, 10) < 0;
+INSERT INTO random_tbl (random)
+ SELECT count(*)
+ FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
+
+-- expect similar, but not identical values
+SELECT random, count(random) FROM random_tbl
+ GROUP BY random HAVING count(random) > 4;
+
+-- approximately check expected distribution
+SELECT AVG(random) FROM random_tbl
+ HAVING AVG(random) NOT BETWEEN 400 AND 600;
I wrote:
Also, I tried running the new random.sql regression cases over
and over, and found that the "not all duplicates" test fails about
one time in 100000 or so. We could probably tolerate that given
that the random test is marked "ignore" in parallel_schedule, but
I thought it best to add one more iteration so we could knock the
odds down.
Hmm ... it occurred to me to try the same check on the existing
random() tests (attached), and darn if they don't fail even more
often, usually within 50K iterations. So maybe we should rethink
that whole thing.
regards, tom lane
Attachments:
On Mon, 9 Jan 2023 at 00:20, Tom Lane <tgl@sss.pgh.pa.us> wrote:
This version seems committable to me, barring objections.
Whilst I have no objection to adding random_normal(), ISTM that we're
at risk of adding an arbitrary set of random functions without a clear
idea of where we'll end up, and how they'll affect one another (shared
state or not).
For example, random_normal() uses a PRNG and it shares the same state
as random()/setseed(). That's fine, and presumably if we add other
continuous distributions like exponential, they'll follow suit.
But what if we add something like random_int() to return a random
integer in a range (something that has been suggested before, and I
think would be very useful), or other discrete random functions? Would
they use a PRNG and share the same state? If so, having that state in
float.c wouldn't make sense anymore. If not, will they have their own
seed-setting functions, and how many seed-setting functions will we
end up with?
Over on [1]/messages/by-id/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net we're currently heading towards adding array_shuffle() and
array_sample() using a PRNG with a separate state shared just by those
2 functions, with no way to seed it, and not marking them as PARALLEL
RESTRICTED, so they can't be made deterministic.
ISTM that random functions should fall into 1 of 2 clearly defined
categories -- strong random functions and pseudorandom functions. IMO
all pseudorandom functions should be PARALLEL RESTRICTED and have a
way to set their seed, so that they can be made deterministic --
something that's very useful for users writing tests.
Now maybe having multiple seed-setting functions (one per datatype?)
is OK, but it seems unnecessary and cumbersome to me. Why would you
ever want to seed the random_int() sequence and the random() sequence
differently? No other library of random functions I know of behaves
that way.
So IMO all pseudorandom functions should share the same PRNG state and
seed-setting functions. That would mean they should all be in the same
(new) C file, so that the PRNG state can be kept private to that file.
I think it would also make sense to add a new "Random Functions"
section to the docs, and move the descriptions of random(),
random_normal() and setseed() there. That way, all the functions
affected by setseed() can be kept together on one page (future random
functions may not all be sensibly classified as "mathematical").
Regards,
Dean
[1]: /messages/by-id/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net
Dean Rasheed <dean.a.rasheed@gmail.com> writes:
So IMO all pseudorandom functions should share the same PRNG state and
seed-setting functions. That would mean they should all be in the same
(new) C file, so that the PRNG state can be kept private to that file.
I agree with this in principle, but I feel no need to actually reshuffle
the code before we accept a proposal for such a function that wouldn't
logically belong in float.c.
I think it would also make sense to add a new "Random Functions"
section to the docs, and move the descriptions of random(),
random_normal() and setseed() there.
Likewise, this'll just add confusion in the short run. A <sect1>
with only three functions in it is going to look mighty odd.
regards, tom lane
On Mon, 9 Jan 2023 at 15:26, Tom Lane <tgl@sss.pgh.pa.us> wrote:
Dean Rasheed <dean.a.rasheed@gmail.com> writes:
So IMO all pseudorandom functions should share the same PRNG state and
seed-setting functions. That would mean they should all be in the same
(new) C file, so that the PRNG state can be kept private to that file.I agree with this in principle, but I feel no need to actually reshuffle
the code before we accept a proposal for such a function that wouldn't
logically belong in float.c.I think it would also make sense to add a new "Random Functions"
section to the docs, and move the descriptions of random(),
random_normal() and setseed() there.Likewise, this'll just add confusion in the short run. A <sect1>
with only three functions in it is going to look mighty odd.
OK, that's fair enough, while we're just adding random_normal().
BTW, "UUID Functions" only has 1 function in it :-)
Regards,
Dean
I wrote:
Hmm ... it occurred to me to try the same check on the existing
random() tests (attached), and darn if they don't fail even more
often, usually within 50K iterations. So maybe we should rethink
that whole thing.
I pushed Paul's patch with the previously-discussed tests, but
the more I look at random.sql the less I like it. I propose
that we nuke the existing tests from orbit and replace with
something more or less as attached. This is faster than what
we have, removes the unnecessary dependency on the "onek" table,
and I believe it to be a substantially more thorough test of the
random functions' properties. (We could probably go further
than this, like trying to verify distribution properties. But
it's been too long since college statistics for me to want to
write such tests myself, and I'm not real sure we need them.)
BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.
regards, tom lane
Attachments:
improve-random-tests-1.patchtext/x-diff; charset=us-ascii; name=improve-random-tests-1.patchDownload
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index 30bd866138..2a9249f3cb 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -1,81 +1,110 @@
--
-- RANDOM
--- Test the random function
+-- Test random() and allies
--
--- count the number of tuples originally, should be 1000
-SELECT count(*) FROM onek;
- count
--------
- 1000
+-- Tests in this file may have a small probability of failure,
+-- since we are dealing with randomness. Try to keep the failure
+-- risk for any one test case under 1e-9.
+--
+-- There should be no duplicates in 1000 random() values.
+-- (Assuming 52 random bits in the float8 results, we could
+-- take as many as 3000 values and still have less than 1e-9 chance
+-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
+SELECT r, count(*)
+FROM (SELECT random() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+ r | count
+---+-------
+(0 rows)
+
+-- The range should be [0, 1). We can expect that at least one out of 2000
+-- random values is in the lowest or highest 1% of the range with failure
+-- probability less than about 1e-9.
+SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
+ (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
+ (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
+FROM (SELECT random() r FROM generate_series(1, 2000)) ss;
+ out_of_range | has_small | has_large
+--------------+-----------+-----------
+ 0 | t | t
(1 row)
--- pick three random rows, they shouldn't match
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1);
- random
---------
-(0 rows)
-
--- count roughly 1/10 of the tuples
-CREATE TABLE RANDOM_TBL AS
- SELECT count(*) AS random
- FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
--- now test that they are different counts
-SELECT random, count(random) FROM RANDOM_TBL
- GROUP BY random HAVING count(random) > 3;
- random | count
---------+-------
-(0 rows)
-
-SELECT AVG(random) FROM RANDOM_TBL
- HAVING AVG(random) NOT BETWEEN 80 AND 120;
- avg
------
-(0 rows)
-
-- now test random_normal()
-TRUNCATE random_tbl;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0, 1) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal() < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
--- expect similar, but not identical values
-SELECT random, count(random) FROM random_tbl
- GROUP BY random HAVING count(random) > 3;
- random | count
---------+-------
+-- As above, there should be no duplicates in 1000 random_normal() values.
+SELECT r, count(*)
+FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+ r | count
+---+-------
(0 rows)
--- approximately check expected distribution
-SELECT AVG(random) FROM random_tbl
- HAVING AVG(random) NOT BETWEEN 400 AND 600;
- avg
------
-(0 rows)
+-- ... unless we force the range (standard deviation) to zero.
+-- This is a good place to check that the mean input does something, too.
+SELECT r, count(*)
+FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+ r | count
+----+-------
+ 10 | 100
+(1 row)
+
+SELECT r, count(*)
+FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+ r | count
+-----+-------
+ -10 | 100
+(1 row)
+
+-- setseed() should produce a reproducible series of numbers.
+SELECT setseed(0.5);
+ setseed
+---------
+
+(1 row)
+
+SELECT random() FROM generate_series(1, 10);
+ random
+---------------------
+ 0.9851677175347999
+ 0.825301858027981
+ 0.12974610012450416
+ 0.16356291958601088
+ 0.6476186144084
+ 0.8822771983038762
+ 0.1404566845227775
+ 0.15619865764623442
+ 0.5145227426983392
+ 0.7712969548127826
+(10 rows)
+
+SELECT random_normal() FROM generate_series(1, 10);
+ random_normal
+----------------------
+ 0.20853464493837737
+ 0.2645302405409627
+ -0.606752467900428
+ 0.8257994278526545
+ 1.7011161173535707
+ -0.22344546371618862
+ 0.24971241919099813
+ -1.2494722990669047
+ 0.1256271520436768
+ 0.47539161454401274
+(10 rows)
+
+SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
+ r
+--------------------
+ 1.006059728117318
+ 1.0968545301500248
+ 1.0286920613200707
+ 0.9094756767123359
+ 0.9837247631342648
+ 0.9393445495776231
+ 1.1871350020636253
+ 0.9622576842929335
+ 0.9144412068004066
+ 0.9640310555754329
+(10 rows)
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 3104af46b7..adfd7b3261 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -1,68 +1,49 @@
--
-- RANDOM
--- Test the random function
+-- Test random() and allies
+--
+-- Tests in this file may have a small probability of failure,
+-- since we are dealing with randomness. Try to keep the failure
+-- risk for any one test case under 1e-9.
--
--- count the number of tuples originally, should be 1000
-SELECT count(*) FROM onek;
-
--- pick three random rows, they shouldn't match
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1);
-
--- count roughly 1/10 of the tuples
-CREATE TABLE RANDOM_TBL AS
- SELECT count(*) AS random
- FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
-
--- now test that they are different counts
-SELECT random, count(random) FROM RANDOM_TBL
- GROUP BY random HAVING count(random) > 3;
-
-SELECT AVG(random) FROM RANDOM_TBL
- HAVING AVG(random) NOT BETWEEN 80 AND 120;
+-- There should be no duplicates in 1000 random() values.
+-- (Assuming 52 random bits in the float8 results, we could
+-- take as many as 3000 values and still have less than 1e-9 chance
+-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
+SELECT r, count(*)
+FROM (SELECT random() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+
+-- The range should be [0, 1). We can expect that at least one out of 2000
+-- random values is in the lowest or highest 1% of the range with failure
+-- probability less than about 1e-9.
+
+SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
+ (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
+ (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
+FROM (SELECT random() r FROM generate_series(1, 2000)) ss;
-- now test random_normal()
-TRUNCATE random_tbl;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0, 1) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal() < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
+-- As above, there should be no duplicates in 1000 random_normal() values.
+SELECT r, count(*)
+FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
--- expect similar, but not identical values
-SELECT random, count(random) FROM random_tbl
- GROUP BY random HAVING count(random) > 3;
+-- ... unless we force the range (standard deviation) to zero.
+-- This is a good place to check that the mean input does something, too.
+SELECT r, count(*)
+FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+SELECT r, count(*)
+FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
--- approximately check expected distribution
-SELECT AVG(random) FROM random_tbl
- HAVING AVG(random) NOT BETWEEN 400 AND 600;
+-- setseed() should produce a reproducible series of numbers.
+
+SELECT setseed(0.5);
+
+SELECT random() FROM generate_series(1, 10);
+SELECT random_normal() FROM generate_series(1, 10);
+SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
On Mon, 9 Jan 2023 at 18:52, Tom Lane <tgl@sss.pgh.pa.us> wrote:
I pushed Paul's patch with the previously-discussed tests, but
the more I look at random.sql the less I like it. I propose
that we nuke the existing tests from orbit and replace with
something more or less as attached.
Looks like a definite improvement.
(We could probably go further
than this, like trying to verify distribution properties. But
it's been too long since college statistics for me to want to
write such tests myself, and I'm not real sure we need them.)
I played around with the Kolmogorov–Smirnov test, to test random() for
uniformity. The following passes roughly 99.9% of the time:
CREATE OR REPLACE FUNCTION ks_test_uniform_random()
RETURNS boolean AS
$$
DECLARE
n int := 1000; -- Number of samples
c float8 := 1.94947; -- Critical value for 99.9% confidence
/* c float8 := 1.62762; -- Critical value for 99% confidence */
/* c float8 := 1.22385; -- Critical value for 90% confidence */
ok boolean;
BEGIN
ok := (
WITH samples AS (
SELECT random() r FROM generate_series(1, n) ORDER BY 1
), indexed_samples AS (
SELECT (row_number() OVER())-1.0 i, r FROM samples
)
SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples
);
RETURN ok;
END
$$
LANGUAGE plpgsql;
and is very fast. So that gives decent confidence that random() is
indeed uniform.
With a one-in-a-thousand chance of failing, if you wanted something
with around a one-in-a-billion chance of failing, you could just try
it 3 times:
SELECT ks_test_uniform_random() OR
ks_test_uniform_random() OR
ks_test_uniform_random();
but it feels pretty hacky, and probably not really necessary.
Rigorous tests for other distributions are harder, but also probably
not necessary if we have confidence that the underlying PRNG is
uniform.
BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.
I didn't check the one-in-a-billion claim, but +1 for that.
Regards,
Dean
Dean Rasheed <dean.a.rasheed@gmail.com> writes:
On Mon, 9 Jan 2023 at 18:52, Tom Lane <tgl@sss.pgh.pa.us> wrote:
(We could probably go further
than this, like trying to verify distribution properties. But
it's been too long since college statistics for me to want to
write such tests myself, and I'm not real sure we need them.)
I played around with the Kolmogorov–Smirnov test, to test random() for
uniformity. The following passes roughly 99.9% of the time:
Ah, cool, thanks for this code.
With a one-in-a-thousand chance of failing, if you wanted something
with around a one-in-a-billion chance of failing, you could just try
it 3 times:
SELECT ks_test_uniform_random() OR
ks_test_uniform_random() OR
ks_test_uniform_random();
but it feels pretty hacky, and probably not really necessary.
That seems like a good way, because I'm not satisfied with
one-in-a-thousand odds if we want to remove the "ignore" marker.
It's still plenty fast enough: on my machine, the v2 patch below
takes about 19ms, versus 22ms for the script as it stands in HEAD.
Rigorous tests for other distributions are harder, but also probably
not necessary if we have confidence that the underlying PRNG is
uniform.
Agreed.
BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.
I didn't check the one-in-a-billion claim, but +1 for that.
I realized that we do already run random in a parallel group;
the "ignore: random" line in parallel_schedule just marks it
as failure-ignorable, it doesn't schedule it. (The comment is a
bit misleading about this, but I want to remove that not rewrite it.)
Nonetheless, nuking the whole ignore-failures mechanism seems like
good cleanup to me.
Also, I tried this on some 32-bit big-endian hardware (NetBSD on macppc)
to verify my thesis that the results of random() are now machine
independent. That part works, but the random_normal() tests didn't;
I saw low-order-bit differences from the results on x86_64 Linux.
Presumably, that's because one or more of sqrt()/log()/sin() are
rounding off a bit differently there. v2 attached deals with this by
backing off to "extra_float_digits = 0" for that test. Once it hits the
buildfarm we might find we have to reduce extra_float_digits some more,
but that was enough to make NetBSD/macppc happy.
regards, tom lane
Attachments:
improve-random-tests-2.patchtext/x-diff; charset=UTF-8; name=improve-random-tests-2.patchDownload
diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index 30bd866138..8785c88ad2 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -1,81 +1,146 @@
--
-- RANDOM
--- Test the random function
+-- Test random() and allies
--
--- count the number of tuples originally, should be 1000
-SELECT count(*) FROM onek;
- count
--------
- 1000
-(1 row)
-
--- pick three random rows, they shouldn't match
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1);
- random
---------
+-- Tests in this file may have a small probability of failure,
+-- since we are dealing with randomness. Try to keep the failure
+-- risk for any one test case under 1e-9.
+--
+-- There should be no duplicates in 1000 random() values.
+-- (Assuming 52 random bits in the float8 results, we could
+-- take as many as 3000 values and still have less than 1e-9 chance
+-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
+SELECT r, count(*)
+FROM (SELECT random() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+ r | count
+---+-------
(0 rows)
--- count roughly 1/10 of the tuples
-CREATE TABLE RANDOM_TBL AS
- SELECT count(*) AS random
- FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
--- now test that they are different counts
-SELECT random, count(random) FROM RANDOM_TBL
- GROUP BY random HAVING count(random) > 3;
- random | count
---------+-------
-(0 rows)
+-- The range should be [0, 1). We can expect that at least one out of 2000
+-- random values is in the lowest or highest 1% of the range with failure
+-- probability less than about 1e-9.
+SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
+ (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
+ (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
+FROM (SELECT random() r FROM generate_series(1, 2000)) ss;
+ out_of_range | has_small | has_large
+--------------+-----------+-----------
+ 0 | t | t
+(1 row)
-SELECT AVG(random) FROM RANDOM_TBL
- HAVING AVG(random) NOT BETWEEN 80 AND 120;
- avg
------
-(0 rows)
+-- Check for uniform distribution using the Kolmogorov–Smirnov test.
+CREATE FUNCTION ks_test_uniform_random()
+RETURNS boolean AS
+$$
+DECLARE
+ n int := 1000; -- Number of samples
+ c float8 := 1.94947; -- Critical value for 99.9% confidence
+ ok boolean;
+BEGIN
+ ok := (
+ WITH samples AS (
+ SELECT random() r FROM generate_series(1, n) ORDER BY 1
+ ), indexed_samples AS (
+ SELECT (row_number() OVER())-1.0 i, r FROM samples
+ )
+ SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples
+ );
+ RETURN ok;
+END
+$$
+LANGUAGE plpgsql;
+-- As written, ks_test_uniform_random() returns true about 99.9%
+-- of the time. To get down to a roughly 1e-9 test failure rate,
+-- just run it 3 times and accept if any one of them passes.
+SELECT ks_test_uniform_random() OR
+ ks_test_uniform_random() OR
+ ks_test_uniform_random() AS uniform;
+ uniform
+---------
+ t
+(1 row)
-- now test random_normal()
-TRUNCATE random_tbl;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0, 1) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal() < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
--- expect similar, but not identical values
-SELECT random, count(random) FROM random_tbl
- GROUP BY random HAVING count(random) > 3;
- random | count
---------+-------
+-- As above, there should be no duplicates in 1000 random_normal() values.
+SELECT r, count(*)
+FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+ r | count
+---+-------
(0 rows)
--- approximately check expected distribution
-SELECT AVG(random) FROM random_tbl
- HAVING AVG(random) NOT BETWEEN 400 AND 600;
- avg
------
-(0 rows)
+-- ... unless we force the range (standard deviation) to zero.
+-- This is a good place to check that the mean input does something, too.
+SELECT r, count(*)
+FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+ r | count
+----+-------
+ 10 | 100
+(1 row)
+
+SELECT r, count(*)
+FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+ r | count
+-----+-------
+ -10 | 100
+(1 row)
+
+-- setseed() should produce a reproducible series of random() values.
+SELECT setseed(0.5);
+ setseed
+---------
+
+(1 row)
+
+SELECT random() FROM generate_series(1, 10);
+ random
+---------------------
+ 0.9851677175347999
+ 0.825301858027981
+ 0.12974610012450416
+ 0.16356291958601088
+ 0.6476186144084
+ 0.8822771983038762
+ 0.1404566845227775
+ 0.15619865764623442
+ 0.5145227426983392
+ 0.7712969548127826
+(10 rows)
+
+-- Likewise for random_normal(); however, since its implementation relies
+-- on libm functions that have different roundoff behaviors on different
+-- machines, we have to round off the results a bit to get consistent output.
+SET extra_float_digits = 0;
+SELECT random_normal() FROM generate_series(1, 10);
+ random_normal
+--------------------
+ 0.208534644938377
+ 0.264530240540963
+ -0.606752467900428
+ 0.825799427852654
+ 1.70111611735357
+ -0.223445463716189
+ 0.249712419190998
+ -1.2494722990669
+ 0.125627152043677
+ 0.475391614544013
+(10 rows)
+
+SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
+ r
+-------------------
+ 1.00605972811732
+ 1.09685453015002
+ 1.02869206132007
+ 0.909475676712336
+ 0.983724763134265
+ 0.939344549577623
+ 1.18713500206363
+ 0.962257684292933
+ 0.914441206800407
+ 0.964031055575433
+(10 rows)
diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 3104af46b7..ce1cc37176 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -1,68 +1,85 @@
--
-- RANDOM
--- Test the random function
+-- Test random() and allies
+--
+-- Tests in this file may have a small probability of failure,
+-- since we are dealing with randomness. Try to keep the failure
+-- risk for any one test case under 1e-9.
--
--- count the number of tuples originally, should be 1000
-SELECT count(*) FROM onek;
-
--- pick three random rows, they shouldn't match
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
- FROM onek ORDER BY random() LIMIT 1);
-
--- count roughly 1/10 of the tuples
-CREATE TABLE RANDOM_TBL AS
- SELECT count(*) AS random
- FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
- SELECT count(*)
- FROM onek WHERE random() < 1.0/10;
-
--- now test that they are different counts
-SELECT random, count(random) FROM RANDOM_TBL
- GROUP BY random HAVING count(random) > 3;
-
-SELECT AVG(random) FROM RANDOM_TBL
- HAVING AVG(random) NOT BETWEEN 80 AND 120;
+-- There should be no duplicates in 1000 random() values.
+-- (Assuming 52 random bits in the float8 results, we could
+-- take as many as 3000 values and still have less than 1e-9 chance
+-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
+SELECT r, count(*)
+FROM (SELECT random() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+
+-- The range should be [0, 1). We can expect that at least one out of 2000
+-- random values is in the lowest or highest 1% of the range with failure
+-- probability less than about 1e-9.
+
+SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
+ (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
+ (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
+FROM (SELECT random() r FROM generate_series(1, 2000)) ss;
+
+-- Check for uniform distribution using the Kolmogorov–Smirnov test.
+
+CREATE FUNCTION ks_test_uniform_random()
+RETURNS boolean AS
+$$
+DECLARE
+ n int := 1000; -- Number of samples
+ c float8 := 1.94947; -- Critical value for 99.9% confidence
+ ok boolean;
+BEGIN
+ ok := (
+ WITH samples AS (
+ SELECT random() r FROM generate_series(1, n) ORDER BY 1
+ ), indexed_samples AS (
+ SELECT (row_number() OVER())-1.0 i, r FROM samples
+ )
+ SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples
+ );
+ RETURN ok;
+END
+$$
+LANGUAGE plpgsql;
+
+-- As written, ks_test_uniform_random() returns true about 99.9%
+-- of the time. To get down to a roughly 1e-9 test failure rate,
+-- just run it 3 times and accept if any one of them passes.
+SELECT ks_test_uniform_random() OR
+ ks_test_uniform_random() OR
+ ks_test_uniform_random() AS uniform;
-- now test random_normal()
-TRUNCATE random_tbl;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0, 1) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(0) < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal() < 0;
-INSERT INTO random_tbl (random)
- SELECT count(*)
- FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
+-- As above, there should be no duplicates in 1000 random_normal() values.
+SELECT r, count(*)
+FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
--- expect similar, but not identical values
-SELECT random, count(random) FROM random_tbl
- GROUP BY random HAVING count(random) > 3;
+-- ... unless we force the range (standard deviation) to zero.
+-- This is a good place to check that the mean input does something, too.
+SELECT r, count(*)
+FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+SELECT r, count(*)
+FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
--- approximately check expected distribution
-SELECT AVG(random) FROM random_tbl
- HAVING AVG(random) NOT BETWEEN 400 AND 600;
+-- setseed() should produce a reproducible series of random() values.
+
+SELECT setseed(0.5);
+
+SELECT random() FROM generate_series(1, 10);
+
+-- Likewise for random_normal(); however, since its implementation relies
+-- on libm functions that have different roundoff behaviors on different
+-- machines, we have to round off the results a bit to get consistent output.
+SET extra_float_digits = 0;
+
+SELECT random_normal() FROM generate_series(1, 10);
+SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
On Mon, 9 Jan 2023 at 23:38, Tom Lane <tgl@sss.pgh.pa.us> wrote:
I tried this on some 32-bit big-endian hardware (NetBSD on macppc)
to verify my thesis that the results of random() are now machine
independent. That part works, but the random_normal() tests didn't;
Ah yes, I wondered about that.
I saw low-order-bit differences from the results on x86_64 Linux.
Presumably, that's because one or more of sqrt()/log()/sin() are
rounding off a bit differently there. v2 attached deals with this by
backing off to "extra_float_digits = 0" for that test.
Makes sense.
I double-checked the one-in-a-billion claim, and it looks about right
for each test.
The one I wasn't sure about was the chance of duplicates for
random_normal(). Analysing it more closely, it actually has a smaller
chance of duplicates, since the difference between 2 standard normal
distributions is another normal distribution with a standard deviation
of sqrt(2), and so the probability of a pair of random_normal()'s
being the same is about 2*sqrt(pi) ~ 3.5449 times lower than for
random(). So you can call random_normal() around 5600 times (rather
than 3000 times) before having a 1e-9 chance of duplicates. So, as
with the random() duplicates test, the probability of failure with
just 1000 values should be well below 1e-9. Intuitively, that was
always going to be true, but I wanted to know the details.
The rest looks good to me, except there's a random non-ASCII character
instead of a hyphen in "Kolmogorov-Smirnov" (because I copy-pasted the
name from some random website).
Regards,
Dean
On Tue, 10 Jan 2023 at 08:33, Dean Rasheed <dean.a.rasheed@gmail.com> wrote:
The rest looks good to me, except there's a random non-ASCII character
instead of a hyphen in "Kolmogorov-Smirnov" (because I copy-pasted the
name from some random website).
Oh, never mind. I see you already fixed that.
I should finish reading all my mail before hitting reply.
Regards,
Dean
Dean Rasheed <dean.a.rasheed@gmail.com> writes:
I double-checked the one-in-a-billion claim, and it looks about right
for each test.
Thanks for double-checking my arithmetic.
The rest looks good to me, except there's a random non-ASCII character
instead of a hyphen in "Kolmogorov-Smirnov" (because I copy-pasted the
name from some random website).
Yeah, I caught that before committing.
The AIX buildfarm members were still showing low-order diffs in the
random_normal results at extra_float_digits = 0, but they seem OK
after reducing it to -1.
regards, tom lane
On Tue, Jan 10, 2023 at 6:34 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:
Dean Rasheed <dean.a.rasheed@gmail.com> writes:
I double-checked the one-in-a-billion claim, and it looks about right
for each test.Thanks for double-checking my arithmetic.
The rest looks good to me, except there's a random non-ASCII character
instead of a hyphen in "Kolmogorov-Smirnov" (because I copy-pasted the
name from some random website).Yeah, I caught that before committing.
The AIX buildfarm members were still showing low-order diffs in the
random_normal results at extra_float_digits = 0, but they seem OK
after reducing it to -1.
I should leave the country more often... thanks for cleaning up my
patch and committing it Tom! It's a Christmas miracle (at least, for
me :)
P.
On 1/9/23 23:52, Tom Lane wrote:
BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.
With 'ignore' option we get used to cover by tests some of the time
dependent features, such as "statement_timeout",
"idle_in_transaction_session_timeout", usage of user timeouts in
extensions and so on.
We have used the pg_sleep() function to interrupt a query at certain
execution phase. But on some platforms, especially in containers, the
query can vary execution time in so widely that the pg_sleep() timeout,
required to get rid of dependency on a query execution time, has become
unacceptable. So, the "ignore" option was the best choice.
For Now, Do we only have the "isolation tests" option to create stable
execution time-dependent tests now? Or I'm not aware about some test
machinery?
--
Regards
Andrey Lepikhov
Postgres Professional
Andrey Lepikhov <a.lepikhov@postgrespro.ru> writes:
On 1/9/23 23:52, Tom Lane wrote:
BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.
We have used the pg_sleep() function to interrupt a query at certain
execution phase. But on some platforms, especially in containers, the
query can vary execution time in so widely that the pg_sleep() timeout,
required to get rid of dependency on a query execution time, has become
unacceptable. So, the "ignore" option was the best choice.
But does such a test have any actual value? If your test infrastructure
ignores the result, what makes you think you'd notice if the test did
indeed detect a problem?
I think "ignore:" was a kluge we put in twenty-plus years ago when our
testing standards were a lot lower, and it's way past time we got
rid of it.
regards, tom lane
On 1/19/23 11:01, Tom Lane wrote:
Andrey Lepikhov <a.lepikhov@postgrespro.ru> writes:
On 1/9/23 23:52, Tom Lane wrote:
BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.We have used the pg_sleep() function to interrupt a query at certain
execution phase. But on some platforms, especially in containers, the
query can vary execution time in so widely that the pg_sleep() timeout,
required to get rid of dependency on a query execution time, has become
unacceptable. So, the "ignore" option was the best choice.But does such a test have any actual value? If your test infrastructure
ignores the result, what makes you think you'd notice if the test did
indeed detect a problem?
Yes, it is good to catch SEGFAULTs and assertions which may be frequent
because of a logic complexity in the case of timeouts.
I think "ignore:" was a kluge we put in twenty-plus years ago when our
testing standards were a lot lower, and it's way past time we got
rid of it.
Ok, I will try to invent alternative way for deep (and stable) testing
of timeouts. Thank you for the answer.
--
Regards
Andrey Lepikhov
Postgres Professional
Hello old thread.
On Jan 19, 2023, at 01:01, Tom Lane <tgl@sss.pgh.pa.us> wrote:
I think "ignore:" was a kluge we put in twenty-plus years ago when our
testing standards were a lot lower, and it's way past time we got
rid of it.
Today I discovered a bad regex in the pgTAP Makefile that matched 18.0 as an 8.0 release and used it `ignore:` a test requiring RLS. I found it after I saw this test failure on Postgres 18:
# syntax error in schedule file "test/build/run.sch" line 3: ignore: policy
I fixed the regex (removed any detection of pre-9.0 matching, in fact), so all good there. But I wanted to point out that `ignore:` is documented in the wiki[1]https://wiki.postgresql.org/wiki/Regression_test_authoring, so other extensions may use it. If no one else notices perhaps it’s not a big deal, but I want to call out that pg_regress features, “documented” or not, might be used outside the core.
Maybe the proper way to address this issue is to add formal docs for pg_regress. I always have the damnedest time finding its docs and usually end up on the Wiki. If there was official documentation for it like there is for `psql`, etc., it might help to prevent such issues in the future. Minor as they may be, admittedly.
Thoughts?
Best,
David
[1]: https://wiki.postgresql.org/wiki/Regression_test_authoring
"David E. Wheeler" <david@justatheory.com> writes:
Hello old thread.
On Jan 19, 2023, at 01:01, Tom Lane <tgl@sss.pgh.pa.us> wrote:I think "ignore:" was a kluge we put in twenty-plus years ago when our
testing standards were a lot lower, and it's way past time we got
rid of it.
I fixed the regex (removed any detection of pre-9.0 matching, in fact), so all good there. But I wanted to point out that `ignore:` is documented in the wiki[1], so other extensions may use it. If no one else notices perhaps it’s not a big deal, but I want to call out that pg_regress features, “documented” or not, might be used outside the core.
I didn't know about that page, but I'm happy to go remove its
mention of "ignore:" ...
regards, tom lane
I wrote:
I didn't know about that page, but I'm happy to go remove its
mention of "ignore:" ...
Actually, there is a *ton* of obsolete information on that page:
there's no serial_schedule file anymore, and we don't have
dynamically-generated test files anymore (input/output subdirectories)
and therefore there's no substituted variables. The age of the whole
thing can be dated by its references to CVS :-(
Maybe we should indeed try to replace it with something in the main
docs, which are a bit more likely to get maintained. There is an
existing chapter "Regression Tests", but it presently only talks
about how to run the tests not how to write new ones.
regards, tom lane
On Sat, Oct 04, 2025 at 02:07:45PM -0400, Tom Lane wrote:
Actually, there is a *ton* of obsolete information on that page:
there's no serial_schedule file anymore, and we don't have
dynamically-generated test files anymore (input/output subdirectories)
and therefore there's no substituted variables. The age of the whole
thing can be dated by its references to CVS :-(Maybe we should indeed try to replace it with something in the main
docs, which are a bit more likely to get maintained. There is an
existing chapter "Regression Tests", but it presently only talks
about how to run the tests not how to write new ones.
Yes, most of the content of this page could be deleted.
IMO, the main docs would benefit greatly from the addition of
PG_ABS_BUILDDIR, PG_ABS_BUILDDIR, PG_LIBDIR and PG_DLSUFFIX, which are
still missing. These are available to extension developers through
installcheck, and are a great way to write more flexible tests with
arbitrary inputs, like a dependency to on-disk files. I would
certainly +1 this addition, at least.
--
Michael
On Oct 6, 2025, at 06:30, Michael Paquier <michael@paquier.xyz> wrote:
IMO, the main docs would benefit greatly from the addition of
PG_ABS_BUILDDIR, PG_ABS_BUILDDIR, PG_LIBDIR and PG_DLSUFFIX, which are
still missing. These are available to extension developers through
installcheck, and are a great way to write more flexible tests with
arbitrary inputs, like a dependency to on-disk files. I would
certainly +1 this addition, at least.
I’ve never heard of these! Seems like there’s an awful lot of stuff in the testing infrastructure that would be useful to document.
D