Weighted Stats
Folks,
I'd like to add weighted statistics to PostgreSQL. While the included
weighted_avg() is trivial to calculate using existing machinery, the
included weighted_stddev_*() functions are not.
I've only done the float8 versions, but if we decide to move forward,
I'd be delighted to add the rest of the numeric types and maybe others
as make sense.
What say?
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
weighted_stats_001.difftext/plain; charset=us-asciiDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 4d482ec..2174594 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -12443,6 +12443,29 @@ NULL baz</literallayout>(3 rows)</entry>
<row>
<entry>
<indexterm>
+ <primary>weighted_average</primary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_avg</primary>
+ </indexterm>
+ <function>weighted_avg(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, <type>numeric</type>, or <type>interval</type>
+ </entry>
+ <entry>
+ <type>numeric</type> for any integer-type argument,
+ <type>double precision</type> for a floating-point argument,
+ otherwise the same as the argument data type
+ </entry>
+ <entry>the average (arithmetic mean) of all input values, weighted by the input weights</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>bit_and</primary>
</indexterm>
<function>bit_and(<replaceable class="parameter">expression</replaceable>)</function>
@@ -13086,6 +13109,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>population</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_pop</primary>
+ </indexterm>
+ <function>weighted_stddev_pop(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted population standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>standard deviation</primary>
<secondary>sample</secondary>
</indexterm>
@@ -13109,6 +13155,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>sample</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_samp</primary>
+ </indexterm>
+ <function>weighted_stddev_samp(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted sample standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>variance</primary>
</indexterm>
<function>variance</function>(<replaceable class="parameter">expression</replaceable>)
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 4e927d8..533ce0a 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -1774,6 +1774,7 @@ setseed(PG_FUNCTION_ARGS)
* float8_accum - accumulate for AVG(), variance aggregates, etc.
* float4_accum - same, but input data is float4
* float8_avg - produce final result for float AVG()
+ * float8_weighted_avg - produce final result for float WEIGHTED_AVG()
* float8_var_samp - produce final result for float VAR_SAMP()
* float8_var_pop - produce final result for float VAR_POP()
* float8_stddev_samp - produce final result for float STDDEV_SAMP()
@@ -1929,6 +1930,28 @@ float8_avg(PG_FUNCTION_ARGS)
}
Datum
+float8_weighted_avg(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ sumWX,
+ sumW;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_avg", 6);
+ N = transvalues[0];
+ sumW = transvalues[1];
+ sumWX = transvalues[5];
+
+ if (N < 1.0)
+ PG_RETURN_NULL();
+
+ CHECKFLOATVAL(N, isinf(1.0/sumW) || isinf(sumWX), true);
+
+ PG_RETURN_FLOAT8(sumWX/sumW);
+}
+
+Datum
float8_var_pop(PG_FUNCTION_ARGS)
{
ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
@@ -2467,6 +2490,119 @@ float8_regr_intercept(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(numeratorXXY / numeratorX);
}
+/*
+ * ===================
+ * WEIGHTED AGGREGATES
+ * ===================
+ *
+ * The transition datatype for these aggregates is a 4-element array
+ * of float8, holding the values N, sum(W), sum(W*X), and sum(W*X*X)
+ * in that order.
+ *
+ * First, an accumulator function for those we can't pirate from the
+ * other accumulators. This accumulator function takes out some of
+ * the rounding error inherent in the general one.
+ * https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
+ *
+ * It consists of a four-element array which includes:
+ *
+ * N, the number of non-zero-weighted values seen thus far,
+ * W, the running sum of weights,
+ * A, an intermediate value used in the calculation, and
+ * Q, another intermediate value.
+ *
+ */
+Datum
+float8_weighted_accum(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 newvalX = PG_GETARG_FLOAT8(1);
+ float8 newvalW = PG_GETARG_FLOAT8(2);
+ float8 *transvalues;
+ float8 N,
+ W,
+ A,
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_accum", 4);
+
+ if (newvalW <= 0.0) /* We only care about positive weights */
+ PG_RETURN_NULL();
+
+ N = transvalues[0];
+ W = transvalues[1];
+ A = transvalues[2];
+ Q = transvalues[3];
+
+ N += 1.0;
+ CHECKFLOATVAL(N, isinf(transvalues[0]), true);
+ W += newvalW;
+ CHECKFLOATVAL(W, isinf(transvalues[1]) || isinf(newvalW), true);
+ A += newvalW * ( newvalX - transvalues[2] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(transvalues[2]) || isinf(1.0/W), true);
+ Q += newvalW * (newvalX - transvalues[2]) * (newvalX - A);
+ CHECKFLOATVAL(A, isinf(newvalX - transvalues[3]) || isinf(newvalX - A) || isinf(1.0/W), true);
+
+ if (AggCheckCallContext(fcinfo, NULL)) /* Update in place is safe in Agg context */
+ {
+ transvalues[0] = N;
+ transvalues[1] = W;
+ transvalues[2] = A;
+ transvalues[3] = Q;
+
+ PG_RETURN_ARRAYTYPE_P(transarray);
+ }
+ else /* You do not need to call this directly. */
+ ereport(ERROR,
+ (errmsg("float8_weighted_accum called outside agg context")));
+}
+
+Datum
+float8_weighted_stddev_samp(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_samp", 4);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[3];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8(
+ sqrt(
+ N * Q /
+ ( (N-1) * W )
+ )
+ );
+}
+
+Datum
+float8_weighted_stddev_pop(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_pop", 4);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[3];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8( sqrt( Q / W ) );
+}
/*
* ====================================
diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h
index dd6079f..6d2f9d4 100644
--- a/src/include/catalog/pg_aggregate.h
+++ b/src/include/catalog/pg_aggregate.h
@@ -133,6 +133,7 @@ DATA(insert ( 2103 n 0 numeric_avg_accum numeric_avg numeric_avg_accum numeric_a
DATA(insert ( 2104 n 0 float4_accum float8_avg - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2105 n 0 float8_accum float8_avg - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2106 n 0 interval_accum interval_avg interval_accum interval_accum_inv interval_avg f f 0 1187 0 1187 0 "{0 second,0 second}" "{0 second,0 second}" ));
+DATA(insert ( 3998 n 0 float8_regr_accum float8_weighted_avg - - - f f 0 1022 0 0 0 "{0,0,0,0,0,0}" _null_ ));
/* sum */
DATA(insert ( 2107 n 0 int8_avg_accum numeric_poly_sum int8_avg_accum int8_avg_accum_inv numeric_poly_sum f f 0 2281 48 2281 48 _null_ _null_ ));
@@ -225,6 +226,7 @@ DATA(insert ( 2726 n 0 int2_accum numeric_poly_stddev_pop int2_accum int2_accum_
DATA(insert ( 2727 n 0 float4_accum float8_stddev_pop - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2728 n 0 float8_accum float8_stddev_pop - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2729 n 0 numeric_accum numeric_stddev_pop numeric_accum numeric_accum_inv numeric_stddev_pop f f 0 2281 128 2281 128 _null_ _null_ ));
+DATA(insert ( 4066 n 0 float8_weighted_accum float8_weighted_stddev_pop - - - f f 0 1022 0 0 0 "{0,0,0,0}" _null_ ));
/* stddev_samp */
DATA(insert ( 2712 n 0 int8_accum numeric_stddev_samp int8_accum int8_accum_inv numeric_stddev_samp f f 0 2281 128 2281 128 _null_ _null_ ));
@@ -232,6 +234,7 @@ DATA(insert ( 2713 n 0 int4_accum numeric_poly_stddev_samp int4_accum int4_accum
DATA(insert ( 2714 n 0 int2_accum numeric_poly_stddev_samp int2_accum int2_accum_inv numeric_poly_stddev_samp f f 0 2281 48 2281 48 _null_ _null_ ));
DATA(insert ( 2715 n 0 float4_accum float8_stddev_samp - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2716 n 0 float8_accum float8_stddev_samp - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
+DATA(insert ( 4083 n 0 float8_weighted_accum float8_weighted_stddev_samp - - - f f 0 1022 0 0 0 "{0,0,0,0}" _null_ ));
DATA(insert ( 2717 n 0 numeric_accum numeric_stddev_samp numeric_accum numeric_accum_inv numeric_stddev_samp f f 0 2281 128 2281 128 _null_ _null_ ));
/* stddev: historical Postgres syntax for stddev_samp */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index f688454..83c4b64 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2502,6 +2502,12 @@ DESCR("join selectivity of case-insensitive regex non-match");
/* Aggregate-related functions */
DATA(insert OID = 1830 ( float8_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_avg _null_ _null_ _null_ ));
DESCR("aggregate final function");
+DATA(insert OID = 3997 ( float8_weighted_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_avg _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4099 ( float8_weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_pop _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4100 ( float8_weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_samp _null_ _null_ _null_ ));
+DESCR("aggregate final function");
DATA(insert OID = 2512 ( float8_var_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_pop _null_ _null_ _null_ ));
DESCR("aggregate final function");
DATA(insert OID = 1831 ( float8_var_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_samp _null_ _null_ _null_ ));
@@ -2585,6 +2591,8 @@ DATA(insert OID = 2805 ( int8inc_float8_float8 PGNSP PGUID 12 1 0 0 0 f f f f
DESCR("aggregate transition function");
DATA(insert OID = 2806 ( float8_regr_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_regr_accum _null_ _null_ _null_ ));
DESCR("aggregate transition function");
+DATA(insert OID = 3999 ( float8_weighted_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_weighted_accum _null_ _null_ _null_ ));
+DESCR("aggregate transition function");
DATA(insert OID = 2807 ( float8_regr_sxx PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_regr_sxx _null_ _null_ _null_ ));
DESCR("aggregate final function");
DATA(insert OID = 2808 ( float8_regr_syy PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_regr_syy _null_ _null_ _null_ ));
@@ -3229,6 +3237,8 @@ DATA(insert OID = 2104 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701
DESCR("the average (arithmetic mean) as float8 of all float4 values");
DATA(insert OID = 2105 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701 "701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as float8 of all float8 values");
+DATA(insert OID = 3998 ( weighted_avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("the weighted average (arithmetic mean) as float8 of all float8 values");
DATA(insert OID = 2106 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1186 "1186" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as interval of all interval values");
@@ -3389,6 +3399,8 @@ DATA(insert OID = 2728 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0
DESCR("population standard deviation of float8 input values");
DATA(insert OID = 2729 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("population standard deviation of numeric input values");
+DATA(insert OID = 4066 ( weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("population weighted standard deviation of float8 input values");
DATA(insert OID = 2712 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of bigint input values");
@@ -3402,6 +3414,8 @@ DATA(insert OID = 2716 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1
DESCR("sample standard deviation of float8 input values");
DATA(insert OID = 2717 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of numeric input values");
+DATA(insert OID = 4083 ( weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("sample weighted standard deviation of float8 input values");
DATA(insert OID = 2154 ( stddev PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("historical alias for stddev_samp");
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index fc1679e..333d538 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -413,8 +413,12 @@ extern Datum radians(PG_FUNCTION_ARGS);
extern Datum drandom(PG_FUNCTION_ARGS);
extern Datum setseed(PG_FUNCTION_ARGS);
extern Datum float8_accum(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_accum(PG_FUNCTION_ARGS);
extern Datum float4_accum(PG_FUNCTION_ARGS);
extern Datum float8_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_pop(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_samp(PG_FUNCTION_ARGS);
extern Datum float8_var_pop(PG_FUNCTION_ARGS);
extern Datum float8_var_samp(PG_FUNCTION_ARGS);
extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index de826b5..a19fd1d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -247,6 +247,18 @@ SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
653.62895538751 | 871.505273850014
(1 row)
+SELECT weighted_avg(a, b) FROM aggtest;
+ weighted_avg
+------------------
+ 55.5553072763149
+(1 row)
+
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
+ weighted_stddev_pop | weighted_stddev_samp
+---------------------+----------------------
+ 24.3364627240769 | 28.1013266097382
+(1 row)
+
SELECT corr(b, a) FROM aggtest;
corr
-------------------
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 8d501dc..77b6102 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -60,6 +60,8 @@ SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
SELECT regr_r2(b, a) FROM aggtest;
SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
+SELECT weighted_avg(a, b) FROM aggtest;
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
SELECT corr(b, a) FROM aggtest;
SELECT count(four) AS cnt_1000 FROM onek;
On 11/2/15 5:46 PM, David Fetter wrote:
I'd like to add weighted statistics to PostgreSQL
Anything happen with this? If community isn't interested, ISTM it'd be
good to put this in PGXN.
--
Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
Experts in Analytics, Data Architecture and PostgreSQL
Data in Trouble? Get it in Treble! http://BlueTreble.com
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Sun, Dec 20, 2015 at 06:13:33PM -0600, Jim Nasby wrote:
On 11/2/15 5:46 PM, David Fetter wrote:
I'd like to add weighted statistics to PostgreSQL
Anything happen with this? If community isn't interested, ISTM it'd be good
to put this in PGXN.
I think it's already in PGXN as an extension, and I'll get another
version out this early this week, as it involves mostly adding some
tests.
I'll do the float8 ones for core this week, too, and unless there's a
really great reason to do more data types on the first pass, it should
be in committable shape.
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Dec 21, 2015 at 1:50 PM, David Fetter <david@fetter.org> wrote:
On Sun, Dec 20, 2015 at 06:13:33PM -0600, Jim Nasby wrote:
On 11/2/15 5:46 PM, David Fetter wrote:
I'd like to add weighted statistics to PostgreSQL
Anything happen with this? If community isn't interested, ISTM it'd be good
to put this in PGXN.I think it's already in PGXN as an extension, and I'll get another
version out this early this week, as it involves mostly adding some
tests.I'll do the float8 ones for core this week, too, and unless there's a
really great reason to do more data types on the first pass, it should
be in committable shape.
I reviewed the patch, following are my observations.
1. + precision</type>, <type>numeric</type>, or <type>interval</type>
with interval type it is giving problem. As interval data type is not supported,
so remove it in the list of supported inputs.
postgres=# select weighted_avg(f7,f1) from tbl;
ERROR: function weighted_avg(interval, smallint) does not exist at character 8
HINT: No function matches the given name and argument types. You
might need to add explicit type casts.
2. +float8_weighted_avg(PG_FUNCTION_ARGS)
It will be helpful, if you provide some information as a function header,
how the weighted average is calculated similar like other weighted functions.
3. + transvalues = check_float8_array(transarray,
"float8_weighted_stddev_accum", 4);
The second parameter to check_float8_array should be "float8_weighted_accum".
4. There is an OID conflict of 4066 with latest master code.
5.+ A += newvalW * ( newvalX - transvalues[2] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(newvalX - transvalues[2])
|| isinf(1.0/W), true);
+ Q += newvalW * (newvalX - transvalues[2]) * (newvalX - A);
+ CHECKFLOATVAL(A, isinf(newvalX - transvalues[3]) || isinf(newvalX -
A) || isinf(1.0/W), true);
Is the need of calculation also needs to be passed to CHECKFLOATVAL?
Just passing
the variables involved in the calculation isn't enough? If expressions
are required then
it should be something as follows?
CHECKFLOATVAL(A, isinf(transvalues[2]) || isinf(newvalW) ||
isinf(newvalX - transvalues[2]) || isinf(1.0/W), true);
CHECKFLOATVAL(Q, isinf(transvalues[3]) || isinf(newvalX -
transvalues[2]) || isinf(newvalX - A) || isinf(1.0/W), true);
I verified the stddev transition and final function calculations
according to wikipedia
and they are fine.
Regards,
Hari Babu
Fujitsu Australia
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Fri, Jan 08, 2016 at 04:37:36PM +1100, Haribabu Kommi wrote:
On Mon, Dec 21, 2015 at 1:50 PM, David Fetter <david@fetter.org> wrote:
On Sun, Dec 20, 2015 at 06:13:33PM -0600, Jim Nasby wrote:
On 11/2/15 5:46 PM, David Fetter wrote:
I'd like to add weighted statistics to PostgreSQL
Anything happen with this? If community isn't interested, ISTM it'd be good
to put this in PGXN.I think it's already in PGXN as an extension, and I'll get another
version out this early this week, as it involves mostly adding some
tests.I'll do the float8 ones for core this week, too, and unless there's a
really great reason to do more data types on the first pass, it should
be in committable shape.I reviewed the patch, following are my observations.
1. + precision</type>, <type>numeric</type>, or <type>interval</type>
with interval type it is giving problem. As interval data type is not supported,
so remove it in the list of supported inputs.
I'd meant to add more, but will make sure that the next version
documents exactly the types it supports.
postgres=# select weighted_avg(f7,f1) from tbl;
ERROR: function weighted_avg(interval, smallint) does not exist at character 8
HINT: No function matches the given name and argument types. You
might need to add explicit type casts.2. +float8_weighted_avg(PG_FUNCTION_ARGS)
It will be helpful, if you provide some information as a function header,
how the weighted average is calculated similar like other weighted functions.
Will do.
3. + transvalues = check_float8_array(transarray,
"float8_weighted_stddev_accum", 4);The second parameter to check_float8_array should be "float8_weighted_accum".
Oops. Will fix.
4. There is an OID conflict of 4066 with latest master code.
Will fix.
5.+ A += newvalW * ( newvalX - transvalues[2] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(newvalX - transvalues[2])
|| isinf(1.0/W), true);+ Q += newvalW * (newvalX - transvalues[2]) * (newvalX - A); + CHECKFLOATVAL(A, isinf(newvalX - transvalues[3]) || isinf(newvalX - A) || isinf(1.0/W), true);Is the need of calculation also needs to be passed to CHECKFLOATVAL?
Just passing
the variables involved in the calculation isn't enough? If expressions
are required then
it should be something as follows?CHECKFLOATVAL(A, isinf(transvalues[2]) || isinf(newvalW) ||
isinf(newvalX - transvalues[2]) || isinf(1.0/W), true);CHECKFLOATVAL(Q, isinf(transvalues[3]) || isinf(newvalX -
transvalues[2]) || isinf(newvalX - A) || isinf(1.0/W), true);
Will fix.
I verified the stddev transition and final function calculations
according to wikipedia
and they are fine.
Thanks for reviewing this!
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
I'm closing this for the current commitfest as returned-with-feedback.
Please resubmit for the 2016-03 CF once you have it.
Thanks!
--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Fri, Jan 08, 2016 at 04:37:36PM +1100, Haribabu Kommi wrote:
On Mon, Dec 21, 2015 at 1:50 PM, David Fetter <david@fetter.org> wrote:
On Sun, Dec 20, 2015 at 06:13:33PM -0600, Jim Nasby wrote:
On 11/2/15 5:46 PM, David Fetter wrote:
I'd like to add weighted statistics to PostgreSQL
Anything happen with this? If community isn't interested, ISTM it'd be good
to put this in PGXN.I think it's already in PGXN as an extension, and I'll get another
version out this early this week, as it involves mostly adding some
tests.I'll do the float8 ones for core this week, too, and unless there's a
really great reason to do more data types on the first pass, it should
be in committable shape.I reviewed the patch, following are my observations.
1. + precision</type>, <type>numeric</type>, or <type>interval</type>
with interval type it is giving problem. As interval data type is not supported,
so remove it in the list of supported inputs.
Done.
2. +float8_weighted_avg(PG_FUNCTION_ARGS)
It will be helpful, if you provide some information as a function header,
how the weighted average is calculated similar like other weighted functions.
Done.
3. + transvalues = check_float8_array(transarray,
"float8_weighted_stddev_accum", 4);The second parameter to check_float8_array should be "float8_weighted_accum".
Done.
4. There is an OID conflict of 4066 with latest master code.
Fixed.
5.+ A += newvalW * ( newvalX - transvalues[2] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(newvalX - transvalues[2])
|| isinf(1.0/W), true);+ Q += newvalW * (newvalX - transvalues[2]) * (newvalX - A); + CHECKFLOATVAL(A, isinf(newvalX - transvalues[3]) || isinf(newvalX - A) || isinf(1.0/W), true);Is the need of calculation also needs to be passed to CHECKFLOATVAL?
Just passing
the variables involved in the calculation isn't enough? If expressions
are required then
it should be something as follows?CHECKFLOATVAL(A, isinf(transvalues[2]) || isinf(newvalW) ||
isinf(newvalX - transvalues[2]) || isinf(1.0/W), true);CHECKFLOATVAL(Q, isinf(transvalues[3]) || isinf(newvalX -
transvalues[2]) || isinf(newvalX - A) || isinf(1.0/W), true);
Done.
Please find attached a patch that uses the float8 version to cover the
numeric types.
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
weighted_stats_002.difftext/plain; charset=us-asciiDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 000489d..09ada7e 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -12645,6 +12645,29 @@ NULL baz</literallayout>(3 rows)</entry>
<row>
<entry>
<indexterm>
+ <primary>weighted_average</primary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_avg</primary>
+ </indexterm>
+ <function>weighted_avg(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, <type>numeric</type>, or <type>interval</type>
+ </entry>
+ <entry>
+ <type>numeric</type> for any integer-type argument,
+ <type>double precision</type> for a floating-point argument,
+ otherwise the same as the argument data type
+ </entry>
+ <entry>the average (arithmetic mean) of all input values, weighted by the input weights</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>bit_and</primary>
</indexterm>
<function>bit_and(<replaceable class="parameter">expression</replaceable>)</function>
@@ -13288,6 +13311,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>population</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_pop</primary>
+ </indexterm>
+ <function>weighted_stddev_pop(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted population standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>standard deviation</primary>
<secondary>sample</secondary>
</indexterm>
@@ -13311,6 +13357,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>sample</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_samp</primary>
+ </indexterm>
+ <function>weighted_stddev_samp(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted sample standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>variance</primary>
</indexterm>
<function>variance</function>(<replaceable class="parameter">expression</replaceable>)
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index d4e5d55..c03f712 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2364,6 +2364,7 @@ setseed(PG_FUNCTION_ARGS)
* float8_accum - accumulate for AVG(), variance aggregates, etc.
* float4_accum - same, but input data is float4
* float8_avg - produce final result for float AVG()
+ * float8_weighted_avg - produce final result for float WEIGHTED_AVG()
* float8_var_samp - produce final result for float VAR_SAMP()
* float8_var_pop - produce final result for float VAR_POP()
* float8_stddev_samp - produce final result for float STDDEV_SAMP()
@@ -3057,6 +3058,155 @@ float8_regr_intercept(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(numeratorXXY / numeratorX);
}
+/*
+ * ===================
+ * WEIGHTED AGGREGATES
+ * ===================
+ *
+ * The transition datatype for these aggregates is a 4-element array
+ * of float8, holding the values N, sum(W), sum(W*X), and sum(W*X*X)
+ * in that order.
+ *
+ * First, an accumulator function for those we can't pirate from the
+ * other accumulators. This accumulator function takes out some of
+ * the rounding error inherent in the general one.
+ * https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
+ *
+ * It consists of a four-element array which includes:
+ *
+ * N, the number of non-zero-weighted values seen thus far,
+ * W, the running sum of weights,
+ * A, an intermediate value used in the calculation, and
+ * Q, another intermediate value.
+ *
+ */
+
+Datum
+float8_weighted_accum(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 newvalX = PG_GETARG_FLOAT8(1);
+ float8 newvalW = PG_GETARG_FLOAT8(2);
+ float8 *transvalues;
+ float8 N,
+ W,
+ A,
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_accum", 4);
+
+ if (newvalW <= 0.0) /* We only care about positive weights */
+ PG_RETURN_NULL();
+
+ N = transvalues[0];
+ W = transvalues[1];
+ A = transvalues[2];
+ Q = transvalues[3];
+
+ N += 1.0;
+ CHECKFLOATVAL(N, isinf(transvalues[0]), true);
+ W += newvalW;
+ CHECKFLOATVAL(W, isinf(transvalues[1]) || isinf(newvalW), true);
+ A += newvalW * ( newvalX - transvalues[2] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(transvalues[2]) || isinf(1.0/W), true);
+ Q += newvalW * (newvalX - transvalues[2]) * (newvalX - A);
+ CHECKFLOATVAL(A, isinf(newvalX - transvalues[3]) || isinf(newvalX - A) || isinf(1.0/W), true);
+
+ if (AggCheckCallContext(fcinfo, NULL)) /* Update in place is safe in Agg context */
+ {
+ transvalues[0] = N;
+ transvalues[1] = W;
+ transvalues[2] = A;
+ transvalues[3] = Q;
+
+ PG_RETURN_ARRAYTYPE_P(transarray);
+ }
+ else /* You do not need to call this directly. */
+ ereport(ERROR,
+ (errmsg("float8_weighted_accum called outside agg context")));
+}
+
+/*
+ * This is the final function for the weighted mean, having borrowed
+ * the 6-element accumulator used in the binary aggregates below to
+ * get
+ *
+ * N, the number of elements with non-zero weights,
+ * sumW, the sum of the weights, and
+ * sumWX, the dot product of elements and weights.
+ *
+ * While it might be possible to optimize this further by making a
+ * more compact accumulator, the performance gain is likely marginal.
+ *
+ */
+Datum
+float8_weighted_avg(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ sumWX,
+ sumW;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_avg", 6);
+ N = transvalues[0];
+ sumW = transvalues[1];
+ sumWX = transvalues[5];
+
+ if (N < 1.0)
+ PG_RETURN_NULL();
+
+ CHECKFLOATVAL(N, isinf(1.0/sumW) || isinf(sumWX), true);
+
+ PG_RETURN_FLOAT8(sumWX/sumW);
+}
+
+Datum
+float8_weighted_stddev_samp(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_samp", 4);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[3];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8(
+ sqrt(
+ N * Q /
+ ( (N-1) * W )
+ )
+ );
+}
+
+Datum
+float8_weighted_stddev_pop(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_pop", 4);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[3];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8( sqrt( Q / W ) );
+}
/*
* ====================================
diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h
index 441db30..9b9c1ee 100644
--- a/src/include/catalog/pg_aggregate.h
+++ b/src/include/catalog/pg_aggregate.h
@@ -136,6 +136,7 @@ DATA(insert ( 2103 n 0 numeric_avg_accum numeric_avg - numeric_avg_accum numeri
DATA(insert ( 2104 n 0 float4_accum float8_avg - - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2105 n 0 float8_accum float8_avg - - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2106 n 0 interval_accum interval_avg - interval_accum interval_accum_inv interval_avg f f 0 1187 0 1187 0 "{0 second,0 second}" "{0 second,0 second}" ));
+DATA(insert ( 3998 n 0 float8_regr_accum float8_weighted_avg - - - - f f 0 1022 0 0 0 "{0,0,0,0,0,0}" _null_ ));
/* sum */
DATA(insert ( 2107 n 0 int8_avg_accum numeric_poly_sum - int8_avg_accum int8_avg_accum_inv numeric_poly_sum f f 0 2281 48 2281 48 _null_ _null_ ));
@@ -228,6 +229,7 @@ DATA(insert ( 2726 n 0 int2_accum numeric_poly_stddev_pop - int2_accum int2_accu
DATA(insert ( 2727 n 0 float4_accum float8_stddev_pop - - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2728 n 0 float8_accum float8_stddev_pop - - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2729 n 0 numeric_accum numeric_stddev_pop - numeric_accum numeric_accum_inv numeric_stddev_pop f f 0 2281 128 2281 128 _null_ _null_ ));
+DATA(insert ( 4032 n 0 float8_weighted_accum float8_weighted_stddev_pop - - - - f f 0 1022 0 0 0 "{0,0,0,0}" _null_ ));
/* stddev_samp */
DATA(insert ( 2712 n 0 int8_accum numeric_stddev_samp - int8_accum int8_accum_inv numeric_stddev_samp f f 0 2281 128 2281 128 _null_ _null_ ));
@@ -235,6 +237,7 @@ DATA(insert ( 2713 n 0 int4_accum numeric_poly_stddev_samp - int4_accum int4_acc
DATA(insert ( 2714 n 0 int2_accum numeric_poly_stddev_samp - int2_accum int2_accum_inv numeric_poly_stddev_samp f f 0 2281 48 2281 48 _null_ _null_ ));
DATA(insert ( 2715 n 0 float4_accum float8_stddev_samp - - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2716 n 0 float8_accum float8_stddev_samp - - - - f f 0 1022 0 0 0 "{0,0,0}" _null_ ));
+DATA(insert ( 4083 n 0 float8_weighted_accum float8_weighted_stddev_samp - - - - f f 0 1022 0 0 0 "{0,0,0,0}" _null_ ));
DATA(insert ( 2717 n 0 numeric_accum numeric_stddev_samp - numeric_accum numeric_accum_inv numeric_stddev_samp f f 0 2281 128 2281 128 _null_ _null_ ));
/* stddev: historical Postgres syntax for stddev_samp */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index ceb8129..b1b7fc2 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2431,6 +2431,12 @@ DESCR("join selectivity of case-insensitive regex non-match");
/* Aggregate-related functions */
DATA(insert OID = 1830 ( float8_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_avg _null_ _null_ _null_ ));
DESCR("aggregate final function");
+DATA(insert OID = 3997 ( float8_weighted_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_avg _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4099 ( float8_weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_pop _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4100 ( float8_weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_samp _null_ _null_ _null_ ));
+DESCR("aggregate final function");
DATA(insert OID = 2512 ( float8_var_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_pop _null_ _null_ _null_ ));
DESCR("aggregate final function");
DATA(insert OID = 1831 ( float8_var_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_samp _null_ _null_ _null_ ));
@@ -2514,6 +2520,8 @@ DATA(insert OID = 2805 ( int8inc_float8_float8 PGNSP PGUID 12 1 0 0 0 f f f f
DESCR("aggregate transition function");
DATA(insert OID = 2806 ( float8_regr_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_regr_accum _null_ _null_ _null_ ));
DESCR("aggregate transition function");
+DATA(insert OID = 3999 ( float8_weighted_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_weighted_accum _null_ _null_ _null_ ));
+DESCR("aggregate transition function");
DATA(insert OID = 2807 ( float8_regr_sxx PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_regr_sxx _null_ _null_ _null_ ));
DESCR("aggregate final function");
DATA(insert OID = 2808 ( float8_regr_syy PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_regr_syy _null_ _null_ _null_ ));
@@ -3170,6 +3178,8 @@ DATA(insert OID = 2104 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701
DESCR("the average (arithmetic mean) as float8 of all float4 values");
DATA(insert OID = 2105 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701 "701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as float8 of all float8 values");
+DATA(insert OID = 3998 ( weighted_avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("the weighted average (arithmetic mean) as float8 of all float8 values");
DATA(insert OID = 2106 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1186 "1186" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as interval of all interval values");
@@ -3330,6 +3340,8 @@ DATA(insert OID = 2728 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0
DESCR("population standard deviation of float8 input values");
DATA(insert OID = 2729 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("population standard deviation of numeric input values");
+DATA(insert OID = 4032 ( weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("population weighted standard deviation of float8 input values");
DATA(insert OID = 2712 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of bigint input values");
@@ -3343,6 +3355,8 @@ DATA(insert OID = 2716 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1
DESCR("sample standard deviation of float8 input values");
DATA(insert OID = 2717 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of numeric input values");
+DATA(insert OID = 4083 ( weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("sample weighted standard deviation of float8 input values");
DATA(insert OID = 2154 ( stddev PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("historical alias for stddev_samp");
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 59a00bb..78f28f9 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -425,8 +425,12 @@ extern Datum radians(PG_FUNCTION_ARGS);
extern Datum drandom(PG_FUNCTION_ARGS);
extern Datum setseed(PG_FUNCTION_ARGS);
extern Datum float8_accum(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_accum(PG_FUNCTION_ARGS);
extern Datum float4_accum(PG_FUNCTION_ARGS);
extern Datum float8_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_pop(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_samp(PG_FUNCTION_ARGS);
extern Datum float8_var_pop(PG_FUNCTION_ARGS);
extern Datum float8_var_samp(PG_FUNCTION_ARGS);
extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 601bdb4..d9fc408 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -247,6 +247,18 @@ SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
653.62895538751 | 871.505273850014
(1 row)
+SELECT weighted_avg(a, b) FROM aggtest;
+ weighted_avg
+------------------
+ 55.5553072763149
+(1 row)
+
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
+ weighted_stddev_pop | weighted_stddev_samp
+---------------------+----------------------
+ 24.3364627240769 | 28.1013266097382
+(1 row)
+
SELECT corr(b, a) FROM aggtest;
corr
-------------------
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 80ef14c..6f236a1 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -60,6 +60,8 @@ SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
SELECT regr_r2(b, a) FROM aggtest;
SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
+SELECT weighted_avg(a, b) FROM aggtest;
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
SELECT corr(b, a) FROM aggtest;
SELECT count(four) AS cnt_1000 FROM onek;
On Tue, Mar 15, 2016 at 8:36 AM, David Fetter <david@fetter.org> wrote:
Please find attached a patch that uses the float8 version to cover the
numeric types.
Is there a well-defined meaning for having a negative weight? If no,
should it be disallowed?
I don't know what I was expecting, but not this:
select weighted_avg(x,10000000-2*x) from generate_series(1,10000000) f(x);
weighted_avg
------------------
16666671666717.1
Also, I think it might not give the correct answer even without
negative weights:
create table foo as select floor(random()*10000)::int val from
generate_series(1,10000000);
create table foo2 as select val, count(*) from foo group by val;
Shouldn't these then give the same result:
select stddev_samp(val) from foo;
stddev_samp
-------------------
2887.054977297105
select weighted_stddev_samp(val,count) from foo2;
weighted_stddev_samp
----------------------
2887.19919651336
The 5th digit seems too early to be seeing round-off error.
Cheers,
Jeff
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Fri, Mar 18, 2016 at 06:12:12PM -0700, Jeff Janes wrote:
On Tue, Mar 15, 2016 at 8:36 AM, David Fetter <david@fetter.org> wrote:
Please find attached a patch that uses the float8 version to cover the
numeric types.Is there a well-defined meaning for having a negative weight? If no,
should it be disallowed?
Opinions on this appear to vary. A Wikipedia article defines weights
as non-negative, while a manual to which it refers only uses non-zero.
https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Mathematical_definition
https://www.gnu.org/software/gsl/manual/html_node/Weighted-Samples.html
I'm not sure which if either would be authoritative, but I could
certainly make up variants for each assumption.
The assumption they have in common about weights is that a zero weight
is not part of the calculation, which assumption is implemented in the
previously submitted code.
I don't know what I was expecting, but not this:
select weighted_avg(x,10000000-2*x) from generate_series(1,10000000) f(x);
weighted_avg
------------------
16666671666717.1
I'm guessing that negative weights can cause bizarre outcomes,
assuming it turns out we should allow them.
Also, I think it might not give the correct answer even without
negative weights:create table foo as select floor(random()*10000)::int val from
generate_series(1,10000000);create table foo2 as select val, count(*) from foo group by val;
Shouldn't these then give the same result:
select stddev_samp(val) from foo;
stddev_samp
-------------------
2887.054977297105select weighted_stddev_samp(val,count) from foo2;
weighted_stddev_samp
----------------------
2887.19919651336The 5th digit seems too early to be seeing round-off error.
Please pardon me if I've misunderstood, but you appear to be assuming
that
SELECT val, count(*) FROM foo GROUP BY val
will produce precisely identical count(*)s at each row, which it
overwhelmingly likely won't, producing the difference you see above.
What have I misunderstood?
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Fri, Mar 18, 2016 at 11:34 PM, David Fetter <david@fetter.org> wrote:
On Fri, Mar 18, 2016 at 06:12:12PM -0700, Jeff Janes wrote:
Also, I think it might not give the correct answer even without
negative weights:create table foo as select floor(random()*10000)::int val from
generate_series(1,10000000);create table foo2 as select val, count(*) from foo group by val;
Shouldn't these then give the same result:
select stddev_samp(val) from foo;
stddev_samp
-------------------
2887.054977297105select weighted_stddev_samp(val,count) from foo2;
weighted_stddev_samp
----------------------
2887.19919651336The 5th digit seems too early to be seeing round-off error.
Please pardon me if I've misunderstood, but you appear to be assuming
thatSELECT val, count(*) FROM foo GROUP BY val
will produce precisely identical count(*)s at each row, which it
overwhelmingly likely won't, producing the difference you see above.
I think the count for each val that gets put in foo2.count should be
the same as the weight of that val as it occurs in foo. Surely they
shouldn't all have the same weight in foo2, unless they all have the
same number of appearances in foo. Which, as you say, they are not
likely to. But still, the foo2.count that they do individually get
should be equal to their weight, shouldn't it?
The other two methods (*avg and *stddev_pop) do give the same answers
using the two different methods (unweighted against foo, weighted
against foo2)
Cheers,
Jeff
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi,
On 03/19/2016 07:34 AM, David Fetter wrote:
On Fri, Mar 18, 2016 at 06:12:12PM -0700, Jeff Janes wrote:
On Tue, Mar 15, 2016 at 8:36 AM, David Fetter <david@fetter.org> wrote:
Please find attached a patch that uses the float8 version to cover the
numeric types.Is there a well-defined meaning for having a negative weight? If no,
should it be disallowed?Opinions on this appear to vary. A Wikipedia article defines weights
as non-negative, while a manual to which it refers only uses non-zero.https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Mathematical_definition
https://www.gnu.org/software/gsl/manual/html_node/Weighted-Samples.html
I don't think that actually allows negative weights. It says that
w_i = 1/\sigma_i^2
and variance is always > 0, thus w_i > 0. The zero is used as a special
flag to remove the sample from the data set in a simple way.
I'm not sure which if either would be authoritative, but I could
certainly make up variants for each assumption.The assumption they have in common about weights is that a zero
weight is not part of the calculation, which assumption is
implemented in the previously submitted code.
I think that if we're not sure what should happen with negative weights,
then we should disallow them. It's easy to allow them later once we have
a reasonable definition, but if we allow them now and later realize it
should behave differently, we'll be in trouble because of breaking
existing uses.
I can't really come up with a reasonable example that would actually use
negative weights. Can you? That would probably help with defining the
behavior correctly.
Allowing negative weights has other consequences. For example, what if
sum(W) ends up being 0? For example
CREATE TABLE t (a float, b float);
INSERT INTO t SELECT i, 1 FROM generate_series(1,1000) s(i);
INSERT INTO t SELECT i, -1 FROM generate_series(1,1000) s(i);
SELECT weighted_avg(a,b) FROM t;
weighted_avg
--------------
NaN
(1 row)
Is that the correct behavior? Why?
So -1 to allowing negative weights, unless we can come up with proper
definition or at least good examples demonstrating the usefulness.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Sat, Mar 19, 2016 at 05:04:08PM +0100, Tomas Vondra wrote:
Hi,
On 03/19/2016 07:34 AM, David Fetter wrote:
On Fri, Mar 18, 2016 at 06:12:12PM -0700, Jeff Janes wrote:
On Tue, Mar 15, 2016 at 8:36 AM, David Fetter <david@fetter.org> wrote:
Please find attached a patch that uses the float8 version to cover the
numeric types.Is there a well-defined meaning for having a negative weight? If no,
should it be disallowed?Opinions on this appear to vary. A Wikipedia article defines weights
as non-negative, while a manual to which it refers only uses non-zero.https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Mathematical_definition
https://www.gnu.org/software/gsl/manual/html_node/Weighted-Samples.htmlI don't think that actually allows negative weights. It says that
w_i = 1/\sigma_i^2
and variance is always > 0, thus w_i > 0. The zero is used as a special flag
to remove the sample from the data set in a simple way.I'm not sure which if either would be authoritative, but I could
certainly make up variants for each assumption.The assumption they have in common about weights is that a zero
weight is not part of the calculation, which assumption is
implemented in the previously submitted code.I think that if we're not sure what should happen with negative weights,
then we should disallow them. It's easy to allow them later once we have a
reasonable definition, but if we allow them now and later realize it should
behave differently, we'll be in trouble because of breaking existing uses.
OK
I can't really come up with a reasonable example that would actually use
negative weights. Can you? That would probably help with defining the
behavior correctly.
No, but I'm not a statistician. I've seen them mentioned in contexts
that appear to be discussions among same, and again opinions vary.
Allowing negative weights has other consequences. For example, what if
sum(W) ends up being 0? For exampleCREATE TABLE t (a float, b float);
INSERT INTO t SELECT i, 1 FROM generate_series(1,1000) s(i);
INSERT INTO t SELECT i, -1 FROM generate_series(1,1000) s(i);SELECT weighted_avg(a,b) FROM t;
weighted_avg
--------------
NaN
(1 row)Is that the correct behavior? Why?
It's not, and you're right.
I will send a patch that disallows negative weights this evening or
tomorrow. It will be slightly more complicated as I believe I will
need to create a new accumulator function for the weighted_avg() case
where I had been using an extant one before.
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Sun, Mar 20, 2016 at 03:38:40PM -0700, David Fetter wrote:
On Sat, Mar 19, 2016 at 05:04:08PM +0100, Tomas Vondra wrote:
Hi,
On 03/19/2016 07:34 AM, David Fetter wrote:
On Fri, Mar 18, 2016 at 06:12:12PM -0700, Jeff Janes wrote:
On Tue, Mar 15, 2016 at 8:36 AM, David Fetter <david@fetter.org> wrote:
Please find attached a patch that uses the float8 version to cover the
numeric types.Is there a well-defined meaning for having a negative weight? If no,
should it be disallowed?Opinions on this appear to vary. A Wikipedia article defines weights
as non-negative, while a manual to which it refers only uses non-zero.https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Mathematical_definition
https://www.gnu.org/software/gsl/manual/html_node/Weighted-Samples.htmlI don't think that actually allows negative weights. It says that
w_i = 1/\sigma_i^2
and variance is always > 0, thus w_i > 0. The zero is used as a special flag
to remove the sample from the data set in a simple way.I'm not sure which if either would be authoritative, but I could
certainly make up variants for each assumption.The assumption they have in common about weights is that a zero
weight is not part of the calculation, which assumption is
implemented in the previously submitted code.I think that if we're not sure what should happen with negative weights,
then we should disallow them. It's easy to allow them later once we have a
reasonable definition, but if we allow them now and later realize it should
behave differently, we'll be in trouble because of breaking existing uses.OK
I can't really come up with a reasonable example that would actually use
negative weights. Can you? That would probably help with defining the
behavior correctly.No, but I'm not a statistician. I've seen them mentioned in contexts
that appear to be discussions among same, and again opinions vary.Allowing negative weights has other consequences. For example, what if
sum(W) ends up being 0? For exampleCREATE TABLE t (a float, b float);
INSERT INTO t SELECT i, 1 FROM generate_series(1,1000) s(i);
INSERT INTO t SELECT i, -1 FROM generate_series(1,1000) s(i);SELECT weighted_avg(a,b) FROM t;
weighted_avg
--------------
NaN
(1 row)Is that the correct behavior? Why?
It's not, and you're right.
I will send a patch that disallows negative weights this evening or
tomorrow. It will be slightly more complicated as I believe I will
need to create a new accumulator function for the weighted_avg() case
where I had been using an extant one before.Cheers,
David.
Sorry about the delay. This patch disallows negative weights,
although it still has that odd difference Jeff found.
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
weighted_stats_003.difftext/plain; charset=us-asciiDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 88145c5..b939340 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -12750,6 +12750,29 @@ NULL baz</literallayout>(3 rows)</entry>
<row>
<entry>
<indexterm>
+ <primary>weighted_average</primary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_avg</primary>
+ </indexterm>
+ <function>weighted_avg(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, <type>numeric</type>, or <type>interval</type>
+ </entry>
+ <entry>
+ <type>numeric</type> for any integer-type argument,
+ <type>double precision</type> for a floating-point argument,
+ otherwise the same as the argument data type
+ </entry>
+ <entry>the average (arithmetic mean) of all input values, weighted by the input weights</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>bit_and</primary>
</indexterm>
<function>bit_and(<replaceable class="parameter">expression</replaceable>)</function>
@@ -13430,6 +13453,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>population</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_pop</primary>
+ </indexterm>
+ <function>weighted_stddev_pop(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted population standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>standard deviation</primary>
<secondary>sample</secondary>
</indexterm>
@@ -13454,6 +13500,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>sample</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_samp</primary>
+ </indexterm>
+ <function>weighted_stddev_samp(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted sample standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>variance</primary>
</indexterm>
<function>variance</function>(<replaceable class="parameter">expression</replaceable>)
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index c7c0b58..cd7b10a 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2405,6 +2405,7 @@ setseed(PG_FUNCTION_ARGS)
* float8_accum - accumulate for AVG(), variance aggregates, etc.
* float4_accum - same, but input data is float4
* float8_avg - produce final result for float AVG()
+ * float8_weighted_avg - produce final result for float WEIGHTED_AVG()
* float8_var_samp - produce final result for float VAR_SAMP()
* float8_var_pop - produce final result for float VAR_POP()
* float8_stddev_samp - produce final result for float STDDEV_SAMP()
@@ -3205,6 +3206,164 @@ float8_regr_intercept(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(numeratorXXY / numeratorX);
}
+/*
+ * ===================
+ * WEIGHTED AGGREGATES
+ * ===================
+ *
+ * The transition datatype for these aggregates is a 5-element array
+ * of float8, holding the values N, sum(W), sum(W*X), and sum(W*X*X)
+ * in that order.
+ *
+ * First, an accumulator function for those we can't pirate from the
+ * other accumulators. This accumulator function takes out some of
+ * the rounding error inherent in the general one.
+ * https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
+ *
+ * It consists of a five-element array which includes:
+ *
+ * N, the number of non-zero-weighted values seen thus far,
+ * W, the running sum of weights,
+ * WX, the running dot product of weights and values,
+ * A, an intermediate value used in the calculation, and
+ * Q, another intermediate value.
+ *
+ */
+
+Datum
+float8_weighted_accum(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 newvalX = PG_GETARG_FLOAT8(1);
+ float8 newvalW = PG_GETARG_FLOAT8(2);
+ float8 *transvalues;
+ float8 N, /* common */
+ W, /* common */
+ WX, /* Used in avg */
+ A, /* Used in stddev_* */
+ Q; /* Used in stddev_* */
+
+ transvalues = check_float8_array(transarray, "float8_weighted_accum", 5);
+
+ if (newvalW == 0.0) /* Discard zero weights */
+ PG_RETURN_NULL();
+
+ if (newvalW < 0.0) /* Negative weights are an error. */
+ ereport(ERROR,
+ (errmsg("negative weights are not allowed")));
+
+ N = transvalues[0];
+ W = transvalues[1];
+ WX = transvalues[2];
+ A = transvalues[3];
+ Q = transvalues[4];
+
+ N += 1.0;
+ CHECKFLOATVAL(N, isinf(transvalues[0]), true);
+ W += newvalW;
+ CHECKFLOATVAL(W, isinf(transvalues[1]) || isinf(newvalW), true);
+ WX += newvalW * newvalX;
+ CHECKFLOATVAL(WX, isinf(transvalues[1]) || isinf(newvalW), true);
+ A += newvalW * ( newvalX - transvalues[3] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(transvalues[3]) || isinf(1.0/W), true);
+ Q += newvalW * (newvalX - transvalues[3]) * (newvalX - A);
+ CHECKFLOATVAL(A, isinf(newvalX - transvalues[4]) || isinf(newvalX - A) || isinf(1.0/W), true);
+
+ if (AggCheckCallContext(fcinfo, NULL)) /* Update in place is safe in Agg context */
+ {
+ transvalues[0] = N;
+ transvalues[1] = W;
+ transvalues[2] = WX;
+ transvalues[3] = A;
+ transvalues[4] = Q;
+
+ PG_RETURN_ARRAYTYPE_P(transarray);
+ }
+ else /* You do not need to call this directly. */
+ ereport(ERROR,
+ (errmsg("float8_weighted_accum called outside agg context")));
+}
+
+/*
+ * This is the final function for the weighted mean. It uses the
+ * 5-element accumulator common to weighted aggregates.
+ *
+ * N, the number of elements with non-zero weights,
+ * sumW, the sum of the weights, and
+ * sumWX, the dot product of elements and weights.
+ *
+ * While it might be possible to optimize this further by making a
+ * more compact accumulator, the performance gain is likely marginal.
+ *
+ */
+Datum
+float8_weighted_avg(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ sumWX,
+ sumW;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_avg", 5);
+ N = transvalues[0];
+ sumW = transvalues[1];
+ sumWX = transvalues[2];
+
+ if (N < 1.0)
+ PG_RETURN_NULL();
+
+ CHECKFLOATVAL(N, isinf(1.0/sumW) || isinf(sumWX), true);
+
+ PG_RETURN_FLOAT8(sumWX/sumW);
+}
+
+Datum
+float8_weighted_stddev_samp(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_samp", 5);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[4];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8(
+ sqrt(
+ N * Q /
+ ( (N-1) * W )
+ )
+ );
+}
+
+Datum
+float8_weighted_stddev_pop(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_pop", 5);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[4];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8( sqrt( Q / W ) );
+}
/*
* ====================================
diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c
index dc35a93..36de6b6 100644
--- a/src/bin/pg_dump/pg_dump_sort.c
+++ b/src/bin/pg_dump/pg_dump_sort.c
@@ -848,14 +848,9 @@ repairTypeFuncLoop(DumpableObject *typeobj, DumpableObject *funcobj)
if (typeInfo->shellType)
{
addObjectDependency(funcobj, typeInfo->shellType->dobj.dumpId);
- /*
- * Mark shell type (always including the definition, as we need
- * the shell type defined to identify the function fully) as to be
- * dumped if any such function is
- */
+ /* Mark shell type as to be dumped if any such function is */
if (funcobj->dump)
- typeInfo->shellType->dobj.dump = funcobj->dump |
- DUMP_COMPONENT_DEFINITION;
+ typeInfo->shellType->dobj.dump = true;
}
}
diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h
index e16aa48..8e2a3a3 100644
--- a/src/include/catalog/pg_aggregate.h
+++ b/src/include/catalog/pg_aggregate.h
@@ -145,6 +145,7 @@ DATA(insert ( 2103 n 0 numeric_avg_accum numeric_avg numeric_avg_combine numer
DATA(insert ( 2104 n 0 float4_accum float8_avg float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2105 n 0 float8_accum float8_avg float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2106 n 0 interval_accum interval_avg interval_combine - - interval_accum interval_accum_inv interval_avg f f 0 1187 0 0 1187 0 "{0 second,0 second}" "{0 second,0 second}" ));
+DATA(insert ( 3998 n 0 float8_weighted_accum float8_weighted_avg - - - - - - f f 0 1022 0 40 0 0 "{0,0,0,0,0}" _null_));
/* sum */
DATA(insert ( 2107 n 0 int8_avg_accum numeric_poly_sum int8_avg_combine int8_avg_serialize int8_avg_deserialize int8_avg_accum int8_avg_accum_inv numeric_poly_sum f f 0 2281 17 48 2281 48 _null_ _null_ ));
@@ -237,6 +238,7 @@ DATA(insert ( 2726 n 0 int2_accum numeric_poly_stddev_pop numeric_poly_combine
DATA(insert ( 2727 n 0 float4_accum float8_stddev_pop float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2728 n 0 float8_accum float8_stddev_pop float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2729 n 0 numeric_accum numeric_stddev_pop numeric_combine numeric_serialize numeric_deserialize numeric_accum numeric_accum_inv numeric_stddev_pop f f 0 2281 17 128 2281 128 _null_ _null_ ));
+DATA(insert ( 4032 n 0 float8_weighted_accum float8_weighted_stddev_pop - - - - - - f f 0 1022 0 40 0 0 "{0,0,0,0,0}" _null_));
/* stddev_samp */
DATA(insert ( 2712 n 0 int8_accum numeric_stddev_samp numeric_combine numeric_serialize numeric_deserialize int8_accum int8_accum_inv numeric_stddev_samp f f 0 2281 17 128 2281 128 _null_ _null_ ));
@@ -245,6 +247,7 @@ DATA(insert ( 2714 n 0 int2_accum numeric_poly_stddev_samp numeric_poly_combine
DATA(insert ( 2715 n 0 float4_accum float8_stddev_samp float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2716 n 0 float8_accum float8_stddev_samp float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2717 n 0 numeric_accum numeric_stddev_samp numeric_combine numeric_serialize numeric_deserialize numeric_accum numeric_accum_inv numeric_stddev_samp f f 0 2281 17 128 2281 128 _null_ _null_ ));
+DATA(insert ( 4101 n 0 float8_weighted_accum float8_weighted_stddev_samp - - - - - - f f 0 1022 0 40 0 0 "{0,0,0,0,0}" _null_));
/* stddev: historical Postgres syntax for stddev_samp */
DATA(insert ( 2154 n 0 int8_accum numeric_stddev_samp numeric_combine numeric_serialize numeric_deserialize int8_accum int8_accum_inv numeric_stddev_samp f f 0 2281 17 128 2281 128 _null_ _null_ ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index bb539d4..7dff0d3 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2433,6 +2433,12 @@ DESCR("join selectivity of case-insensitive regex non-match");
/* Aggregate-related functions */
DATA(insert OID = 1830 ( float8_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_avg _null_ _null_ _null_ ));
DESCR("aggregate final function");
+DATA(insert OID = 3997 ( float8_weighted_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_avg _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4099 ( float8_weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_pop _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4100 ( float8_weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_samp _null_ _null_ _null_ ));
+DESCR("aggregate final function");
DATA(insert OID = 2512 ( float8_var_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_pop _null_ _null_ _null_ ));
DESCR("aggregate final function");
DATA(insert OID = 1831 ( float8_var_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_samp _null_ _null_ _null_ ));
@@ -2544,6 +2550,8 @@ DATA(insert OID = 2805 ( int8inc_float8_float8 PGNSP PGUID 12 1 0 0 0 f f f f
DESCR("aggregate transition function");
DATA(insert OID = 2806 ( float8_regr_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_regr_accum _null_ _null_ _null_ ));
DESCR("aggregate transition function");
+DATA(insert OID = 3999 ( float8_weighted_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_weighted_accum _null_ _null_ _null_ ));
+DESCR("aggregate transition function");
DATA(insert OID = 3342 ( float8_regr_combine PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1022 "1022 1022" _null_ _null_ _null_ _null_ _null_ float8_regr_combine _null_ _null_ _null_ ));
DESCR("aggregate combine function");
DATA(insert OID = 2807 ( float8_regr_sxx PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_regr_sxx _null_ _null_ _null_ ));
@@ -3204,6 +3212,8 @@ DATA(insert OID = 2104 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701
DESCR("the average (arithmetic mean) as float8 of all float4 values");
DATA(insert OID = 2105 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701 "701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as float8 of all float8 values");
+DATA(insert OID = 3998 ( weighted_avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("the weighted average (arithmetic mean) as float8 of all float8 values");
DATA(insert OID = 2106 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1186 "1186" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as interval of all interval values");
@@ -3364,6 +3374,8 @@ DATA(insert OID = 2728 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0
DESCR("population standard deviation of float8 input values");
DATA(insert OID = 2729 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("population standard deviation of numeric input values");
+DATA(insert OID = 4032 ( weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("population weighted standard deviation of float8 input values");
DATA(insert OID = 2712 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of bigint input values");
@@ -3377,6 +3389,8 @@ DATA(insert OID = 2716 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1
DESCR("sample standard deviation of float8 input values");
DATA(insert OID = 2717 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of numeric input values");
+DATA(insert OID = 4101 ( weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("sample weighted standard deviation of float8 input values");
DATA(insert OID = 2154 ( stddev PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("historical alias for stddev_samp");
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 01976a1..1c1d0d4 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -429,8 +429,12 @@ extern Datum drandom(PG_FUNCTION_ARGS);
extern Datum setseed(PG_FUNCTION_ARGS);
extern Datum float8_combine(PG_FUNCTION_ARGS);
extern Datum float8_accum(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_accum(PG_FUNCTION_ARGS);
extern Datum float4_accum(PG_FUNCTION_ARGS);
extern Datum float8_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_pop(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_samp(PG_FUNCTION_ARGS);
extern Datum float8_var_pop(PG_FUNCTION_ARGS);
extern Datum float8_var_samp(PG_FUNCTION_ARGS);
extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 3ff6691..c12ea3b 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -247,6 +247,18 @@ SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
653.62895538751 | 871.505273850014
(1 row)
+SELECT weighted_avg(a, b) FROM aggtest;
+ weighted_avg
+------------------
+ 55.5553072763149
+(1 row)
+
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
+ weighted_stddev_pop | weighted_stddev_samp
+---------------------+----------------------
+ 24.3364627240769 | 28.1013266097382
+(1 row)
+
SELECT corr(b, a) FROM aggtest;
corr
-------------------
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 80ef14c..6f236a1 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -60,6 +60,8 @@ SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
SELECT regr_r2(b, a) FROM aggtest;
SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
+SELECT weighted_avg(a, b) FROM aggtest;
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
SELECT corr(b, a) FROM aggtest;
SELECT count(four) AS cnt_1000 FROM onek;
On Fri, Apr 08, 2016 at 01:47:56PM -0700, David Fetter wrote:
Sorry about the delay. This patch disallows negative weights,
although it still has that odd difference Jeff found.
Difference corrected. It turned out that my reference was mistaken on
the calculation.
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
weighted_stats_004.difftext/plain; charset=us-asciiDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index cb7fe38..ba31d02 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -12750,6 +12750,29 @@ NULL baz</literallayout>(3 rows)</entry>
<row>
<entry>
<indexterm>
+ <primary>weighted_average</primary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_avg</primary>
+ </indexterm>
+ <function>weighted_avg(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, <type>numeric</type>, or <type>interval</type>
+ </entry>
+ <entry>
+ <type>numeric</type> for any integer-type argument,
+ <type>double precision</type> for a floating-point argument,
+ otherwise the same as the argument data type
+ </entry>
+ <entry>the average (arithmetic mean) of all input values, weighted by the input weights</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>bit_and</primary>
</indexterm>
<function>bit_and(<replaceable class="parameter">expression</replaceable>)</function>
@@ -13430,6 +13453,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>population</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_pop</primary>
+ </indexterm>
+ <function>weighted_stddev_pop(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted population standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>standard deviation</primary>
<secondary>sample</secondary>
</indexterm>
@@ -13454,6 +13500,29 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
<row>
<entry>
<indexterm>
+ <primary>weighted standard deviation</primary>
+ <secondary>sample</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>weighted_stddev_samp</primary>
+ </indexterm>
+ <function>weighted_stddev_samp(<replaceable class="parameter">value expression</replaceable>, <replaceable class="parameter">weight expression</replaceable>)</function>
+ </entry>
+ <entry>
+ <type>smallint</type>, <type>int</type>,
+ <type>bigint</type>, <type>real</type>, <type>double
+ precision</type>, or <type>numeric</type>
+ </entry>
+ <entry>
+ <type>double precision</type> for floating-point arguments,
+ otherwise <type>numeric</type>
+ </entry>
+ <entry>weighted sample standard deviation of the input values</entry>
+ </row>
+
+ <row>
+ <entry>
+ <indexterm>
<primary>variance</primary>
</indexterm>
<function>variance</function>(<replaceable class="parameter">expression</replaceable>)
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index c7c0b58..fe4a5e4 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2405,6 +2405,7 @@ setseed(PG_FUNCTION_ARGS)
* float8_accum - accumulate for AVG(), variance aggregates, etc.
* float4_accum - same, but input data is float4
* float8_avg - produce final result for float AVG()
+ * float8_weighted_avg - produce final result for float WEIGHTED_AVG()
* float8_var_samp - produce final result for float VAR_SAMP()
* float8_var_pop - produce final result for float VAR_POP()
* float8_stddev_samp - produce final result for float STDDEV_SAMP()
@@ -3205,6 +3206,163 @@ float8_regr_intercept(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(numeratorXXY / numeratorX);
}
+/*
+ * ===================
+ * WEIGHTED AGGREGATES
+ * ===================
+ *
+ * The transition datatype for these aggregates is a 5-element array
+ * of float8, holding the values N, sum(W), sum(W*X), and sum(W*X*X)
+ * in that order.
+ *
+ * First, an accumulator function for those we can't pirate from the
+ * other accumulators. This accumulator function takes out some of
+ * the rounding error inherent in the general one.
+ * https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
+ *
+ * It consists of a five-element array which includes:
+ *
+ * N, the number of non-zero-weighted values seen thus far,
+ * W, the running sum of weights,
+ * WX, the running dot product of weights and values,
+ * A, an intermediate value used in the calculation, and
+ * Q, another intermediate value.
+ *
+ */
+
+Datum
+float8_weighted_accum(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 newvalX = PG_GETARG_FLOAT8(1);
+ float8 newvalW = PG_GETARG_FLOAT8(2);
+ float8 *transvalues;
+ float8 N, /* common */
+ W, /* common */
+ WX, /* Used in avg */
+ A, /* Used in stddev_* */
+ Q; /* Used in stddev_* */
+
+ transvalues = check_float8_array(transarray, "float8_weighted_accum", 5);
+
+ if (newvalW == 0.0) /* Discard zero weights */
+ PG_RETURN_NULL();
+
+ if (newvalW < 0.0) /* Negative weights are an error. */
+ ereport(ERROR,
+ (errmsg("negative weights are not allowed")));
+
+ N = transvalues[0];
+ W = transvalues[1];
+ WX = transvalues[2];
+ A = transvalues[3];
+ Q = transvalues[4];
+
+ N += 1.0;
+ CHECKFLOATVAL(N, isinf(transvalues[0]), true);
+ W += newvalW;
+ CHECKFLOATVAL(W, isinf(transvalues[1]) || isinf(newvalW), true);
+ WX += newvalW * newvalX;
+ CHECKFLOATVAL(WX, isinf(transvalues[1]) || isinf(newvalW), true);
+ A += newvalW * ( newvalX - transvalues[3] ) / W;
+ CHECKFLOATVAL(A, isinf(newvalW) || isinf(transvalues[3]) || isinf(1.0/W), true);
+ Q += newvalW * (newvalX - transvalues[3]) * (newvalX - A);
+ CHECKFLOATVAL(A, isinf(newvalX - transvalues[4]) || isinf(newvalX - A) || isinf(1.0/W), true);
+
+ if (AggCheckCallContext(fcinfo, NULL)) /* Update in place is safe in Agg context */
+ {
+ transvalues[0] = N;
+ transvalues[1] = W;
+ transvalues[2] = WX;
+ transvalues[3] = A;
+ transvalues[4] = Q;
+
+ PG_RETURN_ARRAYTYPE_P(transarray);
+ }
+ else /* You do not need to call this directly. */
+ ereport(ERROR,
+ (errmsg("float8_weighted_accum called outside agg context")));
+}
+
+/*
+ * This is the final function for the weighted mean. It uses the
+ * 5-element accumulator common to weighted aggregates.
+ *
+ * N, the number of elements with non-zero weights,
+ * sumW, the sum of the weights, and
+ * sumWX, the dot product of elements and weights.
+ *
+ * While it might be possible to optimize this further by making a
+ * more compact accumulator, the performance gain is likely marginal.
+ *
+ */
+Datum
+float8_weighted_avg(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ sumWX,
+ sumW;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_avg", 5);
+ N = transvalues[0];
+ sumW = transvalues[1];
+ sumWX = transvalues[2];
+
+ if (N < 1.0)
+ PG_RETURN_NULL();
+
+ CHECKFLOATVAL(N, isinf(1.0/sumW) || isinf(sumWX), true);
+
+ PG_RETURN_FLOAT8(sumWX/sumW);
+}
+
+Datum
+float8_weighted_stddev_samp(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_samp", 5);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[4];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8(
+ sqrt(
+ Q / (W - 1)
+ )
+ );
+}
+
+Datum
+float8_weighted_stddev_pop(PG_FUNCTION_ARGS)
+{
+ ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0);
+ float8 *transvalues;
+ float8 N,
+ W,
+ /* Skip A. Not used in the calculation */
+ Q;
+
+ transvalues = check_float8_array(transarray, "float8_weighted_stddev_pop", 5);
+ N = transvalues[0];
+ W = transvalues[1];
+ Q = transvalues[4];
+
+ if (N < 2.0) /* Must have at least two samples to get a stddev */
+ PG_RETURN_NULL();
+
+ PG_RETURN_FLOAT8( sqrt( Q / W ) );
+}
/*
* ====================================
diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h
index e16aa48..8e2a3a3 100644
--- a/src/include/catalog/pg_aggregate.h
+++ b/src/include/catalog/pg_aggregate.h
@@ -145,6 +145,7 @@ DATA(insert ( 2103 n 0 numeric_avg_accum numeric_avg numeric_avg_combine numer
DATA(insert ( 2104 n 0 float4_accum float8_avg float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2105 n 0 float8_accum float8_avg float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2106 n 0 interval_accum interval_avg interval_combine - - interval_accum interval_accum_inv interval_avg f f 0 1187 0 0 1187 0 "{0 second,0 second}" "{0 second,0 second}" ));
+DATA(insert ( 3998 n 0 float8_weighted_accum float8_weighted_avg - - - - - - f f 0 1022 0 40 0 0 "{0,0,0,0,0}" _null_));
/* sum */
DATA(insert ( 2107 n 0 int8_avg_accum numeric_poly_sum int8_avg_combine int8_avg_serialize int8_avg_deserialize int8_avg_accum int8_avg_accum_inv numeric_poly_sum f f 0 2281 17 48 2281 48 _null_ _null_ ));
@@ -237,6 +238,7 @@ DATA(insert ( 2726 n 0 int2_accum numeric_poly_stddev_pop numeric_poly_combine
DATA(insert ( 2727 n 0 float4_accum float8_stddev_pop float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2728 n 0 float8_accum float8_stddev_pop float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2729 n 0 numeric_accum numeric_stddev_pop numeric_combine numeric_serialize numeric_deserialize numeric_accum numeric_accum_inv numeric_stddev_pop f f 0 2281 17 128 2281 128 _null_ _null_ ));
+DATA(insert ( 4032 n 0 float8_weighted_accum float8_weighted_stddev_pop - - - - - - f f 0 1022 0 40 0 0 "{0,0,0,0,0}" _null_));
/* stddev_samp */
DATA(insert ( 2712 n 0 int8_accum numeric_stddev_samp numeric_combine numeric_serialize numeric_deserialize int8_accum int8_accum_inv numeric_stddev_samp f f 0 2281 17 128 2281 128 _null_ _null_ ));
@@ -245,6 +247,7 @@ DATA(insert ( 2714 n 0 int2_accum numeric_poly_stddev_samp numeric_poly_combine
DATA(insert ( 2715 n 0 float4_accum float8_stddev_samp float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2716 n 0 float8_accum float8_stddev_samp float8_combine - - - - - f f 0 1022 0 0 0 0 "{0,0,0}" _null_ ));
DATA(insert ( 2717 n 0 numeric_accum numeric_stddev_samp numeric_combine numeric_serialize numeric_deserialize numeric_accum numeric_accum_inv numeric_stddev_samp f f 0 2281 17 128 2281 128 _null_ _null_ ));
+DATA(insert ( 4101 n 0 float8_weighted_accum float8_weighted_stddev_samp - - - - - - f f 0 1022 0 40 0 0 "{0,0,0,0,0}" _null_));
/* stddev: historical Postgres syntax for stddev_samp */
DATA(insert ( 2154 n 0 int8_accum numeric_stddev_samp numeric_combine numeric_serialize numeric_deserialize int8_accum int8_accum_inv numeric_stddev_samp f f 0 2281 17 128 2281 128 _null_ _null_ ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index bb539d4..7dff0d3 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2433,6 +2433,12 @@ DESCR("join selectivity of case-insensitive regex non-match");
/* Aggregate-related functions */
DATA(insert OID = 1830 ( float8_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_avg _null_ _null_ _null_ ));
DESCR("aggregate final function");
+DATA(insert OID = 3997 ( float8_weighted_avg PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_avg _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4099 ( float8_weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_pop _null_ _null_ _null_ ));
+DESCR("aggregate final function");
+DATA(insert OID = 4100 ( float8_weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_weighted_stddev_samp _null_ _null_ _null_ ));
+DESCR("aggregate final function");
DATA(insert OID = 2512 ( float8_var_pop PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_pop _null_ _null_ _null_ ));
DESCR("aggregate final function");
DATA(insert OID = 1831 ( float8_var_samp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_var_samp _null_ _null_ _null_ ));
@@ -2544,6 +2550,8 @@ DATA(insert OID = 2805 ( int8inc_float8_float8 PGNSP PGUID 12 1 0 0 0 f f f f
DESCR("aggregate transition function");
DATA(insert OID = 2806 ( float8_regr_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_regr_accum _null_ _null_ _null_ ));
DESCR("aggregate transition function");
+DATA(insert OID = 3999 ( float8_weighted_accum PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1022 "1022 701 701" _null_ _null_ _null_ _null_ _null_ float8_weighted_accum _null_ _null_ _null_ ));
+DESCR("aggregate transition function");
DATA(insert OID = 3342 ( float8_regr_combine PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1022 "1022 1022" _null_ _null_ _null_ _null_ _null_ float8_regr_combine _null_ _null_ _null_ ));
DESCR("aggregate combine function");
DATA(insert OID = 2807 ( float8_regr_sxx PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_regr_sxx _null_ _null_ _null_ ));
@@ -3204,6 +3212,8 @@ DATA(insert OID = 2104 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701
DESCR("the average (arithmetic mean) as float8 of all float4 values");
DATA(insert OID = 2105 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 701 "701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as float8 of all float8 values");
+DATA(insert OID = 3998 ( weighted_avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("the weighted average (arithmetic mean) as float8 of all float8 values");
DATA(insert OID = 2106 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1186 "1186" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("the average (arithmetic mean) as interval of all interval values");
@@ -3364,6 +3374,8 @@ DATA(insert OID = 2728 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0
DESCR("population standard deviation of float8 input values");
DATA(insert OID = 2729 ( stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("population standard deviation of numeric input values");
+DATA(insert OID = 4032 ( weighted_stddev_pop PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("population weighted standard deviation of float8 input values");
DATA(insert OID = 2712 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of bigint input values");
@@ -3377,6 +3389,8 @@ DATA(insert OID = 2716 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1
DESCR("sample standard deviation of float8 input values");
DATA(insert OID = 2717 ( stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "1700" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("sample standard deviation of numeric input values");
+DATA(insert OID = 4101 ( weighted_stddev_samp PGNSP PGUID 12 1 0 0 0 t f f f f f i s 2 0 701 "701 701" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+DESCR("sample weighted standard deviation of float8 input values");
DATA(insert OID = 2154 ( stddev PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("historical alias for stddev_samp");
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 01976a1..1c1d0d4 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -429,8 +429,12 @@ extern Datum drandom(PG_FUNCTION_ARGS);
extern Datum setseed(PG_FUNCTION_ARGS);
extern Datum float8_combine(PG_FUNCTION_ARGS);
extern Datum float8_accum(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_accum(PG_FUNCTION_ARGS);
extern Datum float4_accum(PG_FUNCTION_ARGS);
extern Datum float8_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_avg(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_pop(PG_FUNCTION_ARGS);
+extern Datum float8_weighted_stddev_samp(PG_FUNCTION_ARGS);
extern Datum float8_var_pop(PG_FUNCTION_ARGS);
extern Datum float8_var_samp(PG_FUNCTION_ARGS);
extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 3ff6691..c12ea3b 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -247,6 +247,18 @@ SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
653.62895538751 | 871.505273850014
(1 row)
+SELECT weighted_avg(a, b) FROM aggtest;
+ weighted_avg
+------------------
+ 55.5553072763149
+(1 row)
+
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
+ weighted_stddev_pop | weighted_stddev_samp
+---------------------+----------------------
+ 24.3364627240769 | 28.1013266097382
+(1 row)
+
SELECT corr(b, a) FROM aggtest;
corr
-------------------
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 80ef14c..6f236a1 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -60,6 +60,8 @@ SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
SELECT regr_r2(b, a) FROM aggtest;
SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
+SELECT weighted_avg(a, b) FROM aggtest;
+SELECT weighted_stddev_pop(a, b), weighted_stddev_samp(a, b) FROM aggtest;
SELECT corr(b, a) FROM aggtest;
SELECT count(four) AS cnt_1000 FROM onek;
On Fri, Mar 18, 2016 at 06:12:12PM -0700, Jeff Janes wrote:
On Tue, Mar 15, 2016 at 8:36 AM, David Fetter <david@fetter.org> wrote:
Please find attached a patch that uses the float8 version to cover the
numeric types.Is there a well-defined meaning for having a negative weight? If no,
should it be disallowed?
Done.
Shouldn't these then give the same result:
select stddev_samp(val) from foo;
stddev_samp
-------------------
2887.054977297105select weighted_stddev_samp(val,count) from foo2;
weighted_stddev_samp
----------------------
2887.19919651336The 5th digit seems too early to be seeing round-off error.
Fixed down-thread.
Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers