From 11a33fba2d02f3f76f1202735f87f4e27416253b Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Sun, 31 Aug 2025 15:23:18 +0300 Subject: [PATCH v1] Add xicorr(X, Y) --- doc/src/sgml/func/func-aggregate.sgml | 28 ++++ src/backend/utils/adt/float.c | 156 +++++++++++++++++++++++ src/include/catalog/pg_aggregate.dat | 2 + src/include/catalog/pg_proc.dat | 9 ++ src/test/regress/expected/aggregates.out | 10 ++ src/test/regress/sql/aggregates.sql | 5 + 6 files changed, 210 insertions(+) diff --git a/doc/src/sgml/func/func-aggregate.sgml b/doc/src/sgml/func/func-aggregate.sgml index f50b692516b..4de10a5cb30 100644 --- a/doc/src/sgml/func/func-aggregate.sgml +++ b/doc/src/sgml/func/func-aggregate.sgml @@ -1091,6 +1091,34 @@ SELECT count(*) FROM sometable; Yes + + + + + xi correlation + + + xicorr + + xicorr ( Y double precision, X double precision ) + double precision + + + Computes Chaterjee's Χi (ξ) correlation coefficient. + The xi correlation coefficient is a measure of association between two variables; + the value tends to be close to zero when the variables + are independent and close to 1 when there is a strong association. + Unlike other correlation coefficients, + the xi correlation is effective even when the association is not monotonic. + Note that this statistic is non-symmetric by design. + Tied values of x are ordered + deterministically by their y + values. This differs from some software which breaks ties arbitrarily. + With small samples, the sample coefficient can be slightly negative. + This implementation clamps such values to 0. + + Yes + diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index 7b97d2be6ca..990430c4c53 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -3754,6 +3754,162 @@ float8_corr(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(Sxy / sqrt(Sxx * Syy)); } +/* xicorr(X, Y) infrastructure */ + +typedef struct XiState +{ + int n; /* number of rows */ + int cap; /* allocated capacity */ + double *xs; + double *ys; +} XiState; + +Datum +xicorr_transfn(PG_FUNCTION_ARGS) +{ + MemoryContext aggctx; + XiState *st; + int newcap; + double x = PG_GETARG_FLOAT8(1); + double y = PG_GETARG_FLOAT8(2); + + if (!AggCheckCallContext(fcinfo, &aggctx)) + elog(ERROR, "must be called as aggregate"); + + if (PG_ARGISNULL(0)) + { + /* first call */ + st = (XiState *) MemoryContextAllocZero(aggctx, sizeof(XiState)); + st->cap = 64; + st->xs = (double *) MemoryContextAlloc(aggctx, sizeof(double)*st->cap); + st->ys = (double *) MemoryContextAlloc(aggctx, sizeof(double)*st->cap); + st->n = 0; + } + else + { + st = (XiState *) PG_GETARG_POINTER(0); + } + + if (PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_POINTER(st); + + /* grow if needed */ + if (st->n >= st->cap) + { + newcap = st->cap * 2; + st->xs = (double *) repalloc(st->xs, sizeof(double)*newcap); + st->ys = (double *) repalloc(st->ys, sizeof(double)*newcap); + st->cap = newcap; + } + + st->xs[st->n] = x; + st->ys[st->n] = y; + st->n++; + + PG_RETURN_POINTER(st); +} + +/* Compare indices by Y */ +static int +cmp_y(const void *a, const void *b, void *arg) +{ + const XiState *st = (const XiState *) arg; + int ia = *(const int *)a; + int ib = *(const int *)b; + + if (st->ys[ia] < st->ys[ib]) return -1; + if (st->ys[ia] > st->ys[ib]) return 1; + return 0; +} + +/* Compare indices by X (break ties with Y) */ +static int +cmp_x_then_y(const void *a, const void *b, void *arg) +{ + const XiState *st = (const XiState *) arg; + int ia = *(const int *)a; + int ib = *(const int *)b; + + if (st->xs[ia] < st->xs[ib]) return -1; + if (st->xs[ia] > st->xs[ib]) return 1; + + if (st->ys[ia] < st->ys[ib]) return -1; + if (st->ys[ia] > st->ys[ib]) return 1; + + return 0; +} + +Datum +float8_xicorr(PG_FUNCTION_ARGS) +{ + XiState *st; + int n, j; + int *ix, *iy; + double *rankY; + double avg, S, denom, xi; + + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + st = (XiState *) PG_GETARG_POINTER(0); + n = st->n; + + if (n < 2) + PG_RETURN_NULL(); + + /* --- 1. rank Y --- */ + iy = palloc(sizeof(int) * n); + for (int i = 0; i < n; i++) iy[i] = i; + + qsort_arg(iy, n, sizeof(int), cmp_y, st); + + rankY = palloc(sizeof(double) * n); + + for (int i = 0; i < n; ) + { + j = i + 1; + while (j < n && st->ys[iy[j]] == st->ys[iy[i]]) + j++; + + /* average rank for ties */ + avg = ((double)(i + 1) + (double)j) / 2.0; + for (int k = i; k < j; k++) + rankY[iy[k]] = avg; + + i = j; + } + + /* --- 2. order indices by X (break ties by Y) --- */ + ix = palloc(sizeof(int) * n); + for (int i = 0; i < n; i++) ix[i] = i; + + qsort_arg(ix, n, sizeof(int), cmp_x_then_y, st); + + /* --- 3. sum successive rank differences --- */ + S = 0.0; + for (int t = 0; t < n - 1; t++) + { + int i0 = ix[t]; + int i1 = ix[t + 1]; + S += fabsl(rankY[i1] - rankY[i0]); + } + + /* --- 4. coefficient --- */ + denom = (double)n * (double)n - 1.0L; + if (denom <= 0.0L) + PG_RETURN_NULL(); + + xi = 1.0 - (3.0 * S) / denom; + + /* Clamp to [0,1] if desired */ + if (xi < 0.0) + xi = 0.0; + else if (xi > 1.0) + xi = 1.0; + + PG_RETURN_FLOAT8(xi); +} + Datum float8_regr_r2(PG_FUNCTION_ARGS) { diff --git a/src/include/catalog/pg_aggregate.dat b/src/include/catalog/pg_aggregate.dat index d6aa1f6ec47..4a69ec2486a 100644 --- a/src/include/catalog/pg_aggregate.dat +++ b/src/include/catalog/pg_aggregate.dat @@ -506,6 +506,8 @@ { aggfnoid => 'corr', aggtransfn => 'float8_regr_accum', aggfinalfn => 'float8_corr', aggcombinefn => 'float8_regr_combine', aggtranstype => '_float8', agginitval => '{0,0,0,0,0,0}' }, +{ aggfnoid => 'xicorr', aggtransfn => 'xicorr_transfn', + aggfinalfn => 'float8_xicorr', aggtranstype => 'internal'}, # boolean-and and boolean-or { aggfnoid => 'bool_and', aggtransfn => 'booland_statefunc', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 118d6da1ace..715f3ae04cb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5165,6 +5165,12 @@ { oid => '2817', descr => 'aggregate final function', proname => 'float8_corr', prorettype => 'float8', proargtypes => '_float8', prosrc => 'float8_corr' }, +{ oid => '2775', descr => 'xicorr transition function', prokind => 'f', + proname => 'xicorr_transfn', prorettype => 'internal', proargtypes => 'internal float8 float8', + prosrc => 'xicorr_transfn', proisstrict => 'f', provolatile => 'i', proparallel => 's' }, +{ oid => '3814', descr => 'aggregate final function', prokind => 'f', + proname => 'float8_xicorr', prorettype => 'float8', proargtypes => 'internal', + prosrc => 'float8_xicorr', proisstrict => 't', provolatile => 'i', proparallel => 's' }, { oid => '3535', descr => 'aggregate transition function', proname => 'string_agg_transfn', proisstrict => 'f', prorettype => 'internal', @@ -7311,6 +7317,9 @@ { oid => '2829', descr => 'correlation coefficient', proname => 'corr', prokind => 'a', proisstrict => 'f', prorettype => 'float8', proargtypes => 'float8 float8', prosrc => 'aggregate_dummy' }, +{ oid => '3063', descr => 'xi correlation coefficient', + proname => 'xicorr', prokind => 'a', proisstrict => 'f', prorettype => 'float8', + proargtypes => 'float8 float8', prosrc => 'aggregate_dummy' }, { oid => '2160', proname => 'text_pattern_lt', proleakproof => 't', prorettype => 'bool', diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index c35288eecde..69969859ea9 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -496,6 +496,16 @@ SELECT corr(b, a) FROM aggtest; 0.139634516517873 (1 row) +CREATE TEMPORARY TABLE bigger_aggtest AS +SELECT g AS a, + g*g + 2 AS b +FROM generate_series(1,100) g; +SELECT xicorr(b, a) FROM bigger_aggtest; + xicorr +------------------ + 0.97029702970297 +(1 row) + -- check single-tuple behavior SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8); covar_pop | covar_samp diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 62540b1ffa4..7714fdff281 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -134,6 +134,11 @@ SELECT regr_r2(b, a) FROM aggtest; SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest; SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest; SELECT corr(b, a) FROM aggtest; +CREATE TEMPORARY TABLE bigger_aggtest AS +SELECT g AS a, + g*g + 2 AS b +FROM generate_series(1,100) g; +SELECT xicorr(b, a) FROM bigger_aggtest; -- check single-tuple behavior SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8); -- 2.39.5 (Apple Git-154)