Implement missing join selectivity estimation for range types
Hi,
Given a query:
SELECT * FROM t1, t2 WHERE t1.r << t2.r
where t1.r, t2.r are of range type,
currently PostgreSQL will estimate a constant selectivity for the << predicate,
which is equal to 0.005, not utilizing the statistics that the optimizer
collects for range attributes.
We have worked out a theory for inequality join selectivity estimation
(http://arxiv.org/abs/2206.07396), and implemented it for range
types it in this patch.
The algorithm in this patch re-uses the currently collected statistics for
range types, which is the bounds histogram. It works fairly accurate for the
operations <<, >>, &&, &<, &>, <=, >= with estimation error of about 0.5%.
The patch also implements selectivity estimation for the
operations @>, <@ (contains and is contained in), but their accuracy is not
stable, since the bounds histograms assume independence between the range
bounds. A point to discuss is whether or not to keep these last two operations.
The patch also includes the selectivity estimation for multirange types,
treating a multirange as a single range which is its bounding box.
The same algorithm in this patch is applicable to inequality joins of scalar
types. We, however, don't implement it for scalars, since more work is needed
to make use of the other statistics available for scalars, such as the MCV.
This is left as a future work.
--
Mahmoud SAKR - Univeristé Libre de Bruxelles
This work is done by Diogo Repas, Zhicheng Luo, Maxime Schoemans, and myself
Attachments:
v1-0001-Join-Selectivity-Estimation-for-Range-types.patchtext/x-patch; charset=US-ASCII; name=v1-0001-Join-Selectivity-Estimation-for-Range-types.patchDownload
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index 919c8889d4..7ba4aa8b04 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1335,3 +1335,511 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This function is a copy of the function with the same name in
+ * rangetypes_selfuncs.c, with the only difference that the types are
+ * multiranges
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity,
+ cur_sel1,
+ cur_sel2,
+ prev_sel1,
+ prev_sel2;
+ RangeBound cur_sync;
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ prev_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ prev_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /*
+ * Do the estimation on overlapping region
+ */
+ selectivity = 0.0;
+ while (i < nhist1 && j < nhist2)
+ {
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity / 2;
+}
+
+/*
+ * multirangejoinsel -- join cardinality for multirange operators
+ */
+Datum
+multirangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL,
+ *rng_typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_multirange_selectivity(operator);
+
+ /* get multirange type cache */
+ if (type_is_multirange(vardata1.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata1.vartype);
+ else if (type_is_multirange(vardata2.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata2.vartype);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ typcache)
+ {
+
+ /* Initialize underlying range type cache */
+ rng_typcache = typcache->rngtype;
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+
+ /*
+ * just punt for now, estimation would require equality
+ * selectivity for bounds
+ */
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty (multi)ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty (multi)ranges might match
+ * different fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * empty (multi)range < non-empty (multi)range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * non-empty (multi)range > empty (multi)range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * empty (multi)range <@ any (multi)range
+ */
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty (multi)range <= any (multi)range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * any (multi)range @> empty (multi)range
+ */
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any (multi)range >= empty (multi)range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty
+ * (multi)range is involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
index c2795f4593..007e14bcf6 100644
--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -1221,3 +1221,509 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity,
+ cur_sel1,
+ cur_sel2,
+ prev_sel1,
+ prev_sel2;
+ RangeBound cur_sync;
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ prev_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ prev_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /*
+ * Do the estimation on overlapping region
+ */
+ selectivity = 0.0;
+ while (i < nhist1 && j < nhist2)
+ {
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity / 2;
+}
+
+/*
+ * rangejoinsel -- join cardinality for range operators
+ */
+Datum
+rangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_range_selectivity(operator);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ vardata1.vartype == vardata2.vartype)
+ {
+
+ /* Initialize type cache */
+ typcache = range_get_typcache(fcinfo, vardata1.vartype);
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_RANGE_OVERLAP_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LEFT_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_RIGHT_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty ranges might match different
+ * fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * empty range < non-empty range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * non-empty range > empty range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * empty range <@ any range
+ */
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty range <= any range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * any range @> empty range
+ */
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any range >= empty range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+ case OID_RANGE_OVERLAP_OP:
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ case OID_RANGE_LEFT_OP:
+ case OID_RANGE_RIGHT_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty range is
+ * involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index bc5f8213f3..b63a7e15af 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -3071,78 +3071,78 @@
oprname => '<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>(anyrange,anyrange)',
oprnegate => '>=(anyrange,anyrange)', oprcode => 'range_lt',
- oprrest => 'rangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3885', oid_symbol => 'OID_RANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>=(anyrange,anyrange)',
oprnegate => '>(anyrange,anyrange)', oprcode => 'range_le',
- oprrest => 'rangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3886', oid_symbol => 'OID_RANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<=(anyrange,anyrange)',
oprnegate => '<(anyrange,anyrange)', oprcode => 'range_ge',
- oprrest => 'rangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3887', oid_symbol => 'OID_RANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<(anyrange,anyrange)',
oprnegate => '<=(anyrange,anyrange)', oprcode => 'range_gt',
- oprrest => 'rangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3888', oid_symbol => 'OID_RANGE_OVERLAP_OP', descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anyrange)',
oprcode => 'range_overlaps', oprrest => 'rangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3889', oid_symbol => 'OID_RANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anyrange)',
oprcode => 'range_contains_elem', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3890', oid_symbol => 'OID_RANGE_CONTAINS_OP', descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anyrange)',
oprcode => 'range_contains', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3891', oid_symbol => 'OID_RANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyelement)',
oprcode => 'elem_contained_by_range', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3892', oid_symbol => 'OID_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyrange)',
oprcode => 'range_contained_by', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3893', oid_symbol => 'OID_RANGE_LEFT_OP', descr => 'is left of',
oprname => '<<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anyrange)',
oprcode => 'range_before', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3894', oid_symbol => 'OID_RANGE_RIGHT_OP', descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anyrange)',
oprcode => 'range_after', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3895', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overleft', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3896', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overright', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3897', descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anyrange)',
oprcode => 'range_adjacent', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3898', descr => 'range union',
oprname => '+', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'anyrange', oprcom => '+(anyrange,anyrange)',
@@ -3277,139 +3277,139 @@
oprname => '<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>(anymultirange,anymultirange)',
oprnegate => '>=(anymultirange,anymultirange)', oprcode => 'multirange_lt',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2863', oid_symbol => 'OID_MULTIRANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>=(anymultirange,anymultirange)',
oprnegate => '>(anymultirange,anymultirange)', oprcode => 'multirange_le',
- oprrest => 'multirangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2864', oid_symbol => 'OID_MULTIRANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<=(anymultirange,anymultirange)',
oprnegate => '<(anymultirange,anymultirange)', oprcode => 'multirange_ge',
- oprrest => 'multirangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2865', oid_symbol => 'OID_MULTIRANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<(anymultirange,anymultirange)',
oprnegate => '<=(anymultirange,anymultirange)', oprcode => 'multirange_gt',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2866', oid_symbol => 'OID_RANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anyrange)',
oprcode => 'range_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2867', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anymultirange)',
oprcode => 'multirange_overlaps_range', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2868', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anymultirange)',
oprcode => 'multirange_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2869', oid_symbol => 'OID_MULTIRANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anymultirange)',
oprcode => 'multirange_contains_elem', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2870', oid_symbol => 'OID_MULTIRANGE_CONTAINS_RANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anymultirange)',
oprcode => 'multirange_contains_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2871', oid_symbol => 'OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anymultirange)',
oprcode => 'multirange_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2872', oid_symbol => 'OID_MULTIRANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyelement)',
oprcode => 'elem_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2873', oid_symbol => 'OID_MULTIRANGE_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyrange)',
oprcode => 'range_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2874', oid_symbol => 'OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anymultirange)',
oprcode => 'multirange_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4539', oid_symbol => 'OID_RANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anyrange)',
oprcode => 'range_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4540', oid_symbol => 'OID_RANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anymultirange)',
oprcode => 'multirange_contained_by_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2875', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2876', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overleft_range',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2877', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '3585', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4035', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overright_range',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4142', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4179', oid_symbol => 'OID_RANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anyrange)',
oprcode => 'range_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4180', oid_symbol => 'OID_MULTIRANGE_ADJACENT_RANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anymultirange)',
oprcode => 'multirange_adjacent_range', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4198', oid_symbol => 'OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anymultirange)',
oprcode => 'multirange_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4392', descr => 'multirange union',
oprname => '+', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'anymultirange', oprcom => '+(anymultirange,anymultirange)',
@@ -3426,36 +3426,36 @@
oprname => '<<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anyrange)',
oprcode => 'range_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4396', oid_symbol => 'OID_MULTIRANGE_LEFT_RANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anymultirange)',
oprcode => 'multirange_before_range', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4397', oid_symbol => 'OID_MULTIRANGE_LEFT_MULTIRANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anymultirange)',
oprcode => 'multirange_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4398', oid_symbol => 'OID_RANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anyrange)',
oprcode => 'range_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4399', oid_symbol => 'OID_MULTIRANGE_RIGHT_RANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anymultirange)',
oprcode => 'multirange_after_range', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4400', oid_symbol => 'OID_MULTIRANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anymultirange)',
oprcode => 'multirange_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
]
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 87aa571a33..c1d4119684 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11885,4 +11885,12 @@
prorettype => 'bytea', proargtypes => 'pg_brin_minmax_multi_summary',
prosrc => 'brin_minmax_multi_summary_send' },
+{ oid => '8355', descr => 'join selectivity for range operators',
+ proname => 'rangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'rangejoinsel' },
+{ oid => '8356', descr => 'join selectivity for multirange operators',
+ proname => 'multirangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'multirangejoinsel' },
]
diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out
index ac2eb84c3a..b0eeb672f0 100644
--- a/src/test/regress/expected/multirangetypes.out
+++ b/src/test/regress/expected/multirangetypes.out
@@ -3330,3 +3330,275 @@ create function mr_table_fail(i anyelement) returns table(i anyelement, r anymul
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anymultirange requires at least one input of type anyrange or anymultirange.
+-- test multirange join operators
+create table test_multirange_join_1(mr1 int4multirange);
+create table test_multirange_join_2(mr2 int4multirange);
+create table test_range_join(ir int4range);
+create table test_elem_join(elem int4);
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,200) g;
+insert into test_multirange_join_1 select '{}'::int4multirange from generate_series(1,50) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10000)) from generate_series(1,100) g;
+insert into test_multirange_join_1 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,20) g;
+insert into test_multirange_join_2 select '{}'::int4multirange from generate_series(1,5) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10000)) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select int4range(g, g+10000) from generate_series(1,10) g;
+insert into test_range_join select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join select NULL from generate_series(1,5) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_range_join;
+analyze test_elem_join;
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 = mr2');
+ estimated | actual
+-----------+--------
+ 55 | 300
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 < mr2');
+ estimated | actual
+-----------+--------
+ 4579 | 4598
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <= mr2');
+ estimated | actual
+-----------+--------
+ 7309 | 4898
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 > mr2');
+ estimated | actual
+-----------+--------
+ 13041 | 15452
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >= mr2');
+ estimated | actual
+-----------+--------
+ 15771 | 15752
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 && mr2');
+ estimated | actual
+-----------+--------
+ 11098 | 10932
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 && ir');
+ estimated | actual
+-----------+--------
+ 9611 | 9471
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir && mr2');
+ estimated | actual
+-----------+--------
+ 2924 | 2851
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <@ mr2');
+ estimated | actual
+-----------+--------
+ 8491 | 7393
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 <@ ir');
+ estimated | actual
+-----------+--------
+ 9754 | 8621
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir <@ mr2');
+ estimated | actual
+-----------+--------
+ 2663 | 1987
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 @> mr2');
+ estimated | actual
+-----------+--------
+ 5022 | 2361
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 @> ir');
+ estimated | actual
+-----------+--------
+ 12473 | 8397
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir @> mr2');
+ estimated | actual
+-----------+--------
+ 1177 | 800
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 << mr2');
+ estimated | actual
+-----------+--------
+ 152 | 181
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 << ir');
+ estimated | actual
+-----------+--------
+ 145 | 170
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir << mr2');
+ estimated | actual
+-----------+--------
+ 478 | 519
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >> mr2');
+ estimated | actual
+-----------+--------
+ 4750 | 4837
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 >> ir');
+ estimated | actual
+-----------+--------
+ 12644 | 12739
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir >> mr2');
+ estimated | actual
+-----------+--------
+ 98 | 110
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &< mr2');
+ estimated | actual
+-----------+--------
+ 4868 | 6318
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &< ir');
+ estimated | actual
+-----------+--------
+ 4120 | 5556
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &< mr2');
+ estimated | actual
+-----------+--------
+ 1986 | 2627
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &> mr2');
+ estimated | actual
+-----------+--------
+ 11441 | 13976
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &> ir');
+ estimated | actual
+-----------+--------
+ 16184 | 19807
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &> mr2');
+ estimated | actual
+-----------+--------
+ 1819 | 1895
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 -|- mr2');
+ estimated | actual
+-----------+--------
+ 160 | 71
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 -|- ir');
+ estimated | actual
+-----------+--------
+ 224 | 118
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir -|- mr2');
+ estimated | actual
+-----------+--------
+ 35 | 37
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_multirange_join_1 where elem <@ mr1');
+ estimated | actual
+-----------+--------
+ 120 | 3110
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_elem_join where mr1 @> elem');
+ estimated | actual
+-----------+--------
+ 120 | 3110
+(1 row)
+
+drop function check_estimated_rows;
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_range_join;
+drop table test_elem_join;
diff --git a/src/test/regress/expected/rangetypes.out b/src/test/regress/expected/rangetypes.out
index 04ccd5d451..10a76dec7a 100644
--- a/src/test/regress/expected/rangetypes.out
+++ b/src/test/regress/expected/rangetypes.out
@@ -1767,3 +1767,157 @@ create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anyrange requires at least one input of type anyrange or anymultirange.
+-- test range join operators
+create table test_range_join_1(ir1 int4range);
+create table test_range_join_2(ir2 int4range);
+create table test_elem_join(elem int4);
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select int4range(g, g+10000) from generate_series(1,100) g;
+insert into test_range_join_1 select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join_1 select NULL from generate_series(1,50) g;
+insert into test_range_join_2 select int4range(g+10, g+20) from generate_series(1,20) g;
+insert into test_range_join_2 select int4range(g+5000, g+15000) from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(NULL,g*5,'(]') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g*5,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join_2 select 'empty'::int4range from generate_series(1,5) g;
+insert into test_range_join_2 select NULL from generate_series(1,5) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_elem_join;
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 = ir2');
+ estimated | actual
+-----------+--------
+ 75 | 190
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 < ir2');
+ estimated | actual
+-----------+--------
+ 7256 | 9745
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <= ir2');
+ estimated | actual
+-----------+--------
+ 9986 | 9935
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 > ir2');
+ estimated | actual
+-----------+--------
+ 30514 | 30565
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >= ir2');
+ estimated | actual
+-----------+--------
+ 33244 | 30755
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 && ir2');
+ estimated | actual
+-----------+--------
+ 9966 | 9720
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <@ ir2');
+ estimated | actual
+-----------+--------
+ 11868 | 6268
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 @> ir2');
+ estimated | actual
+-----------+--------
+ 8933 | 3973
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 << ir2');
+ estimated | actual
+-----------+--------
+ 5034 | 5050
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >> ir2');
+ estimated | actual
+-----------+--------
+ 21400 | 21630
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &< ir2');
+ estimated | actual
+-----------+--------
+ 9665 | 12023
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &> ir2');
+ estimated | actual
+-----------+--------
+ 27914 | 28105
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 -|- ir2');
+ estimated | actual
+-----------+--------
+ 364 | 233
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_range_join_1 where elem <@ ir1');
+ estimated | actual
+-----------+--------
+ 192 | 3349
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_elem_join where ir1 @> elem');
+ estimated | actual
+-----------+--------
+ 192 | 3349
+(1 row)
+
+drop function check_estimated_rows;
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_elem_join;
diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql
index 1abcaeddb5..cb53f90aba 100644
--- a/src/test/regress/sql/multirangetypes.sql
+++ b/src/test/regress/sql/multirangetypes.sql
@@ -854,3 +854,160 @@ create function mr_inoutparam_fail(inout i anyelement, out r anymultirange)
--should fail
create function mr_table_fail(i anyelement) returns table(i anyelement, r anymultirange)
as $$ select $1, '[1,10]' $$ language sql;
+
+-- test multirange join operators
+create table test_multirange_join_1(mr1 int4multirange);
+create table test_multirange_join_2(mr2 int4multirange);
+create table test_range_join(ir int4range);
+create table test_elem_join(elem int4);
+
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,200) g;
+insert into test_multirange_join_1 select '{}'::int4multirange from generate_series(1,50) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10000)) from generate_series(1,100) g;
+insert into test_multirange_join_1 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,20) g;
+insert into test_multirange_join_2 select '{}'::int4multirange from generate_series(1,5) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10000)) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select int4range(g, g+10000) from generate_series(1,10) g;
+insert into test_range_join select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join select NULL from generate_series(1,5) g;
+
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_range_join;
+analyze test_elem_join;
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 = mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 < mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <= mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 > mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >= mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 && mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 && ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir && mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <@ mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 <@ ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir <@ mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 @> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 @> ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir @> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 << mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 << ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir << mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 >> ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir >> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &< mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &< ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &< mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &> ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 -|- mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 -|- ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir -|- mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_multirange_join_1 where elem <@ mr1');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_elem_join where mr1 @> elem');
+
+drop function check_estimated_rows;
+
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_range_join;
+drop table test_elem_join;
diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
index 1a10f67f19..6031cd695a 100644
--- a/src/test/regress/sql/rangetypes.sql
+++ b/src/test/regress/sql/rangetypes.sql
@@ -616,3 +616,105 @@ create function inoutparam_fail(inout i anyelement, out r anyrange)
--should fail
create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
+
+-- test range join operators
+create table test_range_join_1(ir1 int4range);
+create table test_range_join_2(ir2 int4range);
+create table test_elem_join(elem int4);
+
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select int4range(g, g+10000) from generate_series(1,100) g;
+insert into test_range_join_1 select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join_1 select NULL from generate_series(1,50) g;
+
+insert into test_range_join_2 select int4range(g+10, g+20) from generate_series(1,20) g;
+insert into test_range_join_2 select int4range(g+5000, g+15000) from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(NULL,g*5,'(]') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g*5,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join_2 select 'empty'::int4range from generate_series(1,5) g;
+insert into test_range_join_2 select NULL from generate_series(1,5) g;
+
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_elem_join;
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 = ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 < ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <= ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 > ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >= ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 && ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <@ ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 @> ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 << ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >> ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &< ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &> ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 -|- ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_range_join_1 where elem <@ ir1');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_elem_join where ir1 @> elem');
+
+drop function check_estimated_rows;
+
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_elem_join;
Hello Mahmoud,
Thanks for the patch and sorry for not taking a look earlier.
On 6/30/22 16:31, Mahmoud Sakr wrote:
Hi,
Given a query:
SELECT * FROM t1, t2 WHERE t1.r << t2.r
where t1.r, t2.r are of range type,
currently PostgreSQL will estimate a constant selectivity for the << predicate,
which is equal to 0.005, not utilizing the statistics that the optimizer
collects for range attributes.We have worked out a theory for inequality join selectivity estimation
(http://arxiv.org/abs/2206.07396), and implemented it for range
types it in this patch.
Interesting. Are there any particular differences compared to how we
estimate for example range clauses on regular columns?
The algorithm in this patch re-uses the currently collected statistics for
range types, which is the bounds histogram. It works fairly accurate for the
operations <<, >>, &&, &<, &>, <=, >= with estimation error of about 0.5%.
Right. I think 0.5% is roughly expected for the default statistics
target, which creates 100 histogram bins, each representing ~1% of the
values. Which on average means ~0.5% error.
The patch also implements selectivity estimation for the
operations @>, <@ (contains and is contained in), but their accuracy is not
stable, since the bounds histograms assume independence between the range
bounds. A point to discuss is whether or not to keep these last two operations.
That's a good question. I think the independence assumption is rather
foolish in this case, so I wonder if we could "stabilize" this by making
some different - less optimistic - assumption. Essentially, we have an
estimates for lower/upper boundaries:
P1 = P(lower(var1) <= lower(var2))
P2 = P(upper(var2) <= upper(var1))
and independence means we take (P1*P2). But maybe we should be very
pessimistic and use e.g. Min(P1,P2)? Or maybe something in between?
Another option is to use the length histogram, right? I mean, we know
what the average length is, and it should be possible to use that to
calculate how "far" ranges in a histogram can overlap.
The patch also includes the selectivity estimation for multirange types,
treating a multirange as a single range which is its bounding box.
OK. But ideally we'd cross-check elements of the two multiranges, no?
The same algorithm in this patch is applicable to inequality joins of scalar
types. We, however, don't implement it for scalars, since more work is needed
to make use of the other statistics available for scalars, such as the MCV.
This is left as a future work.
So if the column(s) contain a couple very common (multi)ranges that make
it into an MCV, we'll ignore those? That's a bit unfortunate, because
those MCV elements are potentially the main contributors to selectivity.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Also, calc_hist_selectivity_contains in multirangetypes_selfuncs.c needs
a proper comment, not just "this is a copy from rangetypes".
However, it seems the two functions are exactly the same. Would the
functions diverge in the future? If not, maybe there should be just a
single shared function?
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi Tomas,
Thanks for picking up the patch and for the interesting discussions that
you bring !
Interesting. Are there any particular differences compared to how we
estimate for example range clauses on regular columns?
The theory is the same for scalar types. Yet, the statistics that are currently
collected for scalar types include other synopsis than the histogram, such
as MCV, which should be incorporated in the estimation. The theory for using
the additional statistics is ready in the paper, but we didn't yet implement it.
We thought of sharing the ready part, till the time allows us to implement the
rest, or other developers continue it.
Right. I think 0.5% is roughly expected for the default statistics
target, which creates 100 histogram bins, each representing ~1% of the
values. Which on average means ~0.5% error.
Since this patch deals with join selectivity, we are then crossing 100*100 bins.
The ~0.5% error estimation comes from our experiments, rather than a
mathematical analysis.
independence means we take (P1*P2). But maybe we should be very
pessimistic and use e.g. Min(P1,P2)? Or maybe something in between?Another option is to use the length histogram, right? I mean, we know
what the average length is, and it should be possible to use that to
calculate how "far" ranges in a histogram can overlap.
The independence assumption exists if we use the lower and upper
histograms. It equally exists if we use the lower and length histograms.
In both cases, the link between the two histograms is lost during their
construction.
You discussion brings an interesting trade-off of optimistic v.s. pessimistic
estimations. A typical way to deal with such a trade-off is to average the
two, for example is model validation in machine learning, Do you think we
should implement something like
average( (P1*P2), Min(P1,P2) )?
OK. But ideally we'd cross-check elements of the two multiranges, no?
So if the column(s) contain a couple very common (multi)ranges that make
it into an MCV, we'll ignore those? That's a bit unfortunate, because
those MCV elements are potentially the main contributors to selectivity.
Both ideas would require collecting more detailed statistics, for
example similar
to arrays. In this patch, we restricted ourselves to the existing statistics.
Also, calc_hist_selectivity_contains in multirangetypes_selfuncs.c needs
a proper comment, not just "this is a copy from rangetypes".
Right, the comment should elaborate more that the collected statistics are
currently that same as rangetypes but may potentially deviate.
However, it seems the two functions are exactly the same. Would the
functions diverge in the future? If not, maybe there should be just a
single shared function?
Indeed, it is possible that the two functions will deviate if that statistics
of multirange types will be refined.
--
Best regards
Mahmoud SAKR
On Wed, Jan 18, 2023 at 7:07 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
Show quoted text
Also, calc_hist_selectivity_contains in multirangetypes_selfuncs.c needs
a proper comment, not just "this is a copy from rangetypes".However, it seems the two functions are exactly the same. Would the
functions diverge in the future? If not, maybe there should be just a
single shared function?regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 1/18/23 20:23, Mahmoud Sakr wrote:
Hi Tomas,
Thanks for picking up the patch and for the interesting discussions that
you bring !Interesting. Are there any particular differences compared to how we
estimate for example range clauses on regular columns?The theory is the same for scalar types. Yet, the statistics that are currently
collected for scalar types include other synopsis than the histogram, such
as MCV, which should be incorporated in the estimation. The theory for using
the additional statistics is ready in the paper, but we didn't yet implement it.
We thought of sharing the ready part, till the time allows us to implement the
rest, or other developers continue it.
I see. We don't have MCV stats for range types, so the algorithms don't
include that. But we have that for scalars, so the code would need to be
modified to consider that too.
However, I wonder how much could that improve the estimates for range
queries on scalar types. I mean, we already get pretty good estimates
for those, so I guess we wouldn't get much.
Right. I think 0.5% is roughly expected for the default statistics
target, which creates 100 histogram bins, each representing ~1% of the
values. Which on average means ~0.5% error.Since this patch deals with join selectivity, we are then crossing 100*100 bins.
The ~0.5% error estimation comes from our experiments, rather than a
mathematical analysis.
Ah, understood. Even for joins there's probably a fairly close
relationship between the bin size and estimation error, but it's
certainly more complex.
BTW the experiments are those described in section 6 of the paper,
correct? I wonder how uniform (or skewed) the data was - in terms of
range length, etc. Or how it works for other operators, not just for
"<<" as in the paper.
independence means we take (P1*P2). But maybe we should be very
pessimistic and use e.g. Min(P1,P2)? Or maybe something in between?Another option is to use the length histogram, right? I mean, we know
what the average length is, and it should be possible to use that to
calculate how "far" ranges in a histogram can overlap.The independence assumption exists if we use the lower and upper
histograms. It equally exists if we use the lower and length histograms.
In both cases, the link between the two histograms is lost during their
construction.
You discussion brings an interesting trade-off of optimistic v.s. pessimistic
estimations. A typical way to deal with such a trade-off is to average the
two, for example is model validation in machine learning, Do you think we
should implement something like
average( (P1*P2), Min(P1,P2) )?
I don't know.
AFAICS the independence assumption is used not only because it's very
cheap/simple to implement, but also because it actually is a reasonable
assumption for a fair number of data sets (particularly in OLTP).
You're right it's an optimistic estimate, but for many data sets it's
actually quite reasonable.
I'm not sure that applies to range boundaries - the upper/lower bounds
seem pretty strongly correlated. So maybe using a more pessimistic
formula would be appropriate.
I was thinking the length histogram might allow an alternative,
approach, because it says what fraction of ranges has what length. So
for a "fixed" lower boundary, we may check each of those fractions. Of
course, this assumes consistent range length distribution (so if ranges
at one end are much longer, that won't work).
OK. But ideally we'd cross-check elements of the two multiranges, no?
So if the column(s) contain a couple very common (multi)ranges that make
it into an MCV, we'll ignore those? That's a bit unfortunate, because
those MCV elements are potentially the main contributors to selectivity.Both ideas would require collecting more detailed statistics, for
example similar
to arrays. In this patch, we restricted ourselves to the existing statistics.
Ah, I didn't realize we don't actually build MCV for range types. In
that case the current behavior makes perfect sense.
Also, calc_hist_selectivity_contains in multirangetypes_selfuncs.c needs
a proper comment, not just "this is a copy from rangetypes".Right, the comment should elaborate more that the collected statistics are
currently that same as rangetypes but may potentially deviate.However, it seems the two functions are exactly the same. Would the
functions diverge in the future? If not, maybe there should be just a
single shared function?Indeed, it is possible that the two functions will deviate if that statistics
of multirange types will be refined.
Right, but are there any such plans? Also, what's the likelihood we'll
add new statistics to only one of the places (e.g. for multiranges but
not plain ranges)?
I'd keep a single function until we actually need two. That's also
easier for maintenance - with two it's easy to fix a bug in one place
and forget about the other, etc.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi Mahmoud,
I finally had time to properly read the paper today - the general
approach mostly matches how I imagined the estimation would work for
inequalities, but it's definitely nice to see the algorithm properly
formalized and analyzed.
What seems a bit strange to me is that the patch only deals with range
types, leaving the scalar cases unchanged. I understand why (not having
a MCV simplifies it a lot), but I'd bet joins on range types are waaaay
less common than inequality joins on scalar types. I don't even remember
seeing inequality join on a range column, TBH.
That doesn't mean the patch is wrong, of course. But I'd expect users to
be surprised we handle range types better than "old" scalar types (which
range types build on, in some sense).
Did you have any plans to work on improving estimates for the scalar
case too? Or did you do the patch needed for the paper, and have no
plans to continue working on this?
I'm also wondering about not having MCV for ranges. I was a bit
surprised we don't build MCV in compute_range_stats(), and perhaps we
should start building those - if there are common ranges, this might
significantly improve some of the estimates (just like for scalar
columns). Which would mean the estimates for range types are just as
complex as for scalars. Of course, we don't do that now.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi Tomas,
I finally had time to properly read the paper today - the general
approach mostly matches how I imagined the estimation would work for
inequalities, but it's definitely nice to see the algorithm properly
formalized and analyzed.
Awesome, thanks for this interest!
What seems a bit strange to me is that the patch only deals with range
types, leaving the scalar cases unchanged. I understand why (not having
a MCV simplifies it a lot), but I'd bet joins on range types are waaaay
less common than inequality joins on scalar types. I don't even remember
seeing inequality join on a range column, TBH.That doesn't mean the patch is wrong, of course. But I'd expect users to
be surprised we handle range types better than "old" scalar types (which
range types build on, in some sense).Did you have any plans to work on improving estimates for the scalar
case too? Or did you do the patch needed for the paper, and have no
plans to continue working on this?
I fully agree. Scalars are way more important.
I join you in the call for Diogo and Zhicheng to continue this implementation,
specially after the interest you show towards this patch. The current patch
was a course project (taught by me and Maxime), which was specific to range
types. But indeed the solution generalizes well to scalars. Hopefully after the
current exam session (Feb), there will be time to continue the implementation.
Nevertheless, it makes sense to do two separate patches: this one, and
the scalars. The code of the two patches is located in different files, and
the estimation algorithms have slight differences.
I'm also wondering about not having MCV for ranges. I was a bit
surprised we don't build MCV in compute_range_stats(), and perhaps we
should start building those - if there are common ranges, this might
significantly improve some of the estimates (just like for scalar
columns). Which would mean the estimates for range types are just as
complex as for scalars. Of course, we don't do that now.
Good question. Our intuition is that MCV will not be useful for ranges.
Maxime has done an experiment and confirmed this intuition. Here is his
experiment and explanation:
Create a table with 126000 int4ranges.
All ranges have their middle between 0 and 1000 and a length between 90
and 110.
The ranges are created in the following way:
- 10 different ranges, each duplicated 1000 times
- 20 different ranges, each duplicated 500 times
- 40 different ranges, each duplicated 100 times
- 200 different ranges, each duplicated 10 times
- 100000 different ranges, not duplicated
Two such tables (t1 and t2) were created in the same way but with different
data. Then he ran the following query:
EXPLAIN ANALYZE SELECT count(*)
FROM t, t2 WHERE t && t2
The results (using our patch) where the following:
Plan rows: 2991415662
Actual rows: 2981335423
So the estimation accuracy is still fairly good for such data with a lot
of repeated values, even without having MCV statistics.
The only error that can happen in our algorithms comes from the last bin in
which we assume uniform distribution of values. Duplicate values
(end bound, not ranges) might make this assumption be wrong, which would
create inaccurate estimations. However, this is still only incorrect
for the last
bin and all the others are correct.
MCV's are mainly useful for equality, which is not an operation we cover, and
probably not an important predicate for ranges. What do you think?
Best regards,
Mahmoud
Hi Tomas,
As a quick update, the paper related to this work has finally been published in Mathematics (https://www.mdpi.com/2227-7390/11/6/1383).
During revision we also added a figure showing a comparison of our algorithm vs the existing algorithms in Oracle, SQL Server, MySQL and PostgreSQL, which can be found in the experiments section of the paper.
As can be seen, our algorithm outperforms even Oracle and SQL Server.
During this revision we also found a small bug, so we are working on a revision of the patch, which fixes this.
Also, calc_hist_selectivity_contains in multirangetypes_selfuncs.c needs
a proper comment, not just "this is a copy from rangetypes".
Right, the comment should elaborate more that the collected statistics are
currently that same as rangetypes but may potentially deviate.
However, it seems the two functions are exactly the same. Would the
functions diverge in the future? If not, maybe there should be just a
single shared function?
Indeed, it is possible that the two functions will deviate if that statistics
of multirange types will be refined.
Right, but are there any such plans? Also, what's the likelihood we'll
add new statistics to only one of the places (e.g. for multiranges but
not plain ranges)?
I'd keep a single function until we actually need two. That's also
easier for maintenance - with two it's easy to fix a bug in one place
and forget about the other, etc.
Regarding our previous discussion about the duplication of calc_hist_join_selectivity in rangetypes_selfuncs.c and multirangetypes_selfuncs.c, we can also remove this duplication in the revision if needed.
Note that currently, there are no external functions shared between rangetypes_selfuncs.c and multirangetypes_selfuncs.c.
Any function that was used in both files was duplicated as a static function.
The functions calc_hist_selectivity_scalar, calc_length_hist_frac, calc_hist_selectivity_contained and calc_hist_selectivity_contains are examples of this, where the function is identical but has been declared static in both files.
That said, we can remove the duplication of calc_hist_join_selectivity if it still needed.
We would, however, require some guidance as to where to put the external definition of this function, as there does not appear to be a rangetypes_selfuncs.h header.
Should it simply go into utils/selfuncs.h or should we create a new header file?
Best regards,
Maxime Schoemans
Hi,
In the selectivity algorithm, the division was applied after adding the remaining histogram buckets of histogram2 that don't overlap with histogram1.
This could lead to reducing selectivity by half, e.g., in the case that histogram2 is completely right of histogram1.
The correct calculation is to divide by two before adding the remainder.
This patch-fix does the needed.
Best regards,
Maxime Schoemans
On 20/03/2023 16:34, maxime wrote:
Hi Tomas,
As a quick update, the paper related to this work has finally been published in Mathematics (https://www.mdpi.com/2227-7390/11/6/1383).
During revision we also added a figure showing a comparison of our algorithm vs the existing algorithms in Oracle, SQL Server, MySQL and PostgreSQL, which can be found in the experiments section of the paper.
As can be seen, our algorithm outperforms even Oracle and SQL Server.
During this revision we also found a small bug, so we are working on a revision of the patch, which fixes this.
Also, calc_hist_selectivity_contains in multirangetypes_selfuncs.c needs
a proper comment, not just "this is a copy from rangetypes".
Right, the comment should elaborate more that the collected statistics are
currently that same as rangetypes but may potentially deviate.
However, it seems the two functions are exactly the same. Would the
functions diverge in the future? If not, maybe there should be just a
single shared function?
Indeed, it is possible that the two functions will deviate if that statistics
of multirange types will be refined.
Right, but are there any such plans? Also, what's the likelihood we'll
add new statistics to only one of the places (e.g. for multiranges but
not plain ranges)?
I'd keep a single function until we actually need two. That's also
easier for maintenance - with two it's easy to fix a bug in one place
and forget about the other, etc.
Regarding our previous discussion about the duplication of calc_hist_join_selectivity in rangetypes_selfuncs.c and multirangetypes_selfuncs.c, we can also remove this duplication in the revision if needed.
Note that currently, there are no external functions shared between rangetypes_selfuncs.c and multirangetypes_selfuncs.c.
Any function that was used in both files was duplicated as a static function.
The functions calc_hist_selectivity_scalar, calc_length_hist_frac, calc_hist_selectivity_contained and calc_hist_selectivity_contains are examples of this, where the function is identical but has been declared static in both files.
That said, we can remove the duplication of calc_hist_join_selectivity if it still needed.
We would, however, require some guidance as to where to put the external definition of this function, as there does not appear to be a rangetypes_selfuncs.h header.
Should it simply go into utils/selfuncs.h or should we create a new header file?
Best regards,
Maxime Schoemans
Attachments:
v1-0002-apply-division-before-adding-remainder.patchtext/x-patch; name=v1-0002-apply-division-before-adding-remainder.patchDownload
From 53291919f536f6e7b04ca87f408e5b95e730ddb0 Mon Sep 17 00:00:00 2001
From: Maxime Schoemans <maxime.schoemans@ulb.ac.be>
Date: Mon, 20 Mar 2023 11:48:05 -0400
Subject: [PATCH] Apply division before adding remainder
---
src/backend/utils/adt/multirangetypes_selfuncs.c | 5 ++++-
src/backend/utils/adt/rangetypes_selfuncs.c | 5 ++++-
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index 7ba4aa8b04..ad14b789f4 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1412,11 +1412,14 @@ calc_hist_join_selectivity(TypeCacheEntry *typcache,
prev_sel2 = cur_sel2;
}
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
/* Include remainder of hist2 if any */
if (j < nhist2)
selectivity += 1 - prev_sel2;
- return selectivity / 2;
+ return selectivity;
}
/*
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
index 007e14bcf6..129ef9648f 100644
--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -1342,11 +1342,14 @@ calc_hist_join_selectivity(TypeCacheEntry *typcache,
prev_sel2 = cur_sel2;
}
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2
+
/* Include remainder of hist2 if any */
if (j < nhist2)
selectivity += 1 - prev_sel2;
- return selectivity / 2;
+ return selectivity;
}
/*
--
2.17.1
This is a quick correction as the last patch contained a missing semicolon.
Regards,
Maxime Schoemans
Attachments:
v2-0002-apply-division-before-adding-remainder.patchtext/x-patch; name=v2-0002-apply-division-before-adding-remainder.patchDownload
From ebd62356210eff2f38772a9c46a0a8792c0e9ce3 Mon Sep 17 00:00:00 2001
From: Maxime Schoemans <maxime.schoemans@ulb.ac.be>
Date: Mon, 20 Mar 2023 11:48:05 -0400
Subject: [PATCH v2] Apply division before adding remainder
---
src/backend/utils/adt/multirangetypes_selfuncs.c | 5 ++++-
src/backend/utils/adt/rangetypes_selfuncs.c | 5 ++++-
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index 7ba4aa8b04..ad14b789f4 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1412,11 +1412,14 @@ calc_hist_join_selectivity(TypeCacheEntry *typcache,
prev_sel2 = cur_sel2;
}
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
/* Include remainder of hist2 if any */
if (j < nhist2)
selectivity += 1 - prev_sel2;
- return selectivity / 2;
+ return selectivity;
}
/*
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
index 007e14bcf6..90970943b3 100644
--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -1342,11 +1342,14 @@ calc_hist_join_selectivity(TypeCacheEntry *typcache,
prev_sel2 = cur_sel2;
}
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
/* Include remainder of hist2 if any */
if (j < nhist2)
selectivity += 1 - prev_sel2;
- return selectivity / 2;
+ return selectivity;
}
/*
--
2.17.1
Hello!
Thank you for the patch, very interesting article.
The patch doesn't apply to the current postgres version. Could you please
update it?
Regards,
Damir Belyalov,
Postgres Professional
Hi,
Thank you for picking up this patch.
The patch doesn't apply to the current postgres version. Could you
please update it?
Indeed, the code was initially written on pg15.
You can find attached a new version of the patch that can be applied on
the current master branch of postgres.
Please let us know if there is anything else we can do.
Best regards,
Maxime Schoemans
Attachments:
v2-0001-Join-Selectivity-Estimation-for-Range-types.patchtext/x-patch; name=v2-0001-Join-Selectivity-Estimation-for-Range-types.patchDownload
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index cefc4710fd..c670d225a0 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1335,3 +1335,558 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity,
+ cur_sel1,
+ cur_sel2,
+ prev_sel1,
+ prev_sel2;
+ RangeBound cur_sync;
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ prev_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ prev_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /*
+ * Do the estimation on overlapping region
+ */
+ selectivity = 0.0;
+ while (i < nhist1 && j < nhist2)
+ {
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity;
+}
+
+/*
+ * multirangejoinsel -- join cardinality for multirange operators
+ */
+Datum
+multirangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL,
+ *rng_typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_multirange_selectivity(operator);
+
+ /* get multirange type cache */
+ if (type_is_multirange(vardata1.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata1.vartype);
+ else if (type_is_multirange(vardata2.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata2.vartype);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ typcache)
+ {
+
+ /* Initialize underlying range type cache */
+ rng_typcache = typcache->rngtype;
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+
+ /*
+ * just punt for now, estimation would require equality
+ * selectivity for bounds
+ */
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty (multi)ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty (multi)ranges might match
+ * different fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * empty (multi)range < non-empty (multi)range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * non-empty (multi)range > empty (multi)range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * empty (multi)range <@ any (multi)range
+ */
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty (multi)range <= any (multi)range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * any (multi)range @> empty (multi)range
+ */
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any (multi)range >= empty (multi)range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty
+ * (multi)range is involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
index fbabb3e18c..e2c0a7f0c3 100644
--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -1221,3 +1221,512 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity,
+ cur_sel1,
+ cur_sel2,
+ prev_sel1,
+ prev_sel2;
+ RangeBound cur_sync;
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ prev_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ prev_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /*
+ * Do the estimation on overlapping region
+ */
+ selectivity = 0.0;
+ while (i < nhist1 && j < nhist2)
+ {
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity;
+}
+
+/*
+ * rangejoinsel -- join cardinality for range operators
+ */
+Datum
+rangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_range_selectivity(operator);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ vardata1.vartype == vardata2.vartype)
+ {
+
+ /* Initialize type cache */
+ typcache = range_get_typcache(fcinfo, vardata1.vartype);
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_RANGE_OVERLAP_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LEFT_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_RIGHT_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty ranges might match different
+ * fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * empty range < non-empty range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * non-empty range > empty range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * empty range <@ any range
+ */
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty range <= any range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * any range @> empty range
+ */
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any range >= empty range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+ case OID_RANGE_OVERLAP_OP:
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ case OID_RANGE_LEFT_OP:
+ case OID_RANGE_RIGHT_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty range is
+ * involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index b2cdea66c4..d04fc692b5 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -3071,78 +3071,78 @@
oprname => '<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>(anyrange,anyrange)',
oprnegate => '>=(anyrange,anyrange)', oprcode => 'range_lt',
- oprrest => 'rangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3885', oid_symbol => 'OID_RANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>=(anyrange,anyrange)',
oprnegate => '>(anyrange,anyrange)', oprcode => 'range_le',
- oprrest => 'rangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3886', oid_symbol => 'OID_RANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<=(anyrange,anyrange)',
oprnegate => '<(anyrange,anyrange)', oprcode => 'range_ge',
- oprrest => 'rangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3887', oid_symbol => 'OID_RANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<(anyrange,anyrange)',
oprnegate => '<=(anyrange,anyrange)', oprcode => 'range_gt',
- oprrest => 'rangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3888', oid_symbol => 'OID_RANGE_OVERLAP_OP', descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anyrange)',
oprcode => 'range_overlaps', oprrest => 'rangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3889', oid_symbol => 'OID_RANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anyrange)',
oprcode => 'range_contains_elem', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3890', oid_symbol => 'OID_RANGE_CONTAINS_OP', descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anyrange)',
oprcode => 'range_contains', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3891', oid_symbol => 'OID_RANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyelement)',
oprcode => 'elem_contained_by_range', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3892', oid_symbol => 'OID_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyrange)',
oprcode => 'range_contained_by', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3893', oid_symbol => 'OID_RANGE_LEFT_OP', descr => 'is left of',
oprname => '<<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anyrange)',
oprcode => 'range_before', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3894', oid_symbol => 'OID_RANGE_RIGHT_OP', descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anyrange)',
oprcode => 'range_after', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3895', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overleft', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3896', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overright', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3897', descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anyrange)',
oprcode => 'range_adjacent', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3898', descr => 'range union',
oprname => '+', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'anyrange', oprcom => '+(anyrange,anyrange)',
@@ -3277,139 +3277,139 @@
oprname => '<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>(anymultirange,anymultirange)',
oprnegate => '>=(anymultirange,anymultirange)', oprcode => 'multirange_lt',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2863', oid_symbol => 'OID_MULTIRANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>=(anymultirange,anymultirange)',
oprnegate => '>(anymultirange,anymultirange)', oprcode => 'multirange_le',
- oprrest => 'multirangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2864', oid_symbol => 'OID_MULTIRANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<=(anymultirange,anymultirange)',
oprnegate => '<(anymultirange,anymultirange)', oprcode => 'multirange_ge',
- oprrest => 'multirangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2865', oid_symbol => 'OID_MULTIRANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<(anymultirange,anymultirange)',
oprnegate => '<=(anymultirange,anymultirange)', oprcode => 'multirange_gt',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2866', oid_symbol => 'OID_RANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anyrange)',
oprcode => 'range_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2867', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anymultirange)',
oprcode => 'multirange_overlaps_range', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2868', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anymultirange)',
oprcode => 'multirange_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2869', oid_symbol => 'OID_MULTIRANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anymultirange)',
oprcode => 'multirange_contains_elem', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2870', oid_symbol => 'OID_MULTIRANGE_CONTAINS_RANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anymultirange)',
oprcode => 'multirange_contains_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2871', oid_symbol => 'OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anymultirange)',
oprcode => 'multirange_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2872', oid_symbol => 'OID_MULTIRANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyelement)',
oprcode => 'elem_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2873', oid_symbol => 'OID_MULTIRANGE_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyrange)',
oprcode => 'range_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2874', oid_symbol => 'OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anymultirange)',
oprcode => 'multirange_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4539', oid_symbol => 'OID_RANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anyrange)',
oprcode => 'range_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4540', oid_symbol => 'OID_RANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anymultirange)',
oprcode => 'multirange_contained_by_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2875', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2876', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overleft_range',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2877', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '3585', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4035', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overright_range',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4142', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4179', oid_symbol => 'OID_RANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anyrange)',
oprcode => 'range_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4180', oid_symbol => 'OID_MULTIRANGE_ADJACENT_RANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anymultirange)',
oprcode => 'multirange_adjacent_range', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4198', oid_symbol => 'OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anymultirange)',
oprcode => 'multirange_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4392', descr => 'multirange union',
oprname => '+', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'anymultirange', oprcom => '+(anymultirange,anymultirange)',
@@ -3426,36 +3426,36 @@
oprname => '<<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anyrange)',
oprcode => 'range_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4396', oid_symbol => 'OID_MULTIRANGE_LEFT_RANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anymultirange)',
oprcode => 'multirange_before_range', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4397', oid_symbol => 'OID_MULTIRANGE_LEFT_MULTIRANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anymultirange)',
oprcode => 'multirange_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4398', oid_symbol => 'OID_RANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anyrange)',
oprcode => 'range_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4399', oid_symbol => 'OID_MULTIRANGE_RIGHT_RANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anymultirange)',
oprcode => 'multirange_after_range', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4400', oid_symbol => 'OID_MULTIRANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anymultirange)',
oprcode => 'multirange_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
]
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 6996073989..f670319b3c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12043,4 +12043,13 @@
proname => 'any_value_transfn', prorettype => 'anyelement',
proargtypes => 'anyelement anyelement', prosrc => 'any_value_transfn' },
+{ oid => '8355', descr => 'join selectivity for range operators',
+ proname => 'rangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'rangejoinsel' },
+{ oid => '8356', descr => 'join selectivity for multirange operators',
+ proname => 'multirangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'multirangejoinsel' },
+
]
diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out
index a0cb875492..9d3df137f9 100644
--- a/src/test/regress/expected/multirangetypes.out
+++ b/src/test/regress/expected/multirangetypes.out
@@ -3361,3 +3361,275 @@ create function mr_table_fail(i anyelement) returns table(i anyelement, r anymul
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anymultirange requires at least one input of type anyrange or anymultirange.
+-- test multirange join operators
+create table test_multirange_join_1(mr1 int4multirange);
+create table test_multirange_join_2(mr2 int4multirange);
+create table test_range_join(ir int4range);
+create table test_elem_join(elem int4);
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,200) g;
+insert into test_multirange_join_1 select '{}'::int4multirange from generate_series(1,50) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10000)) from generate_series(1,100) g;
+insert into test_multirange_join_1 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,20) g;
+insert into test_multirange_join_2 select '{}'::int4multirange from generate_series(1,5) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10000)) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select int4range(g, g+10000) from generate_series(1,10) g;
+insert into test_range_join select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join select NULL from generate_series(1,5) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_range_join;
+analyze test_elem_join;
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 = mr2');
+ estimated | actual
+-----------+--------
+ 55 | 300
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 < mr2');
+ estimated | actual
+-----------+--------
+ 4579 | 4598
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <= mr2');
+ estimated | actual
+-----------+--------
+ 4829 | 4898
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 > mr2');
+ estimated | actual
+-----------+--------
+ 15521 | 15452
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >= mr2');
+ estimated | actual
+-----------+--------
+ 15771 | 15752
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 && mr2');
+ estimated | actual
+-----------+--------
+ 11098 | 10932
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 && ir');
+ estimated | actual
+-----------+--------
+ 9611 | 9471
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir && mr2');
+ estimated | actual
+-----------+--------
+ 2924 | 2851
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <@ mr2');
+ estimated | actual
+-----------+--------
+ 8491 | 7393
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 <@ ir');
+ estimated | actual
+-----------+--------
+ 9754 | 8621
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir <@ mr2');
+ estimated | actual
+-----------+--------
+ 2660 | 1987
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 @> mr2');
+ estimated | actual
+-----------+--------
+ 3105 | 2361
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 @> ir');
+ estimated | actual
+-----------+--------
+ 9418 | 8397
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir @> mr2');
+ estimated | actual
+-----------+--------
+ 1177 | 800
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 << mr2');
+ estimated | actual
+-----------+--------
+ 152 | 181
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 << ir');
+ estimated | actual
+-----------+--------
+ 145 | 170
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir << mr2');
+ estimated | actual
+-----------+--------
+ 478 | 519
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >> mr2');
+ estimated | actual
+-----------+--------
+ 4750 | 4837
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 >> ir');
+ estimated | actual
+-----------+--------
+ 12644 | 12739
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir >> mr2');
+ estimated | actual
+-----------+--------
+ 98 | 110
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &< mr2');
+ estimated | actual
+-----------+--------
+ 6337 | 6318
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &< ir');
+ estimated | actual
+-----------+--------
+ 5581 | 5556
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &< mr2');
+ estimated | actual
+-----------+--------
+ 1986 | 2627
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &> mr2');
+ estimated | actual
+-----------+--------
+ 13921 | 13976
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &> ir');
+ estimated | actual
+-----------+--------
+ 19712 | 19807
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &> mr2');
+ estimated | actual
+-----------+--------
+ 1819 | 1895
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 -|- mr2');
+ estimated | actual
+-----------+--------
+ 160 | 71
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 -|- ir');
+ estimated | actual
+-----------+--------
+ 224 | 118
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir -|- mr2');
+ estimated | actual
+-----------+--------
+ 35 | 37
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_multirange_join_1 where elem <@ mr1');
+ estimated | actual
+-----------+--------
+ 120 | 3110
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_elem_join where mr1 @> elem');
+ estimated | actual
+-----------+--------
+ 120 | 3110
+(1 row)
+
+drop function check_estimated_rows;
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_range_join;
+drop table test_elem_join;
diff --git a/src/test/regress/expected/rangetypes.out b/src/test/regress/expected/rangetypes.out
index ee02ff0163..9a34fe5e18 100644
--- a/src/test/regress/expected/rangetypes.out
+++ b/src/test/regress/expected/rangetypes.out
@@ -1834,3 +1834,157 @@ create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anyrange requires at least one input of type anyrange or anymultirange.
+-- test range join operators
+create table test_range_join_1(ir1 int4range);
+create table test_range_join_2(ir2 int4range);
+create table test_elem_join(elem int4);
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select int4range(g, g+10000) from generate_series(1,100) g;
+insert into test_range_join_1 select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join_1 select NULL from generate_series(1,50) g;
+insert into test_range_join_2 select int4range(g+10, g+20) from generate_series(1,20) g;
+insert into test_range_join_2 select int4range(g+5000, g+15000) from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(NULL,g*5,'(]') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g*5,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join_2 select 'empty'::int4range from generate_series(1,5) g;
+insert into test_range_join_2 select NULL from generate_series(1,5) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_elem_join;
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 = ir2');
+ estimated | actual
+-----------+--------
+ 75 | 190
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 < ir2');
+ estimated | actual
+-----------+--------
+ 9886 | 9745
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <= ir2');
+ estimated | actual
+-----------+--------
+ 9986 | 9935
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 > ir2');
+ estimated | actual
+-----------+--------
+ 30514 | 30565
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >= ir2');
+ estimated | actual
+-----------+--------
+ 30614 | 30755
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 && ir2');
+ estimated | actual
+-----------+--------
+ 9966 | 9720
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <@ ir2');
+ estimated | actual
+-----------+--------
+ 10975 | 6268
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 @> ir2');
+ estimated | actual
+-----------+--------
+ 8379 | 3973
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 << ir2');
+ estimated | actual
+-----------+--------
+ 5034 | 5050
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >> ir2');
+ estimated | actual
+-----------+--------
+ 21400 | 21630
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &< ir2');
+ estimated | actual
+-----------+--------
+ 12039 | 12023
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &> ir2');
+ estimated | actual
+-----------+--------
+ 27914 | 28105
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 -|- ir2');
+ estimated | actual
+-----------+--------
+ 364 | 233
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_range_join_1 where elem <@ ir1');
+ estimated | actual
+-----------+--------
+ 192 | 3349
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_elem_join where ir1 @> elem');
+ estimated | actual
+-----------+--------
+ 192 | 3349
+(1 row)
+
+drop function check_estimated_rows;
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_elem_join;
diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql
index fefb4b4d42..c1c85c63b6 100644
--- a/src/test/regress/sql/multirangetypes.sql
+++ b/src/test/regress/sql/multirangetypes.sql
@@ -861,3 +861,160 @@ create function mr_inoutparam_fail(inout i anyelement, out r anymultirange)
--should fail
create function mr_table_fail(i anyelement) returns table(i anyelement, r anymultirange)
as $$ select $1, '[1,10]' $$ language sql;
+
+-- test multirange join operators
+create table test_multirange_join_1(mr1 int4multirange);
+create table test_multirange_join_2(mr2 int4multirange);
+create table test_range_join(ir int4range);
+create table test_elem_join(elem int4);
+
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,200) g;
+insert into test_multirange_join_1 select '{}'::int4multirange from generate_series(1,50) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10000)) from generate_series(1,100) g;
+insert into test_multirange_join_1 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10),int4range(g+20, g+30),int4range(g+40, g+50)) from generate_series(1,20) g;
+insert into test_multirange_join_2 select '{}'::int4multirange from generate_series(1,5) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10000)) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(NULL, g*10, '(]'), int4range(g*10, g*20, '(]')) from generate_series(1,10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g*10, g*20, '(]'), int4range(g*20, NULL, '[)')) from generate_series(1,10) g;
+
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select int4range(g, g+10000) from generate_series(1,10) g;
+insert into test_range_join select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join select NULL from generate_series(1,5) g;
+
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_range_join;
+analyze test_elem_join;
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 = mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 < mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <= mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 > mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >= mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 && mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 && ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir && mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 <@ mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 <@ ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir <@ mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 @> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 @> ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir @> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 << mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 << ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir << mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 >> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 >> ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir >> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &< mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &< ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &< mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 &> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 &> ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir &> mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_multirange_join_2 where mr1 -|- mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_range_join where mr1 -|- ir');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join, test_multirange_join_2 where ir -|- mr2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_multirange_join_1 where elem <@ mr1');
+
+SELECT * FROM check_estimated_rows('
+select * from test_multirange_join_1, test_elem_join where mr1 @> elem');
+
+drop function check_estimated_rows;
+
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_range_join;
+drop table test_elem_join;
diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
index c23be928c3..c02ae7c79e 100644
--- a/src/test/regress/sql/rangetypes.sql
+++ b/src/test/regress/sql/rangetypes.sql
@@ -629,3 +629,105 @@ create function inoutparam_fail(inout i anyelement, out r anyrange)
--should fail
create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
+
+-- test range join operators
+create table test_range_join_1(ir1 int4range);
+create table test_range_join_2(ir2 int4range);
+create table test_elem_join(elem int4);
+
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select int4range(g, g+10000) from generate_series(1,100) g;
+insert into test_range_join_1 select int4range(NULL,g*10,'(]') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g*10,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1,200) g;
+insert into test_range_join_1 select 'empty'::int4range from generate_series(1,20) g;
+insert into test_range_join_1 select NULL from generate_series(1,50) g;
+
+insert into test_range_join_2 select int4range(g+10, g+20) from generate_series(1,20) g;
+insert into test_range_join_2 select int4range(g+5000, g+15000) from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(NULL,g*5,'(]') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g*5,NULL,'[)') from generate_series(1,10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1,20) g;
+insert into test_range_join_2 select 'empty'::int4range from generate_series(1,5) g;
+insert into test_range_join_2 select NULL from generate_series(1,5) g;
+
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select g+10000 from generate_series(1,10) g;
+insert into test_elem_join select g*10 from generate_series(1,10) g;
+insert into test_elem_join select g from generate_series(1,20) g;
+insert into test_elem_join select NULL from generate_series(1,5) g;
+
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_elem_join;
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 = ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 < ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <= ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 > ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >= ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 && ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 <@ ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 @> ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 << ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 >> ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &< ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 &> ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_range_join_2 where ir1 -|- ir2');
+
+SELECT * FROM check_estimated_rows('
+select * from test_elem_join, test_range_join_1 where elem <@ ir1');
+
+SELECT * FROM check_estimated_rows('
+select * from test_range_join_1, test_elem_join where ir1 @> elem');
+
+drop function check_estimated_rows;
+
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_elem_join;
Schoemans Maxime <maxime.schoemans@ulb.be> writes:
You can find attached a new version of the patch that can be applied on
the current master branch of postgres.
I took a brief look through this very interesting work. I concur
with Tomas that it feels a little odd that range join selectivity
would become smarter than scalar inequality join selectivity, and
that we really ought to prioritize applying these methods to that
case. Still, that's a poor reason to not take the patch.
I also agree with the upthread criticism that having two identical
functions in different source files will be a maintenance nightmare.
Don't do it. When and if there's a reason for the behavior to
diverge between the range and multirange cases, it'd likely be
better to handle that by passing in a flag to say what to do.
But my real unhappiness with the patch as-submitted is the test cases,
which require rowcount estimates to be reproduced exactly. We know
very well that ANALYZE estimates are not perfectly stable and tend to
vary across platforms. As a quick check I tried the patch within a
32-bit VM, and it passed, which surprised me a bit ... but it would
surprise me a lot if we got these same numbers on every machine in
the buildfarm. We need a more forgiving test method. Usually the
approach is to set up a test case where the improved accuracy of
the estimate changes the planner's choice of plan compared to what
you got before, since that will normally not be too prone to change
from variations of a percent or two in the estimates. Another idea
could be something like
SELECT (estimate/actual BETWEEN 0.9 AND 1.1) AS ok FROM ...
which just gives a true/false output instead of an exact number.
regards, tom lane
On 14/11/2023 20:46, Tom Lane wrote:
I took a brief look through this very interesting work. I concur
with Tomas that it feels a little odd that range join selectivity
would become smarter than scalar inequality join selectivity, and
that we really ought to prioritize applying these methods to that
case. Still, that's a poor reason to not take the patch.
Indeed, we started with ranges as this was the simpler case (no MCV) and
was the topic of a course project.
The idea is to later write a second patch that applies these ideas to
scalar inequality while also handling MCV's correctly.
I also agree with the upthread criticism that having two identical
functions in different source files will be a maintenance nightmare.
Don't do it. When and if there's a reason for the behavior to
diverge between the range and multirange cases, it'd likely be
better to handle that by passing in a flag to say what to do.
The duplication is indeed not ideal. However, there are already 8 other
duplicate functions between the two files.
I would thus suggest to leave the duplication in this patch and create a
second one that removes all duplication from the two files, instead of
just removing the duplication for our new function.
What are your thoughts on this? If we do this, should the function
definitions go in rangetypes.h or should we create a new
rangetypes_selfuncs.h header?
But my real unhappiness with the patch as-submitted is the test cases,
which require rowcount estimates to be reproduced exactly.
We need a more forgiving test method. Usually the
approach is to set up a test case where the improved accuracy of
the estimate changes the planner's choice of plan compared to what
you got before, since that will normally not be too prone to change
from variations of a percent or two in the estimates.
I have changed the test method to produce query plans for a 3-way range
join.
The plans for the different operators differ due to the computed
selectivity estimation, which was not the case before this patch.
Regards,
Maxime Schoemans
Attachments:
v3-0001-Join-Selectivity-Estimation-for-Range-types.patchtext/x-patch; name=v3-0001-Join-Selectivity-Estimation-for-Range-types.patchDownload
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index cefc4710fd..c670d225a0 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1335,3 +1335,558 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity,
+ cur_sel1,
+ cur_sel2,
+ prev_sel1,
+ prev_sel2;
+ RangeBound cur_sync;
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ prev_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ prev_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /*
+ * Do the estimation on overlapping region
+ */
+ selectivity = 0.0;
+ while (i < nhist1 && j < nhist2)
+ {
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity;
+}
+
+/*
+ * multirangejoinsel -- join cardinality for multirange operators
+ */
+Datum
+multirangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL,
+ *rng_typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_multirange_selectivity(operator);
+
+ /* get multirange type cache */
+ if (type_is_multirange(vardata1.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata1.vartype);
+ else if (type_is_multirange(vardata2.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata2.vartype);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ typcache)
+ {
+
+ /* Initialize underlying range type cache */
+ rng_typcache = typcache->rngtype;
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+
+ /*
+ * just punt for now, estimation would require equality
+ * selectivity for bounds
+ */
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty (multi)ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty (multi)ranges might match
+ * different fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * empty (multi)range < non-empty (multi)range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * non-empty (multi)range > empty (multi)range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * empty (multi)range <@ any (multi)range
+ */
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty (multi)range <= any (multi)range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * any (multi)range @> empty (multi)range
+ */
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any (multi)range >= empty (multi)range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty
+ * (multi)range is involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
index fbabb3e18c..e2c0a7f0c3 100644
--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -1221,3 +1221,512 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity,
+ cur_sel1,
+ cur_sel2,
+ prev_sel1,
+ prev_sel2;
+ RangeBound cur_sync;
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ prev_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ prev_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /*
+ * Do the estimation on overlapping region
+ */
+ selectivity = 0.0;
+ while (i < nhist1 && j < nhist2)
+ {
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity;
+}
+
+/*
+ * rangejoinsel -- join cardinality for range operators
+ */
+Datum
+rangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_range_selectivity(operator);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ vardata1.vartype == vardata2.vartype)
+ {
+
+ /* Initialize type cache */
+ typcache = range_get_typcache(fcinfo, vardata1.vartype);
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_RANGE_OVERLAP_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LEFT_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_RIGHT_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty ranges might match different
+ * fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * empty range < non-empty range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * non-empty range > empty range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * empty range <@ any range
+ */
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty range <= any range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * any range @> empty range
+ */
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any range >= empty range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+ case OID_RANGE_OVERLAP_OP:
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ case OID_RANGE_LEFT_OP:
+ case OID_RANGE_RIGHT_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty range is
+ * involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index b2cdea66c4..d04fc692b5 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -3071,78 +3071,78 @@
oprname => '<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>(anyrange,anyrange)',
oprnegate => '>=(anyrange,anyrange)', oprcode => 'range_lt',
- oprrest => 'rangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3885', oid_symbol => 'OID_RANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>=(anyrange,anyrange)',
oprnegate => '>(anyrange,anyrange)', oprcode => 'range_le',
- oprrest => 'rangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3886', oid_symbol => 'OID_RANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<=(anyrange,anyrange)',
oprnegate => '<(anyrange,anyrange)', oprcode => 'range_ge',
- oprrest => 'rangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3887', oid_symbol => 'OID_RANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<(anyrange,anyrange)',
oprnegate => '<=(anyrange,anyrange)', oprcode => 'range_gt',
- oprrest => 'rangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3888', oid_symbol => 'OID_RANGE_OVERLAP_OP', descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anyrange)',
oprcode => 'range_overlaps', oprrest => 'rangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3889', oid_symbol => 'OID_RANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anyrange)',
oprcode => 'range_contains_elem', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3890', oid_symbol => 'OID_RANGE_CONTAINS_OP', descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anyrange)',
oprcode => 'range_contains', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3891', oid_symbol => 'OID_RANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyelement)',
oprcode => 'elem_contained_by_range', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3892', oid_symbol => 'OID_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyrange)',
oprcode => 'range_contained_by', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3893', oid_symbol => 'OID_RANGE_LEFT_OP', descr => 'is left of',
oprname => '<<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anyrange)',
oprcode => 'range_before', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3894', oid_symbol => 'OID_RANGE_RIGHT_OP', descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anyrange)',
oprcode => 'range_after', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3895', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overleft', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3896', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overright', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3897', descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anyrange)',
oprcode => 'range_adjacent', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3898', descr => 'range union',
oprname => '+', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'anyrange', oprcom => '+(anyrange,anyrange)',
@@ -3277,139 +3277,139 @@
oprname => '<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>(anymultirange,anymultirange)',
oprnegate => '>=(anymultirange,anymultirange)', oprcode => 'multirange_lt',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2863', oid_symbol => 'OID_MULTIRANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>=(anymultirange,anymultirange)',
oprnegate => '>(anymultirange,anymultirange)', oprcode => 'multirange_le',
- oprrest => 'multirangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2864', oid_symbol => 'OID_MULTIRANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<=(anymultirange,anymultirange)',
oprnegate => '<(anymultirange,anymultirange)', oprcode => 'multirange_ge',
- oprrest => 'multirangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2865', oid_symbol => 'OID_MULTIRANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<(anymultirange,anymultirange)',
oprnegate => '<=(anymultirange,anymultirange)', oprcode => 'multirange_gt',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2866', oid_symbol => 'OID_RANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anyrange)',
oprcode => 'range_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2867', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anymultirange)',
oprcode => 'multirange_overlaps_range', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2868', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anymultirange)',
oprcode => 'multirange_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2869', oid_symbol => 'OID_MULTIRANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anymultirange)',
oprcode => 'multirange_contains_elem', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2870', oid_symbol => 'OID_MULTIRANGE_CONTAINS_RANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anymultirange)',
oprcode => 'multirange_contains_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2871', oid_symbol => 'OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anymultirange)',
oprcode => 'multirange_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2872', oid_symbol => 'OID_MULTIRANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyelement)',
oprcode => 'elem_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2873', oid_symbol => 'OID_MULTIRANGE_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyrange)',
oprcode => 'range_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2874', oid_symbol => 'OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anymultirange)',
oprcode => 'multirange_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4539', oid_symbol => 'OID_RANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anyrange)',
oprcode => 'range_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4540', oid_symbol => 'OID_RANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anymultirange)',
oprcode => 'multirange_contained_by_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2875', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2876', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overleft_range',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2877', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '3585', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4035', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overright_range',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4142', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4179', oid_symbol => 'OID_RANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anyrange)',
oprcode => 'range_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4180', oid_symbol => 'OID_MULTIRANGE_ADJACENT_RANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anymultirange)',
oprcode => 'multirange_adjacent_range', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4198', oid_symbol => 'OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anymultirange)',
oprcode => 'multirange_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4392', descr => 'multirange union',
oprname => '+', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'anymultirange', oprcom => '+(anymultirange,anymultirange)',
@@ -3426,36 +3426,36 @@
oprname => '<<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anyrange)',
oprcode => 'range_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4396', oid_symbol => 'OID_MULTIRANGE_LEFT_RANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anymultirange)',
oprcode => 'multirange_before_range', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4397', oid_symbol => 'OID_MULTIRANGE_LEFT_MULTIRANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anymultirange)',
oprcode => 'multirange_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4398', oid_symbol => 'OID_RANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anyrange)',
oprcode => 'range_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4399', oid_symbol => 'OID_MULTIRANGE_RIGHT_RANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anymultirange)',
oprcode => 'multirange_after_range', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4400', oid_symbol => 'OID_MULTIRANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anymultirange)',
oprcode => 'multirange_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
]
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index fb58dee3bc..f0097d4ec0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12100,4 +12100,13 @@
proname => 'any_value_transfn', prorettype => 'anyelement',
proargtypes => 'anyelement anyelement', prosrc => 'any_value_transfn' },
+{ oid => '8355', descr => 'join selectivity for range operators',
+ proname => 'rangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'rangejoinsel' },
+{ oid => '8356', descr => 'join selectivity for multirange operators',
+ proname => 'multirangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'multirangejoinsel' },
+
]
diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out
index a0cb875492..21d63d9bda 100644
--- a/src/test/regress/expected/multirangetypes.out
+++ b/src/test/regress/expected/multirangetypes.out
@@ -3361,3 +3361,66 @@ create function mr_table_fail(i anyelement) returns table(i anyelement, r anymul
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anymultirange requires at least one input of type anyrange or anymultirange.
+--
+-- test selectivity of multirange join operators
+--
+create table test_multirange_join_1 (imr1 int4multirange);
+create table test_multirange_join_2 (imr2 int4multirange);
+create table test_multirange_join_3 (imr3 int4multirange);
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10)) from generate_series(1, 1000) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+100)) from generate_series(1, 1000, 10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10)) from generate_series(1, 500) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+100)) from generate_series(1, 500, 10) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+10)) from generate_series(501, 1000) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+100)) from generate_series(501, 1000, 10) g;
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_multirange_join_3;
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 && imr2 and imr2 && imr3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_multirange_join_1.imr1 && test_multirange_join_2.imr2)
+ -> Nested Loop
+ Join Filter: (test_multirange_join_2.imr2 && test_multirange_join_3.imr3)
+ -> Seq Scan on test_multirange_join_2
+ -> Materialize
+ -> Seq Scan on test_multirange_join_3
+ -> Materialize
+ -> Seq Scan on test_multirange_join_1
+(10 rows)
+
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 << imr2 and imr2 << imr3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_multirange_join_2.imr2 << test_multirange_join_3.imr3)
+ -> Nested Loop
+ Join Filter: (test_multirange_join_1.imr1 << test_multirange_join_2.imr2)
+ -> Seq Scan on test_multirange_join_1
+ -> Materialize
+ -> Seq Scan on test_multirange_join_2
+ -> Materialize
+ -> Seq Scan on test_multirange_join_3
+(10 rows)
+
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 >> imr2 and imr2 >> imr3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_multirange_join_1.imr1 >> test_multirange_join_2.imr2)
+ -> Nested Loop
+ Join Filter: (test_multirange_join_2.imr2 >> test_multirange_join_3.imr3)
+ -> Seq Scan on test_multirange_join_2
+ -> Materialize
+ -> Seq Scan on test_multirange_join_3
+ -> Seq Scan on test_multirange_join_1
+(9 rows)
+
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_multirange_join_3;
diff --git a/src/test/regress/expected/rangetypes.out b/src/test/regress/expected/rangetypes.out
index ee02ff0163..357bb3154b 100644
--- a/src/test/regress/expected/rangetypes.out
+++ b/src/test/regress/expected/rangetypes.out
@@ -1834,3 +1834,66 @@ create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anyrange requires at least one input of type anyrange or anymultirange.
+--
+-- test selectivity of range join operators
+--
+create table test_range_join_1 (ir1 int4range);
+create table test_range_join_2 (ir2 int4range);
+create table test_range_join_3 (ir3 int4range);
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1, 1000) g;
+insert into test_range_join_1 select int4range(g, g+100) from generate_series(1, 1000, 10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1, 500) g;
+insert into test_range_join_2 select int4range(g, g+100) from generate_series(1, 500, 10) g;
+insert into test_range_join_3 select int4range(g, g+10) from generate_series(501, 1000) g;
+insert into test_range_join_3 select int4range(g, g+100) from generate_series(501, 1000, 10) g;
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_range_join_3;
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 && ir2 and ir2 && ir3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_range_join_1.ir1 && test_range_join_2.ir2)
+ -> Seq Scan on test_range_join_1
+ -> Materialize
+ -> Nested Loop
+ Join Filter: (test_range_join_2.ir2 && test_range_join_3.ir3)
+ -> Seq Scan on test_range_join_2
+ -> Materialize
+ -> Seq Scan on test_range_join_3
+(10 rows)
+
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 << ir2 and ir2 << ir3;
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_range_join_2.ir2 << test_range_join_3.ir3)
+ -> Nested Loop
+ Join Filter: (test_range_join_1.ir1 << test_range_join_2.ir2)
+ -> Seq Scan on test_range_join_1
+ -> Materialize
+ -> Seq Scan on test_range_join_2
+ -> Materialize
+ -> Seq Scan on test_range_join_3
+(10 rows)
+
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 >> ir2 and ir2 >> ir3;
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_range_join_1.ir1 >> test_range_join_2.ir2)
+ -> Nested Loop
+ Join Filter: (test_range_join_2.ir2 >> test_range_join_3.ir3)
+ -> Seq Scan on test_range_join_2
+ -> Materialize
+ -> Seq Scan on test_range_join_3
+ -> Seq Scan on test_range_join_1
+(9 rows)
+
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_range_join_3;
diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql
index fefb4b4d42..4c62c31166 100644
--- a/src/test/regress/sql/multirangetypes.sql
+++ b/src/test/regress/sql/multirangetypes.sql
@@ -861,3 +861,30 @@ create function mr_inoutparam_fail(inout i anyelement, out r anymultirange)
--should fail
create function mr_table_fail(i anyelement) returns table(i anyelement, r anymultirange)
as $$ select $1, '[1,10]' $$ language sql;
+
+--
+-- test selectivity of multirange join operators
+--
+create table test_multirange_join_1 (imr1 int4multirange);
+create table test_multirange_join_2 (imr2 int4multirange);
+create table test_multirange_join_3 (imr3 int4multirange);
+
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10)) from generate_series(1, 1000) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+100)) from generate_series(1, 1000, 10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10)) from generate_series(1, 500) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+100)) from generate_series(1, 500, 10) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+10)) from generate_series(501, 1000) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+100)) from generate_series(501, 1000, 10) g;
+
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_multirange_join_3;
+
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 && imr2 and imr2 && imr3;
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 << imr2 and imr2 << imr3;
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 >> imr2 and imr2 >> imr3;
+
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_multirange_join_3;
diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
index c23be928c3..1018a234a5 100644
--- a/src/test/regress/sql/rangetypes.sql
+++ b/src/test/regress/sql/rangetypes.sql
@@ -629,3 +629,30 @@ create function inoutparam_fail(inout i anyelement, out r anyrange)
--should fail
create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
+
+--
+-- test selectivity of range join operators
+--
+create table test_range_join_1 (ir1 int4range);
+create table test_range_join_2 (ir2 int4range);
+create table test_range_join_3 (ir3 int4range);
+
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1, 1000) g;
+insert into test_range_join_1 select int4range(g, g+100) from generate_series(1, 1000, 10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1, 500) g;
+insert into test_range_join_2 select int4range(g, g+100) from generate_series(1, 500, 10) g;
+insert into test_range_join_3 select int4range(g, g+10) from generate_series(501, 1000) g;
+insert into test_range_join_3 select int4range(g, g+100) from generate_series(501, 1000, 10) g;
+
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_range_join_3;
+
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 && ir2 and ir2 && ir3;
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 << ir2 and ir2 << ir3;
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 >> ir2 and ir2 >> ir3;
+
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_range_join_3;
Hi!
Thank you for your work on the subject, I think it's a really useful
feature and it allows optimizer to estimate more correctly clauses with
such type of operator.
I rewieved your patch and noticed that some comments are repeated into
multirangejoinsel functions, I suggest combining them.
The proposed changes are in the attached patch.
If this topic is about calculating selectivity, have you thought about
adding cardinality calculation test for queries with this type of operator?
For example, you could form queries similar to those that you use in
src/test/regress/sql/multirangetypes.sql and
src/test/regress/sql/rangetypes.sql.
I added a few in the attached patch.
--
Regards,
Alena Rybakina
Attachments:
changes.diff.txttext/plain; charset=UTF-8; name=changes.diff.txtDownload
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index c670d225a0c..7708768b89f 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1620,14 +1620,15 @@ multirangejoinsel(PG_FUNCTION_ARGS)
hist1_lower, nhist1);
break;
+ /*
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds for comparison operators
+ */
case OID_MULTIRANGE_LESS_EQUAL_OP:
/*
* A <= B
*
- * Start by comparing lower bounds and if they are equal
- * compare upper bounds
- *
* Negation of OID_RANGE_GREATER_OP.
*
* Overestimate by comparing only the lower bounds. Higher
@@ -1644,9 +1645,6 @@ multirangejoinsel(PG_FUNCTION_ARGS)
/*
* A < B
*
- * Start by comparing lower bounds and if they are equal
- * compare upper bounds
- *
* Underestimate by comparing only the lower bounds. Higher
* accuracy would require us to add P(lower1 = lower2) *
* P(upper1 < upper2)
@@ -1661,9 +1659,6 @@ multirangejoinsel(PG_FUNCTION_ARGS)
/*
* A >= B
*
- * Start by comparing lower bounds and if they are equal
- * compare upper bounds
- *
* Negation of OID_RANGE_LESS_OP.
*
* Overestimate by comparing only the lower bounds. Higher
@@ -1680,9 +1675,6 @@ multirangejoinsel(PG_FUNCTION_ARGS)
/*
* A > B == B < A
*
- * Start by comparing lower bounds and if they are equal
- * compare upper bounds
- *
* Underestimate by comparing only the lower bounds. Higher
* accuracy would require us to add P(lower1 = lower2) *
* P(upper1 > upper2)
@@ -1773,18 +1765,16 @@ multirangejoinsel(PG_FUNCTION_ARGS)
case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
case OID_MULTIRANGE_ADJACENT_RANGE_OP:
case OID_RANGE_ADJACENT_MULTIRANGE_OP:
-
- /*
- * just punt for now, estimation would require equality
- * selectivity for bounds
- */
case OID_MULTIRANGE_CONTAINS_ELEM_OP:
case OID_MULTIRANGE_ELEM_CONTAINED_OP:
- /*
- * just punt for now, estimation would require extraction of
- * histograms for the anyelement
- */
+ /*
+ * just punt for now:
+ * if it is a type of adjucent operation estimation
+ * it will require equality selectivity for bounds;
+ * if it is one of type of contain operation
+ * it will extraction of histograms for the any element.
+ */
default:
break;
}
diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out
index 21d63d9bdac..72f15cf48e1 100644
--- a/src/test/regress/expected/multirangetypes.out
+++ b/src/test/regress/expected/multirangetypes.out
@@ -3364,6 +3364,25 @@ DETAIL: A result of type anymultirange requires at least one input of type anyr
--
-- test selectivity of multirange join operators
--
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
create table test_multirange_join_1 (imr1 int4multirange);
create table test_multirange_join_2 (imr2 int4multirange);
create table test_multirange_join_3 (imr3 int4multirange);
@@ -3421,6 +3440,40 @@ explain (costs off) select count(*) from test_multirange_join_1, test_multirange
-> Seq Scan on test_multirange_join_1
(9 rows)
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_multirange_join_1,
+ test_multirange_join_2
+ where imr1 && imr2
+');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_multirange_join_1,
+ test_multirange_join_2
+ where imr1 << imr2
+');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_multirange_join_1,
+ test_multirange_join_2
+ where imr1 >> imr2
+');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
drop table test_multirange_join_1;
drop table test_multirange_join_2;
drop table test_multirange_join_3;
+drop function check_estimated_rows;
diff --git a/src/test/regress/expected/rangetypes.out b/src/test/regress/expected/rangetypes.out
index 357bb3154b2..3168c12b2dc 100644
--- a/src/test/regress/expected/rangetypes.out
+++ b/src/test/regress/expected/rangetypes.out
@@ -1837,6 +1837,25 @@ DETAIL: A result of type anyrange requires at least one input of type anyrange
--
-- test selectivity of range join operators
--
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
create table test_range_join_1 (ir1 int4range);
create table test_range_join_2 (ir2 int4range);
create table test_range_join_3 (ir3 int4range);
@@ -1894,6 +1913,40 @@ explain (costs off) select count(*) from test_range_join_1, test_range_join_2, t
-> Seq Scan on test_range_join_1
(9 rows)
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_range_join_1,
+ test_range_join_2
+ where ir1 && ir2
+');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_range_join_1,
+ test_range_join_2
+ where ir1 << ir2
+');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_range_join_1,
+ test_range_join_2
+ where ir1 >> ir2
+');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
drop table test_range_join_1;
drop table test_range_join_2;
drop table test_range_join_3;
+drop function check_estimated_rows;
diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql
index 4c62c31166a..cd828fc42c1 100644
--- a/src/test/regress/sql/multirangetypes.sql
+++ b/src/test/regress/sql/multirangetypes.sql
@@ -865,6 +865,27 @@ create function mr_table_fail(i anyelement) returns table(i anyelement, r anymul
--
-- test selectivity of multirange join operators
--
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+
create table test_multirange_join_1 (imr1 int4multirange);
create table test_multirange_join_2 (imr2 int4multirange);
create table test_multirange_join_3 (imr3 int4multirange);
@@ -885,6 +906,28 @@ explain (costs off) select count(*) from test_multirange_join_1, test_multirange
explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 << imr2 and imr2 << imr3;
explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 >> imr2 and imr2 >> imr3;
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_multirange_join_1,
+ test_multirange_join_2
+ where imr1 && imr2
+');
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_multirange_join_1,
+ test_multirange_join_2
+ where imr1 << imr2
+');
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_multirange_join_1,
+ test_multirange_join_2
+ where imr1 >> imr2
+');
+
drop table test_multirange_join_1;
drop table test_multirange_join_2;
drop table test_multirange_join_3;
+drop function check_estimated_rows;
diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
index 1018a234a59..50ab6c4552b 100644
--- a/src/test/regress/sql/rangetypes.sql
+++ b/src/test/regress/sql/rangetypes.sql
@@ -633,6 +633,27 @@ create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
--
-- test selectivity of range join operators
--
+
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+ ln text;
+ tmp text[];
+ first_row bool := true;
+begin
+ for ln in
+ execute format('explain analyze %s', $1)
+ loop
+ if first_row then
+ first_row := false;
+ tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+ return query select tmp[1]::int, tmp[2]::int;
+ end if;
+ end loop;
+end;
+$$;
+
create table test_range_join_1 (ir1 int4range);
create table test_range_join_2 (ir2 int4range);
create table test_range_join_3 (ir3 int4range);
@@ -653,6 +674,28 @@ explain (costs off) select count(*) from test_range_join_1, test_range_join_2, t
explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 << ir2 and ir2 << ir3;
explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 >> ir2 and ir2 >> ir3;
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_range_join_1,
+ test_range_join_2
+ where ir1 && ir2
+');
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_range_join_1,
+ test_range_join_2
+ where ir1 << ir2
+');
+
+SELECT * FROM check_estimated_rows('
+ select count(*)
+ from test_range_join_1,
+ test_range_join_2
+ where ir1 >> ir2
+');
+
drop table test_range_join_1;
drop table test_range_join_2;
drop table test_range_join_3;
+drop function check_estimated_rows;
On Tue, 21 Nov 2023 at 01:47, Schoemans Maxime <maxime.schoemans@ulb.be> wrote:
On 14/11/2023 20:46, Tom Lane wrote:
I took a brief look through this very interesting work. I concur
with Tomas that it feels a little odd that range join selectivity
would become smarter than scalar inequality join selectivity, and
that we really ought to prioritize applying these methods to that
case. Still, that's a poor reason to not take the patch.Indeed, we started with ranges as this was the simpler case (no MCV) and
was the topic of a course project.
The idea is to later write a second patch that applies these ideas to
scalar inequality while also handling MCV's correctly.I also agree with the upthread criticism that having two identical
functions in different source files will be a maintenance nightmare.
Don't do it. When and if there's a reason for the behavior to
diverge between the range and multirange cases, it'd likely be
better to handle that by passing in a flag to say what to do.The duplication is indeed not ideal. However, there are already 8 other
duplicate functions between the two files.
I would thus suggest to leave the duplication in this patch and create a
second one that removes all duplication from the two files, instead of
just removing the duplication for our new function.
What are your thoughts on this? If we do this, should the function
definitions go in rangetypes.h or should we create a new
rangetypes_selfuncs.h header?But my real unhappiness with the patch as-submitted is the test cases,
which require rowcount estimates to be reproduced exactly.We need a more forgiving test method. Usually the
approach is to set up a test case where the improved accuracy of
the estimate changes the planner's choice of plan compared to what
you got before, since that will normally not be too prone to change
from variations of a percent or two in the estimates.I have changed the test method to produce query plans for a 3-way range
join.
The plans for the different operators differ due to the computed
selectivity estimation, which was not the case before this patch.
One of the tests was aborted at [1]https://cirrus-ci.com/task/5507789477380096, kindly post an updated patch for the same:
[04:55:42.797] src/tools/ci/cores_backtrace.sh linux /tmp/cores
[04:56:03.640] dumping /tmp/cores/postgres-6-24094.core for
/tmp/cirrus-ci-build/tmp_install/usr/local/pgsql/bin/postgres
[04:57:24.199] Core was generated by `postgres: old_node: postgres
regression [local] EXPLAIN '.
[04:57:24.199] Program terminated with signal SIGABRT, Aborted.
[04:57:24.199] #0 __GI_raise (sig=sig@entry=6) at
../sysdeps/unix/sysv/linux/raise.c:50
[04:57:24.199] Download failed: Invalid argument. Continuing without
source file ./signal/../sysdeps/unix/sysv/linux/raise.c.
[04:57:26.803]
[04:57:26.803] Thread 1 (Thread 0x7f121d9ec380 (LWP 24094)):
[04:57:26.803] #0 __GI_raise (sig=sig@entry=6) at
../sysdeps/unix/sysv/linux/raise.c:50
[04:57:26.803] set = {__val = {4194304, 0, 4636737291354636288,
4636737291354636288, 0, 0, 64, 64, 128, 128, 192, 192, 256, 256, 0,
0}}
[04:57:26.803] pid = <optimized out>
[04:57:26.803] tid = <optimized out>
[04:57:26.803] ret = <optimized out>
[04:57:26.803] #1 0x00007f122003d537 in __GI_abort () at abort.c:79
...
...
[04:57:38.774] #6 0x00007f357ad95788 in __asan::__asan_report_load1
(addr=addr@entry=107477261711120) at
../../../../src/libsanitizer/asan/asan_rtl.cpp:117
[04:57:38.774] bp = 140731433585840
[04:57:38.774] pc = <optimized out>
[04:57:38.774] local_stack = 139867680821632
[04:57:38.774] sp = 140731433585832
[04:57:38.774] #7 0x000055d5b155c37c in range_cmp_bound_values
(typcache=typcache@entry=0x629000030b60, b1=b1@entry=0x61c000017708,
b2=b2@entry=0x61c0000188b8) at rangetypes.c:2090
[04:57:38.774] No locals.
[04:57:38.774] #8 0x000055d5b1567bb2 in calc_hist_join_selectivity
(typcache=typcache@entry=0x629000030b60,
hist1=hist1@entry=0x61c0000188b8, nhist1=nhist1@entry=101,
hist2=hist2@entry=0x61c0000170b8, nhist2=nhist2@entry=101) at
rangetypes_selfuncs.c:1298
[04:57:38.774] i = 0
[04:57:38.774] j = 101
[04:57:38.774] selectivity = <optimized out>
[04:57:38.774] cur_sel1 = <optimized out>
[04:57:38.774] cur_sel2 = <optimized out>
[04:57:38.774] prev_sel1 = <optimized out>
[04:57:38.774] prev_sel2 = <optimized out>
[04:57:38.774] cur_sync = {val = <optimized out>, infinite =
<optimized out>, inclusive = <optimized out>, lower = <optimized out>}
[04:57:38.774] #9 0x000055d5b1569190 in rangejoinsel
(fcinfo=<optimized out>) at rangetypes_selfuncs.c:1495
[1]: https://cirrus-ci.com/task/5507789477380096
Regards,
Vignesh
On 05/01/2024 11:37, vignesh C wrote:
One of the tests was aborted at [1], kindly post an updated patch for
the same:
Thank you for notifying us.
I believe I fixed the issue but it is hard to be certain as the issue
did not arise when running the regression tests locally.
Regards,
Maxime
Attachments:
v4-0001-Join-Selectivity-Estimation-for-Range-types.patchtext/x-patch; name=v4-0001-Join-Selectivity-Estimation-for-Range-types.patchDownload
diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c
index 981c1fd298..6abc43f149 100644
--- a/src/backend/utils/adt/multirangetypes_selfuncs.c
+++ b/src/backend/utils/adt/multirangetypes_selfuncs.c
@@ -1335,3 +1335,542 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity = 0.0, /* initialization */
+ prev_sel1 = -1.0, /* to skip the first iteration */
+ prev_sel2 = 0.0; /* initialization */
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ /* Do the estimation on overlapping regions */
+ while (i < nhist1 && j < nhist2)
+ {
+ double cur_sel1,
+ cur_sel2;
+ RangeBound cur_sync;
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /* Skip the first iteration */
+ if (prev_sel1 >= 0)
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity;
+}
+
+/*
+ * multirangejoinsel -- join cardinality for multirange operators
+ */
+Datum
+multirangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL,
+ *rng_typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_multirange_selectivity(operator);
+
+ /* get multirange type cache */
+ if (type_is_multirange(vardata1.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata1.vartype);
+ else if (type_is_multirange(vardata2.vartype))
+ typcache = multirange_get_typcache(fcinfo, vardata2.vartype);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ typcache)
+ {
+
+ /* Initialize underlying range type cache */
+ rng_typcache = typcache->rngtype;
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(rng_typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(rng_typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(rng_typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+
+ /*
+ * just punt for now, estimation would require equality
+ * selectivity for bounds
+ */
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty (multi)ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty (multi)ranges might match
+ * different fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_MULTIRANGE_LESS_OP:
+
+ /*
+ * empty (multi)range < non-empty (multi)range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_MULTIRANGE_GREATER_OP:
+
+ /*
+ * non-empty (multi)range > empty (multi)range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
+ case OID_MULTIRANGE_RANGE_CONTAINED_OP:
+ case OID_RANGE_MULTIRANGE_CONTAINED_OP:
+
+ /*
+ * empty (multi)range <@ any (multi)range
+ */
+ case OID_MULTIRANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty (multi)range <= any (multi)range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_CONTAINS_RANGE_OP:
+ case OID_RANGE_CONTAINS_MULTIRANGE_OP:
+
+ /*
+ * any (multi)range @> empty (multi)range
+ */
+ case OID_MULTIRANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any (multi)range >= empty (multi)range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_MULTIRANGE_CONTAINS_ELEM_OP:
+ case OID_MULTIRANGE_ELEM_CONTAINED_OP:
+ case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
+ case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_LEFT_RANGE_OP:
+ case OID_RANGE_LEFT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_RIGHT_RANGE_OP:
+ case OID_RANGE_RIGHT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP:
+ case OID_MULTIRANGE_ADJACENT_RANGE_OP:
+ case OID_RANGE_ADJACENT_MULTIRANGE_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty
+ * (multi)range is involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
index c260012bd0..be3479cb0b 100644
--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -1221,3 +1221,496 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
return sum_frac;
}
+
+/*
+ * This is a utility function used to estimate the join selectivity of
+ * range attributes using rangebound histogram statistics as described
+ * in this paper:
+ *
+ * Diogo Repas, Zhicheng Luo, Maxime Schoemans and Mahmoud Sakr, 2022
+ * Selectivity Estimation of Inequality Joins In Databases
+ * https://doi.org/10.48550/arXiv.2206.07396
+ *
+ * The attributes being joined will be treated as random variables
+ * that follow a distribution modeled by a Probability Density Function (PDF).
+ * Let the two attributes be denoted X, Y.
+ * This function finds the probability P(X < Y).
+ * Note that the PDFs of the two variables can easily be obtained
+ * from their bounds histogram, respectively hist1 and hist2 .
+ *
+ * Let the PDF of X, Y be denoted as f_X, f_Y.
+ * The probability P(X < Y) can be formalized as follows:
+ * P(X < Y)= integral_-inf^inf( integral_-inf^y ( f_X(x) * f_Y(y) dx dy ) )
+ * = integral_-inf^inf( F_X(y) * f_Y(y) dy )
+ * where F_X(y) denote the Cumulative Distribution Function of X at y.
+ * Note that F_X is the selectivity estimation (non-join),
+ * which is implemented using the function calc_hist_selectivity_scalar.
+ *
+ * Now given the histograms of the two attributes X, Y, we note the following:
+ * - The PDF of Y is a step function
+ * (constant piece-wise, where each piece is defined in a bin of Y's histogram)
+ * - The CDF of X is linear piece-wise
+ * (each piece is defined in a bin of X's histogram)
+ * This leads to the conclusion that their product
+ * (used to calculate the equation above) is also linear piece-wise.
+ * A new piece starts whenever either the bin of X or the bin of Y changes.
+ * By parallel scanning the two rangebound histograms of X and Y,
+ * we evaluate one piece of the result between every two consecutive rangebounds
+ * in the union of the two histograms.
+ *
+ * Given that the product F_X * f_y is linear in the interval
+ * between every two consecutive rangebounds, let them be denoted prev, cur,
+ * it can be shown that the above formula can be discretized into the following:
+ * P(X < Y) =
+ * 0.5 * sum_0^{n+m-1} ( ( F_X(prev) + F_X(cur) ) * ( F_Y(cur) - F_Y(prev) ) )
+ * where n, m are the lengths of the two histograms.
+ *
+ * As such, it is possible to fully compute the join selectivity
+ * as a summation of CDFs, iterating over the bounds of the two histograms.
+ * This maximizes the code reuse, since the CDF is computed using
+ * the calc_hist_selectivity_scalar function, which is the function used
+ * for selectivity estimation (non-joins).
+ *
+ */
+static double
+calc_hist_join_selectivity(TypeCacheEntry *typcache,
+ const RangeBound *hist1, int nhist1,
+ const RangeBound *hist2, int nhist2)
+{
+ int i,
+ j;
+ double selectivity = 0.0, /* initialization */
+ prev_sel1 = -1.0, /* to skip the first iteration */
+ prev_sel2 = 0.0; /* initialization */
+
+ /*
+ * Histograms will never be empty. In fact, a histogram will never have
+ * less than 2 values (1 bin)
+ */
+ Assert(nhist1 > 1);
+ Assert(nhist2 > 1);
+
+ /* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ /* Do the estimation on overlapping regions */
+ while (i < nhist1 && j < nhist2)
+ {
+ double cur_sel1,
+ cur_sel2;
+ RangeBound cur_sync;
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+ i++;
+ j++;
+ }
+ cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist1, nhist1, false);
+ cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync,
+ hist2, nhist2, false);
+
+ /* Skip the first iteration */
+ if (prev_sel1 >= 0)
+ selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2);
+
+ /* Prepare for the next iteration */
+ prev_sel1 = cur_sel1;
+ prev_sel2 = cur_sel2;
+ }
+
+ /* P(X < Y) = 0.5 * Sum(...) */
+ selectivity /= 2;
+
+ /* Include remainder of hist2 if any */
+ if (j < nhist2)
+ selectivity += 1 - prev_sel2;
+
+ return selectivity;
+}
+
+/*
+ * rangejoinsel -- join cardinality for range operators
+ */
+Datum
+rangejoinsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+ VariableStatData vardata1,
+ vardata2;
+ AttStatsSlot hist1,
+ hist2,
+ sslot;
+ bool reversed;
+ Selectivity selec;
+ TypeCacheEntry *typcache = NULL;
+ Form_pg_statistic stats1,
+ stats2;
+ double empty_frac1,
+ empty_frac2,
+ null_frac1,
+ null_frac2;
+ int nhist1,
+ nhist2;
+ RangeBound *hist1_lower,
+ *hist1_upper,
+ *hist2_lower,
+ *hist2_upper;
+ bool empty;
+ int i;
+
+ get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &reversed);
+
+ selec = default_range_selectivity(operator);
+
+ if (HeapTupleIsValid(vardata1.statsTuple) &&
+ get_attstatsslot(&hist1, vardata1.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ HeapTupleIsValid(vardata2.statsTuple) &&
+ get_attstatsslot(&hist2, vardata2.statsTuple,
+ STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+ ATTSTATSSLOT_VALUES) &&
+ vardata1.vartype == vardata2.vartype)
+ {
+
+ /* Initialize type cache */
+ typcache = range_get_typcache(fcinfo, vardata1.vartype);
+
+ /*
+ * First look up the fraction of NULLs and empty ranges from
+ * pg_statistic.
+ */
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+
+ null_frac1 = stats1->stanullfrac;
+ null_frac2 = stats2->stanullfrac;
+
+ /* Try to get fraction of empty ranges for the first variable */
+ if (get_attstatsslot(&sslot, vardata1.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac1 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac1 = 0.0;
+ }
+
+ /* Try to get fraction of empty ranges for the second variable */
+ if (get_attstatsslot(&sslot, vardata2.statsTuple,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
+ InvalidOid,
+ ATTSTATSSLOT_NUMBERS))
+ {
+ if (sslot.nnumbers != 1) /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic");
+ empty_frac2 = sslot.numbers[0];
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No empty fraction statistic. Assume no empty ranges. */
+ empty_frac2 = 0.0;
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the first variable.
+ */
+ nhist1 = hist1.nvalues;
+ hist1_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ hist1_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist1);
+ for (i = 0; i < nhist1; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist1.values[i]),
+ &hist1_lower[i], &hist1_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ /*
+ * Convert histograms of ranges into histograms of their lower and
+ * upper bounds for the second variable.
+ */
+ nhist2 = hist2.nvalues;
+ hist2_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ hist2_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist2);
+ for (i = 0; i < nhist2; i++)
+ {
+ range_deserialize(typcache, DatumGetRangeTypeP(hist2.values[i]),
+ &hist2_lower[i], &hist2_upper[i], &empty);
+ /* The histogram should not contain any empty ranges */
+ if (empty)
+ elog(ERROR, "bounds histogram contains an empty range");
+ }
+
+ switch (operator)
+ {
+ case OID_RANGE_OVERLAP_OP:
+
+ /*
+ * Selectivity of A && B = Selectivity of NOT( A << B || A >>
+ * B ) = 1 - Selectivity of (A.upper < B.lower) - Selectivity
+ * of (B.upper < A.lower)
+ */
+ selec = 1;
+ selec -= calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ selec -= calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * A <= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_GREATER_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to subtract P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * A < B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * A >= B
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Negation of OID_RANGE_LESS_OP.
+ *
+ * Overestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 < upper2)
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * A > B == B < A
+ *
+ * Start by comparing lower bounds and if they are equal
+ * compare upper bounds
+ *
+ * Underestimate by comparing only the lower bounds. Higher
+ * accuracy would require us to add P(lower1 = lower2) *
+ * P(upper1 > upper2)
+ */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_LEFT_OP:
+ /* var1 << var2 when upper(var1) < lower(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_lower, nhist2);
+ break;
+
+ case OID_RANGE_RIGHT_OP:
+ /* var1 >> var2 when upper(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ /* var1 &< var2 when upper(var1) < upper(var2) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ /* var1 &> var2 when lower(var2) < lower(var1) */
+ selec = calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * var1 <@ var2 is equivalent to lower(var2) <= lower(var1)
+ * and upper(var1) <= upper(var2)
+ *
+ * After negating both sides we get not( lower(var1) <
+ * lower(var2) ) and not( upper(var2) < upper(var1) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist1_lower, nhist1,
+ hist2_lower, nhist2);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist2_upper, nhist2,
+ hist1_upper, nhist1);
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * var1 @> var2 is equivalent to lower(var1) <= lower(var2)
+ * and upper(var2) <= upper(var1)
+ *
+ * After negating both sides we get not( lower(var2) <
+ * lower(var1) ) and not( upper(var1) < upper(var2) ),
+ * respectively. Assuming independence, multiply both
+ * selectivities.
+ */
+ selec = 1 - calc_hist_join_selectivity(typcache,
+ hist2_lower, nhist2,
+ hist1_lower, nhist1);
+ selec *= 1 - calc_hist_join_selectivity(typcache,
+ hist1_upper, nhist1,
+ hist2_upper, nhist2);
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+
+ /*
+ * just punt for now, estimation would require extraction of
+ * histograms for the anyelement
+ */
+ default:
+ break;
+ }
+
+
+ /* the calculated selectivity only applies to non-empty ranges */
+ selec *= (1 - empty_frac1) * (1 - empty_frac2);
+
+ /*
+ * Depending on the operator, empty ranges might match different
+ * fractions of the result.
+ */
+ switch (operator)
+ {
+ case OID_RANGE_LESS_OP:
+
+ /*
+ * empty range < non-empty range
+ */
+ selec += empty_frac1 * (1 - empty_frac2);
+ break;
+
+ case OID_RANGE_GREATER_OP:
+
+ /*
+ * non-empty range > empty range
+ */
+ selec += (1 - empty_frac1) * empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINED_OP:
+
+ /*
+ * empty range <@ any range
+ */
+ case OID_RANGE_LESS_EQUAL_OP:
+
+ /*
+ * empty range <= any range
+ */
+ selec += empty_frac1;
+ break;
+
+ case OID_RANGE_CONTAINS_OP:
+
+ /*
+ * any range @> empty range
+ */
+ case OID_RANGE_GREATER_EQUAL_OP:
+
+ /*
+ * any range >= empty range
+ */
+ selec += empty_frac2;
+ break;
+
+ case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+ case OID_RANGE_OVERLAP_OP:
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
+ case OID_RANGE_LEFT_OP:
+ case OID_RANGE_RIGHT_OP:
+ default:
+
+ /*
+ * these operators always return false when an empty range is
+ * involved
+ */
+ break;
+
+ }
+
+ /* all range operators are strict */
+ selec *= (1 - null_frac1) * (1 - null_frac2);
+
+ free_attstatsslot(&hist1);
+ free_attstatsslot(&hist2);
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+
+}
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index 0e7511dde1..cbee3c2293 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -3071,78 +3071,78 @@
oprname => '<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>(anyrange,anyrange)',
oprnegate => '>=(anyrange,anyrange)', oprcode => 'range_lt',
- oprrest => 'rangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3885', oid_symbol => 'OID_RANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>=(anyrange,anyrange)',
oprnegate => '>(anyrange,anyrange)', oprcode => 'range_le',
- oprrest => 'rangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3886', oid_symbol => 'OID_RANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<=(anyrange,anyrange)',
oprnegate => '<(anyrange,anyrange)', oprcode => 'range_ge',
- oprrest => 'rangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3887', oid_symbol => 'OID_RANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<(anyrange,anyrange)',
oprnegate => '<=(anyrange,anyrange)', oprcode => 'range_gt',
- oprrest => 'rangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'rangesel', oprjoin => 'rangejoinsel' },
{ oid => '3888', oid_symbol => 'OID_RANGE_OVERLAP_OP', descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anyrange)',
oprcode => 'range_overlaps', oprrest => 'rangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3889', oid_symbol => 'OID_RANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anyrange)',
oprcode => 'range_contains_elem', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3890', oid_symbol => 'OID_RANGE_CONTAINS_OP', descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anyrange)',
oprcode => 'range_contains', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3891', oid_symbol => 'OID_RANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyelement)',
oprcode => 'elem_contained_by_range', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3892', oid_symbol => 'OID_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anyrange)',
oprcode => 'range_contained_by', oprrest => 'rangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3893', oid_symbol => 'OID_RANGE_LEFT_OP', descr => 'is left of',
oprname => '<<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anyrange)',
oprcode => 'range_before', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3894', oid_symbol => 'OID_RANGE_RIGHT_OP', descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anyrange)',
oprcode => 'range_after', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3895', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overleft', oprrest => 'rangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3896', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'range_overright', oprrest => 'rangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3897', descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anyrange)',
oprcode => 'range_adjacent', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'rangejoinsel' },
{ oid => '3898', descr => 'range union',
oprname => '+', oprleft => 'anyrange', oprright => 'anyrange',
oprresult => 'anyrange', oprcom => '+(anyrange,anyrange)',
@@ -3277,139 +3277,139 @@
oprname => '<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>(anymultirange,anymultirange)',
oprnegate => '>=(anymultirange,anymultirange)', oprcode => 'multirange_lt',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2863', oid_symbol => 'OID_MULTIRANGE_LESS_EQUAL_OP',
descr => 'less than or equal',
oprname => '<=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>=(anymultirange,anymultirange)',
oprnegate => '>(anymultirange,anymultirange)', oprcode => 'multirange_le',
- oprrest => 'multirangesel', oprjoin => 'scalarlejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2864', oid_symbol => 'OID_MULTIRANGE_GREATER_EQUAL_OP',
descr => 'greater than or equal',
oprname => '>=', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<=(anymultirange,anymultirange)',
oprnegate => '<(anymultirange,anymultirange)', oprcode => 'multirange_ge',
- oprrest => 'multirangesel', oprjoin => 'scalargejoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2865', oid_symbol => 'OID_MULTIRANGE_GREATER_OP',
descr => 'greater than',
oprname => '>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<(anymultirange,anymultirange)',
oprnegate => '<=(anymultirange,anymultirange)', oprcode => 'multirange_gt',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2866', oid_symbol => 'OID_RANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anyrange)',
oprcode => 'range_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2867', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '&&(anyrange,anymultirange)',
oprcode => 'multirange_overlaps_range', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2868', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP',
descr => 'overlaps',
oprname => '&&', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '&&(anymultirange,anymultirange)',
oprcode => 'multirange_overlaps_multirange', oprrest => 'multirangesel',
- oprjoin => 'areajoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2869', oid_symbol => 'OID_MULTIRANGE_CONTAINS_ELEM_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyelement',
oprresult => 'bool', oprcom => '<@(anyelement,anymultirange)',
oprcode => 'multirange_contains_elem', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2870', oid_symbol => 'OID_MULTIRANGE_CONTAINS_RANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<@(anyrange,anymultirange)',
oprcode => 'multirange_contains_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2871', oid_symbol => 'OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anymultirange)',
oprcode => 'multirange_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2872', oid_symbol => 'OID_MULTIRANGE_ELEM_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyelement', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyelement)',
oprcode => 'elem_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2873', oid_symbol => 'OID_MULTIRANGE_RANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anyrange)',
oprcode => 'range_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2874', oid_symbol => 'OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '@>(anymultirange,anymultirange)',
oprcode => 'multirange_contained_by_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4539', oid_symbol => 'OID_RANGE_CONTAINS_MULTIRANGE_OP',
descr => 'contains',
oprname => '@>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<@(anymultirange,anyrange)',
oprcode => 'range_contains_multirange', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4540', oid_symbol => 'OID_RANGE_MULTIRANGE_CONTAINED_OP',
descr => 'is contained by',
oprname => '<@', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '@>(anyrange,anymultirange)',
oprcode => 'multirange_contained_by_range', oprrest => 'multirangesel',
- oprjoin => 'contjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '2875', oid_symbol => 'OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2876', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overleft_range',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '2877', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP',
descr => 'overlaps or is left of',
oprname => '&<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overleft_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalarltjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '3585', oid_symbol => 'OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'range_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4035', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcode => 'multirange_overright_range',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4142', oid_symbol => 'OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP',
descr => 'overlaps or is right of',
oprname => '&>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcode => 'multirange_overright_multirange',
- oprrest => 'multirangesel', oprjoin => 'scalargtjoinsel' },
+ oprrest => 'multirangesel', oprjoin => 'multirangejoinsel' },
{ oid => '4179', oid_symbol => 'OID_RANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anyrange)',
oprcode => 'range_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4180', oid_symbol => 'OID_MULTIRANGE_ADJACENT_RANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '-|-(anyrange,anymultirange)',
oprcode => 'multirange_adjacent_range', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4198', oid_symbol => 'OID_MULTIRANGE_ADJACENT_MULTIRANGE_OP',
descr => 'is adjacent to',
oprname => '-|-', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '-|-(anymultirange,anymultirange)',
oprcode => 'multirange_adjacent_multirange', oprrest => 'matchingsel',
- oprjoin => 'matchingjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4392', descr => 'multirange union',
oprname => '+', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'anymultirange', oprcom => '+(anymultirange,anymultirange)',
@@ -3426,36 +3426,36 @@
oprname => '<<', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anyrange)',
oprcode => 'range_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4396', oid_symbol => 'OID_MULTIRANGE_LEFT_RANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '>>(anyrange,anymultirange)',
oprcode => 'multirange_before_range', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4397', oid_symbol => 'OID_MULTIRANGE_LEFT_MULTIRANGE_OP',
descr => 'is left of',
oprname => '<<', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '>>(anymultirange,anymultirange)',
oprcode => 'multirange_before_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalarltjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4398', oid_symbol => 'OID_RANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anyrange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anyrange)',
oprcode => 'range_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4399', oid_symbol => 'OID_MULTIRANGE_RIGHT_RANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anyrange',
oprresult => 'bool', oprcom => '<<(anyrange,anymultirange)',
oprcode => 'multirange_after_range', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
{ oid => '4400', oid_symbol => 'OID_MULTIRANGE_RIGHT_MULTIRANGE_OP',
descr => 'is right of',
oprname => '>>', oprleft => 'anymultirange', oprright => 'anymultirange',
oprresult => 'bool', oprcom => '<<(anymultirange,anymultirange)',
oprcode => 'multirange_after_multirange', oprrest => 'multirangesel',
- oprjoin => 'scalargtjoinsel' },
+ oprjoin => 'multirangejoinsel' },
]
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 7979392776..4eaaad9fe3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12124,6 +12124,15 @@
proname => 'any_value_transfn', prorettype => 'anyelement',
proargtypes => 'anyelement anyelement', prosrc => 'any_value_transfn' },
+{ oid => '8355', descr => 'join selectivity for range operators',
+ proname => 'rangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'rangejoinsel' },
+{ oid => '8356', descr => 'join selectivity for multirange operators',
+ proname => 'multirangejoinsel', provolatile => 's', prorettype => 'float8',
+ proargtypes => 'internal oid internal int2 internal',
+ prosrc => 'multirangejoinsel' },
+
{ oid => '8436',
descr => 'list of available WAL summary files',
proname => 'pg_available_wal_summaries', prorows => '100',
diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out
index a0cb875492..21d63d9bda 100644
--- a/src/test/regress/expected/multirangetypes.out
+++ b/src/test/regress/expected/multirangetypes.out
@@ -3361,3 +3361,66 @@ create function mr_table_fail(i anyelement) returns table(i anyelement, r anymul
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anymultirange requires at least one input of type anyrange or anymultirange.
+--
+-- test selectivity of multirange join operators
+--
+create table test_multirange_join_1 (imr1 int4multirange);
+create table test_multirange_join_2 (imr2 int4multirange);
+create table test_multirange_join_3 (imr3 int4multirange);
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10)) from generate_series(1, 1000) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+100)) from generate_series(1, 1000, 10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10)) from generate_series(1, 500) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+100)) from generate_series(1, 500, 10) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+10)) from generate_series(501, 1000) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+100)) from generate_series(501, 1000, 10) g;
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_multirange_join_3;
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 && imr2 and imr2 && imr3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_multirange_join_1.imr1 && test_multirange_join_2.imr2)
+ -> Nested Loop
+ Join Filter: (test_multirange_join_2.imr2 && test_multirange_join_3.imr3)
+ -> Seq Scan on test_multirange_join_2
+ -> Materialize
+ -> Seq Scan on test_multirange_join_3
+ -> Materialize
+ -> Seq Scan on test_multirange_join_1
+(10 rows)
+
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 << imr2 and imr2 << imr3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_multirange_join_2.imr2 << test_multirange_join_3.imr3)
+ -> Nested Loop
+ Join Filter: (test_multirange_join_1.imr1 << test_multirange_join_2.imr2)
+ -> Seq Scan on test_multirange_join_1
+ -> Materialize
+ -> Seq Scan on test_multirange_join_2
+ -> Materialize
+ -> Seq Scan on test_multirange_join_3
+(10 rows)
+
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 >> imr2 and imr2 >> imr3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_multirange_join_1.imr1 >> test_multirange_join_2.imr2)
+ -> Nested Loop
+ Join Filter: (test_multirange_join_2.imr2 >> test_multirange_join_3.imr3)
+ -> Seq Scan on test_multirange_join_2
+ -> Materialize
+ -> Seq Scan on test_multirange_join_3
+ -> Seq Scan on test_multirange_join_1
+(9 rows)
+
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_multirange_join_3;
diff --git a/src/test/regress/expected/rangetypes.out b/src/test/regress/expected/rangetypes.out
index ee02ff0163..357bb3154b 100644
--- a/src/test/regress/expected/rangetypes.out
+++ b/src/test/regress/expected/rangetypes.out
@@ -1834,3 +1834,66 @@ create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
ERROR: cannot determine result data type
DETAIL: A result of type anyrange requires at least one input of type anyrange or anymultirange.
+--
+-- test selectivity of range join operators
+--
+create table test_range_join_1 (ir1 int4range);
+create table test_range_join_2 (ir2 int4range);
+create table test_range_join_3 (ir3 int4range);
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1, 1000) g;
+insert into test_range_join_1 select int4range(g, g+100) from generate_series(1, 1000, 10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1, 500) g;
+insert into test_range_join_2 select int4range(g, g+100) from generate_series(1, 500, 10) g;
+insert into test_range_join_3 select int4range(g, g+10) from generate_series(501, 1000) g;
+insert into test_range_join_3 select int4range(g, g+100) from generate_series(501, 1000, 10) g;
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_range_join_3;
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 && ir2 and ir2 && ir3;
+ QUERY PLAN
+-----------------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_range_join_1.ir1 && test_range_join_2.ir2)
+ -> Seq Scan on test_range_join_1
+ -> Materialize
+ -> Nested Loop
+ Join Filter: (test_range_join_2.ir2 && test_range_join_3.ir3)
+ -> Seq Scan on test_range_join_2
+ -> Materialize
+ -> Seq Scan on test_range_join_3
+(10 rows)
+
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 << ir2 and ir2 << ir3;
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_range_join_2.ir2 << test_range_join_3.ir3)
+ -> Nested Loop
+ Join Filter: (test_range_join_1.ir1 << test_range_join_2.ir2)
+ -> Seq Scan on test_range_join_1
+ -> Materialize
+ -> Seq Scan on test_range_join_2
+ -> Materialize
+ -> Seq Scan on test_range_join_3
+(10 rows)
+
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 >> ir2 and ir2 >> ir3;
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ Join Filter: (test_range_join_1.ir1 >> test_range_join_2.ir2)
+ -> Nested Loop
+ Join Filter: (test_range_join_2.ir2 >> test_range_join_3.ir3)
+ -> Seq Scan on test_range_join_2
+ -> Materialize
+ -> Seq Scan on test_range_join_3
+ -> Seq Scan on test_range_join_1
+(9 rows)
+
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_range_join_3;
diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql
index fefb4b4d42..4c62c31166 100644
--- a/src/test/regress/sql/multirangetypes.sql
+++ b/src/test/regress/sql/multirangetypes.sql
@@ -861,3 +861,30 @@ create function mr_inoutparam_fail(inout i anyelement, out r anymultirange)
--should fail
create function mr_table_fail(i anyelement) returns table(i anyelement, r anymultirange)
as $$ select $1, '[1,10]' $$ language sql;
+
+--
+-- test selectivity of multirange join operators
+--
+create table test_multirange_join_1 (imr1 int4multirange);
+create table test_multirange_join_2 (imr2 int4multirange);
+create table test_multirange_join_3 (imr3 int4multirange);
+
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+10)) from generate_series(1, 1000) g;
+insert into test_multirange_join_1 select int4multirange(int4range(g, g+100)) from generate_series(1, 1000, 10) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+10)) from generate_series(1, 500) g;
+insert into test_multirange_join_2 select int4multirange(int4range(g, g+100)) from generate_series(1, 500, 10) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+10)) from generate_series(501, 1000) g;
+insert into test_multirange_join_3 select int4multirange(int4range(g, g+100)) from generate_series(501, 1000, 10) g;
+
+analyze test_multirange_join_1;
+analyze test_multirange_join_2;
+analyze test_multirange_join_3;
+
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 && imr2 and imr2 && imr3;
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 << imr2 and imr2 << imr3;
+explain (costs off) select count(*) from test_multirange_join_1, test_multirange_join_2, test_multirange_join_3 where imr1 >> imr2 and imr2 >> imr3;
+
+drop table test_multirange_join_1;
+drop table test_multirange_join_2;
+drop table test_multirange_join_3;
diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
index c23be928c3..1018a234a5 100644
--- a/src/test/regress/sql/rangetypes.sql
+++ b/src/test/regress/sql/rangetypes.sql
@@ -629,3 +629,30 @@ create function inoutparam_fail(inout i anyelement, out r anyrange)
--should fail
create function table_fail(i anyelement) returns table(i anyelement, r anyrange)
as $$ select $1, '[1,10]' $$ language sql;
+
+--
+-- test selectivity of range join operators
+--
+create table test_range_join_1 (ir1 int4range);
+create table test_range_join_2 (ir2 int4range);
+create table test_range_join_3 (ir3 int4range);
+
+insert into test_range_join_1 select int4range(g, g+10) from generate_series(1, 1000) g;
+insert into test_range_join_1 select int4range(g, g+100) from generate_series(1, 1000, 10) g;
+insert into test_range_join_2 select int4range(g, g+10) from generate_series(1, 500) g;
+insert into test_range_join_2 select int4range(g, g+100) from generate_series(1, 500, 10) g;
+insert into test_range_join_3 select int4range(g, g+10) from generate_series(501, 1000) g;
+insert into test_range_join_3 select int4range(g, g+100) from generate_series(501, 1000, 10) g;
+
+analyze test_range_join_1;
+analyze test_range_join_2;
+analyze test_range_join_3;
+
+--reorder joins based on computed selectivity
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 && ir2 and ir2 && ir3;
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 << ir2 and ir2 << ir3;
+explain (costs off) select count(*) from test_range_join_1, test_range_join_2, test_range_join_3 where ir1 >> ir2 and ir2 >> ir3;
+
+drop table test_range_join_1;
+drop table test_range_join_2;
+drop table test_range_join_3;
On Fri, 5 Jan 2024 at 23:09, Schoemans Maxime <maxime.schoemans@ulb.be> wrote:
On 05/01/2024 11:37, vignesh C wrote:
One of the tests was aborted at [1], kindly post an updated patch for
the same:
Thank you for notifying us.
I believe I fixed the issue but it is hard to be certain as the issue
did not arise when running the regression tests locally.
I'm noticing this issue is not yet resolved, the CFBot is still
failing at [1]https://cirrus-ci.com/task/5698117824151552 with:
#7 0x000055cddc25cd93 in range_cmp_bound_values
(typcache=typcache@entry=0x629000030b60, b1=b1@entry=0x61c000016f08,
b2=b2@entry=0x61c0000180b8) at rangetypes.c:2090
[19:55:02.591] No locals.
[19:55:02.591] #8 0x000055cddc2685c1 in calc_hist_join_selectivity
(typcache=typcache@entry=0x629000030b60,
hist1=hist1@entry=0x61c0000180b8, nhist1=nhist1@entry=101,
hist2=hist2@entry=0x61c0000168b8, nhist2=nhist2@entry=101) at
rangetypes_selfuncs.c:1295
[19:55:02.591] i = 0
[19:55:02.591] j = 101
[19:55:02.591] selectivity = 0
[19:55:02.591] prev_sel1 = -1
[19:55:02.591] prev_sel2 = 0
[19:55:02.591] #9 0x000055cddc269aaa in rangejoinsel
(fcinfo=<optimized out>) at rangetypes_selfuncs.c:1479
[19:55:02.591] root = <optimized out>
[19:55:02.591] operator = <optimized out>
[19:55:02.591] args = <optimized out>
[19:55:02.591] sjinfo = <optimized out>
[19:55:02.591] vardata1 = {var = <optimized out>, rel = <optimized
out>, statsTuple = <optimized out>, freefunc = <optimized out>,
vartype = <optimized out>, atttype = <optimized out>, atttypmod =
<optimized out>, isunique = <optimized out>, acl_ok = <optimized out>}
[19:55:02.591] vardata2 = {var = <optimized out>, rel = <optimized
out>, statsTuple = <optimized out>, freefunc = <optimized out>,
vartype = <optimized out>, atttype = <optimized out>, atttypmod =
<optimized out>, isunique = <optimized out>, acl_ok = <optimized out>}
[19:55:02.591] hist1 = {staop = <optimized out>, stacoll = <optimized
out>, valuetype = <optimized out>, values = <optimized out>, nvalues =
<optimized out>, numbers = <optimized out>, nnumbers = <optimized
out>, values_arr = <optimized out>, numbers_arr = <optimized out>}
[19:55:02.591] hist2 = {staop = <optimized out>, stacoll = <optimized
out>, valuetype = <optimized out>, values = <optimized out>, nvalues =
<optimized out>, numbers = <optimized out>, nnumbers = <optimized
out>, values_arr = <optimized out>, numbers_arr = <optimized out>}
[19:55:02.591] sslot = {staop = <optimized out>, stacoll = <optimized
out>, valuetype = <optimized out>, values = <optimized out>, nvalues =
<optimized out>, numbers = <optimized out>, nnumbers = <optimized
out>, values_arr = <optimized out>, numbers_arr = <optimized out>}
[19:55:02.591] reversed = <optimized out>
[19:55:02.591] selec = 0.001709375000000013
[19:55:02.591] typcache = 0x629000030b60
[19:55:02.591] stats1 = <optimized out>
[19:55:02.591] stats2 = <optimized out>
[19:55:02.591] empty_frac1 = 0
[19:55:02.591] empty_frac2 = 0
[19:55:02.591] null_frac1 = 0
[19:55:02.591] null_frac2 = 0
[19:55:02.591] nhist1 = 101
[19:55:02.591] nhist2 = 101
[19:55:02.591] hist1_lower = 0x61c0000168b8
[19:55:02.591] hist1_upper = 0x61c0000170b8
[19:55:02.591] hist2_lower = 0x61c0000178b8
[19:55:02.591] hist2_upper = 0x61c0000180b8
[19:55:02.591] empty = <optimized out>
[19:55:02.591] i = <optimized out>
[19:55:02.591] __func__ = "rangejoinsel"
[19:55:02.591] #10 0x000055cddc3b761f in FunctionCall5Coll
(flinfo=flinfo@entry=0x7ffc1628d710, collation=collation@entry=0,
arg1=arg1@entry=107545982648856, arg2=arg2@entry=3888,
arg3=arg3@entry=107820862916056, arg4=arg4@entry=0, arg5=<optimized
out>) at fmgr.c:1242
[19:55:02.591] fcinfodata = {fcinfo = {flinfo = <optimized out>,
context = <optimized out>, resultinfo = <optimized out>, fncollation =
<optimized out>, isnull = <optimized out>, nargs = <optimized out>,
args = 0x0}, fcinfo_data = {<optimized out> <repeats 112 times>}}
[19:55:02.591] fcinfo = 0x7ffc1628d5e0
[19:55:02.591] result = <optimized out>
[19:55:02.591] __func__ = "FunctionCall5Coll"
[19:55:02.591] #11 0x000055cddc3b92ee in OidFunctionCall5Coll
(functionId=8355, collation=collation@entry=0,
arg1=arg1@entry=107545982648856, arg2=arg2@entry=3888,
arg3=arg3@entry=107820862916056, arg4=arg4@entry=0, arg5=<optimized
out>) at fmgr.c:1460
[19:55:02.591] flinfo = {fn_addr = <optimized out>, fn_oid =
<optimized out>, fn_nargs = <optimized out>, fn_strict = <optimized
out>, fn_retset = <optimized out>, fn_stats = <optimized out>,
fn_extra = <optimized out>, fn_mcxt = <optimized out>, fn_expr =
<optimized out>}
[19:55:02.591] #12 0x000055cddbe834ae in join_selectivity
(root=root@entry=0x61d00017c218, operatorid=operatorid@entry=3888,
args=0x6210003bc5d8, inputcollid=0,
jointype=jointype@entry=JOIN_INNER,
sjinfo=sjinfo@entry=0x7ffc1628db30) at
../../../../src/include/postgres.h:324
[19:55:02.591] oprjoin = <optimized out>
[19:55:02.591] result = <optimized out>
[19:55:02.591] __func__ = "join_selectivity"
[19:55:02.591] #13 0x000055cddbd8c18c in clause_selectivity_ext
(root=root@entry=0x61d00017c218, clause=0x6210003bc678,
varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER,
sjinfo=sjinfo@entry=0x7ffc1628db30,
use_extended_stats=use_extended_stats@entry=true) at clausesel.c:841
I have changed the status to "Waiting on Author", feel free to post an
updated version, check CFBot and update the Commitfest entry
accordingly.
[1]: https://cirrus-ci.com/task/5698117824151552
Regards,
Vignesh
I cannot figure out why it aborts.
as Tom mentioned in upthread about the test cases.
similar to src/test/regress/sql/stats_ext.sql check_estimated_rows function.
we can test it by something:
create or replace function check_estimated_rows(text) returns table (ok bool)
language plpgsql as
$$
declare
ln text;
tmp text[];
first_row bool := true;
begin
for ln in
execute format('explain analyze %s', $1)
loop
if first_row then
first_row := false;
tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
return query select 0.2 < tmp[1]::float8 / tmp[2]::float8
and tmp[1]::float8 / tmp[2]::float8 < 5;
end if;
end loop;
end;
$$;
select * from check_estimated_rows($$select * from test_range_join_1,
test_range_join_2 where ir1 && ir2$$);
select * from check_estimated_rows($$select * from test_range_join_1,
test_range_join_2 where ir1 << ir2$$);
select * from check_estimated_rows($$select * from test_range_join_1,
test_range_join_2 where ir1 >> ir2$$);
Do you need 3 tables to do the test? because we need to actually run
the query then compare the estimated row
and actually returned rows.
If you really execute the query with 3 table joins, it will take a lot of time.
So two tables join with where quql should be fine?
/* Fast-forwards i and j to start of iteration */
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
+
+ /* Do the estimation on overlapping regions */
+ while (i < nhist1 && j < nhist2)
+ {
+ double cur_sel1,
+ cur_sel2;
+ RangeBound cur_sync;
+
+ if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) < 0)
+ cur_sync = hist1[i++];
+ else if (range_cmp_bound_values(typcache, &hist1[i], &hist2[j]) > 0)
+ cur_sync = hist2[j++];
+ else
+ {
+ /* If equal, skip one */
+ cur_sync = hist1[i];
+
this part range_cmp_bound_values "if else if" part computed twice, you
can just do
`
int cmp;
cmp = range_cmp_bound_values(typcache, &hist1[i], &hist2[j]);
if cmp <0 then
else if cmp > 0 then
else then
`
also. I think you can put the following into main while loop.
+ for (i = 0; range_cmp_bound_values(typcache, &hist1[i], &hist2[0]) < 0; i++);
+ for (j = 0; range_cmp_bound_values(typcache, &hist2[j], &hist1[0]) < 0; j++);
split range and multirange into 2 patches might be a good idea.
seems: same function (calc_hist_join_selectivity) with same function
signature in src/backend/utils/adt/multirangetypes_selfuncs.c
and src/backend/utils/adt/rangetypes_selfuncs.c,
previously mail complaints not resolved.