Re: Functional dependencies and GROUP BY - for subqueries

Started by Ashutosh Bapatover 12 years ago7 messages
#1Ashutosh Bapat
ashutosh.bapat@enterprisedb.com
1 attachment(s)

Hi All,
If group by clause has primary key, the targetlist may have columns which
are not part of the aggregate and not part of group by clause. The relevant
commit is e49ae8d3bc588294d07ce1a1272b31718cfca5ef and relevant mail thread
has subject Functional dependencies and GROUP BY.

As a result, for following set of commands, the last SELECT statement does
not throw error.
CREATE TEMP TABLE products (product_id int, name text, price numeric);
CREATE TEMP TABLE sales (product_id int, units int);
ALTER TABLE products ADD PRIMARY KEY (product_id);
SELECT product_id, p.name, (sum(s.units) * p.price) AS sales FROM products
p LEFT JOIN sales s USING (product_id) GROUP BY product_id;

But, if I rewrite the query using views as follows
create view sel_product as SELECT p.product_id, p.name, p.price FROM
products p;
create view sel_sales as SELECT s.units, s.product_id FROM ONLY sales s;
SELECT p.product_id, p.name, (sum(s.units) * p.price) FROM sel_product p
LEFT JOIN sel_sales s using(product_id) GROUP BY p.product_id;

The last SELECT statement gives error
ERROR: column "p.name" must appear in the GROUP BY clause or be used in an
aggregate function
LINE 1: SELECT p.product_id, p.name, (sum(s.units) * p.price) FROM s...

The reason being, it doesn't look into the subqueries (in FROM clause) to
infer that p.product_id is essentially product.product_id which is a
primary key.

Attached find a crude patch to infer the same by traversing subqueries.

As I said the patch is crude and needs a better shape. If community is
willing to accept the extension, I can work on it further.

--
Best Wishes,
Ashutosh Bapat
EntepriseDB Corporation
The Postgres Database Company

Attachments:

gb_subquery_pk.patchapplication/octet-stream; name=gb_subquery_pk.patchDownload
diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c
index a944a4d..6465d28 100644
--- a/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@ -45,20 +45,22 @@ typedef struct
 } check_ungrouped_columns_context;
 
 static int	check_agg_arguments(ParseState *pstate, List *args);
 static bool check_agg_arguments_walker(Node *node,
 						   check_agg_arguments_context *context);
 static void check_ungrouped_columns(Node *node, ParseState *pstate, Query *qry,
 						List *groupClauses, bool have_non_var_grouping,
 						List **func_grouped_rels);
 static bool check_ungrouped_columns_walker(Node *node,
 							   check_ungrouped_columns_context *context);
+static bool check_functional_grouping_recurse(RangeTblEntry *rte, Var *var,
+									List *grouping_columns);
 
 
 /*
  * transformAggregateCall -
  *		Finish initial transformation of an aggregate call
  *
  * parse_func.c has recognized the function as an aggregate, and has set up
  * all the fields of the Aggref except args, aggorder, aggdistinct and
  * agglevelsup.  The passed-in args list has been through standard expression
  * transformation, while the passed-in aggorder list hasn't been transformed
@@ -786,20 +788,71 @@ check_ungrouped_columns(Node *node, ParseState *pstate, Query *qry,
 
 	context.pstate = pstate;
 	context.qry = qry;
 	context.groupClauses = groupClauses;
 	context.have_non_var_grouping = have_non_var_grouping;
 	context.func_grouped_rels = func_grouped_rels;
 	context.sublevels_up = 0;
 	check_ungrouped_columns_walker(node, &context);
 }
 
+
+static bool
+check_functional_grouping_recurse(RangeTblEntry *rte, Var *var,
+									List *grouping_columns)
+{
+	TargetEntry *tle;
+	ListCell	*lcell;
+	List		*sq_grouping_columns = NIL;
+	ParseState	*pstate;
+	bool		have_non_var_grouping;
+	List		*func_grouped_rels = NIL;
+
+	/* Assumed that the rte corresponds to the varno in passed in var node */
+	Assert(rte->rtekind == RTE_SUBQUERY);
+	/*
+	 * Fetch those grouping columns which entirely belong to this subquery, i.e
+	 * Var nodes which have the same varno as the given Var node.
+	 */
+	have_non_var_grouping = false;
+	foreach(lcell, grouping_columns)
+	{
+		Var *grouping_var = lfirst(lcell); 
+		if (IsA(grouping_var, Var) &&
+			grouping_var->varno == var->varno)
+		{
+			tle = get_tle_by_resno(rte->subquery->targetList, grouping_var->varattno);
+			Assert(tle && tle->expr);
+			sq_grouping_columns = lappend(sq_grouping_columns, tle->expr);
+			if (!IsA(tle->expr, Var))
+				have_non_var_grouping = true;
+		}
+	}
+
+	/* Obtain the expression corresponding var node */
+	tle = get_tle_by_resno(rte->subquery->targetList, var->varattno);
+	Assert(tle && tle->expr);
+
+	
+	pstate = make_parsestate(NULL);
+	/* Is this correct way to set the p_rtable from ParseState? */
+	pstate->p_rtable = rte->subquery->rtable;
+	/*
+	 * The function would throw error, in case the expression is found to be
+	 * ungrouped, otherwise we are good to go
+	 */
+	check_ungrouped_columns((Node *)tle->expr, pstate, rte->subquery,
+						sq_grouping_columns, have_non_var_grouping, &func_grouped_rels);
+	return true;
+
+}
+
 static bool
 check_ungrouped_columns_walker(Node *node,
 							   check_ungrouped_columns_context *context)
 {
 	ListCell   *gl;
 
 	if (node == NULL)
 		return false;
 	if (IsA(node, Const) ||
 		IsA(node, Param))
@@ -893,20 +946,30 @@ check_ungrouped_columns_walker(Node *node,
 										  var->varno,
 										  0,
 										  context->groupClauses,
 										  &context->qry->constraintDeps))
 			{
 				*context->func_grouped_rels =
 					lappend_int(*context->func_grouped_rels, var->varno);
 				return false;	/* acceptable */
 			}
 		}
+		else if (rte->rtekind == RTE_SUBQUERY)
+		{
+			if (check_functional_grouping_recurse(rte, var,
+													context->groupClauses))
+			{
+				*context->func_grouped_rels =
+					lappend_int(*context->func_grouped_rels, var->varno);
+				return false;
+			}
+		}
 
 		/* Found an ungrouped local variable; generate error message */
 		attname = get_rte_attribute_name(rte, var->varattno);
 		if (context->sublevels_up == 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_GROUPING_ERROR),
 					 errmsg("column \"%s.%s\" must appear in the GROUP BY clause or be used in an aggregate function",
 							rte->eref->aliasname, attname),
 					 parser_errposition(context->pstate, var->location)));
 		else
#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Ashutosh Bapat (#1)

Ashutosh Bapat <ashutosh.bapat@enterprisedb.com> writes:

The reason being, it doesn't look into the subqueries (in FROM clause) to
infer that p.product_id is essentially product.product_id which is a
primary key.

Right.

Attached find a crude patch to infer the same by traversing subqueries.

I think this is probably a bad idea. We could spend an infinite amount
of code this way, with ever-increasing runtime cost and ever-decreasing
usefulness, and where would we stop? I'm also a bit concerned about
allowing illegal queries due to bugs of omission in the
ever-more-complex checking code, which could be quite hard to find, and
when we did find them we'd be faced with a backwards compatibility break
if we fix them.

A larger point is that the patch as proposed doesn't fix the stated
problem, because it only descends into written-out subqueries. It
would only succeed at looking into views if we applied it after
rewriting, rather than in the parser. That's really not going to work.
It would be a complete disaster if the dependencies of a query that
references a view depend on the view's contents.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Ashutosh Bapat
ashutosh.bapat@enterprisedb.com
In reply to: Tom Lane (#2)

On Fri, Apr 26, 2013 at 7:54 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Ashutosh Bapat <ashutosh.bapat@enterprisedb.com> writes:

The reason being, it doesn't look into the subqueries (in FROM clause) to
infer that p.product_id is essentially product.product_id which is a
primary key.

Right.

Attached find a crude patch to infer the same by traversing subqueries.

I think this is probably a bad idea. We could spend an infinite amount
of code this way, with ever-increasing runtime cost and ever-decreasing
usefulness, and where would we stop? I'm also a bit concerned about
allowing illegal queries due to bugs of omission in the
ever-more-complex checking code, which could be quite hard to find, and
when we did find them we'd be faced with a backwards compatibility break
if we fix them.

A larger point is that the patch as proposed doesn't fix the stated
problem, because it only descends into written-out subqueries.

That's correct. I tested it only with the written-out subqueries, to see if
the idea works. But it started with the case involving views.

It
would only succeed at looking into views if we applied it after
rewriting, rather than in the parser. That's really not going to work.
It would be a complete disaster if the dependencies of a query that
references a view depend on the view's contents.

Can you please elaborate, why would it be a disaster?

Will we extend this functionality for written-out subqueries queries and/or
views or none?

I am not touchy about the approach, I have taken. I am interested in the
feature extension, whichever way it gets implemented.

regards, tom lane

--
Best Wishes,
Ashutosh Bapat
EntepriseDB Corporation
The Postgres Database Company

#4Tom Lane
tgl@sss.pgh.pa.us
In reply to: Ashutosh Bapat (#3)

Ashutosh Bapat <ashutosh.bapat@enterprisedb.com> writes:

On Fri, Apr 26, 2013 at 7:54 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

A larger point is that the patch as proposed doesn't fix the stated
problem, because it only descends into written-out subqueries. It
would only succeed at looking into views if we applied it after
rewriting, rather than in the parser. That's really not going to work.
It would be a complete disaster if the dependencies of a query that
references a view depend on the view's contents.

Can you please elaborate, why would it be a disaster?

Consider that we've done

create table t1 (id int primary key, ... other stuff ...);
create view v1 as select * from t1;
create view v2 as select * from v1 group by id;

Currently, v2 would be rejected but you would like to make it legal.
Now consider

alter table t1 drop primary key;

This ALTER would have to be rejected, or else (with CASCADE) lead to
dropping v2 but not v1. That's pretty ugly action-at-a-distance
if you ask me. But worse, consider

create or replace view v1 as select * from t2;

where t2 exposes the same columns as t1 but lacks a primary-key
constraint on id. This likewise would need to invalidate v2. We lack
any dependency mechanism that could enforce that, and it seems seriously
ugly that such a view redefinition could fail at all. (Note for
instance that there's no place to put a CASCADE/RESTRICT option in
CREATE OR REPLACE VIEW.)

So quite aside from the implementation difficulties of looking into
views for such constraints, I don't think the behavior would be pleasant
if we did do it. Views are not supposed to expose properties of the
underlying tables.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#5Ashutosh Bapat
ashutosh.bapat@enterprisedb.com
In reply to: Tom Lane (#4)

Can you please elaborate, why would it be a disaster?

Consider that we've done

create table t1 (id int primary key, ... other stuff ...);
create view v1 as select * from t1;
create view v2 as select * from v1 group by id;

Currently, v2 would be rejected but you would like to make it legal.
Now consider

alter table t1 drop primary key;

This ALTER would have to be rejected, or else (with CASCADE) lead to
dropping v2 but not v1. That's pretty ugly action-at-a-distance
if you ask me. But worse, consider

create or replace view v1 as select * from t2;

where t2 exposes the same columns as t1 but lacks a primary-key
constraint on id. This likewise would need to invalidate v2. We lack
any dependency mechanism that could enforce that, and it seems seriously
ugly that such a view redefinition could fail at all. (Note for
instance that there's no place to put a CASCADE/RESTRICT option in
CREATE OR REPLACE VIEW.)

So quite aside from the implementation difficulties of looking into
views for such constraints, I don't think the behavior would be pleasant
if we did do it. Views are not supposed to expose properties of the
underlying tables.

Thanks for the explanation.

Is there any reason why do we want to check the functional dependencies at
the time of parsing and not after rewrite? Obviously, by doing so, we will
allow creation of certain views which will start throwing errors after the
underlying table changes the primary key. Is it mandatory that we throw
"functional dependency" related errors at the time of creation of views?

regards, tom lane

--
Best Wishes,
Ashutosh Bapat
EntepriseDB Corporation
The Postgres Database Company

#6Tom Lane
tgl@sss.pgh.pa.us
In reply to: Ashutosh Bapat (#5)

Ashutosh Bapat <ashutosh.bapat@enterprisedb.com> writes:

Is there any reason why do we want to check the functional dependencies at
the time of parsing and not after rewrite? Obviously, by doing so, we will
allow creation of certain views which will start throwing errors after the
underlying table changes the primary key. Is it mandatory that we throw
"functional dependency" related errors at the time of creation of views?

From a usability standpoint, I would think so. And really the only
excuse for the functional-dependency feature to exist at all is
usability; it adds nothing you couldn't do without it.

If we wanted to do something like this, I think the clean way to do it
would be to invent a notion of unique/not-null/pkey constraints on
views, so that the creator of a view could declaratively say that he
wants such a property exposed. That is, the example would become
something like

create table t1 (id int primary key, ... other stuff ...);
create view v1 as select * from t1;
alter view v1 add primary key(id);
create view v2 as select * from v1 group by id;

The pkey constraint on v1 is just a catalog entry with a dependency on
t1's pkey constraint; there's no actual index there. But now, v2 can
be built with a dependency on v1's pkey, not t1's, and the action-at-
a-distance problem goes away. For example, a "CREATE OR REPLACE v1"
command could check that the new view definition still provides
something for v1's pkey to depend on, and throw error or not without any
need to examine the contents of other views. Dropping various elements
of this schema would work unsurprisingly, too.

This would, of course, require a significant chunk of new code, and
personally I do not think the feature would be worth it. But it
would be a clean and usable design.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#7Ashutosh Bapat
ashutosh.bapat@enterprisedb.com
In reply to: Tom Lane (#6)

On Mon, Apr 29, 2013 at 7:31 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Ashutosh Bapat <ashutosh.bapat@enterprisedb.com> writes:

Is there any reason why do we want to check the functional dependencies

at

the time of parsing and not after rewrite? Obviously, by doing so, we

will

allow creation of certain views which will start throwing errors after

the

underlying table changes the primary key. Is it mandatory that we throw
"functional dependency" related errors at the time of creation of views?

From a usability standpoint, I would think so. And really the only
excuse for the functional-dependency feature to exist at all is
usability; it adds nothing you couldn't do without it.

If we wanted to do something like this, I think the clean way to do it
would be to invent a notion of unique/not-null/pkey constraints on
views, so that the creator of a view could declaratively say that he
wants such a property exposed. That is, the example would become
something like

create table t1 (id int primary key, ... other stuff ...);
create view v1 as select * from t1;
alter view v1 add primary key(id);
create view v2 as select * from v1 group by id;

The pkey constraint on v1 is just a catalog entry with a dependency on
t1's pkey constraint; there's no actual index there. But now, v2 can
be built with a dependency on v1's pkey, not t1's, and the action-at-
a-distance problem goes away. For example, a "CREATE OR REPLACE v1"
command could check that the new view definition still provides
something for v1's pkey to depend on, and throw error or not without any
need to examine the contents of other views. Dropping various elements
of this schema would work unsurprisingly, too.

This would, of course, require a significant chunk of new code, and
personally I do not think the feature would be worth it. But it
would be a clean and usable design.

Yes, this looks better design. But I do not see any interest as such. So,
if I have to spend time here, there is higher chance it would go waste.

Will it be useful to have primary key grouping functionality extended to
the subqueries? For example,

CREATE TEMP TABLE products (product_id int, name text, price numeric);
CREATE TEMP TABLE sales (product_id int, units int);
ALTER TABLE products ADD PRIMARY KEY (product_id);

SELECT product_id, p.name, (sum(s.units) * p.price) AS sales FROM (select *
from products) p LEFT JOIN (select * from sales) s USING (product_id) GROUP
BY product_id;

This subquery gives error (p.name should be part of group by clause), but
functionally it's grouping based on primary key. Is there a better way to
use the functional dependency for grouping?

regards, tom lane

--
Best Wishes,
Ashutosh Bapat
EntepriseDB Corporation
The Postgres Database Company