Optimization of NestLoop join in the case of guaranteed empty inner subtree

Started by Andrey Lepikhovabout 6 years ago3 messages
#1Andrey Lepikhov
a.lepikhov@postgrespro.ru
4 attachment(s)

During NestLoop execution we have bad corner case: if outer subtree
contains tuples the join node will scan inner subtree even if it does
not return any tuples.

To reproduce the problem see 'problem.sql' in attachment:
Out of explain analyze see in 'problem_explain.txt'

As you can see, executor scan each of 1e5 outer tuples despite the fact
that inner can't return any tuples.

Teodor Sigaev and I developed a patch to solve this problem. Result of
explain analyze procedure can be found in the 'optimized_execution.txt'.

--
Andrey Lepikhov
Postgres Professional
https://postgrespro.com
The Russian Postgres Company

Attachments:

problem.sqlapplication/sql; name=problem.sqlDownload
problem_explain.txttext/plain; charset=UTF-8; name=problem_explain.txtDownload
0001-Skip-scan-of-outer-subtree-if-inner-of-the-NestedLoo.patchtext/x-patch; charset=UTF-8; name=0001-Skip-scan-of-outer-subtree-if-inner-of-the-NestedLoo.patchDownload
From 40ffbd2d9ee5954e5ae09f1e5a929ae2f2b17b8c Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Mon, 9 Dec 2019 18:25:04 +0500
Subject: [PATCH] Skip scan of outer subtree if inner of the NestedLoop node is
 guaranteed empty.

---
 src/backend/executor/nodeMaterial.c           | 11 +++++++++++
 src/backend/executor/nodeNestloop.c           |  5 +++++
 src/include/nodes/execnodes.h                 |  4 ++++
 src/test/regress/expected/partition_prune.out |  8 ++++----
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c
index cc93bbe45b..98771647f6 100644
--- a/src/backend/executor/nodeMaterial.c
+++ b/src/backend/executor/nodeMaterial.c
@@ -135,6 +135,8 @@ ExecMaterial(PlanState *pstate)
 		if (TupIsNull(outerslot))
 		{
 			node->eof_underlying = true;
+			if (tuplestorestate && tuplestore_tuple_count(tuplestorestate) == 0)
+				node->ss.ps.guaranteed_empty = true;
 			return NULL;
 		}
 
@@ -348,6 +350,9 @@ ExecReScanMaterial(MaterialState *node)
 			node->tuplestorestate = NULL;
 			if (outerPlan->chgParam == NULL)
 				ExecReScan(outerPlan);
+			else
+				/* At first look no cases can lead to this code. But still. */
+				node->ss.ps.guaranteed_empty = false;
 			node->eof_underlying = false;
 		}
 		else
@@ -363,6 +368,12 @@ ExecReScanMaterial(MaterialState *node)
 		 */
 		if (outerPlan->chgParam == NULL)
 			ExecReScan(outerPlan);
+
+		/*
+		 * The node suppress tuplestore and guaranteed_empty trick isn't work.
+		 */
+		Assert(node->ss.ps.guaranteed_empty == false);
+
 		node->eof_underlying = false;
 	}
 }
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index fc6667ef82..ea26a7d9d1 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -164,6 +164,11 @@ ExecNestLoop(PlanState *pstate)
 		{
 			ENL1_printf("no inner tuple, need new outer tuple");
 
+			if (innerPlan->guaranteed_empty &&
+				(node->js.jointype == JOIN_INNER ||
+				 node->js.jointype == JOIN_SEMI))
+				return NULL;
+
 			node->nl_NeedNewOuter = true;
 
 			if (!node->nl_MatchedOuter &&
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 692438d6df..c7f1a14526 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1007,6 +1007,9 @@ typedef struct PlanState
 	 * ExecConditionalAssignProjectionInfo()).  If no projection is necessary
 	 * ExecConditionalAssignProjectionInfo() defaults those fields to the scan
 	 * operations.
+	 *
+	 * If guaranteed_empty field is true, this means that no tuple will be
+	 * returned by the node.
 	 */
 	const TupleTableSlotOps *scanops;
 	const TupleTableSlotOps *outerops;
@@ -1020,6 +1023,7 @@ typedef struct PlanState
 	bool		outeropsset;
 	bool		inneropsset;
 	bool		resultopsset;
+	bool		guaranteed_empty;
 } PlanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index f9eeda60e6..04cfe0944e 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -2455,9 +2455,9 @@ update ab_a1 set b = 3 from ab where ab.a = 1 and ab.a = ab_a1.a;
                      Heap Blocks: exact=1
                      ->  Bitmap Index Scan on ab_a1_b2_a_idx (actual rows=1 loops=1)
                            Index Cond: (a = 1)
-               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (actual rows=0 loops=1)
+               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (never executed)
                      Recheck Cond: (a = 1)
-                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (actual rows=0 loops=1)
+                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (never executed)
                            Index Cond: (a = 1)
          ->  Materialize (actual rows=0 loops=1)
                ->  Bitmap Heap Scan on ab_a1_b1 ab_a1_1 (actual rows=0 loops=1)
@@ -2496,9 +2496,9 @@ update ab_a1 set b = 3 from ab where ab.a = 1 and ab.a = ab_a1.a;
                      Heap Blocks: exact=1
                      ->  Bitmap Index Scan on ab_a1_b2_a_idx (actual rows=1 loops=1)
                            Index Cond: (a = 1)
-               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (actual rows=0 loops=1)
+               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (never executed)
                      Recheck Cond: (a = 1)
-                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (actual rows=1 loops=1)
+                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (never executed)
                            Index Cond: (a = 1)
          ->  Materialize (actual rows=0 loops=1)
                ->  Bitmap Heap Scan on ab_a1_b3 ab_a1_3 (actual rows=0 loops=1)
-- 
2.17.1

optimized_execution.txttext/plain; charset=UTF-8; name=optimized_execution.txtDownload
#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andrey Lepikhov (#1)
Re: Optimization of NestLoop join in the case of guaranteed empty inner subtree

Andrey Lepikhov <a.lepikhov@postgrespro.ru> writes:

During NestLoop execution we have bad corner case: if outer subtree
contains tuples the join node will scan inner subtree even if it does
not return any tuples.

So the first question about corner-case optimizations like this is always
"how much overhead does it add in the normal case where it fails to gain
anything?". I see no performance numbers in your proposal.

I do not much like anything about the code, either: as written it's
only helpful for an especially narrow corner case (so narrow that
I wonder if it really ever helps at all: surely calling a nodeMaterial
whose tuplestore is empty doesn't cost much). But that doesn't stop it
from adding a bool to the generic PlanState struct, with global
implications. What I'd expected from your text description is that
nodeNestLoop would remember whether its inner child had returned zero rows
the first time, and assume that subsequent executions could be skipped
unless the inner child's parameters change.

regards, tom lane

#3Andrey Lepikhov
a.lepikhov@postgrespro.ru
In reply to: Tom Lane (#2)
1 attachment(s)
Re: Optimization of NestLoop join in the case of guaranteed empty inner subtree

On 12/11/19 8:49 PM, Tom Lane wrote:

Andrey Lepikhov <a.lepikhov@postgrespro.ru> writes:

During NestLoop execution we have bad corner case: if outer subtree
contains tuples the join node will scan inner subtree even if it does
not return any tuples.

So the first question about corner-case optimizations like this is always
"how much overhead does it add in the normal case where it fails to gain
anything?". I see no performance numbers in your proposal.

I thought it is trivial. But quick study shows no differences that can
be seen.

I do not much like anything about the code, either: as written it's
only helpful for an especially narrow corner case (so narrow that
I wonder if it really ever helps at all: surely calling a nodeMaterial
whose tuplestore is empty doesn't cost much).

Scanning of large outer can be very costly. If you will try to play with
analytical queries you can find cases, where nested loops uses
materialization of zero tuples. At least one of the cases for this is
finding data gaps.
Also, this optimization exists in logic of hash join.

But that doesn't stop it
from adding a bool to the generic PlanState struct, with global
implications. What I'd expected from your text description is that
nodeNestLoop would remember whether its inner child had returned zero rows
the first time, and assume that subsequent executions could be skipped
unless the inner child's parameters change.

This note I was waiting for. I agree with you that adding a bool
variable to PlanState is excessful. See in attachment another version of
the optimization.

--
Andrey Lepikhov
Postgres Professional
https://postgrespro.com
The Russian Postgres Company

Attachments:

v2-0001-Skip-scan-of-outer-subtree-if-inner-of-the-NestedLoo.patchtext/x-patch; charset=UTF-8; name=v2-0001-Skip-scan-of-outer-subtree-if-inner-of-the-NestedLoo.patchDownload
From a92617b82d922d5ebac7342bd8c212e0eb5b4553 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Mon, 9 Dec 2019 18:25:04 +0500
Subject: [PATCH] Skip scan of outer subtree if inner of the NestedLoop node is
 guaranteed empty.

---
 src/backend/executor/nodeNestloop.c           | 8 ++++++++
 src/include/nodes/execnodes.h                 | 1 +
 src/test/regress/expected/partition_prune.out | 8 ++++----
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index fc6667ef82..4a7da5406d 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -164,6 +164,11 @@ ExecNestLoop(PlanState *pstate)
 		{
 			ENL1_printf("no inner tuple, need new outer tuple");
 
+			if (node->nl_InnerEmpty && list_length(nl->nestParams) == 0 &&
+				(node->js.jointype == JOIN_INNER ||
+				 node->js.jointype == JOIN_SEMI))
+				return NULL;
+
 			node->nl_NeedNewOuter = true;
 
 			if (!node->nl_MatchedOuter &&
@@ -200,6 +205,8 @@ ExecNestLoop(PlanState *pstate)
 			 */
 			continue;
 		}
+		else
+			node->nl_InnerEmpty = false;
 
 		/*
 		 * at this point we have a new pair of inner and outer tuples so we
@@ -327,6 +334,7 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
 	{
 		case JOIN_INNER:
 		case JOIN_SEMI:
+			nlstate->nl_InnerEmpty = true;
 			break;
 		case JOIN_LEFT:
 		case JOIN_ANTI:
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 692438d6df..8829433347 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1847,6 +1847,7 @@ typedef struct NestLoopState
 	JoinState	js;				/* its first field is NodeTag */
 	bool		nl_NeedNewOuter;
 	bool		nl_MatchedOuter;
+	bool		nl_InnerEmpty;
 	TupleTableSlot *nl_NullInnerTupleSlot;
 } NestLoopState;
 
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index f9eeda60e6..04cfe0944e 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -2455,9 +2455,9 @@ update ab_a1 set b = 3 from ab where ab.a = 1 and ab.a = ab_a1.a;
                      Heap Blocks: exact=1
                      ->  Bitmap Index Scan on ab_a1_b2_a_idx (actual rows=1 loops=1)
                            Index Cond: (a = 1)
-               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (actual rows=0 loops=1)
+               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (never executed)
                      Recheck Cond: (a = 1)
-                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (actual rows=0 loops=1)
+                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (never executed)
                            Index Cond: (a = 1)
          ->  Materialize (actual rows=0 loops=1)
                ->  Bitmap Heap Scan on ab_a1_b1 ab_a1_1 (actual rows=0 loops=1)
@@ -2496,9 +2496,9 @@ update ab_a1 set b = 3 from ab where ab.a = 1 and ab.a = ab_a1.a;
                      Heap Blocks: exact=1
                      ->  Bitmap Index Scan on ab_a1_b2_a_idx (actual rows=1 loops=1)
                            Index Cond: (a = 1)
-               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (actual rows=0 loops=1)
+               ->  Bitmap Heap Scan on ab_a1_b3 ab_2 (never executed)
                      Recheck Cond: (a = 1)
-                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (actual rows=1 loops=1)
+                     ->  Bitmap Index Scan on ab_a1_b3_a_idx (never executed)
                            Index Cond: (a = 1)
          ->  Materialize (actual rows=0 loops=1)
                ->  Bitmap Heap Scan on ab_a1_b3 ab_a1_3 (actual rows=0 loops=1)
-- 
2.17.1