From 372629e30dfb40f357b46f3644e090de945e0cb0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@ispras.ru>
Date: Fri, 10 Mar 2017 17:26:26 +0300
Subject: [PATCH 3/8] Base for reversed executor.

Framework for implementing reversed executor. Substitutes ExecutePlan call
with RunNode, which invokes ExecLeaf on leaf nodes in proper order.

See README and the beginning of execProcnode.c for the details.
---
 src/backend/executor/README         |  47 +++++++
 src/backend/executor/execMain.c     | 273 +++++++++++++++++-------------------
 src/backend/executor/execProcnode.c | 152 ++++++++++++++++----
 src/include/executor/executor.h     |   4 +
 src/include/nodes/execnodes.h       |  11 ++
 5 files changed, 315 insertions(+), 172 deletions(-)

diff --git a/src/backend/executor/README b/src/backend/executor/README
index f1d1e4c76c..08f139d0c5 100644
--- a/src/backend/executor/README
+++ b/src/backend/executor/README
@@ -3,6 +3,53 @@ src/backend/executor/README
 The Postgres Executor
 =====================
 
+This is an attempt to implement proof concept of executor with push-based
+achitecture like in [1]. We will call it 'reversed' executor. Right now we will
+not support both reversed and original executor, because it would involve a lot
+of code copy-pasting (or time to avoid it), while our current goal is just to
+implement working proof of concept to estimate the benefits.
+
+Since this is a huge change, we need to outline the general strategy, things
+we will start with and how we will deal with the old code, remembering that we
+will reuse a great deal of it.
+
+Key points:
+* ExecProcNode is now a stub. All nodes code (ExecSomeNode, etc) is
+  unreachable. However, we leave it to avoid 19k lines removal commit and to
+  produce more useful diffs later; a lot of code will be reused.
+* Base for implementing push model, common for all nodes, is in execMain.c and
+  execProcNode.c. We will substitute execProcNode with functions ExecLeaf,
+  ExecPushTuple and ExecPushNull, theirs interface is described in the comment
+  to the definitions and in the beginning of execProcnode.c. This is the only
+  change to the node's interface. We make necessary changes to execMain.c,
+  namely to ExecutorRun, to run nodes in proper order from the below.
+* Then we are ready to implement the nodes one by one.
+
+At first,
+* parallel execution will not be supported.
+* subplans will not be supported.
+* we will not support ExecReScan too for now.
+* only CMD_SELECT operation will be supported.
+* only forward direction will be supported.
+* we will not support set returning functions either.
+
+In general, we try to treat the old code as follows:
+* As said above, leave it if the old code is dead and not yet rewritten.
+  Compile with -Wno-unused-function, if warnings are annoying.
+* If it is not dead, but not yet updated for reversed executor, remove it.
+  Example is contents of ExecInitNode.
+* Sometimes we need to make minimal changes to some existing function, but these
+  changes will make it incompatible with existing code which is not yet
+  reworked.  In that case, to avoid deleting a lot of code we will just
+  copypaste it until some more generic solution will be provided. Example is
+  heapgettup_pagemode and it's 'reversed' analogue added for seqscan.
+
+
+[1] Efficiently Compiling Efficient Query Plans for Modern Hardware,
+    http://www.vldb.org/pvldb/vol4/p539-neumann.pdf
+
+Below goes the original README text.
+
 The executor processes a tree of "plan nodes".  The plan tree is essentially
 a demand-pull pipeline of tuple processing operations.  Each node, when
 called, will produce the next tuple in its output sequence, or NULL if no
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index a465d74eab..ec8aba660d 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -63,6 +63,7 @@
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/tqual.h"
+#include "executor/executor.h"
 
 
 /* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
@@ -79,14 +80,7 @@ static void InitPlan(QueryDesc *queryDesc, int eflags);
 static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
 static void ExecPostprocessPlan(EState *estate);
 static void ExecEndPlan(PlanState *planstate, EState *estate);
-static void ExecutePlan(EState *estate, PlanState *planstate,
-			bool use_parallel_mode,
-			CmdType operation,
-			bool sendTuples,
-			uint64 numberTuples,
-			ScanDirection direction,
-			DestReceiver *dest,
-			bool execute_once);
+static void RunNode(PlanState *planstate);
 static bool ExecCheckRTEPerms(RangeTblEntry *rte);
 static bool ExecCheckRTEPermsModified(Oid relOid, Oid userid,
 						  Bitmapset *modifiedCols,
@@ -329,6 +323,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	 * extract information from the query descriptor and the query feature.
 	 */
 	operation = queryDesc->operation;
+	if (operation != CMD_SELECT)
+	{
+		elog(ERROR, "Non CMD_SELECT operations are not implemented");
+		return;
+	}
 	dest = queryDesc->dest;
 
 	/*
@@ -343,25 +342,24 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	if (sendTuples)
 		(*dest->rStartup) (dest, operation, queryDesc->tupDesc);
 
+	/* set up state needed for sending tuples to the dest */
+	estate->es_current_tuple_count = 0;
+	estate->es_sendTuples = sendTuples;
+	estate->es_numberTuplesRequested = count;
+	estate->es_operation = operation;
+	estate->es_dest = dest;
+
+	/*
+	 * Set the direction.
+	 */
+	estate->es_direction = direction;
+
 	/*
 	 * run plan
 	 */
 	if (!ScanDirectionIsNoMovement(direction))
-	{
-		if (execute_once && queryDesc->already_executed)
-			elog(ERROR, "can't re-execute query flagged for single execution");
-		queryDesc->already_executed = true;
-
-		ExecutePlan(estate,
-					queryDesc->planstate,
-					queryDesc->plannedstmt->parallelModeNeeded,
-					operation,
-					sendTuples,
-					count,
-					direction,
-					dest,
-					execute_once);
-	}
+		/* Run each leaf in right order	 */
+		RunNode(queryDesc->planstate);
 
 	/*
 	 * shutdown tuple receiver, if we started it
@@ -1562,131 +1560,6 @@ ExecEndPlan(PlanState *planstate, EState *estate)
 	}
 }
 
-/* ----------------------------------------------------------------
- *		ExecutePlan
- *
- *		Processes the query plan until we have retrieved 'numberTuples' tuples,
- *		moving in the specified direction.
- *
- *		Runs to completion if numberTuples is 0
- *
- * Note: the ctid attribute is a 'junk' attribute that is removed before the
- * user can see it
- * ----------------------------------------------------------------
- */
-static void
-ExecutePlan(EState *estate,
-			PlanState *planstate,
-			bool use_parallel_mode,
-			CmdType operation,
-			bool sendTuples,
-			uint64 numberTuples,
-			ScanDirection direction,
-			DestReceiver *dest,
-			bool execute_once)
-{
-	TupleTableSlot *slot;
-	uint64		current_tuple_count;
-
-	/*
-	 * initialize local variables
-	 */
-	current_tuple_count = 0;
-
-	/*
-	 * Set the direction.
-	 */
-	estate->es_direction = direction;
-
-	/*
-	 * If the plan might potentially be executed multiple times, we must force
-	 * it to run without parallelism, because we might exit early.  Also
-	 * disable parallelism when writing into a relation, because no database
-	 * changes are allowed in parallel mode.
-	 */
-	if (!execute_once || dest->mydest == DestIntoRel)
-		use_parallel_mode = false;
-
-	if (use_parallel_mode)
-		EnterParallelMode();
-
-	/*
-	 * Loop until we've processed the proper number of tuples from the plan.
-	 */
-	for (;;)
-	{
-		/* Reset the per-output-tuple exprcontext */
-		ResetPerTupleExprContext(estate);
-
-		/*
-		 * Execute the plan and obtain a tuple
-		 */
-		slot = ExecProcNode(planstate);
-
-		/*
-		 * if the tuple is null, then we assume there is nothing more to
-		 * process so we just end the loop...
-		 */
-		if (TupIsNull(slot))
-		{
-			/* Allow nodes to release or shut down resources. */
-			(void) ExecShutdownNode(planstate);
-			break;
-		}
-
-		/*
-		 * If we have a junk filter, then project a new tuple with the junk
-		 * removed.
-		 *
-		 * Store this new "clean" tuple in the junkfilter's resultSlot.
-		 * (Formerly, we stored it back over the "dirty" tuple, which is WRONG
-		 * because that tuple slot has the wrong descriptor.)
-		 */
-		if (estate->es_junkFilter != NULL)
-			slot = ExecFilterJunk(estate->es_junkFilter, slot);
-
-		/*
-		 * If we are supposed to send the tuple somewhere, do so. (In
-		 * practice, this is probably always the case at this point.)
-		 */
-		if (sendTuples)
-		{
-			/*
-			 * If we are not able to send the tuple, we assume the destination
-			 * has closed and no more tuples can be sent. If that's the case,
-			 * end the loop.
-			 */
-			if (!((*dest->receiveSlot) (slot, dest)))
-				break;
-		}
-
-		/*
-		 * Count tuples processed, if this is a SELECT.  (For other operation
-		 * types, the ModifyTable plan node must count the appropriate
-		 * events.)
-		 */
-		if (operation == CMD_SELECT)
-			(estate->es_processed)++;
-
-		/*
-		 * check our tuple count.. if we've processed the proper number then
-		 * quit, else loop again and process more tuples.  Zero numberTuples
-		 * means no limit.
-		 */
-		current_tuple_count++;
-		if (numberTuples && numberTuples == current_tuple_count)
-		{
-			/* Allow nodes to release or shut down resources. */
-			(void) ExecShutdownNode(planstate);
-			break;
-		}
-	}
-
-	if (use_parallel_mode)
-		ExitParallelMode();
-}
-
-
 /*
  * ExecRelCheck --- check that tuple meets constraints for result relation
  *
@@ -3325,3 +3198,107 @@ ExecBuildSlotPartitionKeyDescription(Relation rel,
 
 	return buf.data;
 }
+
+/*
+ * This function pushes the ready tuple to it's destination. It should
+ * be called by top-level PlanState.
+ * For now, I added the state needed for this to estate, specifically
+ * current_tuple_count, sendTuples, numberTuplesRequested (old numberTuples),
+ * cmdType, dest.
+ *
+ * slot is the tuple to push
+ * planstate is top-level node
+ * returns true, if we are ready to accept more tuples, false otherwise
+ */
+bool
+SendReadyTuple(TupleTableSlot *slot, PlanState *planstate)
+{
+	EState *estate;
+	bool sendTuples;
+	CmdType operation;
+	DestReceiver *dest;
+
+	estate = planstate->state;
+	sendTuples = estate->es_sendTuples;
+	operation = estate->es_operation;
+	dest = estate->es_dest;
+
+	if (TupIsNull(slot))
+	{
+		/* Allow nodes to release or shut down resources. */
+		(void) ExecShutdownNode(planstate);
+		return false;
+	}
+
+	/*
+	 * If we have a junk filter, then project a new tuple with the junk
+	 * removed.
+	 *
+	 * Store this new "clean" tuple in the junkfilter's resultSlot.
+	 * (Formerly, we stored it back over the "dirty" tuple, which is WRONG
+	 * because that tuple slot has the wrong descriptor.)
+	 */
+	if (estate->es_junkFilter != NULL)
+		slot = ExecFilterJunk(estate->es_junkFilter, slot);
+
+	/*
+	 * If we are supposed to send the tuple somewhere, do so. (In
+	 * practice, this is probably always the case at this point.)
+	 */
+	if (sendTuples)
+	{
+		/*
+		 * If we are not able to send the tuple, we assume the destination
+		 * has closed and no more tuples can be sent.
+		 */
+		if (!((*dest->receiveSlot) (slot, dest)))
+			return false;
+	}
+
+	/*
+	 * Count tuples processed, if this is a SELECT.  (For other operation
+	 * types, the ModifyTable plan node must count the appropriate
+	 * events.)
+	 */
+	if (operation == CMD_SELECT)
+		(estate->es_processed)++;
+
+	/*
+	 * check our tuple count.. if we've processed the proper number then
+	 * quit, else process more tuples.  Zero numberTuplesRequested
+	 * means no limit.
+	 */
+	estate->es_current_tuple_count++;
+	if (estate->es_numberTuplesRequested &&
+		estate->es_numberTuplesRequested == estate->es_current_tuple_count)
+		return false;
+
+	ResetPerTupleExprContext(estate);
+	return true;
+}
+
+/*
+ * When pushing, we have to call pushTuple on each leaf of the tree in correct
+ * order: first inner sides, then outer. This function does exactly that.
+ */
+void
+RunNode(PlanState *planstate)
+{
+	Assert(planstate != NULL);
+
+	if (innerPlanState(planstate) != NULL)
+	{
+		RunNode(innerPlanState(planstate));
+		/* I assume that if inner node exists, outer exists too */
+		RunNode(outerPlanState(planstate));
+		return;
+	}
+	if (outerPlanState(planstate) != NULL)
+	{
+		RunNode(outerPlanState(planstate));
+		return;
+	}
+
+	/* node has no childs, it is a leaf */
+	ExecLeaf(planstate);
+}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index b013a17023..5955f84d86 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -2,10 +2,10 @@
  *
  * execProcnode.c
  *	 contains dispatch functions which call the appropriate "initialize",
- *	 "get a tuple", and "cleanup" routines for the given node type.
- *	 If the node has children, then it will presumably call ExecInitNode,
- *	 ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
- *	 processing.
+ *	 "push a tuple", and "cleanup" routines for the given node type.
+ *	 If the node has children, then it will presumably call ExecInitNode
+ *	 and ExecEndNode on its subnodes and ExecPushTuple to push processed tuple
+ *	 to its parent.
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -19,16 +19,18 @@
 /*
  *	 INTERFACE ROUTINES
  *		ExecInitNode	-		initialize a plan node and its subplans
- *		ExecProcNode	-		get a tuple by executing the plan node
+ *		ExecLeaf		-		start execution of the leaf
+ *		ExecPushTuple	-		push tuple to the parent node
+ *		ExecPushNull	-		let parent know that we are done
  *		ExecEndNode		-		shut down a plan node and its subplans
  *
  *	 NOTES
- *		This used to be three files.  It is now all combined into
- *		one file so that it is easier to keep ExecInitNode, ExecProcNode,
- *		and ExecEndNode in sync when new nodes are added.
+ *		This used to be three files. It is now all combined into
+ *		one file so that it is easier to keep ExecInitNode, ExecLeaf,
+ *		ExecPushTuple, ExecPushNull and ExecEndNode in sync when new nodes
+ *		are added.
  *
- *	 EXAMPLE
- *		Suppose we want the age of the manager of the shoe department and
+ *	 EXAMPLE Suppose we want the age of the manager of the shoe department and
  *		the number of employees in that department.  So we have the query:
  *
  *				select DEPT.no_emps, EMP.age
@@ -56,24 +58,47 @@
  *		of ExecInitNode() is a plan state tree built with the same structure
  *		as the underlying plan tree.
  *
- *	  * Then when ExecutorRun() is called, it calls ExecutePlan() which calls
- *		ExecProcNode() repeatedly on the top node of the plan state tree.
- *		Each time this happens, ExecProcNode() will end up calling
- *		ExecNestLoop(), which calls ExecProcNode() on its subplans.
- *		Each of these subplans is a sequential scan so ExecSeqScan() is
- *		called.  The slots returned by ExecSeqScan() may contain
- *		tuples which contain the attributes ExecNestLoop() uses to
- *		form the tuples it returns.
+ *	  * Then when ExecutorRun() is called, it calls ExecLeaf on each leaf of
+ *		the plan state with inner leafs first, outer second. So, in this case
+ *		it will call it on DEPT SeqScan, and then on EMP SeqScan. ExecLeaf
+ *		chooses the corresponding implementation -- here it is ExecSeqScan,
+ *		sequential scan. ExecSeqScan retrieves tuples and for each of them it
+ *		calls ExecPushTuple to pass tuple to nodeSeqScan's parent, Nest Loop
+ *		in this case. ExecPushTuple resolves the call, i.e. it finds something
+ *		like ExecPushTupleToNestLoopFromOuter and calls it. Then the process
+ *		repeats, so ExecPushTuple is called recursively. We have two corner
+ *		cases:
  *
- *	  * Eventually ExecSeqScan() stops returning tuples and the nest
- *		loop join ends.  Lastly, ExecutorEnd() calls ExecEndNode() which
+ *		1) When node have nothing more to push, e.g. nodeSeqScan have
+ *		   scanned all the tuples. Then it calls ExecPushNull once to let its
+ *		   parent know that nodeSeqScan have finished its work.
+ *		2) When node has no parent (top-level node). In this case
+ *		   ExecPushTuple calls SendReadyTuple which sends the tuple to its
+ *		   final destination.
+ *
+ *		So, in our example DEPT ExecSeqScan will eventually call ExecPushNull,
+ *		so Nest Loop node will learn that its inner side is done. Then EMP
+ *		SeqScan will start pushing, and inside EMP Seqscan's ExecPushTuple
+ *		Nested Loop will match the tuples and push them to the final
+ *		destination. Eventually EMP Seqscan will call ExecPushNull, inside it
+ *		Nested Loop will call ExecPushNull too and the nest loop join ends.
+ *
+ *		ExecPushTuple returns bool value which tells whether parent still
+ *		accepts the tuples. It allows to stop execution in the middle; e.g. if
+ *		we have node Limit above SeqScan node, the latter needs to scan only
+ *		LIMIT tuples. We don't push anything after receiving false from
+ *		ExecPushTuple; obviously, even if we have pushed all the tuples and
+ *		the last ExecPushTuple call returned false, we don't call
+ *		ExecPushNull.
+ *
+ *		Lastly, ExecutorEnd() calls ExecEndNode() which
  *		calls ExecEndNestLoop() which in turn calls ExecEndNode() on
  *		its subplans which result in ExecEndSeqScan().
  *
- *		This should show how the executor works by having
- *		ExecInitNode(), ExecProcNode() and ExecEndNode() dispatch
- *		their work to the appropriate node support routines which may
- *		in turn call these routines themselves on their subplans.
+ *		This should show how the executor works by having ExecInitNode(),
+ *		ExecLeaf, ExecPushTuple, ExecPushNull and ExecEndNode() dispatch their
+ *		work to the appropriate node support routines which may in turn call
+ *		these routines themselves on their subplans.
  */
 #include "postgres.h"
 
@@ -173,6 +198,85 @@ ExecProcNode(PlanState *node)
 	return NULL;
 }
 
+/*
+ * Tell the 'node' leaf to start the execution
+ */
+void
+ExecLeaf(PlanState *node)
+{
+	CHECK_FOR_INTERRUPTS();
+
+	switch (nodeTag(node))
+	{
+		default:
+			elog(ERROR, "bottom node type not supported: %d",
+				 (int) nodeTag(node));
+	}
+}
+
+/*
+ * Instead of ExecProcNode, here we will have function ExecPushTuple pushing
+ * one tuple.
+ * 'slot' is tuple to push, it must be not null; when node finished its work
+ * it must call ExecPushNull instead.
+ * 'pusher' is sender of a tuple, it's parent is the receiver. We take it as a
+ * param instead of its parent directly because we need it to distinguish
+ * inner and outer pushes.
+ *
+ * Returns true if node is still accepting tuples, false if not.
+ *
+ * If tuple was pushed into a node which returned 'false' before, the
+ * behaviour is undefined, i.e. it is not allowed; we will try to catch such
+ * situations with asserts.
+ */
+bool
+ExecPushTuple(TupleTableSlot *slot, PlanState *pusher)
+{
+	PlanState *receiver = pusher->parent;
+
+	Assert(!TupIsNull(slot));
+
+	CHECK_FOR_INTERRUPTS();
+
+	/* If the receiver is NULL, then pusher is top-level node, so we need
+	 * to send the tuple to the dest
+	 */
+	if (receiver == NULL)
+	{
+		return SendReadyTuple(slot, pusher);
+	}
+
+	elog(ERROR, "node type not supported: %d", (int) nodeTag(receiver));
+}
+
+/*
+ * Signal the parent that we are done. Like in ExecPushTuple, sender is param
+ * here because we need to distinguish inner and outer pushes.
+ *
+ * 'slot' must be null tuple. It exists to be able to transfer correct
+ * tupleDesc.
+ */
+void
+ExecPushNull(TupleTableSlot *slot, PlanState *pusher)
+{
+	PlanState *receiver = pusher->parent;
+
+	Assert(TupIsNull(slot));
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * If the receiver is NULL, then pusher is top-level node; end of
+	 * the execution
+	 */
+	if (receiver == NULL)
+	{
+		SendReadyTuple(NULL, pusher);
+	}
+
+	elog(ERROR, "node type not supported: %d", (int) nodeTag(receiver));
+}
+
 /* ----------------------------------------------------------------
  * Unsupported too; we don't need it in push model
  * ----------------------------------------------------------------
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 716fa9dc27..386fcb4c8b 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -180,6 +180,7 @@ extern void ExecutorRun(QueryDesc *queryDesc,
 			ScanDirection direction, uint64 count, bool execute_once);
 extern void standard_ExecutorRun(QueryDesc *queryDesc,
 					 ScanDirection direction, uint64 count, bool execute_once);
+extern bool SendReadyTuple(TupleTableSlot *slot, PlanState *planstate);
 extern void ExecutorFinish(QueryDesc *queryDesc);
 extern void standard_ExecutorFinish(QueryDesc *queryDesc);
 extern void ExecutorEnd(QueryDesc *queryDesc);
@@ -241,6 +242,9 @@ extern TupleTableSlot *ExecProcNode(PlanState *node);
 extern Node *MultiExecProcNode(PlanState *node);
 extern void ExecEndNode(PlanState *node);
 extern bool ExecShutdownNode(PlanState *node);
+extern void ExecLeaf(PlanState *node);
+extern bool ExecPushTuple(TupleTableSlot *slot, PlanState *pusher);
+extern void ExecPushNull(TupleTableSlot *slot, PlanState *pusher);
 
 /*
  * prototypes from functions in execQual.c
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 738f098b00..da7fd9c7ac 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -28,6 +28,7 @@
 #include "utils/tuplesort.h"
 #include "nodes/tidbitmap.h"
 #include "storage/condition_variable.h"
+#include "tcop/dest.h" /* for DestReceiver type in EState */
 
 
 /* ----------------
@@ -416,6 +417,16 @@ typedef struct EState
 	List	   *es_auxmodifytables;		/* List of secondary ModifyTableStates */
 
 	/*
+	 * State needed to push tuples to dest in push model, technically it is
+	 * local variables from old ExecutePlan
+	 */
+	uint64		es_current_tuple_count;
+	bool		es_sendTuples;
+	uint64		es_numberTuplesRequested;
+	CmdType		es_operation;
+	DestReceiver *es_dest;
+
+	/*
 	 * this ExprContext is for per-output-tuple operations, such as constraint
 	 * checks and index-value computations.  It will be reset for each output
 	 * tuple.  Note that it will be created only if needed.
-- 
2.11.0

