parallel distinct union and aggregate support patch

Started by bucoo@sohu.comabout 5 years ago27 messages
#1bucoo@sohu.com
bucoo@sohu.com
2 attachment(s)

Hi hackers,
I write a path for soupport parallel distinct, union and aggregate using batch sort.
steps:
1. generate hash value for group clauses values, and using mod hash value save to batch
2. end of outer plan, wait all other workers finish write to batch
3. echo worker get a unique batch number, call tuplesort_performsort() function finish this batch sort
4. return row for this batch
5. if not end of all batchs, got step 3

BatchSort paln make sure same tuple(group clause) return in same range, so Unique(or GroupAggregate) plan can work.

path 2 for parallel aggregate, this is a simple use
but regress failed for partitionwise aggregation difference plan
from GatherMerge->Sort->Append->...
to Sort->Gahter->Append->...
I have no idea how to modify it.

Same idea I writed a batch shared tuple store for HashAgg in our PG version, I will send patch for PG14 when I finish it.

The following is a description in Chinese
英语不好,所以这里写点中文,希望上面写的不对的地方请大家帮忙纠正一下。
BatchSort的工作原理
1. 先按group clause计算出hash值,并按取模的值放入不同的批次
2. 当下层plan返回所有的行后,等待所有其它的工作进程结束
3. 每一个工作进程索取一个唯一的一个批次, 并调用tuplesort_performsort()函数完成最终排序
4. 返回本批次的所有行
5. 如果所有的批次没有读完,则返回第3步
BatchSort plan能保证相同的数据(按分给表达式)在同一个周期内返回,所以几个去重和分组相关的plan可以正常工作。
第2个补丁是支持并行分组的,只做一次分组,而不是并行进程做每一次分组后,主进程再进行二次分组。
这个补丁导致了regress测试中的partitionwise aggregation失败,原来的执行计划有所变更。
补丁只写了一个简单的使用BatchSort plan的方法,可能还需要添加其它用法。

用同样的思想我写了一个使用shared tuple store的HashAgg在我们的AntDB版本中(最新版本暂未开源),适配完PG14版本后我会发出来。
打个广告:欢迎关注我们亚信公司基于PG的分布式数据库产品AntDB,开源地址 https://github.com/ADBSQL/AntDB

bucoo@sohu.com

Attachments:

0001-Parallel-distinct-and-union-support.patchapplication/octet-stream; name=0001-Parallel-distinct-and-union-support.patchDownload
From d91215c25023ec839c4bc8d3116cc3e32f48a3c3 Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Mon, 19 Oct 2020 17:54:11 +0800
Subject: [PATCH 1/2] Parallel distinct and union support

---
 src/backend/commands/explain.c                |  15 +
 src/backend/executor/Makefile                 |   1 +
 src/backend/executor/execAmi.c                |   5 +
 src/backend/executor/execParallel.c           |  17 ++
 src/backend/executor/execProcnode.c           |  10 +
 src/backend/executor/nodeBatchSort.c          | 377 ++++++++++++++++++++++++++
 src/backend/nodes/copyfuncs.c                 |  19 ++
 src/backend/nodes/outfuncs.c                  |  15 +
 src/backend/nodes/readfuncs.c                 |  16 ++
 src/backend/optimizer/path/costsize.c         |  82 ++++++
 src/backend/optimizer/plan/createplan.c       |  47 +++-
 src/backend/optimizer/plan/planner.c          |  43 +++
 src/backend/optimizer/plan/setrefs.c          |   1 +
 src/backend/optimizer/plan/subselect.c        |   1 +
 src/backend/optimizer/prep/prepunion.c        |  52 +++-
 src/backend/optimizer/util/pathnode.c         |  38 +++
 src/backend/optimizer/util/tlist.c            |  16 ++
 src/backend/postmaster/pgstat.c               |   3 +
 src/backend/utils/misc/guc.c                  |   9 +
 src/include/executor/nodeBatchSort.h          |  19 ++
 src/include/nodes/execnodes.h                 |  15 +
 src/include/nodes/nodes.h                     |   3 +
 src/include/nodes/pathnodes.h                 |  10 +
 src/include/nodes/plannodes.h                 |  12 +
 src/include/optimizer/cost.h                  |   6 +
 src/include/optimizer/pathnode.h              |   9 +
 src/include/optimizer/tlist.h                 |   1 +
 src/include/pgstat.h                          |   3 +-
 src/test/regress/expected/select_distinct.out |  42 +++
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/expected/union.out           |  55 ++++
 src/test/regress/sql/select_distinct.sql      |  14 +
 src/test/regress/sql/union.sql                |  15 +
 33 files changed, 960 insertions(+), 14 deletions(-)
 create mode 100644 src/backend/executor/nodeBatchSort.c
 create mode 100644 src/include/executor/nodeBatchSort.h

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index c98c9b5547..16a1fb035d 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1270,6 +1270,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Sort:
 			pname = sname = "Sort";
 			break;
+		case T_BatchSort:
+			pname = sname = "BatchSort";
+			break;
 		case T_IncrementalSort:
 			pname = sname = "Incremental Sort";
 			break;
@@ -1933,6 +1936,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			show_sort_keys(castNode(SortState, planstate), ancestors, es);
 			show_sort_info(castNode(SortState, planstate), es);
 			break;
+		case T_BatchSort:
+			{
+				BatchSort *bsort = (BatchSort*)plan;
+				show_sort_group_keys(planstate, "Sort Key",
+									 bsort->sort.numCols, 0, bsort->sort.sortColIdx,
+									 bsort->sort.sortOperators, bsort->sort.collations,
+									 bsort->sort.nullsFirst,
+									 ancestors, es);
+				if (es->verbose)
+					ExplainPropertyInteger("batches", NULL, bsort->numBatches, es);
+			}
+			break;
 		case T_IncrementalSort:
 			show_incremental_sort_keys(castNode(IncrementalSortState, planstate),
 									   ancestors, es);
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index f990c6473a..a4855a8881 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -33,6 +33,7 @@ OBJS = \
 	instrument.o \
 	nodeAgg.o \
 	nodeAppend.o \
+	nodeBatchSort.o \
 	nodeBitmapAnd.o \
 	nodeBitmapHeapscan.o \
 	nodeBitmapIndexscan.o \
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index e2154ba86a..6eb1fe2424 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -17,6 +17,7 @@
 #include "executor/execdebug.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
+#include "executor/nodeBatchSort.h"
 #include "executor/nodeBitmapAnd.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeBitmapIndexscan.h"
@@ -253,6 +254,10 @@ ExecReScan(PlanState *node)
 			ExecReScanSort((SortState *) node);
 			break;
 
+		case T_BatchSortState:
+			ExecReScanBatchSort((BatchSortState *)node);
+			break;
+
 		case T_IncrementalSortState:
 			ExecReScanIncrementalSort((IncrementalSortState *) node);
 			break;
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 382e78fb7f..a5abd48507 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -27,6 +27,7 @@
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
+#include "executor/nodeBatchSort.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeCustom.h"
 #include "executor/nodeForeignscan.h"
@@ -285,6 +286,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortEstimate((SortState *) planstate, e->pcxt);
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortEstimate((BatchSortState*)planstate, e->pcxt);
+			break;
 		case T_IncrementalSortState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
@@ -505,6 +510,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortInitializeDSM((BatchSortState*)planstate, d->pcxt);
+			break;
 		case T_IncrementalSortState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
@@ -991,6 +1000,10 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 		case T_IncrementalSortState:
 			/* these nodes have DSM state, but no reinitialization is required */
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortReInitializeDSM((BatchSortState*)planstate, pcxt);
+			break;
 
 		default:
 			break;
@@ -1341,6 +1354,10 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortInitializeWorker((BatchSortState*)planstate, pwcxt);
+			break;
 		case T_IncrementalSortState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 01b7b926bf..c13835ddda 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -75,6 +75,7 @@
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
+#include "executor/nodeBatchSort.h"
 #include "executor/nodeBitmapAnd.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeBitmapIndexscan.h"
@@ -314,6 +315,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 												estate, eflags);
 			break;
 
+		case T_BatchSort:
+			result = (PlanState *) ExecInitBatchSort((BatchSort *) node,
+													 estate, eflags);
+			break;
+
 		case T_IncrementalSort:
 			result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
 														   estate, eflags);
@@ -699,6 +705,10 @@ ExecEndNode(PlanState *node)
 			ExecEndSort((SortState *) node);
 			break;
 
+		case T_BatchSortState:
+			ExecEndBatchSort((BatchSortState *) node);
+			break;
+
 		case T_IncrementalSortState:
 			ExecEndIncrementalSort((IncrementalSortState *) node);
 			break;
diff --git a/src/backend/executor/nodeBatchSort.c b/src/backend/executor/nodeBatchSort.c
new file mode 100644
index 0000000000..b090fdaf43
--- /dev/null
+++ b/src/backend/executor/nodeBatchSort.c
@@ -0,0 +1,377 @@
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "executor/nodeBatchSort.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "storage/barrier.h"
+#include "utils/builtins.h"
+#include "utils/tuplesort.h"
+#include "utils/typcache.h"
+
+typedef struct ParallelBatchSort
+{
+	Barrier				barrier;
+	pg_atomic_uint32	attached;
+	pg_atomic_uint32	cur_batch;
+	Size				tuplesort_size;	/* MAXIMUM_ALIGNOF*n */
+}ParallelBatchSort;
+
+#define PARALLEL_BATCH_SORT_SIZE		MAXALIGN(sizeof(ParallelBatchSort))
+#define PARALLEL_BATCH_SORT_SHARED(p,n)	\
+	(Sharedsort*)(((char*)p) + PARALLEL_BATCH_SORT_SIZE + (p)->tuplesort_size * n)
+
+#define BUILD_BATCH_DONE	1
+
+static bool ExecNextParallelBatchSort(BatchSortState *state)
+{
+	ParallelBatchSort  *parallel = state->parallel;
+	BatchSort		   *plan = castNode(BatchSort, state->ps.plan);
+	SortCoordinateData	coord;
+	uint32				cur_batch;
+	Assert(parallel != NULL);
+
+	if (state->curBatch >= 0 &&
+		state->curBatch < plan->numBatches &&
+		state->batches[state->curBatch] != NULL)
+	{
+		tuplesort_end(state->batches[state->curBatch]);
+		state->batches[state->curBatch] = NULL;
+	}
+
+	cur_batch = pg_atomic_fetch_add_u32(&parallel->cur_batch, 1);
+	if (cur_batch >= plan->numBatches)
+	{
+		state->curBatch = plan->numBatches;
+		return false;
+	}
+
+	Assert(state->batches[cur_batch] == NULL);
+	state->curBatch = cur_batch;
+	coord.isWorker = false;
+	coord.nParticipants = pg_atomic_read_u32(&parallel->attached);
+	coord.sharedsort = PARALLEL_BATCH_SORT_SHARED(parallel, cur_batch);
+	state->batches[cur_batch] = tuplesort_begin_heap(ExecGetResultType(outerPlanState(state)),
+													 plan->sort.numCols,
+													 plan->sort.sortColIdx,
+													 plan->sort.sortOperators,
+													 plan->sort.collations,
+													 plan->sort.nullsFirst,
+													 work_mem,
+													 &coord,
+													 false);
+	tuplesort_performsort(state->batches[cur_batch]);
+	return true;
+}
+
+static TupleTableSlot *ExecEmptyBatchSort(PlanState *pstate)
+{
+	return ExecClearTuple(pstate->ps_ResultTupleSlot);
+}
+
+static TupleTableSlot *ExecBatchSort(PlanState *pstate)
+{
+	TupleTableSlot *slot = pstate->ps_ResultTupleSlot;
+	BatchSortState *state = castNode(BatchSortState, pstate);
+	Assert(state->sort_Done);
+
+re_get_:
+	if (tuplesort_gettupleslot(state->batches[state->curBatch],
+							   true,
+							   false,
+							   slot,
+							   NULL) == false &&
+		state->curBatch < castNode(BatchSort, pstate->plan)->numBatches-1)
+	{
+		if (state->parallel)
+		{
+			if (ExecNextParallelBatchSort(state) == false)
+			{
+				ExecSetExecProcNode(pstate, ExecEmptyBatchSort);
+				return ExecClearTuple(slot);
+			}
+		}else
+		{
+			state->curBatch++;
+		}
+		goto re_get_;
+	}
+
+	return slot;
+}
+
+static TupleTableSlot *ExecBatchSortPrepare(PlanState *pstate)
+{
+	BatchSort		   *node = castNode(BatchSort, pstate->plan);
+	BatchSortState	   *state = castNode(BatchSortState, pstate);
+	PlanState		   *outerNode = outerPlanState(pstate);
+	TupleTableSlot	   *slot;
+	ListCell		   *lc;
+	ParallelBatchSort  *parallel = state->parallel;
+	SortCoordinateData	coord;
+	FunctionCallInfo	fcinfo;
+	uint32				hash;
+	int					i;
+	AttrNumber			maxAttr;
+	Assert(state->sort_Done == false);
+	Assert(list_length(state->groupFuns) == node->numGroupCols);
+
+	if (parallel)
+	{
+		if (BarrierAttach(&parallel->barrier) >= BUILD_BATCH_DONE)
+			goto build_already_done_;
+		pg_atomic_add_fetch_u32(&parallel->attached, 1);
+	}
+
+	for (i=node->numBatches;i>0;)
+	{
+		--i;
+		if (parallel)
+		{
+			coord.isWorker = true;
+			coord.nParticipants = -1;
+			coord.sharedsort = PARALLEL_BATCH_SORT_SHARED(parallel, i);
+		}
+		state->batches[i] = tuplesort_begin_heap(ExecGetResultType(outerNode),
+												 node->sort.numCols,
+												 node->sort.sortColIdx,
+												 node->sort.sortOperators,
+												 node->sort.collations,
+												 node->sort.nullsFirst,
+												 work_mem / node->numBatches,
+												 parallel ? &coord : NULL,
+												 false);
+	}
+
+	maxAttr = 0;
+	for (i=node->numGroupCols;i>0;)
+	{
+		if (maxAttr < node->grpColIdx[--i])
+			maxAttr = node->grpColIdx[i];
+	}
+	for (i=node->sort.numCols;i>0;)
+	{
+		if (maxAttr < node->sort.sortColIdx[--i])
+			maxAttr = node->sort.sortColIdx[i];
+	}
+	Assert(maxAttr > 0);
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+		slot = ExecProcNode(outerNode);
+		if (TupIsNull(slot))
+			break;
+		slot_getsomeattrs(slot, maxAttr);
+
+		hash = 0;
+		i = 0;
+		foreach(lc, state->groupFuns)
+		{
+			AttrNumber att = node->grpColIdx[i++]-1;
+			if (slot->tts_isnull[att] == false)
+			{
+				fcinfo = lfirst(lc);
+				fcinfo->args[0].value = slot->tts_values[att];
+				hash = hash_combine(hash, DatumGetUInt32(FunctionCallInvoke(fcinfo)));
+				Assert(fcinfo->isnull == false);
+			}
+		}
+
+		tuplesort_puttupleslot(state->batches[hash%node->numBatches], slot);
+	}
+
+	for (i=node->numBatches;i>0;)
+		tuplesort_performsort(state->batches[--i]);
+build_already_done_:
+	if (parallel)
+	{
+		for (i=node->numBatches;i>0;)
+		{
+			--i;
+			if (state->batches[i])
+			{
+				tuplesort_end(state->batches[i]);
+				state->batches[i] = NULL;
+			}
+		}
+		if (BarrierPhase(&parallel->barrier) < BUILD_BATCH_DONE)
+			BarrierArriveAndWait(&parallel->barrier, WAIT_EVENT_BATCH_SORT_BUILD);
+		BarrierDetach(&parallel->barrier);
+
+		if (ExecNextParallelBatchSort(state))
+			ExecSetExecProcNode(pstate, ExecBatchSort);
+		else
+			ExecSetExecProcNode(pstate, ExecEmptyBatchSort);
+	}else
+	{
+		state->curBatch = 0;
+		ExecSetExecProcNode(pstate, ExecBatchSort);
+	}
+	state->sort_Done = true;
+
+	return (*pstate->ExecProcNodeReal)(pstate);
+}
+
+BatchSortState* ExecInitBatchSort(BatchSort *node, EState *estate, int eflags)
+{
+	BatchSortState *state;
+	TypeCacheEntry *typentry;
+	TupleDesc		desc;
+	int				i;
+
+	if (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))
+	{
+		/* for now, we only using in group aggregate */
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("not support execute flag(s) %d for group sort", eflags)));
+	}
+
+	state = makeNode(BatchSortState);
+	state->ps.plan = (Plan*) node;
+	state->ps.state = estate;
+	state->ps.ExecProcNode = ExecBatchSortPrepare;
+
+	state->sort_Done = false;
+	state->batches = palloc0(node->numBatches * sizeof(Tuplesortstate*));
+
+	outerPlanState(state) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize return slot and type. No need to initialize projection info
+	 * because this node doesn't do projections.
+	 */
+	ExecInitResultTupleSlotTL(&state->ps, &TTSOpsMinimalTuple);
+	state->ps.ps_ProjInfo = NULL;
+
+	Assert(node->numGroupCols > 0);
+	desc = ExecGetResultType(outerPlanState(state));
+	for (i=0;i<node->numGroupCols;++i)
+	{
+		FmgrInfo			   *flinfo;
+		FunctionCallInfo		fcinfo;
+		Form_pg_attribute		attr = TupleDescAttr(desc, node->grpColIdx[i]-1);
+		typentry = lookup_type_cache(attr->atttypid, TYPECACHE_HASH_PROC);
+		if (!OidIsValid(typentry->hash_proc))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("could not identify an extended hash function for type %s",
+							format_type_be(attr->atttypid))));
+		flinfo = palloc0(sizeof(*flinfo));
+		fcinfo = palloc0(SizeForFunctionCallInfo(1));
+		fmgr_info(typentry->hash_proc, flinfo);
+		InitFunctionCallInfoData(*fcinfo, flinfo, 1, attr->attcollation, NULL, NULL);
+		fcinfo->args[0].isnull = false;
+		state->groupFuns = lappend(state->groupFuns, fcinfo);
+	}
+
+	return state;
+}
+
+static void CleanBatchSort(BatchSortState *node)
+{
+	int i;
+
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	if (node->sort_Done)
+	{
+		for (i=castNode(BatchSort, node->ps.plan)->numBatches;i>0;)
+		{
+			if (node->batches[--i] != NULL)
+			{
+				tuplesort_end(node->batches[i]);
+				node->batches[i] = NULL;
+			}
+		}
+		node->sort_Done = false;
+	}
+}
+
+void ExecEndBatchSort(BatchSortState *node)
+{
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	CleanBatchSort(node);
+	ExecEndNode(outerPlanState(node));
+}
+
+void ExecReScanBatchSort(BatchSortState *node)
+{
+	CleanBatchSort(node);
+	if (outerPlanState(node)->chgParam != NULL)
+		ExecReScan(outerPlanState(node));
+	ExecSetExecProcNode(&node->ps, ExecBatchSortPrepare);
+}
+
+void ExecShutdownBatchSort(BatchSortState *node)
+{
+	CleanBatchSort(node);
+}
+
+void ExecBatchSortEstimate(BatchSortState *node, ParallelContext *pcxt)
+{
+	Size size = mul_size(MAXALIGN(tuplesort_estimate_shared(pcxt->nworkers+1)),
+						 castNode(BatchSort, node->ps.plan)->numBatches);
+	size = add_size(size, PARALLEL_BATCH_SORT_SIZE);
+
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+static void InitializeBatchSortParallel(ParallelBatchSort *parallel,
+										int num_batches,
+										int num_workers,
+										dsm_segment *seg)
+{
+	int i;
+	BarrierInit(&parallel->barrier, 0);
+	pg_atomic_init_u32(&parallel->attached, 0);
+	pg_atomic_init_u32(&parallel->cur_batch, 0);
+	for (i=0;i<num_batches;++i)
+	{
+		tuplesort_initialize_shared(PARALLEL_BATCH_SORT_SHARED(parallel, i),
+									num_workers,
+									seg);
+	}
+}
+
+void ExecBatchSortInitializeDSM(BatchSortState *node, ParallelContext *pcxt)
+{
+	ParallelBatchSort *parallel;
+	BatchSort *plan = castNode(BatchSort, node->ps.plan);
+	Size tuplesort_size = MAXALIGN(tuplesort_estimate_shared(pcxt->nworkers+1));
+	Size size = mul_size(tuplesort_size, plan->numBatches);
+	size = add_size(PARALLEL_BATCH_SORT_SIZE, size);
+
+	node->parallel = parallel = shm_toc_allocate(pcxt->toc, size);
+	parallel->tuplesort_size = tuplesort_size;
+	InitializeBatchSortParallel(parallel, plan->numBatches, pcxt->nworkers+1, pcxt->seg);
+	shm_toc_insert(pcxt->toc, plan->sort.plan.plan_node_id, parallel);
+}
+
+void ExecBatchSortReInitializeDSM(BatchSortState *node, ParallelContext *pcxt)
+{
+	InitializeBatchSortParallel(node->parallel,
+								castNode(BatchSort, node->ps.plan)->numBatches,
+								pcxt->nworkers+1,
+								pcxt->seg);
+	ExecSetExecProcNode(&node->ps, ExecBatchSortPrepare);
+}
+
+void ExecBatchSortInitializeWorker(BatchSortState *node, ParallelWorkerContext *pwcxt)
+{
+	uint32 i;
+	BatchSort *plan = castNode(BatchSort, node->ps.plan);
+	ParallelBatchSort *parallel = shm_toc_lookup(pwcxt->toc,
+												 plan->sort.plan.plan_node_id,
+												 false);
+	node->parallel = parallel;
+	for (i=0;i<plan->numBatches;++i)
+	{
+		tuplesort_attach_shared(PARALLEL_BATCH_SORT_SHARED(parallel, i),
+								pwcxt->seg);
+	}
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 0409a40b82..958964f1fa 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -961,6 +961,22 @@ _copySort(const Sort *from)
 	return newnode;
 }
 
+/*
+ * _copyBatchSort
+ */
+static BatchSort *
+_copyBatchSort(const BatchSort *from)
+{
+	BatchSort	   *newnode = makeNode(BatchSort);
+
+	CopySortFields(&from->sort, &newnode->sort);
+
+	COPY_SCALAR_FIELD(numGroupCols);
+	COPY_SCALAR_FIELD(numBatches);
+	COPY_POINTER_FIELD(grpColIdx, from->numGroupCols * sizeof(AttrNumber));
+
+	return newnode;
+}
 
 /*
  * _copyIncrementalSort
@@ -4939,6 +4955,9 @@ copyObjectImpl(const void *from)
 		case T_Sort:
 			retval = _copySort(from);
 			break;
+		case T_BatchSort:
+			retval = _copyBatchSort(from);
+			break;
 		case T_IncrementalSort:
 			retval = _copyIncrementalSort(from);
 			break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f0386480ab..a8dd7ef23f 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -856,6 +856,18 @@ _outSort(StringInfo str, const Sort *node)
 	_outSortInfo(str, node);
 }
 
+static void
+_outBatchSort(StringInfo str, const BatchSort *node)
+{
+	WRITE_NODE_TYPE("BATCHSORT");
+
+	_outSortInfo(str, &node->sort);
+
+	WRITE_INT_FIELD(numGroupCols);
+	WRITE_INT_FIELD(numBatches);
+	WRITE_ATTRNUMBER_ARRAY(grpColIdx, node->numGroupCols);
+}
+
 static void
 _outIncrementalSort(StringInfo str, const IncrementalSort *node)
 {
@@ -3813,6 +3825,9 @@ outNode(StringInfo str, const void *obj)
 			case T_Sort:
 				_outSort(str, obj);
 				break;
+			case T_BatchSort:
+				_outBatchSort(str, obj);
+				break;
 			case T_IncrementalSort:
 				_outIncrementalSort(str, obj);
 				break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 42050ab719..2c6eb4362c 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2181,6 +2181,20 @@ _readSort(void)
 	READ_DONE();
 }
 
+static BatchSort *
+_readBatchSort(void)
+{
+	READ_LOCALS(BatchSort);
+
+	ReadCommonSort(&local_node->sort);
+
+	READ_INT_FIELD(numGroupCols);
+	READ_INT_FIELD(numBatches);
+	READ_ATTRNUMBER_ARRAY(grpColIdx, local_node->numGroupCols);
+
+	READ_DONE();
+}
+
 /*
  * _readIncrementalSort
  */
@@ -2834,6 +2848,8 @@ parseNodeString(void)
 		return_value = _readMaterial();
 	else if (MATCH("SORT", 4))
 		return_value = _readSort();
+	else if (MATCH("BATCHSORT", 9))
+		return_value = _readBatchSort();
 	else if (MATCH("INCREMENTALSORT", 15))
 		return_value = _readIncrementalSort();
 	else if (MATCH("GROUP", 5))
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index cd3716d494..32d0dc8ce5 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -140,6 +140,7 @@ bool		enable_partitionwise_aggregate = false;
 bool		enable_parallel_append = true;
 bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
+bool		enable_batch_sort = true;
 
 typedef struct
 {
@@ -1948,6 +1949,87 @@ cost_sort(Path *path, PlannerInfo *root,
 	path->total_cost = startup_cost + run_cost;
 }
 
+void cost_batchsort(Path *path, PlannerInfo *root,
+					List *batchkeys, Cost input_cost,
+					double tuples, int width,
+					Cost comparison_cost, int sort_mem,
+					uint32 numGroupCols, uint32 numBatches)
+{
+	Cost		startup_cost = input_cost;
+	Cost		run_cost = 0;
+	double		input_bytes = relation_byte_size(tuples, width);
+	double		batch_bytes = input_bytes / numBatches;
+	double		batch_tuples = tuples / numBatches;
+	long		sort_mem_bytes = sort_mem * 1024L;
+
+	if (sort_mem_bytes < (64*1024))
+		sort_mem_bytes = (64*1024);
+
+	if (!enable_batch_sort)
+		startup_cost += disable_cost;
+
+	/* hash cost */
+	startup_cost += cpu_operator_cost * numGroupCols * tuples;
+
+	path->rows = tuples;
+
+	/*
+	 * We want to be sure the cost of a sort is never estimated as zero, even
+	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
+	 */
+	if (tuples < 2.0)
+		tuples = 2.0;
+
+	if (batch_bytes > sort_mem_bytes)
+	{
+		/*
+		 * We'll have to use a disk-based sort of all the tuples
+		 */
+		double		npages = ceil(batch_bytes / BLCKSZ);
+		double		nruns = batch_bytes / sort_mem_bytes;
+		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
+		double		log_runs;
+		double		npageaccesses;
+
+		/*
+		 * CPU costs
+		 *
+		 * Assume about N log2 N comparisons
+		 */
+		startup_cost += comparison_cost * batch_tuples * LOG2(batch_tuples) * numBatches;
+
+		/* Disk costs */
+
+		/* Compute logM(r) as log(r) / log(M) */
+		if (nruns > mergeorder)
+			log_runs = ceil(log(nruns) / log(mergeorder));
+		else
+			log_runs = 1.0;
+		npageaccesses = 2.0 * npages * log_runs;
+		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
+		startup_cost += npageaccesses * numBatches *
+			(seq_page_cost * 0.75 + random_page_cost * 0.25);
+
+	}else
+	{
+		/* We'll use plain quicksort on all the input tuples */
+		startup_cost += comparison_cost * tuples * LOG2(tuples);
+	}
+
+	/*
+	 * Also charge a small amount (arbitrarily set equal to operator cost) per
+	 * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node
+	 * doesn't do qual-checking or projection, so it has less overhead than
+	 * most plan nodes.  Note it's correct to use tuples not output_tuples
+	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
+	 * counting the LIMIT otherwise.
+	 */
+	run_cost += cpu_operator_cost * tuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
+}
+
 /*
  * append_nonpartial_cost
  *	  Estimate the cost of the non-partial paths in a Parallel Append.
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 3d7a4e373f..85969388c2 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -98,6 +98,7 @@ static Plan *create_projection_plan(PlannerInfo *root,
 									int flags);
 static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe);
 static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags);
+static BatchSort *create_batchsort_plan(PlannerInfo *root, BatchSortPath *best_path, int flags);
 static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root,
 													IncrementalSortPath *best_path, int flags);
 static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path);
@@ -468,6 +469,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
 											 (SortPath *) best_path,
 											 flags);
 			break;
+		case T_BatchSort:
+			plan = (Plan *) create_batchsort_plan(root,
+												  (BatchSortPath*) best_path,
+												  flags);
+			break;
 		case T_IncrementalSort:
 			plan = (Plan *) create_incrementalsort_plan(root,
 														(IncrementalSortPath *) best_path,
@@ -2009,6 +2015,39 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
 	return plan;
 }
 
+static BatchSort *create_batchsort_plan(PlannerInfo *root, BatchSortPath *best_path, int flags)
+{
+	BatchSort	   *plan;
+	Plan		   *subplan;
+
+	subplan = create_plan_recurse(root, best_path->subpath,
+								  flags | CP_SMALL_TLIST);
+
+	plan = makeNode(BatchSort);
+	subplan = prepare_sort_from_pathkeys(subplan,
+										 best_path->batchkeys,
+										 IS_OTHER_REL(best_path->subpath->parent) ?
+										     best_path->path.parent->relids : NULL,
+										 NULL,
+										 false,
+										 &plan->sort.numCols,
+										 &plan->sort.sortColIdx,
+										 &plan->sort.sortOperators,
+										 &plan->sort.collations,
+										 &plan->sort.nullsFirst);
+	plan->sort.plan.targetlist = subplan->targetlist;
+	plan->sort.plan.qual = NIL;
+	outerPlan(plan) = subplan;
+	innerPlan(plan) = NULL;
+	plan->numBatches = best_path->numBatches;
+	plan->numGroupCols = list_length(best_path->batchgroup);
+	plan->grpColIdx = extract_grouping_cols(best_path->batchgroup,
+											subplan->targetlist);
+
+	copy_generic_path_info(&plan->sort.plan, &best_path->path);
+	return plan;
+}
+
 /*
  * create_incrementalsort_plan
  *
@@ -2085,6 +2124,12 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag
 {
 	Unique	   *plan;
 	Plan	   *subplan;
+	List	   *pathkeys;
+
+	if (IsA(best_path->subpath, BatchSortPath))
+		pathkeys = ((BatchSortPath*)best_path->subpath)->batchkeys;
+	else
+		pathkeys = best_path->path.pathkeys;
 
 	/*
 	 * Unique doesn't project, so tlist requirements pass through; moreover we
@@ -2094,7 +2139,7 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag
 								  flags | CP_LABEL_TLIST);
 
 	plan = make_unique_from_pathkeys(subplan,
-									 best_path->path.pathkeys,
+									 pathkeys,
 									 best_path->numkeys);
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f331f82a6c..ac7c2a52be 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4806,6 +4806,8 @@ create_distinct_paths(PlannerInfo *root,
 											  cheapest_input_path->rows,
 											  NULL);
 	}
+	distinct_rel->rows = numDistinctRows;
+	distinct_rel->reltarget = root->upper_targets[UPPERREL_DISTINCT];
 
 	/*
 	 * Consider sort-based implementations of DISTINCT, if possible.
@@ -4825,6 +4827,7 @@ create_distinct_paths(PlannerInfo *root,
 		 * the other.)
 		 */
 		List	   *needed_pathkeys;
+		List	   *hashable_clause;
 
 		if (parse->hasDistinctOn &&
 			list_length(root->distinct_pathkeys) <
@@ -4871,6 +4874,44 @@ create_distinct_paths(PlannerInfo *root,
 										  path,
 										  list_length(root->distinct_pathkeys),
 										  numDistinctRows));
+
+		/* add parallel unique */
+		if (distinct_rel->consider_parallel &&
+			input_rel->partial_pathlist != NIL &&
+			numDistinctRows >= BATCH_SORT_MIN_BATCHES &&
+			(hashable_clause = grouping_get_hashable(parse->distinctClause)) != NIL)
+		{
+			double	numPartialDistinctRows;
+			uint32	num_batchs = (uint32)numDistinctRows;
+			if (num_batchs > BATCH_SORT_MAX_BATCHES)
+			{
+				/*
+				 * too many batchs(files) it is not a good idea,
+				 * limit to BATCH_SORT_MAX_BATCHES
+				 */
+				num_batchs = BATCH_SORT_MAX_BATCHES;
+			}
+
+			foreach (lc, input_rel->partial_pathlist)
+			{
+				Path *path = (Path*)create_batchsort_path(root,
+														  distinct_rel,
+														  lfirst(lc),
+														  needed_pathkeys,
+														  hashable_clause,
+														  num_batchs,
+														  true);
+				numPartialDistinctRows = numDistinctRows / path->parallel_workers;
+				if (numPartialDistinctRows < 1.0)
+					numPartialDistinctRows = 1.0;
+				path = (Path*)create_upper_unique_path(root,
+													   distinct_rel,
+													   path,
+													   list_length(root->distinct_pathkeys),
+													   numPartialDistinctRows);
+				add_partial_path(distinct_rel, path);
+			}
+		}
 	}
 
 	/*
@@ -4908,6 +4949,8 @@ create_distinct_paths(PlannerInfo *root,
 								 numDistinctRows));
 	}
 
+	generate_useful_gather_paths(root, distinct_rel, false);
+
 	/* Give a helpful error if we failed to find any implementation */
 	if (distinct_rel->pathlist == NIL)
 		ereport(ERROR,
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index dd8e2e966d..c39eed6b44 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -737,6 +737,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
 
 		case T_Material:
 		case T_Sort:
+		case T_BatchSort:
 		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index fcce81926b..cfe2557988 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2752,6 +2752,7 @@ finalize_plan(PlannerInfo *root, Plan *plan,
 		case T_Hash:
 		case T_Material:
 		case T_Sort:
+		case T_BatchSort:
 		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 745f443e5c..fa1053f077 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -67,7 +67,7 @@ static List *plan_union_children(PlannerInfo *root,
 								 List *refnames_tlist,
 								 List **tlist_list);
 static Path *make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
-							   PlannerInfo *root);
+							   PlannerInfo *root, List *groupList, List *sortKeys);
 static void postprocess_setop_rel(PlannerInfo *root, RelOptInfo *rel);
 static bool choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 								Path *input_path,
@@ -354,6 +354,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 			rel = generate_nonunion_paths(op, root,
 										  refnames_tlist,
 										  pTargetList);
+		generate_useful_gather_paths(root, rel, false);
 		if (pNumGroups)
 			*pNumGroups = rel->rows;
 
@@ -552,6 +553,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	List	   *tlist_list;
 	List	   *tlist;
 	Path	   *path;
+	List	   *groupList = NIL;
+	List	   *sortKeys = NIL;
 
 	/*
 	 * If plain UNION, tell children to fetch all tuples.
@@ -587,6 +590,14 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 
 	*pTargetList = tlist;
 
+	if (!op->all)
+	{
+		/* Identify the grouping semantics */
+		groupList = generate_setop_grouplist(op, tlist);
+		if (grouping_is_sortable(groupList))
+			sortKeys = make_pathkeys_for_sortclauses(root, groupList, tlist);
+	}
+
 	/* Build path lists and relid set. */
 	foreach(lc, rellist)
 	{
@@ -627,7 +638,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	 * node(s) to remove duplicates.
 	 */
 	if (!op->all)
-		path = make_union_unique(op, path, tlist, root);
+		path = make_union_unique(op, path, tlist, root, groupList, sortKeys);
 
 	add_path(result_rel, path);
 
@@ -646,6 +657,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	{
 		Path	   *ppath;
 		ListCell   *lc;
+		List	   *hashable_list;
 		int			parallel_workers = 0;
 
 		/* Find the highest number of workers requested for any subpath. */
@@ -678,11 +690,35 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 							   NIL, NULL,
 							   parallel_workers, enable_parallel_append,
 							   NIL, -1);
+		if (!op->all &&
+			sortKeys != NIL &&
+			ppath->rows >= BATCH_SORT_MIN_BATCHES &&
+			(hashable_list = grouping_get_hashable(groupList)) != NIL)
+		{
+			Path   *partial_path;
+			uint32	numBatches = ppath->rows;
+			if (numBatches > BATCH_SORT_MAX_BATCHES)
+				numBatches = BATCH_SORT_MAX_BATCHES;
+			Assert(list_length(sortKeys) >= list_length(hashable_list));
+			partial_path = (Path*)create_batchsort_path(root,
+														result_rel,
+														ppath,
+														sortKeys,
+														hashable_list,
+														numBatches,
+														true);
+			partial_path = (Path*) create_upper_unique_path(root,
+															result_rel,
+															partial_path,
+															list_length(sortKeys),
+															partial_path->rows);
+			add_partial_path(result_rel, partial_path);
+		}
 		ppath = (Path *)
 			create_gather_path(root, result_rel, ppath,
 							   result_rel->reltarget, NULL, NULL);
 		if (!op->all)
-			ppath = make_union_unique(op, ppath, tlist, root);
+			ppath = make_union_unique(op, ppath, tlist, root, groupList, sortKeys);
 		add_path(result_rel, ppath);
 	}
 
@@ -933,15 +969,11 @@ plan_union_children(PlannerInfo *root,
  */
 static Path *
 make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
-				  PlannerInfo *root)
+				  PlannerInfo *root, List *groupList, List *sortKeys)
 {
 	RelOptInfo *result_rel = fetch_upper_rel(root, UPPERREL_SETOP, NULL);
-	List	   *groupList;
 	double		dNumGroups;
 
-	/* Identify the grouping semantics */
-	groupList = generate_setop_grouplist(op, tlist);
-
 	/*
 	 * XXX for the moment, take the number of distinct groups as equal to the
 	 * total input size, ie, the worst case.  This is too conservative, but
@@ -976,9 +1008,7 @@ make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
 				create_sort_path(root,
 								 result_rel,
 								 path,
-								 make_pathkeys_for_sortclauses(root,
-															   groupList,
-															   tlist),
+								 sortKeys,
 								 -1.0);
 		path = (Path *) create_upper_unique_path(root,
 												 result_rel,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index c1fc866cbf..460d2e5faa 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2880,6 +2880,44 @@ create_sort_path(PlannerInfo *root,
 	return pathnode;
 }
 
+BatchSortPath *
+create_batchsort_path(PlannerInfo *root,
+					  RelOptInfo *rel,
+					  Path *subpath,
+					  List *pathkeys,
+					  List *groupClause,
+					  uint32 numBatches,
+					  bool parallel_sort)
+{
+	BatchSortPath   *pathnode = makeNode(BatchSortPath);
+
+	pathnode->path.pathtype = T_BatchSort;
+	pathnode->path.parent = rel;
+	/* Sort doesn't project, so use source path's pathtarget */
+	pathnode->path.pathtarget = subpath->pathtarget;
+	/* For now, assume we are above any joins, so no parameterization */
+	pathnode->path.param_info = NULL;
+	pathnode->path.parallel_aware = parallel_sort;
+	pathnode->path.parallel_safe = rel->consider_parallel &&
+		subpath->parallel_safe;
+	pathnode->path.parallel_workers = subpath->parallel_workers;
+	pathnode->batchkeys = pathkeys;
+	pathnode->batchgroup = groupClause;
+	pathnode->numBatches = numBatches;
+
+	pathnode->subpath = subpath;
+
+	cost_batchsort(&pathnode->path, root, pathkeys,
+				   subpath->total_cost, subpath->rows,
+				   subpath->pathtarget->width,
+				   0.0,				/* XXX comparison_cost shouldn't be 0? */
+				   work_mem/numBatches,
+				   list_length(groupClause),
+				   numBatches);
+
+	return pathnode;
+}
+
 /*
  * create_group_path
  *	  Creates a pathnode that represents performing grouping of presorted input
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 02a3c6b165..949568c672 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -593,6 +593,22 @@ grouping_is_hashable(List *groupClause)
 	return true;
 }
 
+List *
+grouping_get_hashable(List *groupClause)
+{
+	ListCell   *lc;
+	List	   *result = NIL;
+
+	foreach (lc, groupClause)
+	{
+		SortGroupClause *groupcl = lfirst_node(SortGroupClause, lc);
+
+		if (groupcl->hashable)
+			result = lappend(result, groupcl);
+	}
+
+	return result;
+}
 
 /*****************************************************************************
  *		PathTarget manipulation functions
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 822f0ebc62..cacb7d13e6 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4021,6 +4021,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_XACT_GROUP_UPDATE:
 			event_name = "XactGroupUpdate";
 			break;
+		case WAIT_EVENT_BATCH_SORT_BUILD:
+			event_name = "Batch/Sort/Building";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 596bcb7b84..43a4e36d78 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -987,6 +987,15 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_batch_sort", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable batch sort method"),
+			NULL
+		},
+		&enable_batch_sort,
+		false,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of incremental sort steps."),
diff --git a/src/include/executor/nodeBatchSort.h b/src/include/executor/nodeBatchSort.h
new file mode 100644
index 0000000000..66c68e0125
--- /dev/null
+++ b/src/include/executor/nodeBatchSort.h
@@ -0,0 +1,19 @@
+
+#ifndef NODE_BATCH_SORT_H
+#define NODE_BATCH_SORT_H
+
+#include "access/parallel.h"
+#include "nodes/execnodes.h"
+
+extern BatchSortState *ExecInitBatchSort(BatchSort *node, EState *estate, int eflags);
+extern void ExecEndBatchSort(BatchSortState *node);
+extern void ExecReScanBatchSort(BatchSortState *node);
+
+/* parallel scan support */
+extern void ExecBatchSortEstimate(BatchSortState *node, ParallelContext *pcxt);
+extern void ExecBatchSortInitializeDSM(BatchSortState *node, ParallelContext *pcxt);
+extern void ExecBatchSortReInitializeDSM(BatchSortState *node, ParallelContext *pcxt);
+extern void ExecBatchSortInitializeWorker(BatchSortState *node, ParallelWorkerContext *pwcxt);
+extern void ExecShutdownBatchSort(BatchSortState *node);
+
+#endif							/* NODE_BATCH_SORT_H */
\ No newline at end of file
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index ef448d67c7..14dde9fca3 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2012,6 +2012,21 @@ typedef struct SortState
 	SharedSortInfo *shared_info;	/* one entry per worker */
 } SortState;
 
+/* ----------------
+ *	 BatchSortState information
+ * ----------------
+ */
+typedef struct BatchSortState
+{
+	PlanState	ps;				/* its first field is NodeTag */
+	void	  **batches;		/* private state of tuplesort.c */
+	List	   *groupFuns;		/* hash function call info for each group-key */
+	struct ParallelBatchSort
+			   *parallel;		/* parallel info, private in nodeBatchSort.c */
+	int			curBatch;		/* current batch index */
+	bool		sort_Done;		/* sort completed yet? */
+}BatchSortState;
+
 /* ----------------
  *	 Instrumentation information for IncrementalSort
  * ----------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 7ddd8c011b..ace4c98939 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -74,6 +74,7 @@ typedef enum NodeTag
 	T_HashJoin,
 	T_Material,
 	T_Sort,
+	T_BatchSort,
 	T_IncrementalSort,
 	T_Group,
 	T_Agg,
@@ -131,6 +132,7 @@ typedef enum NodeTag
 	T_HashJoinState,
 	T_MaterialState,
 	T_SortState,
+	T_BatchSortState,
 	T_IncrementalSortState,
 	T_GroupState,
 	T_AggState,
@@ -246,6 +248,7 @@ typedef enum NodeTag
 	T_ProjectionPath,
 	T_ProjectSetPath,
 	T_SortPath,
+	T_BatchSortPath,
 	T_IncrementalSortPath,
 	T_GroupPath,
 	T_UpperUniquePath,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index dbe86e7af6..273bdda452 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1649,6 +1649,16 @@ typedef struct SortPath
 	Path	   *subpath;		/* path representing input source */
 } SortPath;
 
+typedef struct BatchSortPath
+{
+	Path		path;
+	Path	   *subpath;		/* path representing input source */
+	List	   *batchkeys;		/* our result is not all ordered, only for each batch,
+								 * so we can not use Path::pathkeys */
+	List	   *batchgroup;		/* a list of SortGroupClause for hash */
+	uint32		numBatches;
+}BatchSortPath;
+
 /*
  * IncrementalSortPath
  */
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 83e01074ed..f7ad7881dc 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -774,6 +774,18 @@ typedef struct Sort
 	bool	   *nullsFirst;		/* NULLS FIRST/LAST directions */
 } Sort;
 
+/* ----------------
+ *		batch sort node
+ * ----------------
+ */
+typedef struct BatchSort
+{
+	Sort		sort;
+	int			numGroupCols;	/* number of group-key columns */
+	int			numBatches;		/* number of group */
+	AttrNumber *grpColIdx;		/* their indexes in the target list */
+}BatchSort;
+
 /* ----------------
  *		incremental sort node
  * ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 6141654e47..37e6a12a6f 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -53,6 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
+extern PGDLLIMPORT bool enable_batch_sort;
 extern PGDLLIMPORT bool enable_incremental_sort;
 extern PGDLLIMPORT bool enable_hashagg;
 extern PGDLLIMPORT bool enable_nestloop;
@@ -102,6 +103,11 @@ extern void cost_sort(Path *path, PlannerInfo *root,
 					  List *pathkeys, Cost input_cost, double tuples, int width,
 					  Cost comparison_cost, int sort_mem,
 					  double limit_tuples);
+extern void cost_batchsort(Path *path, PlannerInfo *root,
+						   List *batchkeys, Cost input_cost,
+						   double tuples, int width,
+						   Cost comparison_cost, int sort_mem,
+						   uint32 numGroupCols, uint32 numBatchs);
 extern void cost_incremental_sort(Path *path,
 								  PlannerInfo *root, List *pathkeys, int presorted_keys,
 								  Cost input_startup_cost, Cost input_total_cost,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 715a24ad29..816fc37739 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -17,6 +17,8 @@
 #include "nodes/bitmapset.h"
 #include "nodes/pathnodes.h"
 
+#define BATCH_SORT_MIN_BATCHES		2
+#define BATCH_SORT_MAX_BATCHES		512
 
 /*
  * prototypes for pathnode.c
@@ -195,6 +197,13 @@ extern SortPath *create_sort_path(PlannerInfo *root,
 								  Path *subpath,
 								  List *pathkeys,
 								  double limit_tuples);
+extern BatchSortPath *create_batchsort_path(PlannerInfo *root,
+											RelOptInfo *rel,
+											Path *subpath,
+											List *pathkeys,
+											List *groupClause,
+											uint32 numBatches,
+											bool parallel_sort);
 extern GroupPath *create_group_path(PlannerInfo *root,
 									RelOptInfo *rel,
 									Path *subpath,
diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h
index 1d4c7da545..9372cebeba 100644
--- a/src/include/optimizer/tlist.h
+++ b/src/include/optimizer/tlist.h
@@ -36,6 +36,7 @@ extern Oid *extract_grouping_collations(List *groupClause, List *tlist);
 extern AttrNumber *extract_grouping_cols(List *groupClause, List *tlist);
 extern bool grouping_is_sortable(List *groupClause);
 extern bool grouping_is_hashable(List *groupClause);
+extern List *grouping_get_hashable(List *groupClause);
 
 extern PathTarget *make_pathtarget_from_tlist(List *tlist);
 extern List *make_tlist_from_pathtarget(PathTarget *target);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index a821ff4f15..f0b6dae97b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -952,7 +952,8 @@ typedef enum
 	WAIT_EVENT_REPLICATION_SLOT_DROP,
 	WAIT_EVENT_SAFE_SNAPSHOT,
 	WAIT_EVENT_SYNC_REP,
-	WAIT_EVENT_XACT_GROUP_UPDATE
+	WAIT_EVENT_XACT_GROUP_UPDATE,
+	WAIT_EVENT_BATCH_SORT_BUILD
 } WaitEventIPC;
 
 /* ----------
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 11c6f50fbf..c200e38d12 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -306,3 +306,45 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+-- parallel distinct
+BEGIN;
+SET enable_batch_sort = ON;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Unique
+               ->  Parallel BatchSort
+                     Sort Key: tenk1.unique2
+                     ->  Parallel Seq Scan on tenk1
+(7 rows)
+
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Unique
+         ->  Parallel BatchSort
+               Sort Key: tenk1.unique2
+               ->  Unique
+                     ->  Parallel BatchSort
+                           Sort Key: tenk1.unique2
+                           ->  Parallel Seq Scan on tenk1
+(9 rows)
+
+ABORT;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 81bdacf59d..8ed047e520 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -88,6 +88,7 @@ select count(*) = 1 as ok from pg_stat_wal;
 select name, setting from pg_settings where name like 'enable%';
               name              | setting 
 --------------------------------+---------
+ enable_batch_sort              | off
  enable_bitmapscan              | on
  enable_gathermerge             | on
  enable_hashagg                 | on
@@ -106,7 +107,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(18 rows)
+(19 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 6e72e92d80..5a2be9aec9 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1052,3 +1052,58 @@ where (x = 0) or (q1 >= q2 and q1 <= q2);
  4567890123456789 |  4567890123456789 | 1
 (6 rows)
 
+-- parallel union
+BEGIN;
+SET enable_batch_sort = ON;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Unique
+               ->  Parallel BatchSort
+                     Sort Key: tenk1.unique2
+                     ->  Parallel Append
+                           ->  Parallel Seq Scan on tenk1
+                           ->  Parallel Seq Scan on tenk1 tenk1_1
+(9 rows)
+
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: (tenk1_1.unique2 = tenk1.unique2)
+                     ->  Unique
+                           ->  Parallel BatchSort
+                                 Sort Key: tenk1_1.unique2
+                                 ->  Parallel Append
+                                       ->  Parallel Seq Scan on tenk1 tenk1_1
+                                       ->  Parallel Seq Scan on tenk1 tenk1_2
+                     ->  Parallel Hash
+                           ->  Parallel Seq Scan on tenk1
+(14 rows)
+
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+ count 
+-------
+ 10000
+(1 row)
+
+ABORT;
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index 33102744eb..3ff7acf64d 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -135,3 +135,17 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- parallel distinct
+BEGIN;
+SET enable_batch_sort = ON;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ABORT;
\ No newline at end of file
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index 5f4881d594..a1cb1bb7ac 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -440,3 +440,18 @@ select * from
    union all
    select *, 1 as x from int8_tbl b) ss
 where (x = 0) or (q1 >= q2 and q1 <= q2);
+
+-- parallel union
+BEGIN;
+SET enable_batch_sort = ON;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+ABORT;
\ No newline at end of file
-- 
2.16.3

0002-Parallel-aggregate-support-using-batch-sort.patchapplication/octet-stream; name=0002-Parallel-aggregate-support-using-batch-sort.patchDownload
From ef005279f8b008e0593fb290cc4a30fbc275ab76 Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Mon, 19 Oct 2020 21:31:16 +0800
Subject: [PATCH 2/2] Parallel aggregate support using batch sort

---
 src/backend/optimizer/plan/planner.c              | 53 ++++++++++++++++
 src/include/nodes/pathnodes.h                     |  1 +
 src/test/regress/expected/partition_aggregate.out | 73 +++++++++++++++++++++++
 src/test/regress/sql/partition_aggregate.sql      | 25 ++++++++
 4 files changed, 152 insertions(+)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index ac7c2a52be..27680dbeb3 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3880,6 +3880,11 @@ create_grouping_paths(PlannerInfo *root,
 		extra.havingQual = parse->havingQual;
 		extra.targetList = parse->targetList;
 		extra.partial_costs_set = false;
+		if (parse->groupClause != NIL &&
+			(gd == NULL || gd->rollups == NIL))
+			extra.hashable_groups = grouping_get_hashable(parse->groupClause);
+		else
+			extra.hashable_groups = NIL;
 
 		/*
 		 * Determine whether partitionwise aggregation is in theory possible.
@@ -6705,6 +6710,54 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			}
 		}
 
+		/*
+		 * create simple agg using parallel
+		 */
+		if (grouped_rel->consider_parallel &&
+			extra->hashable_groups != NIL &&
+			input_rel->partial_pathlist != NIL &&
+			dNumGroups >= BATCH_SORT_MIN_BATCHES)
+		{
+			Path	   *path;
+			uint32		numBatches = (uint32)dNumGroups;
+			if (numBatches > BATCH_SORT_MAX_BATCHES)
+				numBatches = BATCH_SORT_MAX_BATCHES;
+			Assert(parse->groupingSets == NIL);
+			Assert(parse->groupClause != NIL);
+			foreach (lc, input_rel->partial_pathlist)
+			{
+				double numGroups = dNumGroups / numBatches;
+				if (numGroups < 1.0)
+					numGroups = 1.0;
+				path = (Path*)create_batchsort_path(root,
+													grouped_rel,
+													lfirst(lc),
+													root->group_pathkeys,
+													extra->hashable_groups,
+													numBatches,
+													true);
+				if (parse->hasAggs)
+					path = (Path*)create_agg_path(root,
+												  grouped_rel,
+												  path,
+												  grouped_rel->reltarget,
+												  AGG_SORTED,
+												  AGGSPLIT_SIMPLE,
+												  parse->groupClause,
+												  havingQual,
+												  agg_costs,
+												  numGroups);
+				else
+					path = (Path*)create_group_path(root,
+													grouped_rel,
+													path,
+													parse->groupClause,
+													havingQual,
+													numGroups);
+				add_partial_path(grouped_rel, path);
+			}
+		}
+
 		/*
 		 * Instead of operating directly on the input relation, we can
 		 * consider finalizing a partially aggregated path.
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 273bdda452..80ba8fccdc 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -2502,6 +2502,7 @@ typedef struct
 	bool		target_parallel_safe;
 	Node	   *havingQual;
 	List	   *targetList;
+	List	   *hashable_groups;
 	PartitionwiseAggregateType patype;
 } GroupPathExtraData;
 
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index 45c698daf4..b187c1080b 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -1516,3 +1516,76 @@ SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) <
  21 | 6000 | 6.0000000000000000 |  1000
 (6 rows)
 
+-- simple agg in parallel
+BEGIN;
+SET enable_batch_sort = ON;
+SET min_parallel_table_scan_size = 0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  GroupAggregate
+         Group Key: unique2
+         ->  Parallel BatchSort
+               Sort Key: unique2
+               ->  Parallel Seq Scan on tenk1
+(7 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  GroupAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel BatchSort
+                     Sort Key: tenk1.unique2
+                     ->  Parallel Seq Scan on tenk1
+(8 rows)
+
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Nested Loop
+                     ->  GroupAggregate
+                           Group Key: tenk1_1.unique2
+                           ->  Parallel BatchSort
+                                 Sort Key: tenk1_1.unique2
+                                 ->  Parallel Seq Scan on tenk1 tenk1_1
+                     ->  Index Scan using tenk1_unique2 on tenk1
+                           Index Cond: (unique2 = (count(*)))
+(12 rows)
+
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ count 
+-------
+ 10000
+(1 row)
+
+ABORT;
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index 117f65ecb4..3e50a48d37 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -330,3 +330,28 @@ RESET parallel_setup_cost;
 EXPLAIN (COSTS OFF)
 SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) < 7 ORDER BY 1, 2, 3;
 SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) < 7 ORDER BY 1, 2, 3;
+
+-- simple agg in parallel
+BEGIN;
+SET enable_batch_sort = ON;
+SET min_parallel_table_scan_size = 0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ABORT;
-- 
2.16.3

#2Thomas Munro
thomas.munro@gmail.com
In reply to: bucoo@sohu.com (#1)
Re: parallel distinct union and aggregate support patch

On Tue, Oct 20, 2020 at 3:49 AM bucoo@sohu.com <bucoo@sohu.com> wrote:

I write a path for soupport parallel distinct, union and aggregate using batch sort.
steps:
1. generate hash value for group clauses values, and using mod hash value save to batch
2. end of outer plan, wait all other workers finish write to batch
3. echo worker get a unique batch number, call tuplesort_performsort() function finish this batch sort
4. return row for this batch
5. if not end of all batchs, got step 3

BatchSort paln make sure same tuple(group clause) return in same range, so Unique(or GroupAggregate) plan can work.

Hi!

Interesting work! In the past a few people have speculated about a
Parallel Repartition operator that could partition tuples a bit like
this, so that each process gets a different set of partitions. Here
you combine that with a sort. By doing both things in one node, you
avoid a lot of overheads (writing into a tuplestore once in the
repartitioning node, and then once again in the sort node, with tuples
being copied one-by-one between the two nodes).

If I understood correctly, the tuples emitted by Parallel Batch Sort
in each process are ordered by (hash(key, ...) % npartitions, key,
...), but the path is claiming to be ordered by (key, ...), no?
That's enough for Unique and Aggregate to give the correct answer,
because they really only require equal keys to be consecutive (and in
the same process), but maybe some other plan could break?

#3Dilip Kumar
dilipbalaut@gmail.com
In reply to: bucoo@sohu.com (#1)
Re: parallel distinct union and aggregate support patch

On Mon, Oct 19, 2020 at 8:19 PM bucoo@sohu.com <bucoo@sohu.com> wrote:

Hi hackers,
I write a path for soupport parallel distinct, union and aggregate using batch sort.
steps:
1. generate hash value for group clauses values, and using mod hash value save to batch
2. end of outer plan, wait all other workers finish write to batch
3. echo worker get a unique batch number, call tuplesort_performsort() function finish this batch sort
4. return row for this batch
5. if not end of all batchs, got step 3

BatchSort paln make sure same tuple(group clause) return in same range, so Unique(or GroupAggregate) plan can work.

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#4bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

If I understood correctly, the tuples emitted by Parallel Batch Sort
in each process are ordered by (hash(key, ...) % npartitions, key,
...), but the path is claiming to be ordered by (key, ...), no?
That's enough for Unique and Aggregate to give the correct answer,
because they really only require equal keys to be consecutive (and in
the same process), but maybe some other plan could break?

The path not claiming to be ordered by (key, ...), the path save PathKey(s) in BatchSortPath::batchkeys, not Path::pathkeys.
I don't understand "but maybe some other plan could break", mean some on path using this path? no, BathSortPath on for some special path(Unique, GroupAgg ...).

bucoo@sohu.com

From: Thomas Munro
Date: 2020-10-21 12:27
To: bucoo@sohu.com
CC: pgsql-hackers
Subject: Re: parallel distinct union and aggregate support patch
On Tue, Oct 20, 2020 at 3:49 AM bucoo@sohu.com <bucoo@sohu.com> wrote:

I write a path for soupport parallel distinct, union and aggregate using batch sort.
steps:
1. generate hash value for group clauses values, and using mod hash value save to batch
2. end of outer plan, wait all other workers finish write to batch
3. echo worker get a unique batch number, call tuplesort_performsort() function finish this batch sort
4. return row for this batch
5. if not end of all batchs, got step 3

BatchSort paln make sure same tuple(group clause) return in same range, so Unique(or GroupAggregate) plan can work.

Hi!

Interesting work! In the past a few people have speculated about a
Parallel Repartition operator that could partition tuples a bit like
this, so that each process gets a different set of partitions. Here
you combine that with a sort. By doing both things in one node, you
avoid a lot of overheads (writing into a tuplestore once in the
repartitioning node, and then once again in the sort node, with tuples
being copied one-by-one between the two nodes).

If I understood correctly, the tuples emitted by Parallel Batch Sort
in each process are ordered by (hash(key, ...) % npartitions, key,
...), but the path is claiming to be ordered by (key, ...), no?
That's enough for Unique and Aggregate to give the correct answer,
because they really only require equal keys to be consecutive (and in
the same process), but maybe some other plan could break?

#5bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

Good idea. Batch sort suitable for large aggregate result rows,
in large aggregate result using partial aggregation maybe out of memory,
and all aggregate functions must support partial(using batch sort this is unnecessary).

Actually i written a batch hash store for hash aggregate(for pg11) like this idea,
but not write partial aggregations to shared tuple store, it's write origin tuple and hash value
to shared tuple store, But it's not support parallel grouping sets.
I'am trying to write parallel hash aggregate support using batch shared tuple store for PG14,
and need support parallel grouping sets hash aggregate.

#6Dilip Kumar
dilipbalaut@gmail.com
In reply to: bucoo@sohu.com (#5)
Re: Re: parallel distinct union and aggregate support patch

On Fri, Oct 23, 2020 at 11:58 AM bucoo@sohu.com <bucoo@sohu.com> wrote:

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

Good idea. Batch sort suitable for large aggregate result rows,
in large aggregate result using partial aggregation maybe out of memory,
and all aggregate functions must support partial(using batch sort this is unnecessary).

Actually i written a batch hash store for hash aggregate(for pg11) like this idea,
but not write partial aggregations to shared tuple store, it's write origin tuple and hash value
to shared tuple store, But it's not support parallel grouping sets.
I'am trying to write parallel hash aggregate support using batch shared tuple store for PG14,
and need support parallel grouping sets hash aggregate.

I was trying to look into this patch to understand the logic in more
detail. Actually, there are no comments at all so it's really hard to
understand what the code is trying to do.

I was reading the below functions, which is the main entry point for
the batch sort.

+static TupleTableSlot *ExecBatchSortPrepare(PlanState *pstate)
+{
...
+ for (;;)
+ {
...
+ tuplesort_puttupleslot(state->batches[hash%node->numBatches], slot);
+ }
+
+ for (i=node->numBatches;i>0;)
+ tuplesort_performsort(state->batches[--i]);
+build_already_done_:
+ if (parallel)
+ {
+ for (i=node->numBatches;i>0;)
+ {
+ --i;
+ if (state->batches[i])
+ {
+ tuplesort_end(state->batches[i]);
+ state->batches[i] = NULL;
+ }
+ }

I did not understand this part, that once each worker has performed
their local batch-wise sort why we are clearing the baches? I mean
individual workers have their on batches so eventually they supposed
to get merged. Can you explain this part and also it will be better
if you can add the comments.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#7Robert Haas
robertmhaas@gmail.com
In reply to: Dilip Kumar (#3)
Re: parallel distinct union and aggregate support patch

On Thu, Oct 22, 2020 at 5:08 AM Dilip Kumar <dilipbalaut@gmail.com> wrote:

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

I am not sure if this would be a win if the typical group size is
small and the transition state has to be serialized/deserialized.
Possibly we need multiple strategies, but I guess we'd have to test
performance to be sure.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#8Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#6)
Re: Re: parallel distinct union and aggregate support patch

On Tue, Oct 27, 2020 at 3:27 PM Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Fri, Oct 23, 2020 at 11:58 AM bucoo@sohu.com <bucoo@sohu.com> wrote:

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

Good idea. Batch sort suitable for large aggregate result rows,
in large aggregate result using partial aggregation maybe out of memory,
and all aggregate functions must support partial(using batch sort this is unnecessary).

Actually i written a batch hash store for hash aggregate(for pg11) like this idea,
but not write partial aggregations to shared tuple store, it's write origin tuple and hash value
to shared tuple store, But it's not support parallel grouping sets.
I'am trying to write parallel hash aggregate support using batch shared tuple store for PG14,
and need support parallel grouping sets hash aggregate.

I was trying to look into this patch to understand the logic in more
detail. Actually, there are no comments at all so it's really hard to
understand what the code is trying to do.

I was reading the below functions, which is the main entry point for
the batch sort.

+static TupleTableSlot *ExecBatchSortPrepare(PlanState *pstate)
+{
...
+ for (;;)
+ {
...
+ tuplesort_puttupleslot(state->batches[hash%node->numBatches], slot);
+ }
+
+ for (i=node->numBatches;i>0;)
+ tuplesort_performsort(state->batches[--i]);
+build_already_done_:
+ if (parallel)
+ {
+ for (i=node->numBatches;i>0;)
+ {
+ --i;
+ if (state->batches[i])
+ {
+ tuplesort_end(state->batches[i]);
+ state->batches[i] = NULL;
+ }
+ }

I did not understand this part, that once each worker has performed
their local batch-wise sort why we are clearing the baches? I mean
individual workers have their on batches so eventually they supposed
to get merged. Can you explain this part and also it will be better
if you can add the comments.

I think I got this, IIUC, each worker is initializing the shared
short and performing the batch-wise sorting and we will wait on a
barrier so that all the workers can finish with their sorting. Once
that is done the workers will coordinate and pick the batch by batch
and perform the final merge for the batch.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#9Dilip Kumar
dilipbalaut@gmail.com
In reply to: Robert Haas (#7)
Re: parallel distinct union and aggregate support patch

On Tue, Oct 27, 2020 at 5:43 PM Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Oct 22, 2020 at 5:08 AM Dilip Kumar <dilipbalaut@gmail.com> wrote:

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

I am not sure if this would be a win if the typical group size is
small and the transition state has to be serialized/deserialized.
Possibly we need multiple strategies, but I guess we'd have to test
performance to be sure.

+1

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#10bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

On Tue, Oct 27, 2020 at 3:27 PM Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Fri, Oct 23, 2020 at 11:58 AM bucoo@sohu.com <bucoo@sohu.com> wrote:

Interesting idea. So IIUC, whenever a worker is scanning the tuple it
will directly put it into the respective batch(shared tuple store),
based on the hash on grouping column and once all the workers are
doing preparing the batch then each worker will pick those baches one
by one, perform sort and finish the aggregation. I think there is a
scope of improvement that instead of directly putting the tuple to the
batch what if the worker does the partial aggregations and then it
places the partially aggregated rows in the shared tuple store based
on the hash value and then the worker can pick the batch by batch. By
doing this way, we can avoid doing large sorts. And then this
approach can also be used with the hash aggregate, I mean the
partially aggregated data by the hash aggregate can be put into the
respective batch.

Good idea. Batch sort suitable for large aggregate result rows,
in large aggregate result using partial aggregation maybe out of memory,
and all aggregate functions must support partial(using batch sort this is unnecessary).

Actually i written a batch hash store for hash aggregate(for pg11) like this idea,
but not write partial aggregations to shared tuple store, it's write origin tuple and hash value
to shared tuple store, But it's not support parallel grouping sets.
I'am trying to write parallel hash aggregate support using batch shared tuple store for PG14,
and need support parallel grouping sets hash aggregate.

I was trying to look into this patch to understand the logic in more
detail. Actually, there are no comments at all so it's really hard to
understand what the code is trying to do.

I was reading the below functions, which is the main entry point for
the batch sort.

+static TupleTableSlot *ExecBatchSortPrepare(PlanState *pstate)
+{
...
+ for (;;)
+ {
...
+ tuplesort_puttupleslot(state->batches[hash%node->numBatches], slot);
+ }
+
+ for (i=node->numBatches;i>0;)
+ tuplesort_performsort(state->batches[--i]);
+build_already_done_:
+ if (parallel)
+ {
+ for (i=node->numBatches;i>0;)
+ {
+ --i;
+ if (state->batches[i])
+ {
+ tuplesort_end(state->batches[i]);
+ state->batches[i] = NULL;
+ }
+ }

I did not understand this part, that once each worker has performed
their local batch-wise sort why we are clearing the baches? I mean
individual workers have their on batches so eventually they supposed
to get merged. Can you explain this part and also it will be better
if you can add the comments.

I think I got this, IIUC, each worker is initializing the shared
short and performing the batch-wise sorting and we will wait on a
barrier so that all the workers can finish with their sorting. Once
that is done the workers will coordinate and pick the batch by batch
and perform the final merge for the batch.

Yes, it is. Each worker open the shared sort as "worker" (nodeBatchSort.c:134),
end of all worker performing, pick one batch and open it as "leader"(nodeBatchSort.c:54).

#11bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
2 attachment(s)
Re: parallel distinct union and aggregate support patch

Hi
Here is patch for parallel distinct union aggregate and grouping sets support using batch hash agg.
Please review.

how to use:
set enable_batch_hashagg = on

how to work:
like batch sort, but not sort each batch, just save hash value in each rows

unfinished work:
not support rescan yet. welcome to add. Actually I don't really understand how rescan works in parallel mode.

other:
patch 1 base on branch master(80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f)
patch 1 and 2 see /messages/by-id/2020101922424962544053@sohu.com
patch 3:
extpand shared tuple store and add batch store module.
By the way, use atomic operations instead LWLock for shared tuple store get next read page.
patch 4:
using batch hash agg support parallels

发件人: bucoo@sohu.com
发送时间: 2020-10-19 22:42
收件人: pgsql-hackers
主题: parallel distinct union and aggregate support patch
Hi hackers,
I write a path for soupport parallel distinct, union and aggregate using batch sort.
steps:
1. generate hash value for group clauses values, and using mod hash value save to batch
2. end of outer plan, wait all other workers finish write to batch
3. echo worker get a unique batch number, call tuplesort_performsort() function finish this batch sort
4. return row for this batch
5. if not end of all batchs, got step 3

BatchSort paln make sure same tuple(group clause) return in same range, so Unique(or GroupAggregate) plan can work.

path 2 for parallel aggregate, this is a simple use
but regress failed for partitionwise aggregation difference plan
from GatherMerge->Sort->Append->...
to Sort->Gahter->Append->...
I have no idea how to modify it.

Same idea I writed a batch shared tuple store for HashAgg in our PG version, I will send patch for PG14 when I finish it.

The following is a description in Chinese
英语不好,所以这里写点中文,希望上面写的不对的地方请大家帮忙纠正一下。
BatchSort的工作原理
1. 先按group clause计算出hash值,并按取模的值放入不同的批次
2. 当下层plan返回所有的行后,等待所有其它的工作进程结束
3. 每一个工作进程索取一个唯一的一个批次, 并调用tuplesort_performsort()函数完成最终排序
4. 返回本批次的所有行
5. 如果所有的批次没有读完,则返回第3步
BatchSort plan能保证相同的数据(按分给表达式)在同一个周期内返回,所以几个去重和分组相关的plan可以正常工作。
第2个补丁是支持并行分组的,只做一次分组,而不是并行进程做每一次分组后,主进程再进行二次分组。
这个补丁导致了regress测试中的partitionwise aggregation失败,原来的执行计划有所变更。
补丁只写了一个简单的使用BatchSort plan的方法,可能还需要添加其它用法。

用同样的思想我写了一个使用shared tuple store的HashAgg在我们的AntDB版本中(最新版本暂未开源),适配完PG14版本后我会发出来。
打个广告:欢迎关注我们亚信公司基于PG的分布式数据库产品AntDB,开源地址 https://github.com/ADBSQL/AntDB

bucoo@sohu.com

Attachments:

0003-extpand-shared-tuple-store-and-add-batch-store-modul.patchapplication/octet-stream; name=0003-extpand-shared-tuple-store-and-add-batch-store-modul.patchDownload
From 027b572d1134842d81b7a4b4310daabdbd75a2f2 Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Wed, 28 Oct 2020 16:27:43 +0800
Subject: [PATCH 3/4] extpand shared tuple store and add batch store module. By
 the way, use atomic operations instead LWLock for shared tuple store get next
 read page.

---
 src/backend/storage/lmgr/lwlock.c         |   2 -
 src/backend/utils/sort/Makefile           |   3 +-
 src/backend/utils/sort/batchstore.c       | 381 ++++++++++++++++++++++++++++++
 src/backend/utils/sort/sharedtuplestore.c | 112 +++++++--
 src/include/storage/lwlock.h              |   1 -
 src/include/utils/batchstore.h            |  38 +++
 src/include/utils/sharedtuplestore.h      |  12 +
 7 files changed, 526 insertions(+), 23 deletions(-)
 create mode 100644 src/backend/utils/sort/batchstore.c
 create mode 100644 src/include/utils/batchstore.h

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 2fa90cc095..b11a91e94c 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -170,8 +170,6 @@ static const char *const BuiltinTrancheNames[] = {
 	"PerSessionRecordType",
 	/* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
 	"PerSessionRecordTypmod",
-	/* LWTRANCHE_SHARED_TUPLESTORE: */
-	"SharedTupleStore",
 	/* LWTRANCHE_SHARED_TIDBITMAP: */
 	"SharedTidBitmap",
 	/* LWTRANCHE_PARALLEL_APPEND: */
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index 7ac3659261..f82f5aa8cd 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -19,7 +19,8 @@ OBJS = \
 	sharedtuplestore.o \
 	sortsupport.o \
 	tuplesort.o \
-	tuplestore.o
+	tuplestore.o \
+	batchstore.o
 
 tuplesort.o: qsort_tuple.c
 
diff --git a/src/backend/utils/sort/batchstore.c b/src/backend/utils/sort/batchstore.c
new file mode 100644
index 0000000000..3cca7769c9
--- /dev/null
+++ b/src/backend/utils/sort/batchstore.c
@@ -0,0 +1,381 @@
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "commands/tablespace.h"
+#include "executor/nodeHash.h"
+#include "port/atomics.h"
+#include "storage/buffile.h"
+#include "utils/batchstore.h"
+#include "utils/memutils.h"
+#include "utils/sharedtuplestore.h"
+
+#define InvalidBatch UINT32_MAX
+
+typedef enum BatchMethod
+{
+	BSM_HASH = 1,
+	BSM_PARALLEL_HASH
+}BatchMethod;
+
+typedef struct BatchStoreParallelHashData
+{
+	pg_atomic_uint32	cur_batches;
+	uint32				num_batches;
+	uint32				num_participants;
+}BatchStoreParallelHashData;
+
+typedef struct BatchStoreData
+{
+	BatchStoreFuncs	func;
+	BatchMethod	method;
+	uint32		num_batches;
+	void	  **all_batches;
+	void	   *cur_batch_ptr;
+	uint32		cur_batch_num;
+	union
+	{
+		/* for hash */
+		struct
+		{
+			StringInfoData	hash_read_buf;
+		};
+		/* for parallel hash */
+		struct
+		{
+			dsm_segment		   *dsm_seg;
+			MemoryContext		accessor_mcontext;
+			Bitmapset		   *our_batches;		/* we got batches(for rescan) */
+			pg_atomic_uint32   *shm_ph_batch_num;	/* in shared memory, parallel hash batch number */
+			bool				ended_parallel;		/* parallel batches loop end? for rescan. */
+			bool				parallel_batch;		/* each worker read part of all batches? */
+		};
+	};
+}BatchStoreData;
+
+static void bs_write_normal_hash(BatchStore bs, MinimalTuple mtup, uint32 hash);
+static MinimalTuple bs_read_normal_hash(BatchStore bs, uint32 *hash);
+
+static void bs_write_parallel_hash(BatchStore bs, MinimalTuple mtup, uint32 hash);
+static void bs_write_parallel_one_batch_hash(BatchStore bs, MinimalTuple mtup, uint32 hash);
+static MinimalTuple bs_read_parallel_hash(BatchStore bs, uint32 *hash);
+
+static inline BatchStore make_empty_batch_store(uint32 num_batches)
+{
+	BatchStore bs;
+
+	bs = palloc0(MAXALIGN(sizeof(BatchStoreData)) +
+					sizeof(void*) * num_batches);
+	bs->all_batches = (void**)(((char*)bs) + MAXALIGN(sizeof(*bs)));
+	bs->num_batches = num_batches;
+	bs->cur_batch_num = InvalidBatch;
+
+	return bs;
+}
+
+BatchStore bs_begin_hash(uint32 num_batches)
+{
+	BatchStore bs = make_empty_batch_store(num_batches);
+	bs->method = BSM_HASH;
+
+	initStringInfo(&bs->hash_read_buf);
+	enlargeStringInfo(&bs->hash_read_buf, MINIMAL_TUPLE_DATA_OFFSET);
+	MemSet(bs->hash_read_buf.data, 0, MINIMAL_TUPLE_DATA_OFFSET);
+
+	PrepareTempTablespaces();
+
+	bs->func.hash_write = bs_write_normal_hash;
+	bs->func.hash_read = bs_read_normal_hash;
+	return bs;
+}
+
+size_t bs_parallel_hash_estimate(uint32 num_batches, uint32 nparticipants)
+{
+	return MAXALIGN(sizeof(struct BatchStoreParallelHashData)) + 
+				MAXALIGN(sts_estimate(nparticipants)) * num_batches;
+}
+
+static BatchStore bs_begin_parallel_hash(BatchStoreParallelHash bsph,
+										 uint32 my_participant_num, bool init,
+										 SharedFileSet *fileset, const char *name,
+										 dsm_segment *dsm_seg)
+{
+	uint32			i;
+	MemoryContext	oldcontext;
+	char		   *addr;
+	char			buffer[24];
+	Size			sts_size = MAXALIGN(sts_estimate(bsph->num_participants));
+	BatchStore		bs = make_empty_batch_store(bsph->num_batches);
+
+	bs->method = BSM_PARALLEL_HASH;
+	bs->shm_ph_batch_num = &bsph->cur_batches;
+
+	bs->accessor_mcontext = AllocSetContextCreate(CurrentMemoryContext,
+												  "batch parallel hash",
+												  ALLOCSET_DEFAULT_SIZES);
+	oldcontext = MemoryContextSwitchTo(bs->accessor_mcontext);
+	addr = ((char*)bsph) + MAXALIGN(sizeof(*bsph));
+	for (i=bsph->num_batches;i>0;)
+	{
+		--i;
+		if (init)
+		{
+			sprintf(buffer, "%s_%u", name, i);
+			bs->all_batches[i] = sts_initialize((SharedTuplestore*)addr,
+												bsph->num_participants,
+												my_participant_num,
+												sizeof(uint32),
+												0,
+												fileset,
+												buffer);
+		}else
+		{
+			bs->all_batches[i] = sts_attach((SharedTuplestore*)addr,
+											my_participant_num,
+											fileset);
+		}
+		addr += sts_size;
+	}
+	MemoryContextSwitchTo(oldcontext);
+
+	bs->dsm_seg = dsm_seg;
+	bs->func.hash_read = bs_read_parallel_hash;
+	if (bs->num_batches == 1)
+		bs->func.hash_write = bs_write_parallel_one_batch_hash;
+	else
+		bs->func.hash_write = bs_write_parallel_hash;
+
+	return bs;
+}
+
+BatchStore bs_init_parallel_hash(uint32 num_batches,
+								 uint32 nparticipants, uint32 my_participant_num,
+								 BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+								 SharedFileSet *fileset, const char *name)
+{
+	Assert(name != NULL && fileset != NULL);
+	bsph->num_batches = num_batches;
+	bsph->num_participants = nparticipants;
+	pg_atomic_init_u32(&bsph->cur_batches, InvalidBatch);
+
+	return bs_begin_parallel_hash(bsph, my_participant_num, true, fileset, name, dsm_seg);
+}
+
+BatchStore bs_attach_parallel_hash(BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+								   SharedFileSet *fileset, uint32 my_participant_num)
+{
+	return bs_begin_parallel_hash(bsph, my_participant_num, false, fileset, NULL, dsm_seg);
+}
+
+void bs_destory(BatchStore bs)
+{
+	uint32	i;
+	if (bs == NULL)
+		return;
+
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		for(i=0;i<bs->num_batches;++i)
+		{
+			if (bs->all_batches[i])
+				BufFileClose(bs->all_batches[i]);
+		}
+		pfree(bs->hash_read_buf.data);
+		break;
+	case BSM_PARALLEL_HASH:
+		{
+			BatchStoreParallelHash bsph = (BatchStoreParallelHash)(((char*)bs->shm_ph_batch_num) -
+											offsetof(BatchStoreParallelHashData, cur_batches));
+			uint32 count = bsph->num_batches;
+			while (count > 0)
+				sts_detach(bs->all_batches[--count]);
+			MemoryContextDelete(bs->accessor_mcontext);
+		}
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+
+	pfree(bs);
+}
+
+static void bs_write_normal_hash(BatchStore bs, MinimalTuple mtup, uint32 hash)
+{
+	uint32 batch = hash % bs->num_batches;
+	uint32 data_len = mtup->t_len - MINIMAL_TUPLE_DATA_OFFSET;
+	BufFile *buffile = bs->all_batches[batch];
+
+	if (unlikely(buffile == NULL))
+	{
+		MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(bs));
+		buffile = BufFileCreateTemp(false);
+		bs->all_batches[batch] = buffile;
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	BufFileWrite(buffile, &hash, sizeof(hash));
+	BufFileWrite(buffile, &mtup->t_len, sizeof(mtup->t_len));
+	BufFileWrite(buffile, ((char*)mtup) + MINIMAL_TUPLE_DATA_OFFSET, data_len);
+}
+
+static MinimalTuple bs_read_normal_hash(BatchStore bs, uint32 *hash)
+{
+	MinimalTuple	mtup;
+	size_t			nread;
+	uint32			head[2];
+	uint32			data_len;
+
+	nread = BufFileRead(bs->cur_batch_ptr, head, sizeof(head));
+	if (nread == 0)
+		return NULL;
+
+	if (nread != sizeof(head))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from batch store temporary file: %m")));
+	*hash = head[0];
+	enlargeStringInfo(&bs->hash_read_buf, head[1]);
+	mtup = (MinimalTuple)bs->hash_read_buf.data;
+	mtup->t_len = head[1];
+	data_len = head[1] - MINIMAL_TUPLE_DATA_OFFSET;
+	if (BufFileRead(bs->cur_batch_ptr, ((char*)mtup) + MINIMAL_TUPLE_DATA_OFFSET, data_len) != data_len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from batch store temporary file: %m")));
+
+	return mtup;
+}
+
+void bs_end_write(BatchStore bs)
+{
+	uint32 i;
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		/* nothing to do */
+		break;
+	case BSM_PARALLEL_HASH:
+		for (i=bs->num_batches;i>0;)
+			sts_end_write(bs->all_batches[--i]);
+		bs->cur_batch_ptr = NULL;
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+}
+
+static void bs_write_parallel_hash(BatchStore bs, MinimalTuple mtup, uint32 hash)
+{
+	sts_puttuple(bs->all_batches[hash%bs->num_batches],
+				 &hash,
+				 mtup);
+}
+
+static void bs_write_parallel_one_batch_hash(BatchStore bs, MinimalTuple mtup, uint32 hash)
+{
+	Assert(bs->num_batches == 1);
+	sts_puttuple(bs->all_batches[0],
+				 &hash,
+				 mtup);
+}
+
+static MinimalTuple bs_read_parallel_hash(BatchStore bs, uint32 *hash)
+{
+	return sts_scan_next(bs->cur_batch_ptr, hash);
+}
+
+bool bs_next_batch(BatchStore bs, bool no_parallel)
+{
+	uint32 batch;
+	switch(bs->method)
+	{
+	case BSM_HASH:
+
+		batch = bs->cur_batch_num;
+		++batch;
+
+		for (;batch < bs->num_batches;++batch)
+		{
+			if (bs->all_batches[batch])
+			{
+				bs->cur_batch_ptr = bs->all_batches[batch];
+				bs->cur_batch_num = batch;
+				if (BufFileSeek(bs->cur_batch_ptr, 0, 0, SEEK_SET) != 0)
+				{
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("can not seek batch store file to head")));
+				}
+				return true;
+			}
+		}
+		break;
+	case BSM_PARALLEL_HASH:
+		if (no_parallel)
+		{
+			batch = bs->cur_batch_num;
+			++batch;
+		}else
+		{
+			batch = pg_atomic_add_fetch_u32(bs->shm_ph_batch_num, 1);
+		}
+
+		if (batch < bs->num_batches)
+		{
+			bs->cur_batch_num = batch;
+			bs->cur_batch_ptr = bs->all_batches[batch];
+			sts_begin_scan(bs->cur_batch_ptr);
+			return true;
+		}
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+
+	return false;
+}
+
+void bs_rescan(BatchStore bs)
+{
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		bs->cur_batch_ptr = NULL;
+		bs->cur_batch_num = InvalidBatch;
+		break;
+	case BSM_PARALLEL_HASH:
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("parallel batch store not support rescan yet")));
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+}
+
+void bs_end_cur_batch(BatchStore bs)
+{
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		bs->cur_batch_ptr = NULL;
+		break;
+	case BSM_PARALLEL_HASH:
+		sts_end_scan(bs->cur_batch_ptr);
+		bs->cur_batch_ptr = NULL;
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+}
diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c
index fe298ce92e..fbad3098e6 100644
--- a/src/backend/utils/sort/sharedtuplestore.c
+++ b/src/backend/utils/sort/sharedtuplestore.c
@@ -24,9 +24,10 @@
 #include "access/htup.h"
 #include "access/htup_details.h"
 #include "miscadmin.h"
+#include "port/atomics.h"
 #include "storage/buffile.h"
-#include "storage/lwlock.h"
 #include "storage/sharedfileset.h"
+#include "utils/memutils.h"
 #include "utils/sharedtuplestore.h"
 
 /*
@@ -50,8 +51,7 @@ typedef struct SharedTuplestoreChunk
 /* Per-participant shared state. */
 typedef struct SharedTuplestoreParticipant
 {
-	LWLock		lock;
-	BlockNumber read_page;		/* Page number for next read. */
+	pg_atomic_uint32 read_page;	/* Page number for next read. */
 	BlockNumber npages;			/* Number of pages written. */
 	bool		writing;		/* Used only for assertions. */
 } SharedTuplestoreParticipant;
@@ -72,6 +72,8 @@ struct SharedTuplestore
 struct SharedTuplestoreAccessor
 {
 	int			participant;	/* My participant number. */
+	bool		is_read_only;	/* is read only attach? */
+	bool		is_normal_scan;	/* is not parallel scan? */
 	SharedTuplestore *sts;		/* The shared state. */
 	SharedFileSet *fileset;		/* The SharedFileSet holding files. */
 	MemoryContext context;		/* Memory context for buffers. */
@@ -155,9 +157,8 @@ sts_initialize(SharedTuplestore *sts, int participants,
 
 	for (i = 0; i < participants; ++i)
 	{
-		LWLockInitialize(&sts->participants[i].lock,
-						 LWTRANCHE_SHARED_TUPLESTORE);
-		sts->participants[i].read_page = 0;
+		pg_atomic_init_u32(&sts->participants[i].read_page, 0);
+		sts->participants[i].npages = 0;
 		sts->participants[i].writing = false;
 	}
 
@@ -192,6 +193,24 @@ sts_attach(SharedTuplestore *sts,
 	return accessor;
 }
 
+SharedTuplestoreAccessor *
+sts_attach_read_only(SharedTuplestore *sts,
+					 SharedFileSet *fileset)
+{
+	SharedTuplestoreAccessor *accessor;
+
+	Assert(sts->nparticipants > 0);
+
+	accessor = palloc0(sizeof(SharedTuplestoreAccessor));
+	accessor->is_read_only = true;
+	accessor->participant = 0;
+	accessor->sts = sts;
+	accessor->fileset = fileset;
+	accessor->context = CurrentMemoryContext;
+
+	return accessor;
+}
+
 static void
 sts_flush_chunk(SharedTuplestoreAccessor *accessor)
 {
@@ -242,7 +261,7 @@ sts_reinitialize(SharedTuplestoreAccessor *accessor)
 	 */
 	for (i = 0; i < accessor->sts->nparticipants; ++i)
 	{
-		accessor->sts->participants[i].read_page = 0;
+		pg_atomic_init_u32(&accessor->sts->participants[i].read_page, 0);
 	}
 }
 
@@ -272,6 +291,8 @@ sts_begin_parallel_scan(SharedTuplestoreAccessor *accessor)
 	accessor->read_participant = accessor->participant;
 	accessor->read_file = NULL;
 	accessor->read_next_page = 0;
+	accessor->read_ntuples = 0;
+	accessor->read_ntuples_available = 0;
 }
 
 /*
@@ -302,15 +323,23 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data,
 {
 	size_t		size;
 
+	if (unlikely(accessor->is_read_only))
+	{
+		ereport(ERROR,
+				(errmsg("shard tuplestore is attached read only")));
+	}
+
 	/* Do we have our own file yet? */
 	if (accessor->write_file == NULL)
 	{
 		SharedTuplestoreParticipant *participant;
-		char		name[MAXPGPATH];
+		char			name[MAXPGPATH];
+		MemoryContext	oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(accessor));
 
 		/* Create one.  Only this backend will write into it. */
 		sts_filename(name, accessor, accessor->participant);
 		accessor->write_file = BufFileCreateShared(accessor->fileset, name);
+		MemoryContextSwitchTo(oldcontext);
 
 		/* Set up the shared state for this backend's file. */
 		participant = &accessor->sts->participants[accessor->participant];
@@ -532,20 +561,36 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
 		/* Find the location of a new chunk to read. */
 		p = &accessor->sts->participants[accessor->read_participant];
 
-		LWLockAcquire(&p->lock, LW_EXCLUSIVE);
-		/* We can skip directly past overflow pages we know about. */
-		if (p->read_page < accessor->read_next_page)
-			p->read_page = accessor->read_next_page;
-		eof = p->read_page >= p->npages;
-		if (!eof)
+		if (accessor->is_normal_scan)
+		{
+			eof = accessor->read_next_page >= p->npages;
+			if (!eof)
+			{
+				read_page = accessor->read_next_page;
+				accessor->read_next_page += STS_CHUNK_PAGES;
+			}
+		}else
 		{
 			/* Claim the next chunk. */
-			read_page = p->read_page;
-			/* Advance the read head for the next reader. */
-			p->read_page += STS_CHUNK_PAGES;
-			accessor->read_next_page = p->read_page;
+			read_page = pg_atomic_read_u32(&p->read_page);
+			/* We can skip directly past overflow pages we know about. */
+			while (read_page < accessor->read_next_page)
+			{
+				if (pg_atomic_compare_exchange_u32(&p->read_page,
+												   &read_page,
+												   accessor->read_next_page))
+					break;
+			}
+			while ((eof = read_page >= p->npages) == false)
+			{
+				/* Advance the read head for the next reader. */
+				accessor->read_next_page = read_page + STS_CHUNK_PAGES;
+				if (pg_atomic_compare_exchange_u32(&p->read_page,
+												   &read_page,
+												   accessor->read_next_page))
+					break;
+			}
 		}
-		LWLockRelease(&p->lock);
 
 		if (!eof)
 		{
@@ -556,10 +601,12 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
 			if (accessor->read_file == NULL)
 			{
 				char		name[MAXPGPATH];
+				MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(accessor));
 
 				sts_filename(name, accessor, accessor->read_participant);
 				accessor->read_file =
 					BufFileOpenShared(accessor->fileset, name, O_RDONLY);
+				MemoryContextSwitchTo(oldcontext);
 			}
 
 			/* Seek and load the chunk header. */
@@ -626,3 +673,30 @@ sts_filename(char *name, SharedTuplestoreAccessor *accessor, int participant)
 {
 	snprintf(name, MAXPGPATH, "%s.p%d", accessor->sts->name, participant);
 }
+
+void sts_begin_scan(SharedTuplestoreAccessor *accessor)
+{
+	sts_begin_parallel_scan(accessor);
+	accessor->is_normal_scan = true;
+}
+
+void sts_end_scan(SharedTuplestoreAccessor *accessor)
+{
+	Assert(accessor->is_normal_scan);
+	sts_end_parallel_scan(accessor);
+	accessor->is_normal_scan = false;
+}
+
+MinimalTuple sts_scan_next(SharedTuplestoreAccessor *accessor,
+					   void *meta_data)
+{
+	Assert(accessor->is_normal_scan);
+	return sts_parallel_scan_next(accessor, meta_data);
+}
+
+void sts_detach(SharedTuplestoreAccessor *accessor)
+{
+	sts_end_write(accessor);
+	sts_end_parallel_scan(accessor);
+	pfree(accessor);
+}
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index af9b41795d..8a13397379 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -214,7 +214,6 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_PER_SESSION_DSA,
 	LWTRANCHE_PER_SESSION_RECORD_TYPE,
 	LWTRANCHE_PER_SESSION_RECORD_TYPMOD,
-	LWTRANCHE_SHARED_TUPLESTORE,
 	LWTRANCHE_SHARED_TIDBITMAP,
 	LWTRANCHE_PARALLEL_APPEND,
 	LWTRANCHE_PER_XACT_PREDICATE_LIST,
diff --git a/src/include/utils/batchstore.h b/src/include/utils/batchstore.h
new file mode 100644
index 0000000000..1afbaea570
--- /dev/null
+++ b/src/include/utils/batchstore.h
@@ -0,0 +1,38 @@
+#ifndef BATCH_STORE_H
+#define BATCH_STORE_H
+
+#include "access/htup.h"
+#include "storage/dsm.h"
+#include "storage/sharedfileset.h"
+
+typedef struct BatchStoreData* BatchStore;
+typedef struct BatchStoreParallelHashData* BatchStoreParallelHash;
+
+typedef struct BatchStoreFuncs
+{
+	void (*hash_write)(BatchStore bs, MinimalTuple mtup, uint32 hash);
+	MinimalTuple (*hash_read)(BatchStore bs, uint32 *hash);
+}BatchStoreFuncs;
+
+#define bs_write_hash(bs, mtup, hash) (*((BatchStoreFuncs*)bs)->hash_write)(bs, mtup, hash)
+#define bs_read_hash(bs, phash) (*((BatchStoreFuncs*)bs)->hash_read)(bs, phash)
+
+extern BatchStore bs_begin_hash(uint32 num_batches);
+
+extern size_t bs_parallel_hash_estimate(uint32 num_batches, uint32 nparticipants);
+extern BatchStore bs_init_parallel_hash(uint32 num_batches,
+										uint32 nparticipants, uint32 my_participant_num,
+										BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+										SharedFileSet *fileset, const char *name);
+extern BatchStore bs_attach_parallel_hash(BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+										  SharedFileSet *fileset, uint32 my_participant_num);
+
+extern void bs_destory(BatchStore bs);
+
+
+extern void bs_end_write(BatchStore bs);
+
+extern bool bs_next_batch(BatchStore bs, bool no_parallel);
+extern void bs_rescan(BatchStore bs);
+extern void bs_end_cur_batch(BatchStore bs);
+#endif /* BATCH_STORE_H */
\ No newline at end of file
diff --git a/src/include/utils/sharedtuplestore.h b/src/include/utils/sharedtuplestore.h
index 9754504cc5..e8121fbe78 100644
--- a/src/include/utils/sharedtuplestore.h
+++ b/src/include/utils/sharedtuplestore.h
@@ -43,6 +43,9 @@ extern SharedTuplestoreAccessor *sts_attach(SharedTuplestore *sts,
 											int my_participant_number,
 											SharedFileSet *fileset);
 
+extern SharedTuplestoreAccessor *sts_attach_read_only(SharedTuplestore *sts,
+													  SharedFileSet *fileset);
+
 extern void sts_end_write(SharedTuplestoreAccessor *accessor);
 
 extern void sts_reinitialize(SharedTuplestoreAccessor *accessor);
@@ -58,4 +61,13 @@ extern void sts_puttuple(SharedTuplestoreAccessor *accessor,
 extern MinimalTuple sts_parallel_scan_next(SharedTuplestoreAccessor *accessor,
 										   void *meta_data);
 
+extern void sts_begin_scan(SharedTuplestoreAccessor *accessor);
+
+extern void sts_end_scan(SharedTuplestoreAccessor *accessor);
+
+extern MinimalTuple sts_scan_next(SharedTuplestoreAccessor *accessor,
+								  void *meta_data);
+
+extern void sts_detach(SharedTuplestoreAccessor *accessor);
+
 #endif							/* SHAREDTUPLESTORE_H */
-- 
2.16.3

0004-Parallel-distinct-union-aggregate-and-grouping-sets-.patchapplication/octet-stream; name=0004-Parallel-distinct-union-aggregate-and-grouping-sets-.patchDownload
From 54ba12269794d7c714c7396da6a41fb47a442b06 Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Wed, 28 Oct 2020 16:29:06 +0800
Subject: [PATCH 4/4] Parallel distinct union aggregate and grouping sets
 support using batch hash aggregate

---
 src/backend/commands/explain.c                    |   4 +
 src/backend/executor/nodeAgg.c                    | 543 ++++++++++++++++++----
 src/backend/nodes/copyfuncs.c                     |   1 +
 src/backend/nodes/outfuncs.c                      |   1 +
 src/backend/nodes/readfuncs.c                     |   1 +
 src/backend/optimizer/path/costsize.c             |  31 +-
 src/backend/optimizer/plan/createplan.c           |  12 +-
 src/backend/optimizer/plan/planner.c              | 131 ++++++
 src/backend/optimizer/prep/prepunion.c            |  27 ++
 src/backend/postmaster/pgstat.c                   |   3 +
 src/backend/utils/misc/guc.c                      |   9 +
 src/include/executor/nodeAgg.h                    |   2 +
 src/include/nodes/execnodes.h                     |   3 +
 src/include/nodes/nodes.h                         |   3 +-
 src/include/nodes/plannodes.h                     |   1 +
 src/include/optimizer/cost.h                      |   1 +
 src/include/optimizer/pathnode.h                  |   2 +
 src/include/pgstat.h                              |   3 +-
 src/test/regress/expected/groupingsets.out        |  65 +++
 src/test/regress/expected/partition_aggregate.out |  64 +++
 src/test/regress/expected/select_distinct.out     |  35 ++
 src/test/regress/expected/sysviews.out            |   3 +-
 src/test/regress/expected/union.out               |  30 ++
 src/test/regress/sql/groupingsets.sql             |  10 +
 src/test/regress/sql/partition_aggregate.sql      |  20 +
 src/test/regress/sql/select_distinct.sql          |   9 +
 src/test/regress/sql/union.sql                    |  13 +
 27 files changed, 941 insertions(+), 86 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 16a1fb035d..d4b336aa82 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1302,6 +1302,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 						pname = "MixedAggregate";
 						strategy = "Mixed";
 						break;
+					case AGG_BATCH_HASH:
+						pname = "BatchHashAggregate";
+						strategy = "BatchHashed";
+						break;
 					default:
 						pname = "Aggregate ???";
 						strategy = "???";
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 75e5bbf209..08024532a3 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -256,7 +256,10 @@
 #include "optimizer/optimizer.h"
 #include "parser/parse_agg.h"
 #include "parser/parse_coerce.h"
+#include "pgstat.h"
+#include "storage/barrier.h"
 #include "utils/acl.h"
+#include "utils/batchstore.h"
 #include "utils/builtins.h"
 #include "utils/datum.h"
 #include "utils/dynahash.h"
@@ -311,6 +314,11 @@
  */
 #define CHUNKHDRSZ 16
 
+#define SHARED_AGG_MAGIC		UINT64CONST(0x4141bbcd61518e52)
+#define SHARED_AGG_KEY_INFO		UINT64CONST(0xD000000000000001)
+#define SHARED_AGG_KEY_BARRIER	UINT64CONST(0xD000000000000002)
+#define SHARED_AGG_KEY_FILE_SET	UINT64CONST(0xD000000000000003)
+
 /*
  * Track all tapes needed for a HashAgg that spills. We don't know the maximum
  * number of tapes needed at the start of the algorithm (because it can
@@ -446,7 +454,7 @@ static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset,
 									   int input_tapenum, int setno,
 									   int64 input_tuples, double input_card,
 									   int used_bits);
-static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
+static bool hashagg_batch_read(void *userdata, TupleTableSlot *slot, uint32 *hashp);
 static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
 							   int used_bits, double input_groups,
 							   double hashentrysize);
@@ -473,6 +481,13 @@ static int	find_compatible_pertrans(AggState *aggstate, Aggref *newagg,
 									 Oid aggserialfn, Oid aggdeserialfn,
 									 Datum initValue, bool initValueIsNull,
 									 List *transnos);
+static TupleTableSlot *ExecBatchHashAggPrepare(PlanState *pstate);
+static TupleTableSlot *ExecBatchHashAgg(PlanState *pstate);
+static bool ExecBatchHashAggNextBatch(AggState *node);
+#if 0
+static void agg_batch_fill_hash_table(AggState *aggstate);
+static void ClearBatchAgg(AggState *node);
+#endif
 
 
 /*
@@ -1338,7 +1353,8 @@ finalize_aggregates(AggState *aggstate,
 		if (pertrans->numSortCols > 0)
 		{
 			Assert(aggstate->aggstrategy != AGG_HASHED &&
-				   aggstate->aggstrategy != AGG_MIXED);
+				   aggstate->aggstrategy != AGG_MIXED &&
+				   aggstate->aggstrategy != AGG_BATCH_HASH);
 
 			if (pertrans->numInputs == 1)
 				process_ordered_aggregate_single(aggstate,
@@ -1510,8 +1526,10 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 	MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
 	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
 	Size		additionalsize;
+	bool		use_hash_iv;
 
 	Assert(aggstate->aggstrategy == AGG_HASHED ||
+		   aggstate->aggstrategy == AGG_BATCH_HASH ||
 		   aggstate->aggstrategy == AGG_MIXED);
 
 	/*
@@ -1521,6 +1539,10 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 	 * tuple of each group.
 	 */
 	additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
+	if (aggstate->aggstrategy == AGG_BATCH_HASH)
+		use_hash_iv = false;
+	else
+		use_hash_iv = DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit);
 
 	perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
 												perhash->hashslot->tts_tupleDescriptor,
@@ -1534,7 +1556,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 												metacxt,
 												hashcxt,
 												tmpcxt,
-												DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+												use_hash_iv);
 }
 
 /*
@@ -1633,10 +1655,15 @@ find_hash_columns(AggState *aggstate)
 			palloc(maxCols * sizeof(AttrNumber));
 		perhash->hashGrpColIdxHash =
 			palloc(perhash->numCols * sizeof(AttrNumber));
+		perhash->colnos_needed = bms_copy(aggregated_colnos);
 
 		/* Add all the grouping columns to colnos */
 		for (i = 0; i < perhash->numCols; i++)
+		{
 			colnos = bms_add_member(colnos, grpColIdx[i]);
+			perhash->colnos_needed = bms_add_member(perhash->colnos_needed,
+													grpColIdx[i]);
+		}
 
 		/*
 		 * First build mapping for columns directly hashed. These are the
@@ -1746,9 +1773,11 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
 	int			j = nullcheck ? 1 : 0;
 
 	Assert(aggstate->aggstrategy == AGG_HASHED ||
+		   aggstate->aggstrategy == AGG_BATCH_HASH ||
 		   aggstate->aggstrategy == AGG_MIXED);
 
-	if (aggstate->aggstrategy == AGG_HASHED)
+	if (aggstate->aggstrategy == AGG_HASHED ||
+		aggstate->aggstrategy == AGG_BATCH_HASH)
 		phase = &aggstate->phases[0];
 	else						/* AGG_MIXED */
 		phase = &aggstate->phases[1];
@@ -2170,6 +2199,9 @@ ExecAgg(PlanState *pstate)
 			case AGG_SORTED:
 				result = agg_retrieve_direct(node);
 				break;
+			case AGG_BATCH_HASH:
+				elog(ERROR, "batch hash should not run in function ExecAgg");
+				break;
 		}
 
 		if (!TupIsNull(result))
@@ -2570,35 +2602,20 @@ agg_fill_hash_table(AggState *aggstate)
 						   &aggstate->perhash[0].hashiter);
 }
 
-/*
- * If any data was spilled during hash aggregation, reset the hash table and
- * reprocess one batch of spilled data. After reprocessing a batch, the hash
- * table will again contain data, ready to be consumed by
- * agg_retrieve_hash_table_in_memory().
- *
- * Should only be called after all in memory hash table entries have been
- * finalized and emitted.
- *
- * Return false when input is exhausted and there's no more work to be done;
- * otherwise return true.
- */
-static bool
-agg_refill_hash_table(AggState *aggstate)
+static void
+agg_refill_hash_table_ex(AggState *aggstate,
+						 bool (*read_tup)(void *userdata, TupleTableSlot *slot, uint32 *hash),
+						 void *userdata,
+						 int used_bits,
+						 double input_groups,
+						 int setno)
 {
-	HashAggBatch *batch;
 	AggStatePerHash perhash;
 	HashAggSpill spill;
-	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
 	bool		spill_initialized = false;
 
-	if (aggstate->hash_batches == NIL)
-		return false;
-
-	batch = linitial(aggstate->hash_batches);
-	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
-
-	hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
-						batch->used_bits, &aggstate->hash_mem_limit,
+	hash_agg_set_limits(aggstate->hashentrysize, input_groups,
+						used_bits, &aggstate->hash_mem_limit,
 						&aggstate->hash_ngroups_limit, NULL);
 
 	/* there could be residual pergroup pointers; clear them */
@@ -2626,7 +2643,7 @@ agg_refill_hash_table(AggState *aggstate)
 		aggstate->phase = &aggstate->phases[aggstate->current_phase];
 	}
 
-	select_current_set(aggstate, batch->setno, true);
+	select_current_set(aggstate, setno, true);
 
 	perhash = &aggstate->perhash[aggstate->current_set];
 
@@ -2644,31 +2661,27 @@ agg_refill_hash_table(AggState *aggstate)
 		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
 		TupleTableSlot *hashslot = perhash->hashslot;
 		TupleHashEntry entry;
-		MinimalTuple tuple;
 		uint32		hash;
 		bool		isnew = false;
 		bool	   *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		CHECK_FOR_INTERRUPTS();
 
-		tuple = hashagg_batch_read(batch, &hash);
-		if (tuple == NULL)
+		if ((*read_tup)(userdata, spillslot, &hash) == false)
 			break;
 
-		ExecStoreMinimalTuple(tuple, spillslot, true);
 		aggstate->tmpcontext->ecxt_outertuple = spillslot;
 
 		prepare_hash_slot(perhash,
-						  aggstate->tmpcontext->ecxt_outertuple,
+						  spillslot,
 						  hashslot);
-		entry = LookupTupleHashEntryHash(
-										 perhash->hashtable, hashslot, p_isnew, hash);
+		entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, p_isnew, hash);
 
 		if (entry != NULL)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+			aggstate->hash_pergroup[setno] = entry->additional;
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2680,13 +2693,13 @@ agg_refill_hash_table(AggState *aggstate)
 				 * that we don't assign tapes that will never be used.
 				 */
 				spill_initialized = true;
-				hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
-								   batch->input_card, aggstate->hashentrysize);
+				hashagg_spill_init(&spill, aggstate->hash_tapeinfo, used_bits,
+								   input_groups, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
 			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
 
-			aggstate->hash_pergroup[batch->setno] = NULL;
+			aggstate->hash_pergroup[setno] = NULL;
 		}
 
 		/*
@@ -2696,15 +2709,13 @@ agg_refill_hash_table(AggState *aggstate)
 		ResetExprContext(aggstate->tmpcontext);
 	}
 
-	hashagg_tapeinfo_release(tapeinfo, batch->input_tapenum);
-
 	/* change back to phase 0 */
 	aggstate->current_phase = 0;
 	aggstate->phase = &aggstate->phases[aggstate->current_phase];
 
 	if (spill_initialized)
 	{
-		hashagg_spill_finish(aggstate, &spill, batch->setno);
+		hashagg_spill_finish(aggstate, &spill, setno);
 		hash_agg_update_metrics(aggstate, true, spill.npartitions);
 	}
 	else
@@ -2713,9 +2724,43 @@ agg_refill_hash_table(AggState *aggstate)
 	aggstate->hash_spill_mode = false;
 
 	/* prepare to walk the first hash table */
-	select_current_set(aggstate, batch->setno, true);
-	ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
-						   &aggstate->perhash[batch->setno].hashiter);
+	select_current_set(aggstate, setno, true);
+	ResetTupleHashIterator(aggstate->perhash[setno].hashtable,
+						   &aggstate->perhash[setno].hashiter);
+}
+
+/*
+ * If any data was spilled during hash aggregation, reset the hash table and
+ * reprocess one batch of spilled data. After reprocessing a batch, the hash
+ * table will again contain data, ready to be consumed by
+ * agg_retrieve_hash_table_in_memory().
+ *
+ * Should only be called after all in memory hash table entries have been
+ * finalized and emitted.
+ *
+ * Return false when input is exhausted and there's no more work to be done;
+ * otherwise return true.
+ */
+static bool
+agg_refill_hash_table(AggState *aggstate)
+{
+	HashAggBatch *batch;
+
+	if (aggstate->hash_batches == NIL)
+		return false;
+
+	batch = linitial(aggstate->hash_batches);
+	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
+
+	agg_refill_hash_table_ex(aggstate,
+							 hashagg_batch_read,
+							 batch,
+							 batch->used_bits,
+							 batch->input_card,
+							 batch->setno);
+
+	hashagg_tapeinfo_release(aggstate->hash_tapeinfo,
+							 batch->input_tapenum);
 
 	pfree(batch);
 
@@ -3056,9 +3101,10 @@ hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
  * read_spilled_tuple
  * 		read the next tuple from a batch's tape.  Return NULL if no more.
  */
-static MinimalTuple
-hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
+static bool
+hashagg_batch_read(void *userdata, TupleTableSlot *slot, uint32 *hashp)
 {
+	HashAggBatch *batch = userdata;
 	LogicalTapeSet *tapeset = batch->tapeset;
 	int			tapenum = batch->input_tapenum;
 	MinimalTuple tuple;
@@ -3068,7 +3114,7 @@ hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
 
 	nread = LogicalTapeRead(tapeset, tapenum, &hash, sizeof(uint32));
 	if (nread == 0)
-		return NULL;
+		return false;
 	if (nread != sizeof(uint32))
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3096,7 +3142,8 @@ hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
 				 errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
 						tapenum, t_len - sizeof(uint32), nread)));
 
-	return tuple;
+	ExecStoreMinimalTuple(tuple, slot, true);
+	return true;
 }
 
 /*
@@ -3257,6 +3304,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	int			i = 0;
 	int			j = 0;
 	bool		use_hashing = (node->aggstrategy == AGG_HASHED ||
+							   node->aggstrategy == AGG_BATCH_HASH ||
 							   node->aggstrategy == AGG_MIXED);
 
 	/* check for unsupported flags */
@@ -3268,7 +3316,10 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	aggstate = makeNode(AggState);
 	aggstate->ss.ps.plan = (Plan *) node;
 	aggstate->ss.ps.state = estate;
-	aggstate->ss.ps.ExecProcNode = ExecAgg;
+	if (node->aggstrategy == AGG_BATCH_HASH)
+		aggstate->ss.ps.ExecProcNode = ExecBatchHashAggPrepare;
+	else
+		aggstate->ss.ps.ExecProcNode = ExecAgg;
 
 	aggstate->aggs = NIL;
 	aggstate->numaggs = 0;
@@ -3315,7 +3366,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 			 * additional AGG_HASHED aggs become part of phase 0, but all
 			 * others add an extra phase.
 			 */
-			if (agg->aggstrategy != AGG_HASHED)
+			if (agg->aggstrategy != AGG_HASHED &&
+				agg->aggstrategy != AGG_BATCH_HASH)
 				++numPhases;
 			else
 				++numHashes;
@@ -3362,7 +3414,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	 * If we are doing a hashed aggregation then the child plan does not need
 	 * to handle REWIND efficiently; see ExecReScanAgg.
 	 */
-	if (node->aggstrategy == AGG_HASHED)
+	if (node->aggstrategy == AGG_HASHED ||
+		node->aggstrategy == AGG_BATCH_HASH)
 		eflags &= ~EXEC_FLAG_REWIND;
 	outerPlan = outerPlan(node);
 	outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
@@ -3370,9 +3423,16 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	/*
 	 * initialize source tuple type.
 	 */
-	aggstate->ss.ps.outerops =
-		ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
-							 &aggstate->ss.ps.outeropsfixed);
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
+		aggstate->ss.ps.outeropsfixed = true;
+	}else
+	{
+		aggstate->ss.ps.outerops =
+			ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
+								 &aggstate->ss.ps.outeropsfixed);
+	}
 	aggstate->ss.ps.outeropsset = true;
 
 	ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
@@ -3470,6 +3530,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		Assert(phase <= 1 || sortnode);
 
 		if (aggnode->aggstrategy == AGG_HASHED
+			|| aggnode->aggstrategy == AGG_BATCH_HASH
 			|| aggnode->aggstrategy == AGG_MIXED)
 		{
 			AggStatePerPhase phasedata = &aggstate->phases[0];
@@ -3678,7 +3739,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	 * hashing is being done too, then phase 0 is processed last); but if only
 	 * hashing is being done, then phase 0 is all there is.
 	 */
-	if (node->aggstrategy == AGG_HASHED)
+	if (node->aggstrategy == AGG_HASHED ||
+		node->aggstrategy == AGG_BATCH_HASH)
 	{
 		aggstate->current_phase = 0;
 		initialize_phase(aggstate, 0);
@@ -4066,7 +4128,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 			dohash = false;
 			dosort = true;
 		}
-		else if (phase->aggstrategy == AGG_HASHED)
+		else if (phase->aggstrategy == AGG_HASHED ||
+				 phase->aggstrategy == AGG_BATCH_HASH)
 		{
 			dohash = true;
 			dosort = false;
@@ -4666,6 +4729,14 @@ ExecReScanAgg(AggState *node)
 			return;
 		}
 	}
+	else if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		if (!node->batch_filled)
+			return;
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("batch hash not support rescan yet!")));
+	}
 
 	/* Make sure we have closed any open tuplesorts */
 	for (transno = 0; transno < node->numtrans; transno++)
@@ -4958,6 +5029,54 @@ aggregate_dummy(PG_FUNCTION_ARGS)
  * ----------------------------------------------------------------
  */
 
+static Size ExecAggEstimateToc(AggState *node, ParallelContext *pcxt)
+{
+	Size				size;
+	shm_toc_estimator	estimator;
+	ListCell		   *lc;
+
+	/* don't need this if no workers */
+	if (pcxt->nworkers == 0)
+		return 0;
+	/* don't need this if not instrumenting and not batch hash agg */
+	if (!node->ss.ps.instrument &&
+		node->aggstrategy != AGG_BATCH_HASH)
+		return 0;
+
+	shm_toc_initialize_estimator(&estimator);
+	if (node->ss.ps.instrument)
+	{
+		size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
+		size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+		shm_toc_estimate_chunk(&estimator, size);
+		shm_toc_estimate_keys(&estimator, 1);
+	}
+
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		int nparticipants = pcxt->nworkers + 1;
+		shm_toc_estimate_chunk(&estimator, sizeof(Barrier));
+		shm_toc_estimate_chunk(&estimator, sizeof(SharedFileSet));
+		shm_toc_estimate_keys(&estimator, 2);
+
+		size = bs_parallel_hash_estimate(castNode(Agg, node->ss.ps.plan)->numBatches,
+										 nparticipants);
+		shm_toc_estimate_chunk(&estimator, size);
+		shm_toc_estimate_keys(&estimator, 1);
+
+		foreach (lc, castNode(Agg, node->ss.ps.plan)->chain)
+		{
+			Agg *agg = lfirst_node(Agg, lc);
+			Assert(agg->aggstrategy == AGG_BATCH_HASH);
+			size = bs_parallel_hash_estimate(agg->numBatches, nparticipants);
+			shm_toc_estimate_chunk(&estimator, size);
+			shm_toc_estimate_keys(&estimator, 1);
+		}
+	}
+
+	return shm_toc_estimate(&estimator);
+}
+
  /* ----------------------------------------------------------------
   *		ExecAggEstimate
   *
@@ -4967,14 +5086,7 @@ aggregate_dummy(PG_FUNCTION_ARGS)
 void
 ExecAggEstimate(AggState *node, ParallelContext *pcxt)
 {
-	Size		size;
-
-	/* don't need this if not instrumenting or no workers */
-	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
-		return;
-
-	size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
-	size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+	Size size = ExecAggEstimateToc(node, pcxt);
 	shm_toc_estimate_chunk(&pcxt->estimator, size);
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 }
@@ -4989,19 +5101,77 @@ void
 ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
 {
 	Size		size;
+	shm_toc	   *toc;
+	void	   *addr;
 
-	/* don't need this if not instrumenting or no workers */
-	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+	size = ExecAggEstimateToc(node, pcxt);
+	if (size == 0)
 		return;
 
-	size = offsetof(SharedAggInfo, sinstrument)
-		+ pcxt->nworkers * sizeof(AggregateInstrumentation);
-	node->shared_info = shm_toc_allocate(pcxt->toc, size);
-	/* ensure any unfilled slots will contain zeroes */
-	memset(node->shared_info, 0, size);
-	node->shared_info->num_workers = pcxt->nworkers;
-	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
-				   node->shared_info);
+	addr = shm_toc_allocate(pcxt->toc, size);
+	toc = shm_toc_create(SHARED_AGG_MAGIC, addr, size);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, addr);
+
+	if (node->ss.ps.instrument)
+	{
+		size = offsetof(SharedAggInfo, sinstrument)
+			+ pcxt->nworkers * sizeof(AggregateInstrumentation);
+		node->shared_info = shm_toc_allocate(toc, size);
+		/* ensure any unfilled slots will contain zeroes */
+		memset(node->shared_info, 0, size);
+		node->shared_info->num_workers = pcxt->nworkers;
+		shm_toc_insert(toc, SHARED_AGG_KEY_INFO, node->shared_info);
+	}
+
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		int				nparticipants = pcxt->nworkers + 1;
+		int				i = 0;
+		ListCell	   *lc;
+		Agg			   *agg;
+		SharedFileSet  *fset = shm_toc_allocate(toc, sizeof(SharedFileSet));
+		SharedFileSetInit(fset, pcxt->seg);
+		shm_toc_insert(toc, SHARED_AGG_KEY_FILE_SET, fset);
+
+		node->batch_barrier = shm_toc_allocate(toc, sizeof(Barrier));
+		BarrierInit(node->batch_barrier, 0);
+		shm_toc_insert(toc, SHARED_AGG_KEY_BARRIER, node->batch_barrier);
+
+		agg = castNode(Agg, node->ss.ps.plan);
+		Assert(agg->numBatches > 0);
+		size = bs_parallel_hash_estimate(agg->numBatches, nparticipants);
+		addr = shm_toc_allocate(toc, size);
+		shm_toc_insert(toc, 0, addr);
+		node->perhash[0].batch_store = bs_init_parallel_hash(agg->numBatches,
+															 nparticipants,
+															 0,
+															 addr,
+															 pcxt->seg,
+															 fset,
+															 "BatchHashAgg");
+
+		i = 1;
+		foreach (lc, agg->chain)
+		{
+			Agg	   *subagg = lfirst_node(Agg, lc);
+			char	name[30];
+			Assert(subagg->aggstrategy == AGG_BATCH_HASH &&
+				   subagg->numBatches > 0);
+			Assert(i < node->num_hashes);
+			size = bs_parallel_hash_estimate(subagg->numBatches, nparticipants);
+			addr = shm_toc_allocate(toc, size);
+			shm_toc_insert(toc, i, addr);
+			sprintf(name, "BatchHashAgg%d", i);
+			node->perhash[i].batch_store = bs_init_parallel_hash(subagg->numBatches,
+																 nparticipants,
+																 0,
+																 addr,
+																 pcxt->seg,
+																 fset,
+																 name);
+			++i;
+		}
+	}
 }
 
 /* ----------------------------------------------------------------
@@ -5013,8 +5183,43 @@ ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
 void
 ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
 {
-	node->shared_info =
-		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	shm_toc	   *toc;
+	void	   *addr = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	if (addr == NULL)
+	{
+		Assert(node->aggstrategy != AGG_BATCH_HASH);
+		return;
+	}
+	toc = shm_toc_attach(SHARED_AGG_MAGIC, addr);
+	node->shared_info = shm_toc_lookup(toc, SHARED_AGG_KEY_INFO, true);
+
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		int				i;
+		ListCell	   *lc;
+		Agg			   *agg = castNode(Agg, node->ss.ps.plan);
+		SharedFileSet  *fset = shm_toc_lookup(toc, SHARED_AGG_KEY_FILE_SET, false);
+
+		node->batch_barrier = shm_toc_lookup(toc, SHARED_AGG_KEY_BARRIER, false);
+		node->perhash[0].batch_store =
+			bs_attach_parallel_hash(shm_toc_lookup(toc, 0, false),
+									pwcxt->seg,
+									fset,
+									ParallelWorkerNumber+1);
+
+		i = 1;
+		foreach (lc, agg->chain)
+		{
+			Assert(lfirst_node(Agg, lc)->aggstrategy == AGG_BATCH_HASH);
+			Assert (i<node->num_hashes);
+			node->perhash[i].batch_store =
+				bs_attach_parallel_hash(shm_toc_lookup(toc, i, false),
+										pwcxt->seg,
+										fset,
+										ParallelWorkerNumber+1);
+			++i;
+		}
+	}
 }
 
 /* ----------------------------------------------------------------
@@ -5038,3 +5243,185 @@ ExecAggRetrieveInstrumentation(AggState *node)
 	memcpy(si, node->shared_info, size);
 	node->shared_info = si;
 }
+
+static TupleTableSlot *ExecBatchHashAggPrepare(PlanState *pstate)
+{
+	int				i,x,max_colno_needed;
+	MinimalTuple	mtup;
+	TupleTableSlot *inputslot;
+	PlanState	   *outer = outerPlanState(pstate);
+	AggState	   *node = castNode(AggState, pstate);
+	ExprContext	   *tmpcontext = node->tmpcontext;
+	bool		   *isnull;
+	Bitmapset	   *colnos_needed;
+	Bitmapset	  **colnos_neededs;
+	Assert(node->aggstrategy == AGG_BATCH_HASH);
+	Assert(node->perhash[0].batch_store == NULL ||
+		   node->batch_barrier != NULL);
+
+	if (node->agg_done)
+		return NULL;
+
+	/* create batch store if not parallel */
+	if (node->perhash[0].batch_store == NULL)
+	{
+		MemoryContext	oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(pstate));
+		Agg			   *agg = castNode(Agg, pstate->plan);
+		ListCell	   *lc;
+
+		node->perhash[0].batch_store = bs_begin_hash(agg->numBatches);
+
+		i = 1;
+		foreach (lc, agg->chain)
+		{
+			Agg *subagg = lfirst_node(Agg, lc);
+			Assert(subagg->aggstrategy == AGG_BATCH_HASH);
+			Assert(i < node->num_hashes);
+			node->perhash[i].batch_store = bs_begin_hash(subagg->numBatches);
+			++i;
+		}
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	if (node->batch_barrier &&
+		BarrierAttach(node->batch_barrier) > 0)
+	{
+		BarrierDetach(node->batch_barrier);
+		goto batches_already_done_;
+	}
+
+	/* read for make minimal tuple */
+	isnull = palloc(sizeof(isnull[0]) * node->hash_spill_wslot->tts_tupleDescriptor->natts);
+	memset(isnull, true, sizeof(isnull[0]) * node->hash_spill_wslot->tts_tupleDescriptor->natts);
+	max_colno_needed = node->max_colno_needed;
+
+	/* convert Attribute numbers to index(start with 0) */
+	colnos_neededs = palloc(sizeof(colnos_neededs[0]) * node->num_hashes);
+	for (i=0;i<node->num_hashes;++i)
+	{
+		AggStatePerHash	perhash = &node->perhash[i];
+		colnos_needed = NULL;
+		x = -1;
+		while ((x=bms_next_member(perhash->colnos_needed, x)) >= 0)
+		{
+			Assert(x > 0);
+			colnos_needed = bms_add_member(colnos_needed, x-1);
+		}
+		colnos_neededs[i] = colnos_needed;
+	}
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+		inputslot = ExecProcNode(outer);
+		if (TupIsNull(inputslot))
+			break;
+
+		tmpcontext->ecxt_outertuple = inputslot;
+		slot_getsomeattrs(inputslot, max_colno_needed);
+
+		for (i=0;i<node->num_hashes;++i)
+		{
+			AggStatePerHash	perhash = &node->perhash[i];
+			TupleTableSlot *hashslot = perhash->hashslot;
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* mark unneeded columns as null */
+			memset(isnull, true, sizeof(isnull[0]) * max_colno_needed);
+			colnos_needed = colnos_neededs[i];
+			x = -1;
+			while ((x = bms_next_member(colnos_needed, x)) >= 0)
+				isnull[x] = inputslot->tts_isnull[x];
+			/* make minimal tuple from we needed columns for this set */
+			mtup = heap_form_minimal_tuple(inputslot->tts_tupleDescriptor,
+										   inputslot->tts_values,
+										   isnull);
+
+			prepare_hash_slot(perhash, inputslot, hashslot);
+
+			bs_write_hash(perhash->batch_store,
+						  mtup,
+						  TupleHashTableHash(perhash->hashtable, hashslot));
+			pfree(mtup);
+			ResetExprContext(tmpcontext);
+		}
+	}
+
+	for (i=0;i<node->num_hashes;++i)
+		bs_end_write(node->perhash[i].batch_store);
+	if (node->batch_barrier)
+	{
+		BarrierArriveAndWait(node->batch_barrier, WAIT_EVENT_BATCH_HASH_BUILD);
+		BarrierDetach(node->batch_barrier);
+	}
+
+	/* clear temp memory */
+	for (i=0;i<node->num_hashes;++i)
+		bms_free(colnos_neededs[i]);
+	pfree(colnos_neededs);
+	pfree(isnull);
+
+batches_already_done_:
+	node->batch_filled = true;
+	node->current_batch = 0;
+	if (ExecBatchHashAggNextBatch(node) == false)
+		return NULL;
+
+	ExecSetExecProcNode(pstate, ExecBatchHashAgg);
+	return ExecBatchHashAgg(pstate);
+}
+
+static TupleTableSlot *ExecBatchHashAgg(PlanState *pstate)
+{
+	AggState	   *node = castNode(AggState, pstate);
+	TupleTableSlot *result;
+
+reloop:
+	result = agg_retrieve_hash_table_in_memory(node);
+	if (unlikely(result == NULL))
+	{
+		if (agg_refill_hash_table(node) == false &&
+			ExecBatchHashAggNextBatch(node) == false)
+		{
+			return NULL;
+		}else
+		{
+			goto reloop;
+		}
+	}
+
+	return result;
+}
+
+static bool
+batchstore_read(void *userdata, TupleTableSlot *slot, uint32 *hashp)
+{
+	MinimalTuple mtup = bs_read_hash(userdata, hashp);
+	if (unlikely(mtup == NULL))
+		return false;
+	ExecStoreMinimalTuple(mtup, slot, false);
+	return true;
+}
+
+static bool ExecBatchHashAggNextBatch(AggState *node)
+{
+	while (bs_next_batch(node->perhash[node->current_batch].batch_store, false) == false)
+	{
+		++node->current_batch;
+		if (node->current_batch >= node->num_hashes)
+		{
+			node->agg_done = true;
+			return false;
+		}
+	}
+
+	agg_refill_hash_table_ex(node,
+							 batchstore_read,
+							 node->perhash[node->current_batch].batch_store,
+							 0,
+							 node->perhash[node->current_batch].aggnode->numGroups,
+							 node->current_batch);
+	return true;
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 958964f1fa..8649e7d610 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1042,6 +1042,7 @@ _copyAgg(const Agg *from)
 	COPY_BITMAPSET_FIELD(aggParams);
 	COPY_NODE_FIELD(groupingSets);
 	COPY_NODE_FIELD(chain);
+	COPY_SCALAR_FIELD(numBatches);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index a8dd7ef23f..8893bfab29 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -787,6 +787,7 @@ _outAgg(StringInfo str, const Agg *node)
 	WRITE_BITMAPSET_FIELD(aggParams);
 	WRITE_NODE_FIELD(groupingSets);
 	WRITE_NODE_FIELD(chain);
+	WRITE_UINT_FIELD(numBatches);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 2c6eb4362c..03ef6bc5da 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2249,6 +2249,7 @@ _readAgg(void)
 	READ_BITMAPSET_FIELD(aggParams);
 	READ_NODE_FIELD(groupingSets);
 	READ_NODE_FIELD(chain);
+	READ_UINT_FIELD(numBatches);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 32d0dc8ce5..4143b69178 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -141,6 +141,7 @@ bool		enable_parallel_append = true;
 bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
 bool		enable_batch_sort = true;
+bool		enable_batch_hashagg = false;
 
 typedef struct
 {
@@ -2404,7 +2405,7 @@ cost_agg(Path *path, PlannerInfo *root,
 	/* Use all-zero per-aggregate costs if NULL is passed */
 	if (aggcosts == NULL)
 	{
-		Assert(aggstrategy == AGG_HASHED);
+		Assert(aggstrategy == AGG_HASHED || aggstrategy == AGG_BATCH_HASH);
 		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
 		aggcosts = &dummy_aggcosts;
 	}
@@ -2463,10 +2464,13 @@ cost_agg(Path *path, PlannerInfo *root,
 	}
 	else
 	{
-		/* must be AGG_HASHED */
+		/* must be AGG_HASHED or AGG_BATCH_HASH */
 		startup_cost = input_total_cost;
-		if (!enable_hashagg)
+		if ((aggstrategy == AGG_HASHED && !enable_hashagg) ||
+			(aggstrategy == AGG_BATCH_HASH && !enable_batch_hashagg))
+		{
 			startup_cost += disable_cost;
+		}
 		startup_cost += aggcosts->transCost.startup;
 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
 		/* cost of computing hash value */
@@ -2478,6 +2482,15 @@ cost_agg(Path *path, PlannerInfo *root,
 		/* cost of retrieving from hash table */
 		total_cost += cpu_tuple_cost * numGroups;
 		output_tuples = numGroups;
+
+		if (aggstrategy == AGG_BATCH_HASH)
+		{
+			double	nbytes = relation_byte_size(input_tuples, input_width);
+			double	npages = ceil(nbytes / BLCKSZ);
+			double	material_cost = (seq_page_cost * npages);
+			startup_cost += material_cost;
+			total_cost += material_cost;
+		}
 	}
 
 	/*
@@ -2493,7 +2506,9 @@ cost_agg(Path *path, PlannerInfo *root,
 	 * Accrue writes (spilled tuples) to startup_cost and to total_cost;
 	 * accrue reads only to total_cost.
 	 */
-	if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
+	if (aggstrategy == AGG_HASHED ||
+		aggstrategy == AGG_BATCH_HASH ||
+		aggstrategy == AGG_MIXED)
 	{
 		double		pages;
 		double		pages_written = 0.0;
@@ -2506,6 +2521,14 @@ cost_agg(Path *path, PlannerInfo *root,
 		int			num_partitions;
 		int			depth;
 
+		if (aggstrategy == AGG_BATCH_HASH &&
+			numGroups > BATCH_STORE_MAX_BATCH)
+		{
+			numGroups /= BATCH_STORE_MAX_BATCH;
+			if (numGroups < 1.0)
+				numGroups = 1.0;
+		}
+
 		/*
 		 * Estimate number of batches based on the computed limits. If less
 		 * than or equal to one, all groups are expected to fit in memory;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 85969388c2..a87dd633dc 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -30,6 +30,7 @@
 #include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paramassign.h"
+#include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/placeholder.h"
 #include "optimizer/plancat.h"
@@ -2327,7 +2328,9 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 			if (!rollup->is_hashed)
 				is_first_sort = false;
 
-			if (rollup->is_hashed)
+			if (best_path->aggstrategy == AGG_BATCH_HASH)
+				strat = AGG_BATCH_HASH;
+			else if (rollup->is_hashed)
 				strat = AGG_HASHED;
 			else if (list_length(linitial(rollup->gsets)) == 0)
 				strat = AGG_PLAIN;
@@ -6417,6 +6420,13 @@ make_agg(List *tlist, List *qual,
 	plan->lefttree = lefttree;
 	plan->righttree = NULL;
 
+	if (aggstrategy == AGG_BATCH_HASH)
+	{
+		node->numBatches = (int32)numGroups;
+		if (node->numBatches > BATCH_STORE_MAX_BATCH)
+			node->numBatches = BATCH_STORE_MAX_BATCH;
+	}
+
 	return node;
 }
 
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 27680dbeb3..5d854b26b5 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -176,6 +176,12 @@ static void consider_groupingsets_paths(PlannerInfo *root,
 										grouping_sets_data *gd,
 										const AggClauseCosts *agg_costs,
 										double dNumGroups);
+static void consider_parallel_hash_groupingsets_paths(PlannerInfo *root,
+													  RelOptInfo *grouped_rel,
+													  Path *path,
+													  grouping_sets_data *gd,
+													  const AggClauseCosts *agg_costs,
+													  double dNumGroups);
 static RelOptInfo *create_window_paths(PlannerInfo *root,
 									   RelOptInfo *input_rel,
 									   PathTarget *input_target,
@@ -4538,6 +4544,77 @@ consider_groupingsets_paths(PlannerInfo *root,
 										  dNumGroups));
 }
 
+static void
+consider_parallel_hash_groupingsets_paths(PlannerInfo *root,
+										  RelOptInfo *grouped_rel,
+										  Path *path,
+										  grouping_sets_data *gd,
+										  const AggClauseCosts *agg_costs,
+										  double dNumGroups)
+{
+	int			hash_mem = get_hash_mem();
+	List	   *new_rollups = NIL;
+	List	   *sets_data;
+	ListCell   *lc;
+	RollupData *rollup;
+	GroupingSetData *gs;
+	double		hashsize;
+	double		numGroups;
+
+	sets_data = list_copy(gd->unsortable_sets);
+	foreach (lc, gd->rollups)
+	{
+		rollup = lfirst_node(RollupData, lc);
+		if (rollup->hashable == false)
+		{
+			list_free(sets_data);
+			return;
+		}
+		sets_data = list_concat(sets_data, rollup->gsets_data);
+	}
+	foreach (lc, sets_data)
+	{
+		gs = lfirst_node(GroupingSetData, lc);
+		numGroups = gs->numGroups / BATCH_STORE_MAX_BATCH;
+		if (numGroups < 1.0)
+			numGroups = 1.0;
+		hashsize = estimate_hashagg_tablesize(path,
+											  agg_costs,
+											  numGroups);
+		if (hashsize > hash_mem * 1024L)
+		{
+			list_free(sets_data);
+			list_free_deep(new_rollups);
+			return;
+		}
+
+		rollup = makeNode(RollupData);
+		rollup->groupClause = preprocess_groupclause(root, gs->set);
+		rollup->gsets_data = list_make1(gs);
+		rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+												 rollup->gsets_data,
+												 gd->tleref_to_colnum_map);
+		rollup->numGroups = gs->numGroups;
+		rollup->hashable = true;
+		rollup->is_hashed = true;
+		new_rollups = lappend(new_rollups, rollup);
+	}
+
+	numGroups = dNumGroups / path->parallel_workers;
+	if (numGroups < list_length(new_rollups))
+		numGroups = list_length(new_rollups);
+	path = (Path*)create_groupingsets_path(root,
+										   grouped_rel,
+										   path,
+										   (List*) root->parse->havingQual,
+										   AGG_BATCH_HASH,
+										   new_rollups,
+										   agg_costs,
+										   numGroups);
+	path->parallel_aware = true;
+	add_partial_path(grouped_rel, path);
+}
+
 /*
  * create_window_paths
  *
@@ -4952,6 +5029,30 @@ create_distinct_paths(PlannerInfo *root,
 								 NIL,
 								 NULL,
 								 numDistinctRows));
+#if 1
+		/* Generate parallel batch hashed aggregate path */
+		if (distinct_rel->consider_parallel &&
+			input_rel->partial_pathlist != NIL &&
+			numDistinctRows > 1.0)
+		{
+			Path *path = linitial(input_rel->partial_pathlist);
+			double numRows = numDistinctRows / path->parallel_workers;
+			if (numRows < 1.0)
+				numRows = 1.0;
+			path = (Path *)create_agg_path(root,
+										   distinct_rel,
+										   path,
+										   path->pathtarget,
+										   AGG_BATCH_HASH,
+										   AGGSPLIT_SIMPLE,
+										   parse->distinctClause,
+										   NIL,
+										   NULL,
+										   numRows);
+			path->parallel_aware = true;
+			add_partial_path(distinct_rel, path);
+		}
+#endif
 	}
 
 	generate_useful_gather_paths(root, distinct_rel, false);
@@ -6874,6 +6975,14 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			consider_groupingsets_paths(root, grouped_rel,
 										cheapest_path, false, true,
 										gd, agg_costs, dNumGroups);
+			if (grouped_rel->consider_parallel &&
+				input_rel->partial_pathlist != NIL)
+				consider_parallel_hash_groupingsets_paths(root,
+														  grouped_rel,
+														  linitial(input_rel->partial_pathlist),
+														  gd,
+														  agg_costs,
+														  dNumGroups);
 		}
 		else
 		{
@@ -6891,6 +7000,28 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 									 havingQual,
 									 agg_costs,
 									 dNumGroups));
+
+			if (grouped_rel->consider_parallel &&
+				input_rel->partial_pathlist != NIL &&
+				dNumGroups >= 2.0)
+			{
+				Path   *path = linitial(input_rel->partial_pathlist);
+				double	numGroups = dNumGroups / path->parallel_workers;
+				if (numGroups < 1.0)
+					numGroups = 1.0;
+				path = (Path*)create_agg_path(root,
+											  grouped_rel,
+											  path,
+											  grouped_rel->reltarget,
+											  AGG_BATCH_HASH,
+											  AGGSPLIT_SIMPLE,
+											  parse->groupClause,
+											  havingQual,
+											  agg_costs,
+											  numGroups);
+				path->parallel_aware = true;
+				add_partial_path(grouped_rel, path);
+			}
 		}
 
 		/*
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index fa1053f077..efe438524c 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -714,6 +714,33 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 															partial_path->rows);
 			add_partial_path(result_rel, partial_path);
 		}
+
+		/* create parallel batch hashed union */
+		if (!op->all &&
+			ppath->rows > 1.0 &&
+			grouping_is_hashable(groupList))
+		{
+			Path   *partial_path;
+			double	dNumGroups = ppath->rows / ppath->parallel_workers;
+			if (dNumGroups < 1.0)
+				dNumGroups = 1.0;
+			if (ppath->pathtarget->width * dNumGroups <= get_hash_mem() * 1024L)
+			{
+				partial_path = (Path*)create_agg_path(root,
+													  result_rel,
+													  ppath,
+													  create_pathtarget(root, tlist),
+													  AGG_BATCH_HASH,
+													  AGGSPLIT_SIMPLE,
+													  groupList,
+													  NIL,
+													  NULL,
+													  dNumGroups);
+				partial_path->parallel_aware = true;
+				add_partial_path(result_rel, partial_path);
+			}
+		}
+
 		ppath = (Path *)
 			create_gather_path(root, result_rel, ppath,
 							   result_rel->reltarget, NULL, NULL);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index cacb7d13e6..04113bae7d 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4024,6 +4024,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_BATCH_SORT_BUILD:
 			event_name = "Batch/Sort/Building";
 			break;
+		case WAIT_EVENT_BATCH_HASH_BUILD:
+			event_name = "Batch/Hash/Building";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 43a4e36d78..ed2a4369eb 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -996,6 +996,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_batch_hashagg", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable batch hash agg method"),
+			NULL
+		},
+		&enable_batch_hashagg,
+		false,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of incremental sort steps."),
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index b955169538..c6968c8301 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -310,6 +310,8 @@ typedef struct AggStatePerHashData
 	int			largestGrpColIdx;	/* largest col required for hashing */
 	AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
 	AttrNumber *hashGrpColIdxHash;	/* indices in hash table tuples */
+	Bitmapset  *colnos_needed;	/* all columns needed from the outer plan */
+	struct BatchStoreData *batch_store;	/* grouping set batch store hash */
 	Agg		   *aggnode;		/* original Agg node, for numGroups etc. */
 }			AggStatePerHashData;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 14dde9fca3..ac53d0723e 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2218,6 +2218,9 @@ typedef struct AggState
 										 * ->hash_pergroup */
 	ProjectionInfo *combinedproj;	/* projection machinery */
 	SharedAggInfo *shared_info; /* one entry per worker */
+	struct Barrier *batch_barrier;		/* for parallel batch */
+	int			current_batch;
+	bool		batch_filled;
 } AggState;
 
 /* ----------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index ace4c98939..1b3365c241 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -762,7 +762,8 @@ typedef enum AggStrategy
 	AGG_PLAIN,					/* simple agg across all input rows */
 	AGG_SORTED,					/* grouped agg, input must be sorted */
 	AGG_HASHED,					/* grouped agg, use internal hashtable */
-	AGG_MIXED					/* grouped agg, hash and sort both used */
+	AGG_MIXED,					/* grouped agg, hash and sort both used */
+	AGG_BATCH_HASH				/* grouped agg, use batch hash */
 } AggStrategy;
 
 /*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index f7ad7881dc..941eb3a23b 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -840,6 +840,7 @@ typedef struct Agg
 	/* Note: planner provides numGroups & aggParams only in HASHED/MIXED case */
 	List	   *groupingSets;	/* grouping sets to use */
 	List	   *chain;			/* chained Agg/Sort nodes */
+	uint32		numBatches;	/* valid in HASHED */
 } Agg;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 37e6a12a6f..dc0cd825f0 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -54,6 +54,7 @@ extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
 extern PGDLLIMPORT bool enable_batch_sort;
+extern PGDLLIMPORT bool enable_batch_hashagg;
 extern PGDLLIMPORT bool enable_incremental_sort;
 extern PGDLLIMPORT bool enable_hashagg;
 extern PGDLLIMPORT bool enable_nestloop;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 816fc37739..0e8eb26111 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -19,6 +19,8 @@
 
 #define BATCH_SORT_MIN_BATCHES		2
 #define BATCH_SORT_MAX_BATCHES		512
+#define BATCH_STORE_MIN_BATCH		2
+#define BATCH_STORE_MAX_BATCH		1024
 
 /*
  * prototypes for pathnode.c
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index f0b6dae97b..2826a0b38c 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -953,7 +953,8 @@ typedef enum
 	WAIT_EVENT_SAFE_SNAPSHOT,
 	WAIT_EVENT_SYNC_REP,
 	WAIT_EVENT_XACT_GROUP_UPDATE,
-	WAIT_EVENT_BATCH_SORT_BUILD
+	WAIT_EVENT_BATCH_SORT_BUILD,
+	WAIT_EVENT_BATCH_HASH_BUILD
 } WaitEventIPC;
 
 /* ----------
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index 701d52b465..bcac70894f 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1739,4 +1739,69 @@ set work_mem to default;
 
 drop table gs_group_1;
 drop table gs_hash_1;
+-- parallel grouping sets
+BEGIN;
+set enable_batch_hashagg = on;
+set min_parallel_table_scan_size = 0;
+set parallel_setup_cost = 10;
+explain (costs off)
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather Merge
+   Workers Planned: 2
+   ->  Sort
+         Sort Key: two, four, ten, twenty
+         ->  Parallel BatchHashAggregate
+               Group Key: two, twenty
+               Group Key: two
+               Group Key: ()
+               Group Key: four
+               Group Key: ten
+               ->  Parallel Seq Scan on tenk1
+(11 rows)
+
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+   sum    | count | two | four | ten | twenty 
+----------+-------+-----+------+-----+--------
+  2495000 |   500 |   0 |      |     |      0
+  2496000 |   500 |   0 |      |     |      2
+  2497000 |   500 |   0 |      |     |      4
+  2498000 |   500 |   0 |      |     |      6
+  2499000 |   500 |   0 |      |     |      8
+  2500000 |   500 |   0 |      |     |     10
+  2501000 |   500 |   0 |      |     |     12
+  2502000 |   500 |   0 |      |     |     14
+  2503000 |   500 |   0 |      |     |     16
+  2504000 |   500 |   0 |      |     |     18
+ 24995000 |  5000 |   0 |      |     |       
+  2495500 |   500 |   1 |      |     |      1
+  2496500 |   500 |   1 |      |     |      3
+  2497500 |   500 |   1 |      |     |      5
+  2498500 |   500 |   1 |      |     |      7
+  2499500 |   500 |   1 |      |     |      9
+  2500500 |   500 |   1 |      |     |     11
+  2501500 |   500 |   1 |      |     |     13
+  2502500 |   500 |   1 |      |     |     15
+  2503500 |   500 |   1 |      |     |     17
+  2504500 |   500 |   1 |      |     |     19
+ 25000000 |  5000 |   1 |      |     |       
+ 12495000 |  2500 |     |    0 |     |       
+ 12497500 |  2500 |     |    1 |     |       
+ 12500000 |  2500 |     |    2 |     |       
+ 12502500 |  2500 |     |    3 |     |       
+  4995000 |  1000 |     |      |   0 |       
+  4996000 |  1000 |     |      |   1 |       
+  4997000 |  1000 |     |      |   2 |       
+  4998000 |  1000 |     |      |   3 |       
+  4999000 |  1000 |     |      |   4 |       
+  5000000 |  1000 |     |      |   5 |       
+  5001000 |  1000 |     |      |   6 |       
+  5002000 |  1000 |     |      |   7 |       
+  5003000 |  1000 |     |      |   8 |       
+  5004000 |  1000 |     |      |   9 |       
+ 49995000 | 10000 |     |      |     |       
+(37 rows)
+
+ABORT;
 -- end
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index b187c1080b..ba327d7583 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -1523,6 +1523,7 @@ SET min_parallel_table_scan_size = 0;
 SET parallel_tuple_cost = 0;
 SET parallel_setup_cost = 0;
 SET enable_indexonlyscan = OFF;
+-- using batch sort
 EXPLAIN (COSTS OFF)
 SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
                   QUERY PLAN                  
@@ -1588,4 +1589,67 @@ SELECT count(*) FROM
  10000
 (1 row)
 
+-- using batch hash
+SET enable_batch_sort = OFF;
+SET enable_batch_hashagg = ON;
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+               QUERY PLAN               
+----------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Parallel BatchHashAggregate
+         Group Key: unique2
+         ->  Parallel Seq Scan on tenk1
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Seq Scan on tenk1
+(6 rows)
+
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: ((count(*)) = tenk1.unique2)
+                     ->  Parallel BatchHashAggregate
+                           Group Key: tenk1_1.unique2
+                           ->  Parallel Seq Scan on tenk1 tenk1_1
+                     ->  Parallel Hash
+                           ->  Parallel Seq Scan on tenk1
+(11 rows)
+
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ count 
+-------
+ 10000
+(1 row)
+
 ABORT;
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index c200e38d12..8c7f3381be 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -313,6 +313,7 @@ SET min_parallel_table_scan_size =0;
 SET parallel_tuple_cost = 0;
 SET parallel_setup_cost = 0;
 SET enable_indexonlyscan = OFF;
+-- using batch sort
 EXPLAIN (costs off)
 SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
                      QUERY PLAN                     
@@ -347,4 +348,38 @@ SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
                            ->  Parallel Seq Scan on tenk1
 (9 rows)
 
+-- using batch hash
+SET enable_batch_sort = OFF;
+SET enable_batch_hashagg = ON;
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Seq Scan on tenk1
+(6 rows)
+
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Parallel BatchHashAggregate
+         Group Key: tenk1.unique2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Seq Scan on tenk1
+(7 rows)
+
 ABORT;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 8ed047e520..a7219644a8 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -88,6 +88,7 @@ select count(*) = 1 as ok from pg_stat_wal;
 select name, setting from pg_settings where name like 'enable%';
               name              | setting 
 --------------------------------+---------
+ enable_batch_hashagg           | off
  enable_batch_sort              | off
  enable_bitmapscan              | on
  enable_gathermerge             | on
@@ -107,7 +108,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(19 rows)
+(20 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 5a2be9aec9..519a50bb20 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1059,6 +1059,7 @@ SET min_parallel_table_scan_size =0;
 SET parallel_tuple_cost = 0;
 SET parallel_setup_cost = 0;
 SET enable_indexonlyscan = OFF;
+-- using batch sort
 EXPLAIN (costs off)
 SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
                             QUERY PLAN                            
@@ -1106,4 +1107,33 @@ SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1)
  10000
 (1 row)
 
+-- using batch hash
+SET enable_batch_sort = OFF;
+SET enable_batch_hashagg = ON;
+EXPLAIN (costs off)
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.hundred
+               ->  Parallel Append
+                     ->  Parallel Seq Scan on tenk1
+                     ->  Parallel Seq Scan on tenk1 tenk1_1
+(8 rows)
+
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+ count 
+-------
+   100
+(1 row)
+
 ABORT;
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index d4e5628eba..2eff77af47 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -511,4 +511,14 @@ set work_mem to default;
 drop table gs_group_1;
 drop table gs_hash_1;
 
+-- parallel grouping sets
+BEGIN;
+set enable_batch_hashagg = on;
+set min_parallel_table_scan_size = 0;
+set parallel_setup_cost = 10;
+explain (costs off)
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+ABORT;
+
 -- end
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index 3e50a48d37..7ecbe3ed56 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -338,6 +338,26 @@ SET min_parallel_table_scan_size = 0;
 SET parallel_tuple_cost = 0;
 SET parallel_setup_cost = 0;
 SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+-- using batch hash
+SET enable_batch_sort = OFF;
+SET enable_batch_hashagg = ON;
 EXPLAIN (COSTS OFF)
 SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
 EXPLAIN (COSTS OFF)
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index 3ff7acf64d..2a16d9b23d 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -143,6 +143,15 @@ SET min_parallel_table_scan_size =0;
 SET parallel_tuple_cost = 0;
 SET parallel_setup_cost = 0;
 SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+-- using batch hash
+SET enable_batch_sort = OFF;
+SET enable_batch_hashagg = ON;
 EXPLAIN (costs off)
 SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
 SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index a1cb1bb7ac..9fd50db549 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -448,10 +448,23 @@ SET min_parallel_table_scan_size =0;
 SET parallel_tuple_cost = 0;
 SET parallel_setup_cost = 0;
 SET enable_indexonlyscan = OFF;
+-- using batch sort
 EXPLAIN (costs off)
 SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
 SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
 EXPLAIN (costs off)
 SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
 SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+-- using batch hash
+SET enable_batch_sort = OFF;
+SET enable_batch_hashagg = ON;
+EXPLAIN (costs off)
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
 ABORT;
\ No newline at end of file
-- 
2.16.3

#12Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: bucoo@sohu.com (#11)
Re: parallel distinct union and aggregate support patch

Hi,

On Wed, Oct 28, 2020 at 05:37:40PM +0800, bucoo@sohu.com wrote:

Hi
Here is patch for parallel distinct union aggregate and grouping sets support using batch hash agg.
Please review.

how to use:
set enable_batch_hashagg = on

how to work:
like batch sort, but not sort each batch, just save hash value in each rows

unfinished work:
not support rescan yet. welcome to add. Actually I don't really understand how rescan works in parallel mode.

other:
patch 1 base on branch master(80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f)
patch 1 and 2 see /messages/by-id/2020101922424962544053@sohu.com
patch 3:
extpand shared tuple store and add batch store module.
By the way, use atomic operations instead LWLock for shared tuple store get next read page.
patch 4:
using batch hash agg support parallels

Thanks for the patch!

Two generic comments:

1) It's better to always include the whole patch series - including the
parts that have not changed. Otherwise people have to scavenge the
thread and search for all the pieces, which may be a source of issues.
Also, it confuses the patch tester [1] which tries to apply patches from
a single message, so it will fail for this one.

2) I suggest you try to describe the goal of these patches, using some
example queries, explain output etc. Right now the reviewers have to
reverse engineer the patches and deduce what the intention was, which
may be causing unnecessary confusion etc. If this was my patch, I'd try
to create a couple examples (CREATE TABLE + SELECT + EXPLAIN) showing
how the patch changes the query plan, showing speedup etc.

I'd like to do a review and some testing, and this would make it much
easier for me.

kind regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#13bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

1) It's better to always include the whole patch series - including the
parts that have not changed. Otherwise people have to scavenge the
thread and search for all the pieces, which may be a source of issues.
Also, it confuses the patch tester [1] which tries to apply patches from
a single message, so it will fail for this one.

Pathes 3 and 4 do not rely on 1 and 2 in code.
But, it will fail when you apply the apatches 3 and 4 directly, because
they are written after 1 and 2.
I can generate a new single patch if you need.

2) I suggest you try to describe the goal of these patches, using some
example queries, explain output etc. Right now the reviewers have to
reverse engineer the patches and deduce what the intention was, which
may be causing unnecessary confusion etc. If this was my patch, I'd try
to create a couple examples (CREATE TABLE + SELECT + EXPLAIN) showing
how the patch changes the query plan, showing speedup etc.

I written some example queries in to regress, include "unique" "union"
"group by" and "group by grouping sets".
here is my tests, they are not in regress
```sql
begin;
create table gtest(id integer, txt text);
insert into gtest select t1.id,'txt'||t1.id from (select generate_series(1,1000*1000) id) t1,(select generate_series(1,10) id) t2;
analyze gtest;
commit;
set jit = off;
\timing on
```
normal aggregate times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
Finalize GroupAggregate (actual time=6469.279..8947.024 rows=1000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
-> Gather Merge (actual time=6469.245..8165.930 rows=1000058 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 2
Workers Launched: 2
-> Sort (actual time=6356.471..7133.832 rows=333353 loops=3)
Output: txt, (PARTIAL sum(id))
Sort Key: gtest.txt
Sort Method: external merge Disk: 11608kB
Worker 0: actual time=6447.665..7349.431 rows=317512 loops=1
Sort Method: external merge Disk: 10576kB
Worker 1: actual time=6302.882..7061.157 rows=333301 loops=1
Sort Method: external merge Disk: 11112kB
-> Partial HashAggregate (actual time=2591.487..4430.437 rows=333353 loops=3)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 17 Memory Usage: 4241kB Disk Usage: 113152kB
Worker 0: actual time=2584.345..4486.407 rows=317512 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 101392kB
Worker 1: actual time=2584.369..4393.244 rows=333301 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 112832kB
-> Parallel Seq Scan on public.gtest (actual time=0.691..603.990 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.104..607.146 rows=3174970 loops=1
Worker 1: actual time=0.100..603.951 rows=3332785 loops=1
Planning Time: 0.226 ms
Execution Time: 9021.058 ms
(29 rows)

Time: 9022.251 ms (00:09.022)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=3116.666..5740.826 rows=1000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=3103.181..5464.948 rows=333333 loops=3)
Output: sum(id), txt
Group Key: gtest.txt
Worker 0: actual time=3094.550..5486.992 rows=326082 loops=1
Worker 1: actual time=3099.562..5480.111 rows=324729 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.791..656.601 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.080..646.053 rows=3057680 loops=1
Worker 1: actual time=0.070..662.754 rows=3034370 loops=1
Planning Time: 0.243 ms
Execution Time: 5788.981 ms
(15 rows)

Time: 5790.143 ms (00:05.790)
```

grouping sets times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
------------------------------------------------------------------------------------------
GroupAggregate (actual time=9454.707..38921.885 rows=2000001 loops=1)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Sort Key: gtest.txt
Group Key: gtest.txt
-> Sort (actual time=9454.679..11804.071 rows=10000000 loops=1)
Output: txt, id
Sort Key: gtest.id
Sort Method: external merge Disk: 254056kB
-> Seq Scan on public.gtest (actual time=2.250..2419.031 rows=10000000 loops=1)
Output: txt, id
Planning Time: 0.230 ms
Execution Time: 39203.883 ms
(14 rows)

Time: 39205.339 ms (00:39.205)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=5931.776..14353.957 rows=2000001 loops=1)
Output: (sum(id)), txt, id
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=5920.963..13897.852 rows=666667 loops=3)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Group Key: gtest.txt
Worker 0: actual time=5916.370..14062.461 rows=513810 loops=1
Worker 1: actual time=5916.037..13932.847 rows=775901 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.399..688.273 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.052..690.955 rows=3349990 loops=1
Worker 1: actual time=0.050..691.595 rows=3297070 loops=1
Planning Time: 0.157 ms
Execution Time: 14598.416 ms
(17 rows)

Time: 14599.437 ms (00:14.599)
```

#14Dilip Kumar
dilipbalaut@gmail.com
In reply to: bucoo@sohu.com (#13)
6 attachment(s)
Re: Re: parallel distinct union and aggregate support patch

On Thu, Oct 29, 2020 at 12:53 PM bucoo@sohu.com <bucoo@sohu.com> wrote:

1) It's better to always include the whole patch series - including the
parts that have not changed. Otherwise people have to scavenge the
thread and search for all the pieces, which may be a source of issues.
Also, it confuses the patch tester [1] which tries to apply patches from
a single message, so it will fail for this one.

Pathes 3 and 4 do not rely on 1 and 2 in code.
But, it will fail when you apply the apatches 3 and 4 directly, because
they are written after 1 and 2.
I can generate a new single patch if you need.

2) I suggest you try to describe the goal of these patches, using some
example queries, explain output etc. Right now the reviewers have to
reverse engineer the patches and deduce what the intention was, which
may be causing unnecessary confusion etc. If this was my patch, I'd try
to create a couple examples (CREATE TABLE + SELECT + EXPLAIN) showing
how the patch changes the query plan, showing speedup etc.

I written some example queries in to regress, include "unique" "union"
"group by" and "group by grouping sets".
here is my tests, they are not in regress
```sql
begin;
create table gtest(id integer, txt text);
insert into gtest select t1.id,'txt'||t1.id from (select generate_series(1,1000*1000) id) t1,(select generate_series(1,10) id) t2;
analyze gtest;
commit;
set jit = off;
\timing on
```
normal aggregate times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
Finalize GroupAggregate (actual time=6469.279..8947.024 rows=1000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
-> Gather Merge (actual time=6469.245..8165.930 rows=1000058 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 2
Workers Launched: 2
-> Sort (actual time=6356.471..7133.832 rows=333353 loops=3)
Output: txt, (PARTIAL sum(id))
Sort Key: gtest.txt
Sort Method: external merge Disk: 11608kB
Worker 0: actual time=6447.665..7349.431 rows=317512 loops=1
Sort Method: external merge Disk: 10576kB
Worker 1: actual time=6302.882..7061.157 rows=333301 loops=1
Sort Method: external merge Disk: 11112kB
-> Partial HashAggregate (actual time=2591.487..4430.437 rows=333353 loops=3)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 17 Memory Usage: 4241kB Disk Usage: 113152kB
Worker 0: actual time=2584.345..4486.407 rows=317512 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 101392kB
Worker 1: actual time=2584.369..4393.244 rows=333301 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 112832kB
-> Parallel Seq Scan on public.gtest (actual time=0.691..603.990 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.104..607.146 rows=3174970 loops=1
Worker 1: actual time=0.100..603.951 rows=3332785 loops=1
Planning Time: 0.226 ms
Execution Time: 9021.058 ms
(29 rows)

Time: 9022.251 ms (00:09.022)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=3116.666..5740.826 rows=1000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=3103.181..5464.948 rows=333333 loops=3)
Output: sum(id), txt
Group Key: gtest.txt
Worker 0: actual time=3094.550..5486.992 rows=326082 loops=1
Worker 1: actual time=3099.562..5480.111 rows=324729 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.791..656.601 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.080..646.053 rows=3057680 loops=1
Worker 1: actual time=0.070..662.754 rows=3034370 loops=1
Planning Time: 0.243 ms
Execution Time: 5788.981 ms
(15 rows)

Time: 5790.143 ms (00:05.790)
```

grouping sets times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
------------------------------------------------------------------------------------------
GroupAggregate (actual time=9454.707..38921.885 rows=2000001 loops=1)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Sort Key: gtest.txt
Group Key: gtest.txt
-> Sort (actual time=9454.679..11804.071 rows=10000000 loops=1)
Output: txt, id
Sort Key: gtest.id
Sort Method: external merge Disk: 254056kB
-> Seq Scan on public.gtest (actual time=2.250..2419.031 rows=10000000 loops=1)
Output: txt, id
Planning Time: 0.230 ms
Execution Time: 39203.883 ms
(14 rows)

Time: 39205.339 ms (00:39.205)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=5931.776..14353.957 rows=2000001 loops=1)
Output: (sum(id)), txt, id
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=5920.963..13897.852 rows=666667 loops=3)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Group Key: gtest.txt
Worker 0: actual time=5916.370..14062.461 rows=513810 loops=1
Worker 1: actual time=5916.037..13932.847 rows=775901 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.399..688.273 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.052..690.955 rows=3349990 loops=1
Worker 1: actual time=0.050..691.595 rows=3297070 loops=1
Planning Time: 0.157 ms
Execution Time: 14598.416 ms
(17 rows)

Time: 14599.437 ms (00:14.599)
```

I have done some performance testing with TPCH to see the impact on
the different query plan, I could see there are a lot of plan changes
across various queries but out of those, there are few queries where
these patches gave noticeable gain query13 and query17 (I have
attached the plan for these 2 queries).

Test details:
----------------
TPCH scale factor 50 (database size 112GB)
work_mem 20GB, shared buffers: 20GB max_parallel_workers_per_gather=4

Machine information:
Architecture: x86_64
CPU(s): 56
Thread(s) per core: 2
Core(s) per socket: 14
Socket(s): 2
NUMA node(s): 2
Model name: Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz

Observation:
In the TPCH test, I have noticed that the major gain we are getting in
this patch is because we are able to use the parallelism where we were
not able to use due to the limitation of the parallel aggregate.
Basically, for computing final aggregated results we need to break the
parallelism because the worker is only performing the partial
aggregate and after that, we had to gather all the partially
aggregated results and do the finalize aggregate. Now, with this
patch, since we are batching the results we are able to compute the
final aggregate within the workers itself and that enables us to get
the parallelism in more cases.

Example:
If we observe the output of plan 13(13.explain_head.out), the subquery
is performing the aggregate and the outer query is doing the grouping
on the aggregated value of the subquery, due to this we are not
selecting the parallelism in the head because in the inner aggregation
the number of groups is huge and if we select the parallelism we need
to transfer a lot of tuple through the tuple queue and we will also
have to serialize/deserialize those many transition values. And the
outer query needs the final aggregated results from the inner query so
we can not select the parallelism. Now with the batch
aggregate(13.explain_patch.out), we are able to compute the finalize
aggregation within the workers itself and that enabled us to continue
the parallelism till the top node. The execution time for this query
is now reduced to 57sec from 238sec which is 4X faster.

I will perform some more tests with different scale factors and
analyze the behavior of this.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Attachments:

13.explain_head.outapplication/octet-stream; name=13.explain_head.outDownload
17.explain_patch.outapplication/octet-stream; name=17.explain_patch.outDownload
17.explain_head.outapplication/octet-stream; name=17.explain_head.outDownload
13.explain_patch.outapplication/octet-stream; name=13.explain_patch.outDownload
13.sqlapplication/sql; name=13.sqlDownload
17.sqlapplication/sql; name=17.sqlDownload
#15Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#14)
Re: Re: parallel distinct union and aggregate support patch

On Tue, Nov 3, 2020 at 6:06 PM Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Thu, Oct 29, 2020 at 12:53 PM bucoo@sohu.com <bucoo@sohu.com> wrote:

1) It's better to always include the whole patch series - including the
parts that have not changed. Otherwise people have to scavenge the
thread and search for all the pieces, which may be a source of issues.
Also, it confuses the patch tester [1] which tries to apply patches from
a single message, so it will fail for this one.

Pathes 3 and 4 do not rely on 1 and 2 in code.
But, it will fail when you apply the apatches 3 and 4 directly, because
they are written after 1 and 2.
I can generate a new single patch if you need.

2) I suggest you try to describe the goal of these patches, using some
example queries, explain output etc. Right now the reviewers have to
reverse engineer the patches and deduce what the intention was, which
may be causing unnecessary confusion etc. If this was my patch, I'd try
to create a couple examples (CREATE TABLE + SELECT + EXPLAIN) showing
how the patch changes the query plan, showing speedup etc.

I written some example queries in to regress, include "unique" "union"
"group by" and "group by grouping sets".
here is my tests, they are not in regress
```sql
begin;
create table gtest(id integer, txt text);
insert into gtest select t1.id,'txt'||t1.id from (select generate_series(1,1000*1000) id) t1,(select generate_series(1,10) id) t2;
analyze gtest;
commit;
set jit = off;
\timing on
```
normal aggregate times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
Finalize GroupAggregate (actual time=6469.279..8947.024 rows=1000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
-> Gather Merge (actual time=6469.245..8165.930 rows=1000058 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 2
Workers Launched: 2
-> Sort (actual time=6356.471..7133.832 rows=333353 loops=3)
Output: txt, (PARTIAL sum(id))
Sort Key: gtest.txt
Sort Method: external merge Disk: 11608kB
Worker 0: actual time=6447.665..7349.431 rows=317512 loops=1
Sort Method: external merge Disk: 10576kB
Worker 1: actual time=6302.882..7061.157 rows=333301 loops=1
Sort Method: external merge Disk: 11112kB
-> Partial HashAggregate (actual time=2591.487..4430.437 rows=333353 loops=3)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 17 Memory Usage: 4241kB Disk Usage: 113152kB
Worker 0: actual time=2584.345..4486.407 rows=317512 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 101392kB
Worker 1: actual time=2584.369..4393.244 rows=333301 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 112832kB
-> Parallel Seq Scan on public.gtest (actual time=0.691..603.990 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.104..607.146 rows=3174970 loops=1
Worker 1: actual time=0.100..603.951 rows=3332785 loops=1
Planning Time: 0.226 ms
Execution Time: 9021.058 ms
(29 rows)

Time: 9022.251 ms (00:09.022)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=3116.666..5740.826 rows=1000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=3103.181..5464.948 rows=333333 loops=3)
Output: sum(id), txt
Group Key: gtest.txt
Worker 0: actual time=3094.550..5486.992 rows=326082 loops=1
Worker 1: actual time=3099.562..5480.111 rows=324729 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.791..656.601 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.080..646.053 rows=3057680 loops=1
Worker 1: actual time=0.070..662.754 rows=3034370 loops=1
Planning Time: 0.243 ms
Execution Time: 5788.981 ms
(15 rows)

Time: 5790.143 ms (00:05.790)
```

grouping sets times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
------------------------------------------------------------------------------------------
GroupAggregate (actual time=9454.707..38921.885 rows=2000001 loops=1)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Sort Key: gtest.txt
Group Key: gtest.txt
-> Sort (actual time=9454.679..11804.071 rows=10000000 loops=1)
Output: txt, id
Sort Key: gtest.id
Sort Method: external merge Disk: 254056kB
-> Seq Scan on public.gtest (actual time=2.250..2419.031 rows=10000000 loops=1)
Output: txt, id
Planning Time: 0.230 ms
Execution Time: 39203.883 ms
(14 rows)

Time: 39205.339 ms (00:39.205)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=5931.776..14353.957 rows=2000001 loops=1)
Output: (sum(id)), txt, id
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=5920.963..13897.852 rows=666667 loops=3)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Group Key: gtest.txt
Worker 0: actual time=5916.370..14062.461 rows=513810 loops=1
Worker 1: actual time=5916.037..13932.847 rows=775901 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.399..688.273 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.052..690.955 rows=3349990 loops=1
Worker 1: actual time=0.050..691.595 rows=3297070 loops=1
Planning Time: 0.157 ms
Execution Time: 14598.416 ms
(17 rows)

Time: 14599.437 ms (00:14.599)
```

I have done some performance testing with TPCH to see the impact on
the different query plan, I could see there are a lot of plan changes
across various queries but out of those, there are few queries where
these patches gave noticeable gain query13 and query17 (I have
attached the plan for these 2 queries).

Test details:
----------------
TPCH scale factor 50 (database size 112GB)
work_mem 20GB, shared buffers: 20GB max_parallel_workers_per_gather=4

Machine information:
Architecture: x86_64
CPU(s): 56
Thread(s) per core: 2
Core(s) per socket: 14
Socket(s): 2
NUMA node(s): 2
Model name: Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz

Observation:
In the TPCH test, I have noticed that the major gain we are getting in
this patch is because we are able to use the parallelism where we were
not able to use due to the limitation of the parallel aggregate.
Basically, for computing final aggregated results we need to break the
parallelism because the worker is only performing the partial
aggregate and after that, we had to gather all the partially
aggregated results and do the finalize aggregate. Now, with this
patch, since we are batching the results we are able to compute the
final aggregate within the workers itself and that enables us to get
the parallelism in more cases.

Example:
If we observe the output of plan 13(13.explain_head.out), the subquery
is performing the aggregate and the outer query is doing the grouping
on the aggregated value of the subquery, due to this we are not
selecting the parallelism in the head because in the inner aggregation
the number of groups is huge and if we select the parallelism we need
to transfer a lot of tuple through the tuple queue and we will also
have to serialize/deserialize those many transition values. And the
outer query needs the final aggregated results from the inner query so
we can not select the parallelism. Now with the batch
aggregate(13.explain_patch.out), we are able to compute the finalize
aggregation within the workers itself and that enabled us to continue
the parallelism till the top node. The execution time for this query
is now reduced to 57sec from 238sec which is 4X faster.

I will perform some more tests with different scale factors and
analyze the behavior of this.

I have started reviewing these patches, I have a couple of review comments.

Some general comment to make code more readable

1. Comments are missing in the patch, even there are no function
header comments to explain the overall idea about the function.
I think adding comments will make it easier to review the patch.

2. Code is not written as per the Postgres coding guideline, the
common problems observed with the patch are
a) There should be an empty line after the variable declaration section
b) In the function definition, the function return type and the
function name should not be in the same line

Change

+static bool ExecNextParallelBatchSort(BatchSortState *state)
{
}
to
static bool
ExecNextParallelBatchSort(BatchSortState *state)
{
}

c) While typecasting the variable the spacing is not used properly and
uniformly, you can refer to other code and fix it.

*Specific comments to patch 0001*

1.
+#define BATCH_SORT_MAX_BATCHES 512

Did you decide this number based on some experiment or is there some
analysis behind selecting this number?

2.
+BatchSortState* ExecInitBatchSort(BatchSort *node, EState *estate, int eflags)
+{
+ BatchSortState *state;
+ TypeCacheEntry *typentry;
....
+ for (i=0;i<node->numGroupCols;++i)
+ {
...
+ InitFunctionCallInfoData(*fcinfo, flinfo, 1, attr->attcollation, NULL, NULL);
+ fcinfo->args[0].isnull = false;
+ state->groupFuns = lappend(state->groupFuns, fcinfo);
+ }

From the variable naming, it appeared like the batch sort is dependent
upon the grouping node. I think instead of using the name
numGroupCols and groupFuns we need to use names that are more relevant
to the batch sort something like numSortKey.

3.
+ if (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))
+ {
+ /* for now, we only using in group aggregate */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("not support execute flag(s) %d for group sort", eflags)));
+ }

Instead of ereport, you should just put an Assert for the unsupported
flag or elog.

4.
+ state = makeNode(BatchSortState);
+ state->ps.plan = (Plan*) node;
+ state->ps.state = estate;
+ state->ps.ExecProcNode = ExecBatchSortPrepare;

I think the main executor entry function should be named ExecBatchSort
instead of ExecBatchSortPrepare, it will look more consistent with the
other executor machinery.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#16Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#15)
Re: Re: parallel distinct union and aggregate support patch

On Sun, Nov 8, 2020 at 11:54 AM Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Tue, Nov 3, 2020 at 6:06 PM Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Thu, Oct 29, 2020 at 12:53 PM bucoo@sohu.com <bucoo@sohu.com> wrote:

1) It's better to always include the whole patch series - including the
parts that have not changed. Otherwise people have to scavenge the
thread and search for all the pieces, which may be a source of issues.
Also, it confuses the patch tester [1] which tries to apply patches from
a single message, so it will fail for this one.

Pathes 3 and 4 do not rely on 1 and 2 in code.
But, it will fail when you apply the apatches 3 and 4 directly, because
they are written after 1 and 2.
I can generate a new single patch if you need.

2) I suggest you try to describe the goal of these patches, using some
example queries, explain output etc. Right now the reviewers have to
reverse engineer the patches and deduce what the intention was, which
may be causing unnecessary confusion etc. If this was my patch, I'd try
to create a couple examples (CREATE TABLE + SELECT + EXPLAIN) showing
how the patch changes the query plan, showing speedup etc.

I written some example queries in to regress, include "unique" "union"
"group by" and "group by grouping sets".
here is my tests, they are not in regress
```sql
begin;
create table gtest(id integer, txt text);
insert into gtest select t1.id,'txt'||t1.id from (select generate_series(1,1000*1000) id) t1,(select generate_series(1,10) id) t2;
analyze gtest;
commit;
set jit = off;
\timing on
```
normal aggregate times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
Finalize GroupAggregate (actual time=6469.279..8947.024 rows=1000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
-> Gather Merge (actual time=6469.245..8165.930 rows=1000058 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 2
Workers Launched: 2
-> Sort (actual time=6356.471..7133.832 rows=333353 loops=3)
Output: txt, (PARTIAL sum(id))
Sort Key: gtest.txt
Sort Method: external merge Disk: 11608kB
Worker 0: actual time=6447.665..7349.431 rows=317512 loops=1
Sort Method: external merge Disk: 10576kB
Worker 1: actual time=6302.882..7061.157 rows=333301 loops=1
Sort Method: external merge Disk: 11112kB
-> Partial HashAggregate (actual time=2591.487..4430.437 rows=333353 loops=3)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 17 Memory Usage: 4241kB Disk Usage: 113152kB
Worker 0: actual time=2584.345..4486.407 rows=317512 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 101392kB
Worker 1: actual time=2584.369..4393.244 rows=333301 loops=1
Batches: 17 Memory Usage: 4241kB Disk Usage: 112832kB
-> Parallel Seq Scan on public.gtest (actual time=0.691..603.990 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.104..607.146 rows=3174970 loops=1
Worker 1: actual time=0.100..603.951 rows=3332785 loops=1
Planning Time: 0.226 ms
Execution Time: 9021.058 ms
(29 rows)

Time: 9022.251 ms (00:09.022)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by txt;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=3116.666..5740.826 rows=1000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=3103.181..5464.948 rows=333333 loops=3)
Output: sum(id), txt
Group Key: gtest.txt
Worker 0: actual time=3094.550..5486.992 rows=326082 loops=1
Worker 1: actual time=3099.562..5480.111 rows=324729 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.791..656.601 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.080..646.053 rows=3057680 loops=1
Worker 1: actual time=0.070..662.754 rows=3034370 loops=1
Planning Time: 0.243 ms
Execution Time: 5788.981 ms
(15 rows)

Time: 5790.143 ms (00:05.790)
```

grouping sets times
```
set enable_batch_hashagg = off;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
------------------------------------------------------------------------------------------
GroupAggregate (actual time=9454.707..38921.885 rows=2000001 loops=1)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Sort Key: gtest.txt
Group Key: gtest.txt
-> Sort (actual time=9454.679..11804.071 rows=10000000 loops=1)
Output: txt, id
Sort Key: gtest.id
Sort Method: external merge Disk: 254056kB
-> Seq Scan on public.gtest (actual time=2.250..2419.031 rows=10000000 loops=1)
Output: txt, id
Planning Time: 0.230 ms
Execution Time: 39203.883 ms
(14 rows)

Time: 39205.339 ms (00:39.205)

set enable_batch_hashagg = on;
explain (costs off,analyze,verbose)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather (actual time=5931.776..14353.957 rows=2000001 loops=1)
Output: (sum(id)), txt, id
Workers Planned: 2
Workers Launched: 2
-> Parallel BatchHashAggregate (actual time=5920.963..13897.852 rows=666667 loops=3)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Group Key: gtest.txt
Worker 0: actual time=5916.370..14062.461 rows=513810 loops=1
Worker 1: actual time=5916.037..13932.847 rows=775901 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.399..688.273 rows=3333333 loops=3)
Output: id, txt
Worker 0: actual time=0.052..690.955 rows=3349990 loops=1
Worker 1: actual time=0.050..691.595 rows=3297070 loops=1
Planning Time: 0.157 ms
Execution Time: 14598.416 ms
(17 rows)

Time: 14599.437 ms (00:14.599)
```

I have done some performance testing with TPCH to see the impact on
the different query plan, I could see there are a lot of plan changes
across various queries but out of those, there are few queries where
these patches gave noticeable gain query13 and query17 (I have
attached the plan for these 2 queries).

Test details:
----------------
TPCH scale factor 50 (database size 112GB)
work_mem 20GB, shared buffers: 20GB max_parallel_workers_per_gather=4

Machine information:
Architecture: x86_64
CPU(s): 56
Thread(s) per core: 2
Core(s) per socket: 14
Socket(s): 2
NUMA node(s): 2
Model name: Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz

Observation:
In the TPCH test, I have noticed that the major gain we are getting in
this patch is because we are able to use the parallelism where we were
not able to use due to the limitation of the parallel aggregate.
Basically, for computing final aggregated results we need to break the
parallelism because the worker is only performing the partial
aggregate and after that, we had to gather all the partially
aggregated results and do the finalize aggregate. Now, with this
patch, since we are batching the results we are able to compute the
final aggregate within the workers itself and that enables us to get
the parallelism in more cases.

Example:
If we observe the output of plan 13(13.explain_head.out), the subquery
is performing the aggregate and the outer query is doing the grouping
on the aggregated value of the subquery, due to this we are not
selecting the parallelism in the head because in the inner aggregation
the number of groups is huge and if we select the parallelism we need
to transfer a lot of tuple through the tuple queue and we will also
have to serialize/deserialize those many transition values. And the
outer query needs the final aggregated results from the inner query so
we can not select the parallelism. Now with the batch
aggregate(13.explain_patch.out), we are able to compute the finalize
aggregation within the workers itself and that enabled us to continue
the parallelism till the top node. The execution time for this query
is now reduced to 57sec from 238sec which is 4X faster.

I will perform some more tests with different scale factors and
analyze the behavior of this.

I have started reviewing these patches, I have a couple of review comments.

Some general comment to make code more readable

1. Comments are missing in the patch, even there are no function
header comments to explain the overall idea about the function.
I think adding comments will make it easier to review the patch.

2. Code is not written as per the Postgres coding guideline, the
common problems observed with the patch are
a) There should be an empty line after the variable declaration section
b) In the function definition, the function return type and the
function name should not be in the same line

Change

+static bool ExecNextParallelBatchSort(BatchSortState *state)
{
}
to
static bool
ExecNextParallelBatchSort(BatchSortState *state)
{
}

c) While typecasting the variable the spacing is not used properly and
uniformly, you can refer to other code and fix it.

*Specific comments to patch 0001*

1.
+#define BATCH_SORT_MAX_BATCHES 512

Did you decide this number based on some experiment or is there some
analysis behind selecting this number?

2.
+BatchSortState* ExecInitBatchSort(BatchSort *node, EState *estate, int eflags)
+{
+ BatchSortState *state;
+ TypeCacheEntry *typentry;
....
+ for (i=0;i<node->numGroupCols;++i)
+ {
...
+ InitFunctionCallInfoData(*fcinfo, flinfo, 1, attr->attcollation, NULL, NULL);
+ fcinfo->args[0].isnull = false;
+ state->groupFuns = lappend(state->groupFuns, fcinfo);
+ }

From the variable naming, it appeared like the batch sort is dependent
upon the grouping node. I think instead of using the name
numGroupCols and groupFuns we need to use names that are more relevant
to the batch sort something like numSortKey.

3.
+ if (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))
+ {
+ /* for now, we only using in group aggregate */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("not support execute flag(s) %d for group sort", eflags)));
+ }

Instead of ereport, you should just put an Assert for the unsupported
flag or elog.

4.
+ state = makeNode(BatchSortState);
+ state->ps.plan = (Plan*) node;
+ state->ps.state = estate;
+ state->ps.ExecProcNode = ExecBatchSortPrepare;

I think the main executor entry function should be named ExecBatchSort
instead of ExecBatchSortPrepare, it will look more consistent with the
other executor machinery.

1.
+void cost_batchsort(Path *path, PlannerInfo *root,
+                    List *batchkeys, Cost input_cost,
+                    double tuples, int width,
+                    Cost comparison_cost, int sort_mem,
+                    uint32 numGroupCols, uint32 numBatches)
+{
+    Cost        startup_cost = input_cost;
+    Cost        run_cost = 0;
+    double        input_bytes = relation_byte_size(tuples, width);
+    double        batch_bytes = input_bytes / numBatches;
+    double        batch_tuples = tuples / numBatches;
+    long        sort_mem_bytes = sort_mem * 1024L;
+
+    if (sort_mem_bytes < (64*1024))
+        sort_mem_bytes = (64*1024);
+
+    if (!enable_batch_sort)
+        startup_cost += disable_cost;

You don't need to write a duplicate function for this, you can reuse
the cost_tuplesort function with some minor changes.

2. I have one more suggestion, currently, the batches are picked by
workers dynamically and the benefit of that is the work distribution
is quite flexible. But one downside I see with this approach is that
if we want to make this parallelism to the upper node for example
merge join, therein we can imagine the merge join with both side nodes
as BatchSort. But the problem is if the worker picks the batch
dynamically then the worker need to pick the same batch on both sides
so for that the right side node should be aware of what batch got
picked on the left side node so for doing that we might have to
introduce a different join node say BatchWiseMergeJoin. Whereas if we
make the batches as per the worker number then each sort node can be
processed independently without knowing what is happening on the other
side.

3. I have also done some performance tests especially with the small
group size, basically, the cases where parallel aggregate is not
picked due to the small group size, and with the new patch the
parallel aggregate is possible now.

Setup: I have used TPCH database with S.F 50 and executed an
aggregation query on the ORDER table

Number of rows in order table: 75000000
Total table size: 18 GB

Work_mem: 10GB

postgres=# explain (analyze, verbose) select sum(o_totalprice) from
orders group by o_custkey;

QUERY
PLAN
--------------------------------------------------------------------------------------------------------------------------------------
HashAggregate (cost=2506201.00..2570706.04 rows=5160403 width=40)
(actual time=94002.681..98733.002 rows=4999889 loops=1)
Output: sum(o_totalprice), o_custkey
Group Key: orders.o_custkey
Batches: 1 Memory Usage: 2228241kB
-> Seq Scan on public.orders (cost=0.00..2131201.00 rows=75000000
width=16) (actual time=0.042..12930.981 rows=75000000 loops=1)
Output: o_orderkey, o_custkey, o_orderstatus, o_totalprice,
o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment
Planning Time: 0.317 ms
Execution Time: 99230.242 ms

postgres=# set enable_batch_sort=on;
SET
postgres=# explain (analyze, verbose) select sum(o_totalprice) from
orders group by o_custkey;

QUERY PLAN

-------------------------------------------------------------------------------------------------------------------------------------------------
---------
Gather (cost=1616576.00..1761358.55 rows=40316 width=40) (actual
time=18516.549..28811.164 rows=4999889 loops=1)
Output: (sum(o_totalprice)), o_custkey
Workers Planned: 4
Workers Launched: 4
-> GroupAggregate (cost=1615576.00..1756326.99 rows=10079
width=40) (actual time=18506.051..28131.650 rows=999978 loops=5)
Output: sum(o_totalprice), o_custkey
Group Key: orders.o_custkey
Worker 0: actual time=18502.746..28406.868 rows=995092 loops=1
Worker 1: actual time=18502.339..28518.559 rows=1114511 loops=1
Worker 2: actual time=18503.233..28461.975 rows=985574 loops=1
Worker 3: actual time=18506.026..28409.130 rows=1005414 loops=1
-> Parallel BatchSort (cost=1615576.00..1662451.00
rows=18750000 width=16) (actual time=18505.982..21839.567
rows=15000000 loops=5)
Output: o_custkey, o_totalprice
Sort Key: orders.o_custkey
batches: 512
Worker 0: actual time=18502.666..21945.442 rows=14925544 loops=1
Worker 1: actual time=18502.270..21979.350 rows=16714443 loops=1
Worker 2: actual time=18503.144..21933.151 rows=14784292 loops=1
Worker 3: actual time=18505.950..21943.312 rows=15081559 loops=1
-> Parallel Seq Scan on public.orders
(cost=0.00..1568701.00 rows=18750000 width=16) (actual
time=0.082..4662.390 rows=15000000
loops=5)
Output: o_custkey, o_totalprice
Worker 0: actual time=0.079..4720.424
rows=15012981 loops=1
Worker 1: actual time=0.083..4710.919
rows=15675399 loops=1
Worker 2: actual time=0.082..4663.096
rows=14558663 loops=1
Worker 3: actual time=0.104..4625.940
rows=14496910 loops=1
Planning Time: 0.281 ms
Execution Time: 29504.248 ms

postgres=# set enable_batch_hashagg =on;
postgres=# set enable_batch_sort=off;
postgres=# explain (analyze, verbose) select sum(o_totalprice) from
orders group by o_custkey;

QUERY PLAN

-------------------------------------------------------------------------------------------------------------------------------------------------
---
Gather (cost=1755004.00..2287170.56 rows=5160403 width=40) (actual
time=12935.338..27064.962 rows=4999889 loops=1)
Output: (sum(o_totalprice)), o_custkey
Workers Planned: 4
Workers Launched: 4
-> Parallel BatchHashAggregate (cost=1754004.00..1770130.26
rows=1290101 width=40) (actual time=12987.830..24726.348 rows=999978
loops=5)
Output: sum(o_totalprice), o_custkey
Group Key: orders.o_custkey
Worker 0: actual time=13013.228..25078.902 rows=999277 loops=1
Worker 1: actual time=12917.375..25456.751 rows=1100607 loops=1
Worker 2: actual time=13041.088..24022.445 rows=900562 loops=1
Worker 3: actual time=13032.732..25230.101 rows=1001386 loops=1
-> Parallel Seq Scan on public.orders
(cost=0.00..1568701.00 rows=18750000 width=16) (actual
time=0.059..2764.881 rows=15000000 loops=
5)
Output: o_orderkey, o_custkey, o_orderstatus,
o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority,
o_comment
Worker 0: actual time=0.056..2754.621 rows=14924063 loops=1
Worker 1: actual time=0.063..2815.688 rows=16241825 loops=1
Worker 2: actual time=0.067..2750.927 rows=14064529 loops=1
Worker 3: actual time=0.055..2753.620 rows=14699841 loops=1
Planning Time: 0.209 ms
Execution Time: 27728.363 ms
(19 rows)

I think both parallel batch-wise grouping aggregate and the batch-wise
hash aggregate are giving very huge improvement when the typical group
size is small.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#17Heikki Linnakangas
hlinnaka@iki.fi
In reply to: Dilip Kumar (#15)
Re: parallel distinct union and aggregate support patch

I also had a quick look at the patch and the comments made so far. Summary:

1. The performance results are promising.

2. The code needs comments.

Regarding the design:

Thomas Munro mentioned the idea of a "Parallel Repartition" node that
would redistribute tuples like this. As I understand it, the difference
is that this BatchSort implementation collects all tuples in a tuplesort
or a tuplestore, while a Parallel Repartition node would just
redistribute the tuples to the workers, without buffering. The receiving
worker could put the tuples to a tuplestore or sort if needed.

I think a non-buffering Reparttion node would be simpler, and thus
better. In these patches, you have a BatchSort node, and batchstore, but
a simple Parallel Repartition node could do both. For example, to
implement distinct:

Gather
- > Unique
-> Sort
-> Parallel Redistribute
-> Parallel Seq Scan

And a Hash Agg would look like this:

Gather
- > Hash Agg
-> Parallel Redistribute
-> Parallel Seq Scan

I'm marking this as Waiting on Author in the commitfest.

- Heikki

#18Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#17)
Re: parallel distinct union and aggregate support patch

On Fri, Nov 27, 2020 at 10:55 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

I think a non-buffering Reparttion node would be simpler, and thus
better. In these patches, you have a BatchSort node, and batchstore, but
a simple Parallel Repartition node could do both. For example, to
implement distinct:

Gather
- > Unique
-> Sort
-> Parallel Redistribute
-> Parallel Seq Scan

And a Hash Agg would look like this:

Gather
- > Hash Agg
-> Parallel Redistribute
-> Parallel Seq Scan

I'm marking this as Waiting on Author in the commitfest.

I'm also intrigued by the parallel redistribute operator -- it seems
like it might be more flexible than this approach. However, I'm
concerned that there may be deadlock risks. If there is no buffer, or
a fixed-size buffer, the buffer might be full, and process trying to
jam tuples into the parallel redistribute would have to wait. Now if A
can wait for B and at the same time B can wait for A, deadlock will
ensue. In a naive implementation, this could happen with a single
parallel redistribute operator: worker 1 is trying to send a tuple to
worker 2, which can't receive it because it's busy sending a tuple to
worker 1. That could probably be fixed by arranging for workers to try
to try to receive data whenever they block in the middle of sending
data. However, in general there can be multiple nodes that cause
waiting in the tree: any number of Parallel Redistribute nodes, plus a
Gather, plus maybe other stuff. The cheap way out of that problem is
to use a buffer that can grow arbitrarily large, but that's not
terribly satisfying either.

--
Robert Haas
EDB: http://www.enterprisedb.com

#19Dilip Kumar
dilipbalaut@gmail.com
In reply to: Heikki Linnakangas (#17)
Re: parallel distinct union and aggregate support patch

On Fri, Nov 27, 2020 at 9:25 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

I also had a quick look at the patch and the comments made so far. Summary:

1. The performance results are promising.

2. The code needs comments.

Regarding the design:

Thomas Munro mentioned the idea of a "Parallel Repartition" node that
would redistribute tuples like this. As I understand it, the difference
is that this BatchSort implementation collects all tuples in a tuplesort
or a tuplestore, while a Parallel Repartition node would just
redistribute the tuples to the workers, without buffering.

I think the advantage of the "Parallel BatchSort" is that it give
flexibility to pick the batches dynamically by the worker after the
repartition. OTOH if we distribute batches directly based on the
worker number the advantage is that the operator will be quite
flexible, e.g. if we want to implement the merge join we can just
place the "Parallel Repartition" node above both side of the scan node
and we will simply get the batch wise merge join because each worker
knows their batch. Whereas if we allow workers to dynamically pick
the batch the right side node needs to know which batch to pick
because it is dynamically picked, I mean it is not as difficult
because it is the same worker but it seems less flexible.

The receiving

worker could put the tuples to a tuplestore or sort if needed.

If we are using it without buffering then the sending worker can
directly put the tuple into the respective sort/tuplestore node.

I think a non-buffering Reparttion node would be simpler, and thus
better. In these patches, you have a BatchSort node, and batchstore, but
a simple Parallel Repartition node could do both. For example, to
implement distinct:

Gather
- > Unique
-> Sort
-> Parallel Redistribute
-> Parallel Seq Scan

And a Hash Agg would look like this:

Gather
- > Hash Agg
-> Parallel Redistribute
-> Parallel Seq Scan

I'm marking this as Waiting on Author in the commitfest.

I agree that the simple parallel redistribute/repartition node will be
flexible and could do both, but I see one problem. Basically, if we
use the common operator then first the Parallel Redistribute operator
will use the tuplestore for redistributing the data as per the worker
and then each worker might use the disk again to sort their respective
data. Instead of that while redistributing the data itself we can use
the parallel sort so that each worker gets their respective batch in
form of sorted tapes.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#20bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

1.
+#define BATCH_SORT_MAX_BATCHES 512

Did you decide this number based on some experiment or is there some
analysis behind selecting this number?

When there are too few batches, if a certain process works too slowly, it will cause unbalanced load.
When there are too many batches, FD will open and close files frequently.

2.
+BatchSortState* ExecInitBatchSort(BatchSort *node, EState *estate, int eflags)
+{
+ BatchSortState *state;
+ TypeCacheEntry *typentry;
....
+ for (i=0;i<node->numGroupCols;++i)
+ {
...
+ InitFunctionCallInfoData(*fcinfo, flinfo, 1, attr->attcollation, NULL, NULL);
+ fcinfo->args[0].isnull = false;
+ state->groupFuns = lappend(state->groupFuns, fcinfo);
+ }

From the variable naming, it appeared like the batch sort is dependent
upon the grouping node. I think instead of using the name
numGroupCols and groupFuns we need to use names that are more relevant
to the batch sort something like numSortKey.

Not all data types support both sorting and hashing calculations, such as user-defined data types.
We do not need all columns to support hash calculation when we batch, so I used two variables.

3.
+ if (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))
+ {
+ /* for now, we only using in group aggregate */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("not support execute flag(s) %d for group sort", eflags)));
+ }

Instead of ereport, you should just put an Assert for the unsupported
flag or elog.

In fact, this is an unfinished feature, BatchSort should also support these features, welcome to supplement.

4.
+ state = makeNode(BatchSortState);
+ state->ps.plan = (Plan*) node;
+ state->ps.state = estate;
+ state->ps.ExecProcNode = ExecBatchSortPrepare;

I think the main executor entry function should be named ExecBatchSort
instead of ExecBatchSortPrepare, it will look more consistent with the
other executor machinery.

The job of the ExecBatchSortPrepare function is to preprocess the data (batch and pre-sort),
and when its work ends, it will call "ExecSetExecProcNode(pstate, ExecBatchSort)" to return the data to the ExecBatchSort function.
There is another advantage of dividing into two functions,
It is not necessary to judge whether tuplesort is now available every time the function is processed to improve the subtle performance.
And I think this code is clearer.

#21bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
1 attachment(s)
Re: Re: parallel distinct union and aggregate support patch

Now, I rewrite batch hashagg and sort, add some comment and combin too patches. base on master 2ad78a87f018260d4474eee63187e1cc73c9b976.
They are support rescan and change GUC enable_batch_hashagg/enable_batch_sort to max_hashagg_batches/max_sort_batch, default value is "0"(mean is disable).
The "max_hashagg_batches" in grouping sets each chain using this value, maybe we need a better algorithm.
Do not set "max_sort_batch" too large, because each tuplesort's work memory is "work_mem/max_sort_batch".

Next step I want use batch sort add parallel merge join(thinks Dilip Kumar) and except/intersect support after this patch commit, welcome to discuss.

Some test result:
hash group by: 17,974.797 ms -> 10,137.909 ms
sort group by: 117,475.380 ms -> 34,830.489 ms
grouping sets: 91,915.597 ms -> 24,585.103 ms
union: 95,765.297 ms -> 21,416.414 ms

---------------------------test details-------------------------------
Machine information:
Architecture: x86_64
CPU(s): 88
Thread(s) per core: 2
Core(s) per socket: 22
Socket(s): 2
NUMA node(s): 2
Model name: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz

prepare data:
begin;
create table gtest(id integer, txt text);
insert into gtest select t1.id,'txt'||t1.id from (select generate_series(1,10*1000*1000) id) t1,(select generate_series(1,10) id) t2;
analyze gtest;
commit;
set max_parallel_workers_per_gather=8;
set work_mem = '100MB';

hash aggregate:
explain (verbose,costs off,analyze)
select sum(id),txt from gtest group by txt;
QUERY PLAN
---------------------------------------------------------------------------------------------------------
Finalize HashAggregate (actual time=10832.805..17403.671 rows=10000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
Batches: 29 Memory Usage: 102489kB Disk Usage: 404696kB
-> Gather (actual time=4389.345..7227.279 rows=10000058 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 6
Workers Launched: 6
-> Partial HashAggregate (actual time=4353.147..5992.183 rows=1428580 loops=7)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 5 Memory Usage: 110641kB Disk Usage: 238424kB
Worker 0: actual time=4347.155..5954.088 rows=1398608 loops=1
Batches: 5 Memory Usage: 114737kB Disk Usage: 203928kB
Worker 1: actual time=4347.061..6209.121 rows=1443046 loops=1
Batches: 5 Memory Usage: 114737kB Disk Usage: 224384kB
Worker 2: actual time=4347.175..5882.065 rows=1408238 loops=1
Batches: 5 Memory Usage: 110641kB Disk Usage: 216360kB
Worker 3: actual time=4347.193..6015.830 rows=1477568 loops=1
Batches: 5 Memory Usage: 110641kB Disk Usage: 240824kB
Worker 4: actual time=4347.210..5950.730 rows=1404288 loops=1
Batches: 5 Memory Usage: 110641kB Disk Usage: 214872kB
Worker 5: actual time=4347.482..6064.460 rows=1439454 loops=1
Batches: 5 Memory Usage: 110641kB Disk Usage: 239400kB
-> Parallel Seq Scan on public.gtest (actual time=0.051..1216.378 rows=14285714 loops=7)
Output: id, txt
Worker 0: actual time=0.048..1219.133 rows=13986000 loops=1
Worker 1: actual time=0.047..1214.860 rows=14430370 loops=1
Worker 2: actual time=0.051..1222.124 rows=14082300 loops=1
Worker 3: actual time=0.061..1213.851 rows=14775580 loops=1
Worker 4: actual time=0.073..1216.712 rows=14042795 loops=1
Worker 5: actual time=0.049..1210.870 rows=14394480 loops=1
Planning Time: 0.673 ms
Execution Time: 17974.797 ms
batch hash aggregate:
set max_hashagg_batches = 100;
explain (verbose,costs off,analyze)
select sum(id),txt from gtest group by txt;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Gather (actual time=5050.110..9757.292 rows=10000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 6
Workers Launched: 6
-> Parallel BatchHashAggregate (actual time=5032.178..7810.979 rows=1428571 loops=7)
Output: sum(id), txt
Group Key: gtest.txt
Worker 0: actual time=5016.488..7694.715 rows=1399958 loops=1
Worker 1: actual time=5021.651..7942.628 rows=1501753 loops=1
Worker 2: actual time=5018.327..7944.842 rows=1400176 loops=1
Worker 3: actual time=5082.977..7973.635 rows=1400818 loops=1
Worker 4: actual time=5019.229..7847.522 rows=1499952 loops=1
Worker 5: actual time=5017.086..7667.116 rows=1398470 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.055..1378.237 rows=14285714 loops=7)
Output: id, txt
Worker 0: actual time=0.057..1349.870 rows=14533515 loops=1
Worker 1: actual time=0.052..1376.305 rows=13847620 loops=1
Worker 2: actual time=0.068..1382.226 rows=13836705 loops=1
Worker 3: actual time=0.071..1405.669 rows=13856130 loops=1
Worker 4: actual time=0.055..1406.186 rows=14677345 loops=1
Worker 5: actual time=0.045..1351.142 rows=15344825 loops=1
Planning Time: 0.250 ms
Execution Time: 10137.909 ms

sort aggregate:
set enable_hashagg = off;
set max_hashagg_batches = 0;
explain (verbose,costs off,analyze)
select sum(id),txt from gtest group by txt;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------
Finalize GroupAggregate (actual time=10370.559..116494.922 rows=10000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
-> Gather Merge (actual time=10370.487..112470.148 rows=10000059 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 6
Workers Launched: 6
-> Partial GroupAggregate (actual time=8608.563..24526.716 rows=1428580 loops=7)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Worker 0: actual time=8283.755..18641.475 rows=887626 loops=1
Worker 1: actual time=8303.984..26206.673 rows=1536832 loops=1
Worker 2: actual time=8290.611..28110.145 rows=1676544 loops=1
Worker 3: actual time=10347.326..29912.135 rows=1783536 loops=1
Worker 4: actual time=8329.604..20262.795 rows=980352 loops=1
Worker 5: actual time=8322.877..27957.446 rows=1758958 loops=1
-> Sort (actual time=8608.501..21752.009 rows=14285714 loops=7)
Output: txt, id
Sort Key: gtest.txt
Sort Method: external merge Disk: 349760kB
Worker 0: actual time=8283.648..16831.068 rows=8876115 loops=1
Sort Method: external merge Disk: 225832kB
Worker 1: actual time=8303.927..23053.078 rows=15368320 loops=1
Sort Method: external merge Disk: 391008kB
Worker 2: actual time=8290.556..24735.395 rows=16765440 loops=1
Sort Method: external merge Disk: 426552kB
Worker 3: actual time=10347.264..26438.333 rows=17835210 loops=1
Sort Method: external merge Disk: 453768kB
Worker 4: actual time=8329.534..18248.302 rows=9803520 loops=1
Sort Method: external merge Disk: 249408kB
Worker 5: actual time=8322.827..24480.383 rows=17589430 loops=1
Sort Method: external merge Disk: 447520kB
-> Parallel Seq Scan on public.gtest (actual time=51.618..1530.850 rows=14285714 loops=7)
Output: txt, id
Worker 0: actual time=49.907..1001.606 rows=8876115 loops=1
Worker 1: actual time=51.011..1665.980 rows=15368320 loops=1
Worker 2: actual time=50.087..1812.426 rows=16765440 loops=1
Worker 3: actual time=51.010..1828.299 rows=17835210 loops=1
Worker 4: actual time=42.614..1077.896 rows=9803520 loops=1
Worker 5: actual time=51.010..1790.012 rows=17589430 loops=1
Planning Time: 0.119 ms
Execution Time: 117475.380 ms
batch sort aggregate:
set max_sort_batches = 21;
explain (verbose,costs off,analyze)
select sum(id),txt from gtest group by txt;
QUERY PLAN
----------------------------------------------------------------------------------------------------------
Gather (actual time=18699.622..34438.083 rows=10000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 6
Workers Launched: 6
-> GroupAggregate (actual time=18671.875..31121.607 rows=1428571 loops=7)
Output: sum(id), txt
Group Key: gtest.txt
Worker 0: actual time=18669.038..30913.680 rows=1427622 loops=1
Worker 1: actual time=18674.356..31045.516 rows=1430448 loops=1
Worker 2: actual time=18677.565..31375.340 rows=1427636 loops=1
Worker 3: actual time=18667.879..31359.458 rows=1427935 loops=1
Worker 4: actual time=18669.760..31263.414 rows=1430220 loops=1
Worker 5: actual time=18645.428..30813.141 rows=1427411 loops=1
-> Parallel BatchSort (actual time=18671.796..29348.606 rows=14285714 loops=7)
Output: txt, id
Sort Key: gtest.txt
batches: 21
Worker 0: actual time=18668.856..29172.519 rows=14276220 loops=1
Worker 1: actual time=18674.287..29280.794 rows=14304480 loops=1
Worker 2: actual time=18677.501..29569.974 rows=14276360 loops=1
Worker 3: actual time=18667.801..29558.286 rows=14279350 loops=1
Worker 4: actual time=18669.689..29468.636 rows=14302200 loops=1
Worker 5: actual time=18645.367..29076.665 rows=14274110 loops=1
-> Parallel Seq Scan on public.gtest (actual time=50.164..1893.727 rows=14285714 loops=7)
Output: txt, id
Worker 0: actual time=50.058..1818.959 rows=13953440 loops=1
Worker 1: actual time=50.974..1723.268 rows=13066735 loops=1
Worker 2: actual time=48.050..1855.469 rows=13985175 loops=1
Worker 3: actual time=49.640..1791.897 rows=12673240 loops=1
Worker 4: actual time=48.027..1932.927 rows=14586880 loops=1
Worker 5: actual time=51.151..2094.981 rows=16360290 loops=1
Planning Time: 0.160 ms
Execution Time: 34830.489 ms

normal grouping sets:
set enable_hashagg = on;
set max_sort_batches = 0;
set max_hashagg_batches = 0;
explain (costs off,verbose,analyze)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
----------------------------------------------------------------------------------------------------------
MixedAggregate (actual time=4563.123..90348.608 rows=20000001 loops=1)
Output: sum(id), txt, id
Hash Key: gtest.txt
Group Key: gtest.id
Group Key: ()
Batches: 29 Memory Usage: 114737kB Disk Usage: 3241968kB
-> Gather Merge (actual time=4563.070..39429.593 rows=100000000 loops=1)
Output: txt, id
Workers Planned: 6
Workers Launched: 6
-> Sort (actual time=4493.638..7532.910 rows=14285714 loops=7)
Output: txt, id
Sort Key: gtest.id
Sort Method: external merge Disk: 353080kB
Worker 0: actual time=4474.665..7853.595 rows=14327510 loops=1
Sort Method: external merge Disk: 364528kB
Worker 1: actual time=4492.273..7796.141 rows=14613250 loops=1
Sort Method: external merge Disk: 371776kB
Worker 2: actual time=4472.937..7626.318 rows=14339905 loops=1
Sort Method: external merge Disk: 364840kB
Worker 3: actual time=4480.141..7730.419 rows=14406135 loops=1
Sort Method: external merge Disk: 366528kB
Worker 4: actual time=4490.723..7581.102 rows=13971200 loops=1
Sort Method: external merge Disk: 355096kB
Worker 5: actual time=4482.204..7894.434 rows=14464410 loops=1
Sort Method: external merge Disk: 368008kB
-> Parallel Seq Scan on public.gtest (actual time=27.040..1514.516 rows=14285714 loops=7)
Output: txt, id
Worker 0: actual time=23.111..1514.219 rows=14327510 loops=1
Worker 1: actual time=22.696..1528.771 rows=14613250 loops=1
Worker 2: actual time=23.119..1519.190 rows=14339905 loops=1
Worker 3: actual time=22.705..1525.183 rows=14406135 loops=1
Worker 4: actual time=23.134..1509.694 rows=13971200 loops=1
Worker 5: actual time=23.652..1516.585 rows=14464410 loops=1
Planning Time: 0.162 ms
Execution Time: 91915.597 ms

batch grouping sets:
set max_hashagg_batches = 100;
explain (costs off,verbose,analyze)
select sum(id),txt from gtest group by grouping sets(id,txt,());
QUERY PLAN
---------------------------------------------------------------------------------------------------
Gather (actual time=9082.581..23203.803 rows=20000001 loops=1)
Output: (sum(id)), txt, id
Workers Planned: 6
Workers Launched: 6
-> Parallel BatchHashAggregate (actual time=9040.895..15911.190 rows=2857143 loops=7)
Output: sum(id), txt, id
Group Key: gtest.id
Group Key: ()
Group Key: gtest.txt
Worker 0: actual time=9031.714..15499.292 rows=3101124 loops=1
Worker 1: actual time=9038.217..15403.655 rows=3100997 loops=1
Worker 2: actual time=9030.557..15157.267 rows=3103320 loops=1
Worker 3: actual time=9034.391..15537.851 rows=3100505 loops=1
Worker 4: actual time=9037.079..19823.359 rows=1400191 loops=1
Worker 5: actual time=9032.359..15012.338 rows=3097137 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.052..1506.109 rows=14285714 loops=7)
Output: id, txt
Worker 0: actual time=0.058..1521.705 rows=13759375 loops=1
Worker 1: actual time=0.054..1514.218 rows=13758635 loops=1
Worker 2: actual time=0.062..1531.244 rows=14456270 loops=1
Worker 3: actual time=0.050..1506.569 rows=14451930 loops=1
Worker 4: actual time=0.053..1495.908 rows=15411240 loops=1
Worker 5: actual time=0.055..1503.382 rows=14988885 loops=1
Planning Time: 0.160 ms
Execution Time: 24585.103 ms

normal union:
set max_hashagg_batches = 0;
set max_sort_batches = 0;
explain (verbose,costs false,analyze)
select * from gtest union select * from gtest;
QUERY PLAN
---------------------------------------------------------------------------------------------------------
Unique (actual time=53939.294..94666.573 rows=10000000 loops=1)
Output: gtest.id, gtest.txt
-> Sort (actual time=53939.292..76581.157 rows=200000000 loops=1)
Output: gtest.id, gtest.txt
Sort Key: gtest.id, gtest.txt
Sort Method: external merge Disk: 4871024kB
-> Append (actual time=0.020..25832.476 rows=200000000 loops=1)
-> Seq Scan on public.gtest (actual time=0.019..7074.113 rows=100000000 loops=1)
Output: gtest.id, gtest.txt
-> Seq Scan on public.gtest gtest_1 (actual time=0.006..7067.898 rows=100000000 loops=1)
Output: gtest_1.id, gtest_1.txt
Planning Time: 0.152 ms
Execution Time: 95765.297 ms

batch hash aggregate union:
set max_hashagg_batches = 100;
explain (verbose,costs false,analyze)
select * from gtest union select * from gtest;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------
Gather (actual time=11623.986..21021.317 rows=10000000 loops=1)
Output: gtest.id, gtest.txt
Workers Planned: 6
Workers Launched: 6
-> Parallel BatchHashAggregate (actual time=11636.753..16584.067 rows=1428571 loops=7)
Output: gtest.id, gtest.txt
Group Key: gtest.id, gtest.txt
Worker 0: actual time=11631.225..16846.376 rows=1500587 loops=1
Worker 1: actual time=11553.019..16233.006 rows=1397874 loops=1
Worker 2: actual time=11581.523..16807.962 rows=1499049 loops=1
Worker 3: actual time=11593.865..16416.381 rows=1399579 loops=1
Worker 4: actual time=11772.115..16783.605 rows=1400961 loops=1
Worker 5: actual time=11702.415..16571.841 rows=1400943 loops=1
-> Parallel Append (actual time=0.047..4339.450 rows=28571429 loops=7)
Worker 0: actual time=0.062..4396.130 rows=28591565 loops=1
Worker 1: actual time=0.053..4383.983 rows=29536360 loops=1
Worker 2: actual time=0.045..4305.253 rows=28282900 loops=1
Worker 3: actual time=0.053..4295.805 rows=28409625 loops=1
Worker 4: actual time=0.061..4314.450 rows=28363645 loops=1
Worker 5: actual time=0.015..4311.121 rows=29163585 loops=1
-> Parallel Seq Scan on public.gtest (actual time=0.030..1201.563 rows=14285714 loops=7)
Output: gtest.id, gtest.txt
Worker 0: actual time=0.019..281.903 rows=3277090 loops=1
Worker 1: actual time=0.050..2473.135 rows=29536360 loops=1
Worker 2: actual time=0.021..273.766 rows=3252955 loops=1
Worker 3: actual time=0.018..285.911 rows=3185145 loops=1
Worker 4: actual time=0.058..2387.626 rows=28363645 loops=1
Worker 5: actual time=0.013..2432.342 rows=29163585 loops=1
-> Parallel Seq Scan on public.gtest gtest_1 (actual time=0.048..2140.373 rows=25000000 loops=4)
Output: gtest_1.id, gtest_1.txt
Worker 0: actual time=0.059..2173.690 rows=25314475 loops=1
Worker 2: actual time=0.043..2114.314 rows=25029945 loops=1
Worker 3: actual time=0.050..2142.670 rows=25224480 loops=1
Planning Time: 0.137 ms
Execution Time: 21416.414 ms

bucoo@sohu.com

Attachments:

support-parallel-union-distinct-aggregate-using-batc.patchapplication/octet-stream; name=support-parallel-union-distinct-aggregate-using-batc.patchDownload
From 2f793936f4b1beb73dbcc13fabc8f4febc24a60d Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Mon, 25 Jan 2021 21:43:30 +0800
Subject: [PATCH] support parallel union/distinct/aggregate using batch hashagg
 and sort

---
 src/backend/commands/explain.c                    |  20 +
 src/backend/executor/Makefile                     |   1 +
 src/backend/executor/execAmi.c                    |   5 +
 src/backend/executor/execParallel.c               |  17 +
 src/backend/executor/execProcnode.c               |  10 +
 src/backend/executor/nodeAgg.c                    | 637 +++++++++++++++++++---
 src/backend/executor/nodeBatchSort.c              | 554 +++++++++++++++++++
 src/backend/nodes/copyfuncs.c                     |  22 +
 src/backend/nodes/outfuncs.c                      |  18 +
 src/backend/nodes/readfuncs.c                     |  19 +
 src/backend/optimizer/path/costsize.c             | 110 +++-
 src/backend/optimizer/plan/createplan.c           |  61 ++-
 src/backend/optimizer/plan/planner.c              | 305 +++++++++++
 src/backend/optimizer/plan/setrefs.c              |   1 +
 src/backend/optimizer/plan/subselect.c            |   6 +
 src/backend/optimizer/prep/prepunion.c            |  81 ++-
 src/backend/optimizer/util/pathnode.c             |  39 ++
 src/backend/optimizer/util/tlist.c                |  21 +
 src/backend/postmaster/pgstat.c                   |   6 +
 src/backend/storage/lmgr/lwlock.c                 |   2 -
 src/backend/utils/misc/guc.c                      |  26 +
 src/backend/utils/sort/Makefile                   |   3 +-
 src/backend/utils/sort/batchstore.c               | 466 ++++++++++++++++
 src/backend/utils/sort/sharedtuplestore.c         | 163 +++++-
 src/backend/utils/sort/tuplesort.c                |  20 +
 src/include/executor/nodeAgg.h                    |   2 +
 src/include/executor/nodeBatchSort.h              |  19 +
 src/include/nodes/execnodes.h                     |  19 +
 src/include/nodes/nodes.h                         |   6 +-
 src/include/nodes/pathnodes.h                     |  11 +
 src/include/nodes/plannodes.h                     |  15 +
 src/include/optimizer/cost.h                      |   7 +
 src/include/optimizer/pathnode.h                  |   7 +
 src/include/optimizer/tlist.h                     |   1 +
 src/include/pgstat.h                              |   4 +-
 src/include/storage/lwlock.h                      |   1 -
 src/include/utils/batchstore.h                    |  38 ++
 src/include/utils/sharedtuplestore.h              |  14 +
 src/include/utils/tuplesort.h                     |   1 +
 src/test/regress/expected/groupingsets.out        |  65 +++
 src/test/regress/expected/partition_aggregate.out | 138 +++++
 src/test/regress/expected/select_distinct.out     |  77 +++
 src/test/regress/expected/select_parallel.out     |  86 +++
 src/test/regress/expected/union.out               |  85 +++
 src/test/regress/sql/groupingsets.sql             |  10 +
 src/test/regress/sql/partition_aggregate.sql      |  45 ++
 src/test/regress/sql/select_distinct.sql          |  23 +
 src/test/regress/sql/select_parallel.sql          |  32 ++
 src/test/regress/sql/union.sql                    |  28 +
 49 files changed, 3223 insertions(+), 124 deletions(-)
 create mode 100644 src/backend/executor/nodeBatchSort.c
 create mode 100644 src/backend/utils/sort/batchstore.c
 create mode 100644 src/include/executor/nodeBatchSort.h
 create mode 100644 src/include/utils/batchstore.h

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 5d7eb3574c..cd38289fbb 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1283,6 +1283,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Sort:
 			pname = sname = "Sort";
 			break;
+		case T_BatchSort:
+			pname = sname = "BatchSort";
+			break;
 		case T_IncrementalSort:
 			pname = sname = "Incremental Sort";
 			break;
@@ -1312,6 +1315,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 						pname = "MixedAggregate";
 						strategy = "Mixed";
 						break;
+					case AGG_BATCH_HASH:
+						pname = "BatchHashAggregate";
+						strategy = "BatchHashed";
+						break;
 					default:
 						pname = "Aggregate ???";
 						strategy = "???";
@@ -1946,6 +1953,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			show_sort_keys(castNode(SortState, planstate), ancestors, es);
 			show_sort_info(castNode(SortState, planstate), es);
 			break;
+		case T_BatchSort:
+			{
+				BatchSort *bsort = (BatchSort*)plan;
+				show_sort_group_keys(planstate, "Sort Key",
+									 bsort->sort.numCols, 0, bsort->sort.sortColIdx,
+									 bsort->sort.sortOperators, bsort->sort.collations,
+									 bsort->sort.nullsFirst,
+									 ancestors, es);
+				if (es->verbose)
+					ExplainPropertyInteger("batches", NULL, bsort->numBatches, es);
+			}
+			break;
 		case T_IncrementalSort:
 			show_incremental_sort_keys(castNode(IncrementalSortState, planstate),
 									   ancestors, es);
@@ -3054,6 +3073,7 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 	int64		memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024;
 
 	if (agg->aggstrategy != AGG_HASHED &&
+		agg->aggstrategy != AGG_BATCH_HASH &&
 		agg->aggstrategy != AGG_MIXED)
 		return;
 
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index f990c6473a..a4855a8881 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -33,6 +33,7 @@ OBJS = \
 	instrument.o \
 	nodeAgg.o \
 	nodeAppend.o \
+	nodeBatchSort.o \
 	nodeBitmapAnd.o \
 	nodeBitmapHeapscan.o \
 	nodeBitmapIndexscan.o \
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index 23bdb53cd1..ecbcc09b51 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -17,6 +17,7 @@
 #include "executor/execdebug.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
+#include "executor/nodeBatchSort.h"
 #include "executor/nodeBitmapAnd.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeBitmapIndexscan.h"
@@ -253,6 +254,10 @@ ExecReScan(PlanState *node)
 			ExecReScanSort((SortState *) node);
 			break;
 
+		case T_BatchSortState:
+			ExecReScanBatchSort((BatchSortState *)node);
+			break;
+
 		case T_IncrementalSortState:
 			ExecReScanIncrementalSort((IncrementalSortState *) node);
 			break;
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index c95d5170e4..98937302c2 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -27,6 +27,7 @@
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
+#include "executor/nodeBatchSort.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeCustom.h"
 #include "executor/nodeForeignscan.h"
@@ -284,6 +285,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortEstimate((SortState *) planstate, e->pcxt);
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortEstimate((BatchSortState*)planstate, e->pcxt);
+			break;
 		case T_IncrementalSortState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
@@ -504,6 +509,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortInitializeDSM((BatchSortState*)planstate, d->pcxt);
+			break;
 		case T_IncrementalSortState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
@@ -990,6 +999,10 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 		case T_IncrementalSortState:
 			/* these nodes have DSM state, but no reinitialization is required */
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortReInitializeDSM((BatchSortState*)planstate, pcxt);
+			break;
 
 		default:
 			break;
@@ -1340,6 +1353,10 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
 			break;
+		case T_BatchSortState:
+			if (planstate->plan->parallel_aware)
+				ExecBatchSortInitializeWorker((BatchSortState*)planstate, pwcxt);
+			break;
 		case T_IncrementalSortState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 414df50a05..d51750e4fb 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -75,6 +75,7 @@
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
+#include "executor/nodeBatchSort.h"
 #include "executor/nodeBitmapAnd.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeBitmapIndexscan.h"
@@ -314,6 +315,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 												estate, eflags);
 			break;
 
+		case T_BatchSort:
+			result = (PlanState *) ExecInitBatchSort((BatchSort *) node,
+													 estate, eflags);
+			break;
+
 		case T_IncrementalSort:
 			result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
 														   estate, eflags);
@@ -699,6 +705,10 @@ ExecEndNode(PlanState *node)
 			ExecEndSort((SortState *) node);
 			break;
 
+		case T_BatchSortState:
+			ExecEndBatchSort((BatchSortState *) node);
+			break;
+
 		case T_IncrementalSortState:
 			ExecEndIncrementalSort((IncrementalSortState *) node);
 			break;
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 601b6dab03..ad5e67eddf 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -256,7 +256,10 @@
 #include "optimizer/optimizer.h"
 #include "parser/parse_agg.h"
 #include "parser/parse_coerce.h"
+#include "pgstat.h"
+#include "storage/barrier.h"
 #include "utils/acl.h"
+#include "utils/batchstore.h"
 #include "utils/builtins.h"
 #include "utils/datum.h"
 #include "utils/dynahash.h"
@@ -311,6 +314,11 @@
  */
 #define CHUNKHDRSZ 16
 
+#define SHARED_AGG_MAGIC		UINT64CONST(0x4141bbcd61518e52)
+#define SHARED_AGG_KEY_INFO		UINT64CONST(0xD000000000000001)
+#define SHARED_AGG_KEY_BARRIER	UINT64CONST(0xD000000000000002)
+#define SHARED_AGG_KEY_FILE_SET	UINT64CONST(0xD000000000000003)
+
 /*
  * Track all tapes needed for a HashAgg that spills. We don't know the maximum
  * number of tapes needed at the start of the algorithm (because it can
@@ -446,7 +454,7 @@ static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset,
 									   int input_tapenum, int setno,
 									   int64 input_tuples, double input_card,
 									   int used_bits);
-static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
+static bool hashagg_batch_read(void *userdata, TupleTableSlot *slot, uint32 *hashp);
 static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
 							   int used_bits, double input_groups,
 							   double hashentrysize);
@@ -465,6 +473,8 @@ static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
 									  Oid aggserialfn, Oid aggdeserialfn,
 									  Datum initValue, bool initValueIsNull,
 									  Oid *inputTypes, int numArguments);
+static TupleTableSlot *ExecBatchHashAggPrepare(PlanState *pstate);
+static TupleTableSlot *ExecBatchHashAggFirstReScan(PlanState *pstate);
 
 
 /*
@@ -1330,7 +1340,8 @@ finalize_aggregates(AggState *aggstate,
 		if (pertrans->numSortCols > 0)
 		{
 			Assert(aggstate->aggstrategy != AGG_HASHED &&
-				   aggstate->aggstrategy != AGG_MIXED);
+				   aggstate->aggstrategy != AGG_MIXED &&
+				   aggstate->aggstrategy != AGG_BATCH_HASH);
 
 			if (pertrans->numInputs == 1)
 				process_ordered_aggregate_single(aggstate,
@@ -1502,8 +1513,10 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 	MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
 	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
 	Size		additionalsize;
+	bool		use_hash_iv;
 
 	Assert(aggstate->aggstrategy == AGG_HASHED ||
+		   aggstate->aggstrategy == AGG_BATCH_HASH ||
 		   aggstate->aggstrategy == AGG_MIXED);
 
 	/*
@@ -1514,6 +1527,15 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 	 */
 	additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
 
+	/*
+	 * In batch hash mode each worker must use the same hash algorithm,
+	 * because we need to ensure that the same records(group clauses) are in the same batch
+	 */
+	if (aggstate->aggstrategy == AGG_BATCH_HASH)
+		use_hash_iv = false;
+	else
+		use_hash_iv = DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit);
+
 	perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
 												perhash->hashslot->tts_tupleDescriptor,
 												perhash->numCols,
@@ -1526,7 +1548,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 												metacxt,
 												hashcxt,
 												tmpcxt,
-												DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+												use_hash_iv);
 }
 
 /*
@@ -1625,10 +1647,15 @@ find_hash_columns(AggState *aggstate)
 			palloc(maxCols * sizeof(AttrNumber));
 		perhash->hashGrpColIdxHash =
 			palloc(perhash->numCols * sizeof(AttrNumber));
+		perhash->colnos_needed = bms_copy(aggregated_colnos);
 
 		/* Add all the grouping columns to colnos */
 		for (i = 0; i < perhash->numCols; i++)
+		{
 			colnos = bms_add_member(colnos, grpColIdx[i]);
+			perhash->colnos_needed = bms_add_member(perhash->colnos_needed,
+													grpColIdx[i]);
+		}
 
 		/*
 		 * First build mapping for columns directly hashed. These are the
@@ -1738,9 +1765,11 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
 	int			j = nullcheck ? 1 : 0;
 
 	Assert(aggstate->aggstrategy == AGG_HASHED ||
+		   aggstate->aggstrategy == AGG_BATCH_HASH ||
 		   aggstate->aggstrategy == AGG_MIXED);
 
-	if (aggstate->aggstrategy == AGG_HASHED)
+	if (aggstate->aggstrategy == AGG_HASHED ||
+		aggstate->aggstrategy == AGG_BATCH_HASH)
 		phase = &aggstate->phases[0];
 	else						/* AGG_MIXED */
 		phase = &aggstate->phases[1];
@@ -2168,6 +2197,9 @@ ExecAgg(PlanState *pstate)
 			case AGG_SORTED:
 				result = agg_retrieve_direct(node);
 				break;
+			case AGG_BATCH_HASH:
+				elog(ERROR, "batch hash should not run in function ExecAgg");
+				break;
 		}
 
 		if (!TupIsNull(result))
@@ -2568,35 +2600,20 @@ agg_fill_hash_table(AggState *aggstate)
 						   &aggstate->perhash[0].hashiter);
 }
 
-/*
- * If any data was spilled during hash aggregation, reset the hash table and
- * reprocess one batch of spilled data. After reprocessing a batch, the hash
- * table will again contain data, ready to be consumed by
- * agg_retrieve_hash_table_in_memory().
- *
- * Should only be called after all in memory hash table entries have been
- * finalized and emitted.
- *
- * Return false when input is exhausted and there's no more work to be done;
- * otherwise return true.
- */
-static bool
-agg_refill_hash_table(AggState *aggstate)
+static void
+agg_refill_hash_table_ex(AggState *aggstate,
+						 bool (*read_tup)(void *userdata, TupleTableSlot *slot, uint32 *hash),
+						 void *userdata,
+						 int used_bits,
+						 double input_groups,
+						 int setno)
 {
-	HashAggBatch *batch;
 	AggStatePerHash perhash;
 	HashAggSpill spill;
-	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
 	bool		spill_initialized = false;
 
-	if (aggstate->hash_batches == NIL)
-		return false;
-
-	batch = linitial(aggstate->hash_batches);
-	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
-
-	hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
-						batch->used_bits, &aggstate->hash_mem_limit,
+	hash_agg_set_limits(aggstate->hashentrysize, input_groups,
+						used_bits, &aggstate->hash_mem_limit,
 						&aggstate->hash_ngroups_limit, NULL);
 
 	/*
@@ -2628,7 +2645,7 @@ agg_refill_hash_table(AggState *aggstate)
 		aggstate->phase = &aggstate->phases[aggstate->current_phase];
 	}
 
-	select_current_set(aggstate, batch->setno, true);
+	select_current_set(aggstate, setno, true);
 
 	perhash = &aggstate->perhash[aggstate->current_set];
 
@@ -2646,31 +2663,27 @@ agg_refill_hash_table(AggState *aggstate)
 		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
 		TupleTableSlot *hashslot = perhash->hashslot;
 		TupleHashEntry entry;
-		MinimalTuple tuple;
 		uint32		hash;
 		bool		isnew = false;
 		bool	   *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		CHECK_FOR_INTERRUPTS();
 
-		tuple = hashagg_batch_read(batch, &hash);
-		if (tuple == NULL)
+		if ((*read_tup)(userdata, spillslot, &hash) == false)
 			break;
 
-		ExecStoreMinimalTuple(tuple, spillslot, true);
 		aggstate->tmpcontext->ecxt_outertuple = spillslot;
 
 		prepare_hash_slot(perhash,
-						  aggstate->tmpcontext->ecxt_outertuple,
+						  spillslot,
 						  hashslot);
-		entry = LookupTupleHashEntryHash(
-										 perhash->hashtable, hashslot, p_isnew, hash);
+		entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, p_isnew, hash);
 
 		if (entry != NULL)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+			aggstate->hash_pergroup[setno] = entry->additional;
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2682,13 +2695,13 @@ agg_refill_hash_table(AggState *aggstate)
 				 * that we don't assign tapes that will never be used.
 				 */
 				spill_initialized = true;
-				hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
-								   batch->input_card, aggstate->hashentrysize);
+				hashagg_spill_init(&spill, aggstate->hash_tapeinfo, used_bits,
+								   input_groups, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
 			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
 
-			aggstate->hash_pergroup[batch->setno] = NULL;
+			aggstate->hash_pergroup[setno] = NULL;
 		}
 
 		/*
@@ -2698,15 +2711,13 @@ agg_refill_hash_table(AggState *aggstate)
 		ResetExprContext(aggstate->tmpcontext);
 	}
 
-	hashagg_tapeinfo_release(tapeinfo, batch->input_tapenum);
-
 	/* change back to phase 0 */
 	aggstate->current_phase = 0;
 	aggstate->phase = &aggstate->phases[aggstate->current_phase];
 
 	if (spill_initialized)
 	{
-		hashagg_spill_finish(aggstate, &spill, batch->setno);
+		hashagg_spill_finish(aggstate, &spill, setno);
 		hash_agg_update_metrics(aggstate, true, spill.npartitions);
 	}
 	else
@@ -2715,9 +2726,43 @@ agg_refill_hash_table(AggState *aggstate)
 	aggstate->hash_spill_mode = false;
 
 	/* prepare to walk the first hash table */
-	select_current_set(aggstate, batch->setno, true);
-	ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
-						   &aggstate->perhash[batch->setno].hashiter);
+	select_current_set(aggstate, setno, true);
+	ResetTupleHashIterator(aggstate->perhash[setno].hashtable,
+						   &aggstate->perhash[setno].hashiter);
+}
+
+/*
+ * If any data was spilled during hash aggregation, reset the hash table and
+ * reprocess one batch of spilled data. After reprocessing a batch, the hash
+ * table will again contain data, ready to be consumed by
+ * agg_retrieve_hash_table_in_memory().
+ *
+ * Should only be called after all in memory hash table entries have been
+ * finalized and emitted.
+ *
+ * Return false when input is exhausted and there's no more work to be done;
+ * otherwise return true.
+ */
+static bool
+agg_refill_hash_table(AggState *aggstate)
+{
+	HashAggBatch *batch;
+
+	if (aggstate->hash_batches == NIL)
+		return false;
+
+	batch = linitial(aggstate->hash_batches);
+	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
+
+	agg_refill_hash_table_ex(aggstate,
+							 hashagg_batch_read,
+							 batch,
+							 batch->used_bits,
+							 batch->input_card,
+							 batch->setno);
+
+	hashagg_tapeinfo_release(aggstate->hash_tapeinfo,
+							 batch->input_tapenum);
 
 	pfree(batch);
 
@@ -3056,11 +3101,12 @@ hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
 
 /*
  * read_spilled_tuple
- * 		read the next tuple from a batch's tape.  Return NULL if no more.
+ * 		read the next tuple from a batch's tape.  Return false if no more.
  */
-static MinimalTuple
-hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
+static bool
+hashagg_batch_read(void *userdata, TupleTableSlot *slot, uint32 *hashp)
 {
+	HashAggBatch *batch = userdata;
 	LogicalTapeSet *tapeset = batch->tapeset;
 	int			tapenum = batch->input_tapenum;
 	MinimalTuple tuple;
@@ -3070,7 +3116,7 @@ hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
 
 	nread = LogicalTapeRead(tapeset, tapenum, &hash, sizeof(uint32));
 	if (nread == 0)
-		return NULL;
+		return false;
 	if (nread != sizeof(uint32))
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3098,7 +3144,8 @@ hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
 				 errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
 						tapenum, t_len - sizeof(uint32), nread)));
 
-	return tuple;
+	ExecStoreMinimalTuple(tuple, slot, true);
+	return true;
 }
 
 /*
@@ -3261,6 +3308,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	int			i = 0;
 	int			j = 0;
 	bool		use_hashing = (node->aggstrategy == AGG_HASHED ||
+							   node->aggstrategy == AGG_BATCH_HASH ||
 							   node->aggstrategy == AGG_MIXED);
 
 	/* check for unsupported flags */
@@ -3272,7 +3320,10 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	aggstate = makeNode(AggState);
 	aggstate->ss.ps.plan = (Plan *) node;
 	aggstate->ss.ps.state = estate;
-	aggstate->ss.ps.ExecProcNode = ExecAgg;
+	if (node->aggstrategy == AGG_BATCH_HASH)
+		aggstate->ss.ps.ExecProcNode = ExecBatchHashAggPrepare;
+	else
+		aggstate->ss.ps.ExecProcNode = ExecAgg;
 
 	aggstate->aggs = NIL;
 	aggstate->numaggs = 0;
@@ -3319,7 +3370,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 			 * additional AGG_HASHED aggs become part of phase 0, but all
 			 * others add an extra phase.
 			 */
-			if (agg->aggstrategy != AGG_HASHED)
+			if (agg->aggstrategy != AGG_HASHED &&
+				agg->aggstrategy != AGG_BATCH_HASH)
 				++numPhases;
 			else
 				++numHashes;
@@ -3366,7 +3418,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	 * If we are doing a hashed aggregation then the child plan does not need
 	 * to handle REWIND efficiently; see ExecReScanAgg.
 	 */
-	if (node->aggstrategy == AGG_HASHED)
+	if (node->aggstrategy == AGG_HASHED ||
+		node->aggstrategy == AGG_BATCH_HASH)
 		eflags &= ~EXEC_FLAG_REWIND;
 	outerPlan = outerPlan(node);
 	outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
@@ -3374,9 +3427,20 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	/*
 	 * initialize source tuple type.
 	 */
-	aggstate->ss.ps.outerops =
-		ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
-							 &aggstate->ss.ps.outeropsfixed);
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		/*
+		 * When we prepared tuples from outer tree, we always read
+		 * tuples from saved files and it is MinimalTuple
+		 */
+		aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
+		aggstate->ss.ps.outeropsfixed = true;
+	}else
+	{
+		aggstate->ss.ps.outerops =
+			ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
+								 &aggstate->ss.ps.outeropsfixed);
+	}
 	aggstate->ss.ps.outeropsset = true;
 
 	ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
@@ -3484,6 +3548,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		Assert(phase <= 1 || sortnode);
 
 		if (aggnode->aggstrategy == AGG_HASHED
+			|| aggnode->aggstrategy == AGG_BATCH_HASH
 			|| aggnode->aggstrategy == AGG_MIXED)
 		{
 			AggStatePerPhase phasedata = &aggstate->phases[0];
@@ -3696,7 +3761,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	 * hashing is being done too, then phase 0 is processed last); but if only
 	 * hashing is being done, then phase 0 is all there is.
 	 */
-	if (node->aggstrategy == AGG_HASHED)
+	if (node->aggstrategy == AGG_HASHED ||
+		node->aggstrategy == AGG_BATCH_HASH)
 	{
 		aggstate->current_phase = 0;
 		initialize_phase(aggstate, 0);
@@ -4011,7 +4077,8 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 			dohash = false;
 			dosort = true;
 		}
-		else if (phase->aggstrategy == AGG_HASHED)
+		else if (phase->aggstrategy == AGG_HASHED ||
+				 phase->aggstrategy == AGG_BATCH_HASH)
 		{
 			dohash = true;
 			dosort = false;
@@ -4470,6 +4537,53 @@ ExecReScanAgg(AggState *node)
 			return;
 		}
 	}
+	else if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		/* If we haven't yet filled the batch store then we can just return */
+		if (!node->batch_filled)
+			return;
+
+		/*
+		 * If we filled the batch store and subplan does not have
+		 * any parameter changes(except only gather parameter),
+		 * we can rescan from batch store
+		 */
+		if (outerPlan->chgParam == NULL ||
+			(aggnode->gatherParam >= 0 &&
+			 bms_membership(outerPlan->chgParam) == BMS_SINGLETON &&
+			 bms_is_member(aggnode->gatherParam, outerPlan->chgParam)))
+		{
+			/* Rescan each set's batch store */
+			for (setno = 0; setno < numGroupingSets; ++setno)
+				bs_rescan(node->perhash[setno].batch_store);
+
+			/*
+			 * We don't want call ExecBatchHashAggNextBatch() for now,
+			 * this will immediately take up a batch, which is not what we want
+			 * in parallel mode. Because it is possible that other workers
+			 * will finish processing other batches very quickly,
+			 * and we did occupy a batch and we have been in no hurry to deal with it.
+			 *
+			 * So we use a new function ExecBatchHashAggFirstReScan call
+			 * ExecBatchHashAggNextBatch
+			 */
+			ExecSetExecProcNode(&node->ss.ps, ExecBatchHashAggFirstReScan);
+
+			return;
+		}
+
+		/* Clear all set's data */
+		for (setno = 0; setno < numGroupingSets; ++setno)
+			bs_clear(node->perhash[setno].batch_store);
+
+		/* Reset build barrier */
+		if (node->batch_barrier)
+			BarrierInit(node->batch_barrier, 0);
+
+		ExecSetExecProcNode(&node->ss.ps, ExecBatchHashAggPrepare);
+		node->batch_filled = false;
+		node->current_batch_set = 0;
+	}
 
 	/* Make sure we have closed any open tuplesorts */
 	for (transno = 0; transno < node->numtrans; transno++)
@@ -4512,11 +4626,13 @@ ExecReScanAgg(AggState *node)
 	MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
 
 	/*
-	 * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
+	 * With AGG_HASHED/AGG_BATCH_HASH/MIXED, the hash table is allocated in a sub-context of
 	 * the hashcontext. This used to be an issue, but now, resetting a context
 	 * automatically deletes sub-contexts too.
 	 */
-	if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
+	if (node->aggstrategy == AGG_HASHED ||
+		node->aggstrategy == AGG_BATCH_HASH ||
+		node->aggstrategy == AGG_MIXED)
 	{
 		hashagg_reset_spill_state(node);
 
@@ -4533,7 +4649,8 @@ ExecReScanAgg(AggState *node)
 		hashagg_recompile_expressions(node, false, false);
 	}
 
-	if (node->aggstrategy != AGG_HASHED)
+	if (node->aggstrategy != AGG_HASHED &&
+		node->aggstrategy != AGG_BATCH_HASH)
 	{
 		/*
 		 * Reset the per-group state (in particular, mark transvalues null)
@@ -4738,12 +4855,277 @@ AggRegisterCallback(FunctionCallInfo fcinfo,
 	elog(ERROR, "aggregate function cannot register a callback in this context");
 }
 
+/* Read tuple from batch store, Return false if no more */
+static bool
+batchstore_read(void *userdata, TupleTableSlot *slot, uint32 *hashp)
+{
+	MinimalTuple mtup = bs_read_hash(userdata, hashp);
+	if (unlikely(mtup == NULL))
+		return false;
+	ExecStoreMinimalTuple(mtup, slot, false);
+	return true;
+}
+
+/*
+ * Check have next batch. if not return false
+ * else open it and fill tuples into hash table
+ */
+static bool
+ExecBatchHashAggNextBatch(AggState *node)
+{
+	while (bs_next_batch(node->perhash[node->current_batch_set].batch_store, false) == false)
+	{
+		++node->current_batch_set;
+		if (node->current_batch_set >= node->num_hashes)
+		{
+			node->agg_done = true;
+			return false;
+		}
+	}
+
+	/* refill hash table from batch store */
+	agg_refill_hash_table_ex(node,
+							 batchstore_read,
+							 node->perhash[node->current_batch_set].batch_store,
+							 0,
+							 node->perhash[node->current_batch_set].aggnode->numGroups,
+							 node->current_batch_set);
+	return true;
+}
+
+static TupleTableSlot *
+ExecBatchHashAgg(PlanState *pstate)
+{
+	AggState	   *node = castNode(AggState, pstate);
+	TupleTableSlot *result;
+
+reloop:
+	result = agg_retrieve_hash_table_in_memory(node);
+	if (unlikely(result == NULL))
+	{
+		if (agg_refill_hash_table(node) == false &&
+			ExecBatchHashAggNextBatch(node) == false)
+		{
+			return NULL;
+		}else
+		{
+			goto reloop;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * Test have next batch, set real execute function and call it if have next batch,
+ * else return an empty TupleTableSlot
+ */
+TupleTableSlot *
+ExecBatchHashAggFirstReScan(PlanState *pstate)
+{
+	AggState   *node = castNode(AggState, pstate);
+
+	node->current_batch_set = 0;
+	if (ExecBatchHashAggNextBatch(node) == false)
+		return ExecClearTuple(pstate->ps_ResultTupleSlot);
+
+	ExecSetExecProcNode(&node->ss.ps, ExecBatchHashAgg);
+	return ExecBatchHashAgg(pstate);
+}
+
+/*
+ * ExecBatchHashAggPrepare -
+ *  ExecBatchHashAggPrepare receives tuples from its outer subplan and save
+ *  into batchstore. after finish save tuples set execute function as
+ *  ExecBatchHashAgg and call it.
+ */
+static TupleTableSlot *
+ExecBatchHashAggPrepare(PlanState *pstate)
+{
+	int				i,x,max_colno_needed;
+	MinimalTuple	mtup;
+	TupleTableSlot *inputslot;
+	PlanState	   *outer = outerPlanState(pstate);
+	AggState	   *node = castNode(AggState, pstate);
+	ExprContext	   *tmpcontext = node->tmpcontext;
+	bool		   *isnull;
+	Bitmapset	   *colnos_needed;
+	Bitmapset	  **colnos_neededs;
+	Assert(node->aggstrategy == AGG_BATCH_HASH);
+	Assert(node->perhash[0].batch_store == NULL ||
+		   node->batch_barrier != NULL);
+
+	if (node->agg_done)
+		return NULL;
+
+	/* create batch store if not parallel */
+	if (node->perhash[0].batch_store == NULL)
+	{
+		MemoryContext	oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(pstate));
+		Agg			   *agg = castNode(Agg, pstate->plan);
+		ListCell	   *lc;
+
+		node->perhash[0].batch_store = bs_begin_hash(agg->numBatches);
+
+		i = 1;
+		foreach (lc, agg->chain)
+		{
+			Agg *subagg = lfirst_node(Agg, lc);
+			Assert(subagg->aggstrategy == AGG_BATCH_HASH);
+			Assert(i < node->num_hashes);
+			node->perhash[i].batch_store = bs_begin_hash(subagg->numBatches);
+			++i;
+		}
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	if (node->batch_barrier &&
+		BarrierAttach(node->batch_barrier) > 0)
+	{
+		BarrierDetach(node->batch_barrier);
+		goto batches_already_done_;
+	}
+
+	/* read for make minimal tuple */
+	isnull = palloc(sizeof(isnull[0]) * node->hash_spill_wslot->tts_tupleDescriptor->natts);
+	memset(isnull, true, sizeof(isnull[0]) * node->hash_spill_wslot->tts_tupleDescriptor->natts);
+	max_colno_needed = node->max_colno_needed;
+
+	/* convert Attribute numbers to index(start with 0) */
+	colnos_neededs = palloc(sizeof(colnos_neededs[0]) * node->num_hashes);
+	for (i=0;i<node->num_hashes;++i)
+	{
+		AggStatePerHash	perhash = &node->perhash[i];
+		colnos_needed = NULL;
+		x = -1;
+		while ((x=bms_next_member(perhash->colnos_needed, x)) >= 0)
+		{
+			Assert(x > 0);
+			colnos_needed = bms_add_member(colnos_needed, x-1);
+		}
+		colnos_neededs[i] = colnos_needed;
+	}
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+		inputslot = ExecProcNode(outer);
+		if (TupIsNull(inputslot))
+			break;
+
+		tmpcontext->ecxt_outertuple = inputslot;
+		slot_getsomeattrs(inputslot, max_colno_needed);
+
+		for (i=0;i<node->num_hashes;++i)
+		{
+			AggStatePerHash	perhash = &node->perhash[i];
+			TupleTableSlot *hashslot = perhash->hashslot;
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* mark unneeded columns as null */
+			memset(isnull, true, sizeof(isnull[0]) * max_colno_needed);
+			colnos_needed = colnos_neededs[i];
+			x = -1;
+			while ((x = bms_next_member(colnos_needed, x)) >= 0)
+				isnull[x] = inputslot->tts_isnull[x];
+			/* make minimal tuple from we needed columns for this set */
+			mtup = heap_form_minimal_tuple(inputslot->tts_tupleDescriptor,
+										   inputslot->tts_values,
+										   isnull);
+
+			prepare_hash_slot(perhash, inputslot, hashslot);
+
+			bs_write_hash(perhash->batch_store,
+						  mtup,
+						  TupleHashTableHash(perhash->hashtable, hashslot));
+			pfree(mtup);
+			ResetExprContext(tmpcontext);
+		}
+	}
+
+	/* Before read, must call bs_end_write() function */
+	for (i=0;i<node->num_hashes;++i)
+		bs_end_write(node->perhash[i].batch_store);
+
+	if (node->batch_barrier)
+	{
+		/* wait other workers finish write */
+		BarrierArriveAndWait(node->batch_barrier, WAIT_EVENT_BATCH_HASH_BUILD);
+		BarrierDetach(node->batch_barrier);
+	}
+
+	/* clear temp memory */
+	for (i=0;i<node->num_hashes;++i)
+		bms_free(colnos_neededs[i]);
+	pfree(colnos_neededs);
+	pfree(isnull);
+
+batches_already_done_:
+	node->batch_filled = true;
+	node->current_batch_set = 0;
+	if (ExecBatchHashAggNextBatch(node) == false)
+		return NULL;
+
+	ExecSetExecProcNode(pstate, ExecBatchHashAgg);
+	return ExecBatchHashAgg(pstate);
+}
 
 /* ----------------------------------------------------------------
  *						Parallel Query Support
  * ----------------------------------------------------------------
  */
 
+static Size
+ExecAggEstimateToc(AggState *node, ParallelContext *pcxt)
+{
+	Size				size;
+	shm_toc_estimator	estimator;
+	ListCell		   *lc;
+
+	/* don't need this if no workers */
+	if (pcxt->nworkers == 0)
+		return 0;
+	/* don't need this if not instrumenting and not batch hash agg */
+	if (!node->ss.ps.instrument &&
+		node->aggstrategy != AGG_BATCH_HASH)
+		return 0;
+
+	shm_toc_initialize_estimator(&estimator);
+	if (node->ss.ps.instrument)
+	{
+		size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
+		size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+		shm_toc_estimate_chunk(&estimator, size);
+		shm_toc_estimate_keys(&estimator, 1);
+	}
+
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		int nparticipants = pcxt->nworkers + 1;
+		shm_toc_estimate_chunk(&estimator, BUFFERALIGN(sizeof(Barrier)));
+		shm_toc_estimate_chunk(&estimator, BUFFERALIGN(sizeof(SharedFileSet)));
+		shm_toc_estimate_keys(&estimator, 2);
+
+		size = bs_parallel_hash_estimate(castNode(Agg, node->ss.ps.plan)->numBatches,
+										 nparticipants);
+		shm_toc_estimate_chunk(&estimator, BUFFERALIGN(size));
+		shm_toc_estimate_keys(&estimator, 1);
+
+		foreach (lc, castNode(Agg, node->ss.ps.plan)->chain)
+		{
+			Agg *agg = lfirst_node(Agg, lc);
+			Assert(agg->aggstrategy == AGG_BATCH_HASH);
+			size = bs_parallel_hash_estimate(agg->numBatches, nparticipants);
+			shm_toc_estimate_chunk(&estimator, BUFFERALIGN(size));
+			shm_toc_estimate_keys(&estimator, 1);
+		}
+	}
+
+	return shm_toc_estimate(&estimator);
+}
+
  /* ----------------------------------------------------------------
   *		ExecAggEstimate
   *
@@ -4753,14 +5135,7 @@ AggRegisterCallback(FunctionCallInfo fcinfo,
 void
 ExecAggEstimate(AggState *node, ParallelContext *pcxt)
 {
-	Size		size;
-
-	/* don't need this if not instrumenting or no workers */
-	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
-		return;
-
-	size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
-	size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+	Size size = ExecAggEstimateToc(node, pcxt);
 	shm_toc_estimate_chunk(&pcxt->estimator, size);
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 }
@@ -4775,19 +5150,82 @@ void
 ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
 {
 	Size		size;
+	shm_toc	   *toc;
+	void	   *addr;
 
-	/* don't need this if not instrumenting or no workers */
-	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+	size = ExecAggEstimateToc(node, pcxt);
+	if (size == 0)
 		return;
 
-	size = offsetof(SharedAggInfo, sinstrument)
-		+ pcxt->nworkers * sizeof(AggregateInstrumentation);
-	node->shared_info = shm_toc_allocate(pcxt->toc, size);
-	/* ensure any unfilled slots will contain zeroes */
-	memset(node->shared_info, 0, size);
-	node->shared_info->num_workers = pcxt->nworkers;
-	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
-				   node->shared_info);
+	addr = shm_toc_allocate(pcxt->toc, size);
+	toc = shm_toc_create(SHARED_AGG_MAGIC, addr, size);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, addr);
+
+	if (node->ss.ps.instrument)
+	{
+		size = offsetof(SharedAggInfo, sinstrument)
+			+ pcxt->nworkers * sizeof(AggregateInstrumentation);
+		node->shared_info = shm_toc_allocate(toc, size);
+		/* ensure any unfilled slots will contain zeroes */
+		memset(node->shared_info, 0, size);
+		node->shared_info->num_workers = pcxt->nworkers;
+		shm_toc_insert(toc, SHARED_AGG_KEY_INFO, node->shared_info);
+	}
+
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		int				nparticipants = pcxt->nworkers + 1;
+		int				i = 0;
+		ListCell	   *lc;
+		Agg			   *agg;
+
+		/* Initialize shared file set */
+		SharedFileSet  *fset = shm_toc_allocate(toc, sizeof(SharedFileSet));
+		SharedFileSetInit(fset, pcxt->seg);
+		shm_toc_insert(toc, SHARED_AGG_KEY_FILE_SET, fset);
+
+		/* Initialize build barrier */
+		node->batch_barrier = shm_toc_allocate(toc, sizeof(Barrier));
+		BarrierInit(node->batch_barrier, 0);
+		shm_toc_insert(toc, SHARED_AGG_KEY_BARRIER, node->batch_barrier);
+
+		/* Initialize batch store */
+		agg = castNode(Agg, node->ss.ps.plan);
+		Assert(agg->numBatches > 0);
+		size = bs_parallel_hash_estimate(agg->numBatches, nparticipants);
+		addr = shm_toc_allocate(toc, size);
+		shm_toc_insert(toc, 0, addr);
+		node->perhash[0].batch_store = bs_init_parallel_hash(agg->numBatches,
+															 nparticipants,
+															 0,
+															 addr,
+															 pcxt->seg,
+															 fset,
+															 "BatchHashAgg");
+
+		/* Initialize batch store for other sets(if has) */
+		i = 1;
+		foreach (lc, agg->chain)
+		{
+			Agg	   *subagg = lfirst_node(Agg, lc);
+			char	name[30];
+			Assert(subagg->aggstrategy == AGG_BATCH_HASH &&
+				   subagg->numBatches > 0);
+			Assert(i < node->num_hashes);
+			size = bs_parallel_hash_estimate(subagg->numBatches, nparticipants);
+			addr = shm_toc_allocate(toc, size);
+			shm_toc_insert(toc, i, addr);
+			sprintf(name, "BatchHashAgg%d", i);
+			node->perhash[i].batch_store = bs_init_parallel_hash(subagg->numBatches,
+																 nparticipants,
+																 0,
+																 addr,
+																 pcxt->seg,
+																 fset,
+																 name);
+			++i;
+		}
+	}
 }
 
 /* ----------------------------------------------------------------
@@ -4799,8 +5237,43 @@ ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
 void
 ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
 {
-	node->shared_info =
-		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	shm_toc	   *toc;
+	void	   *addr = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	if (addr == NULL)
+	{
+		Assert(node->aggstrategy != AGG_BATCH_HASH);
+		return;
+	}
+	toc = shm_toc_attach(SHARED_AGG_MAGIC, addr);
+	node->shared_info = shm_toc_lookup(toc, SHARED_AGG_KEY_INFO, true);
+
+	if (node->aggstrategy == AGG_BATCH_HASH)
+	{
+		int				i;
+		ListCell	   *lc;
+		Agg			   *agg = castNode(Agg, node->ss.ps.plan);
+		SharedFileSet  *fset = shm_toc_lookup(toc, SHARED_AGG_KEY_FILE_SET, false);
+
+		node->batch_barrier = shm_toc_lookup(toc, SHARED_AGG_KEY_BARRIER, false);
+		node->perhash[0].batch_store =
+			bs_attach_parallel_hash(shm_toc_lookup(toc, 0, false),
+									pwcxt->seg,
+									fset,
+									ParallelWorkerNumber+1);
+
+		i = 1;
+		foreach (lc, agg->chain)
+		{
+			Assert(lfirst_node(Agg, lc)->aggstrategy == AGG_BATCH_HASH);
+			Assert (i<node->num_hashes);
+			node->perhash[i].batch_store =
+				bs_attach_parallel_hash(shm_toc_lookup(toc, i, false),
+										pwcxt->seg,
+										fset,
+										ParallelWorkerNumber+1);
+			++i;
+		}
+	}
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/backend/executor/nodeBatchSort.c b/src/backend/executor/nodeBatchSort.c
new file mode 100644
index 0000000000..a157909374
--- /dev/null
+++ b/src/backend/executor/nodeBatchSort.c
@@ -0,0 +1,554 @@
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "executor/nodeBatchSort.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "storage/barrier.h"
+#include "utils/builtins.h"
+#include "utils/tuplesort.h"
+#include "utils/typcache.h"
+
+typedef struct ParallelBatchSort
+{
+	Barrier				barrier;		/* data build barrier */
+	pg_atomic_uint32	attached;		/* how many worker attached */
+	pg_atomic_uint32	cur_batch;		/* current not final sort batch number */
+	Size				tuplesort_size;	/* MAXIMUM_ALIGNOF*n */
+}ParallelBatchSort;
+
+#define PARALLEL_BATCH_SORT_SIZE		MAXALIGN(sizeof(ParallelBatchSort))
+#define PARALLEL_BATCH_SORT_SHARED(p,n)	\
+	(Sharedsort*)(((char*)(p)) + PARALLEL_BATCH_SORT_SIZE + (p)->tuplesort_size * (n))
+
+#define BUILD_BATCH_DONE	1
+
+/*
+ * if we can scan next batch, then return true and open it,
+ * else return false
+ */
+static bool
+ExecNextParallelBatchSort(BatchSortState *state)
+{
+	ParallelBatchSort  *parallel = state->parallel;
+	BatchSort		   *plan = castNode(BatchSort, state->ps.plan);
+	SortCoordinateData	coord;
+	uint32				cur_batch;
+	Assert(parallel != NULL);
+
+	/* do we opened other batch? */
+	if (state->curBatch >= 0 &&
+		state->curBatch < plan->numBatches &&
+		state->batches[state->curBatch] != NULL)
+	{
+		/* close it */
+		tuplesort_end(state->batches[state->curBatch]);
+		state->batches[state->curBatch] = NULL;
+	}
+
+	/* get next batch number and set next number in shared memory */
+	cur_batch = pg_atomic_fetch_add_u32(&parallel->cur_batch, 1);
+	if (cur_batch >= plan->numBatches)
+	{
+		/* out of batch range, no more batch */
+		state->curBatch = plan->numBatches;
+		return false;
+	}
+
+	/* open tuplesort as leader */
+	Assert(state->batches[cur_batch] == NULL);
+	state->curBatch = cur_batch;
+	coord.isWorker = false;		/* we are leader */
+	coord.nParticipants = pg_atomic_read_u32(&parallel->attached);
+	coord.sharedsort = PARALLEL_BATCH_SORT_SHARED(parallel, cur_batch);
+	state->batches[cur_batch] = tuplesort_begin_heap(ExecGetResultType(outerPlanState(state)),
+													 plan->sort.numCols,
+													 plan->sort.sortColIdx,
+													 plan->sort.sortOperators,
+													 plan->sort.collations,
+													 plan->sort.nullsFirst,
+													 work_mem,
+													 &coord,
+													 false);
+	tuplesort_performsort(state->batches[cur_batch]);
+	return true;
+}
+
+/*
+ * return a empty TupleTableSlot
+ */
+static TupleTableSlot *
+ExecEmptyBatchSort(PlanState *pstate)
+{
+	return ExecClearTuple(pstate->ps_ResultTupleSlot);
+}
+
+/*
+ * scan tuple slot from tuplesort,
+ * all batch must be sort completed(called function ExecBatchSortPrepare)
+ */
+static TupleTableSlot *
+ExecBatchSort(PlanState *pstate)
+{
+	TupleTableSlot *slot = pstate->ps_ResultTupleSlot;
+	BatchSortState *state = castNode(BatchSortState, pstate);
+	Assert(state->sort_Done);
+
+re_get_:
+	/*
+	 * Get tuple from current tuplesort
+	 */
+	if (likely(tuplesort_gettupleslot(state->batches[state->curBatch],
+									  true,
+									  false,
+									  slot,
+									  NULL)))
+		return slot;
+
+	/*
+	 * Is all batches scanned?
+	 */
+	if (state->curBatch < castNode(BatchSort, pstate->plan)->numBatches-1)
+	{
+		/* Try next batch */
+		if (state->parallel)
+		{
+			if (ExecNextParallelBatchSort(state) == false)
+			{
+				/* No nore batches */
+				ExecSetExecProcNode(pstate, ExecEmptyBatchSort);
+				return ExecClearTuple(slot);
+			}
+			/*
+			 * OK, we can scan next batch,
+			 * function ExecNextParallelBatchSort changed curBatch
+			 */
+		}else
+		{
+			state->curBatch++;
+		}
+		goto re_get_;
+	}
+
+	/*
+	 * no more batches, return empty TupleTableSlot
+	 * tuplesort_gettupleslot cleaned slot
+	 */
+	Assert(TupIsNull(slot));
+	return slot;
+}
+
+/*
+ * Sorts tuples from the outer subree of the node using tuplesort
+ */
+static TupleTableSlot *
+ExecBatchSortPrepare(PlanState *pstate)
+{
+	BatchSort		   *node = castNode(BatchSort, pstate->plan);
+	BatchSortState	   *state = castNode(BatchSortState, pstate);
+	PlanState		   *outerNode = outerPlanState(pstate);
+	TupleTableSlot	   *slot;
+	ListCell		   *lc;
+	ParallelBatchSort  *parallel = state->parallel;
+	SortCoordinateData	coord;
+	FunctionCallInfo	fcinfo;
+	uint32				hash;
+	int					i;
+	AttrNumber			maxAttr;
+	Assert(state->sort_Done == false ||
+		   (parallel && BarrierPhase(&parallel->barrier) >= BUILD_BATCH_DONE));
+	Assert(list_length(state->groupFuns) == node->numGroupCols);
+
+	/* Is parallel mode */
+	if (parallel)
+	{
+		/* Attach to barrier and test current state */
+		if (BarrierAttach(&parallel->barrier) >= BUILD_BATCH_DONE)
+		{
+			/*
+			 * all tuples sorted by other workers
+			 * we don't need scan tuple from outer subtree
+			 * and can not change batches state(too late)
+			 */
+			goto build_already_done_;
+		}
+		/* let other workers known we are attached */
+		pg_atomic_add_fetch_u32(&parallel->attached, 1);
+	}
+
+	/* Make all tuplesort */
+	for (i=node->numBatches;i>0;)
+	{
+		--i;
+		if (parallel)
+		{
+			coord.isWorker = true;		/* We are not leader */
+			coord.nParticipants = -1;
+			coord.sharedsort = PARALLEL_BATCH_SORT_SHARED(parallel, i);
+		}
+		state->batches[i] = tuplesort_begin_heap(ExecGetResultType(outerNode),
+												 node->sort.numCols,
+												 node->sort.sortColIdx,
+												 node->sort.sortOperators,
+												 node->sort.collations,
+												 node->sort.nullsFirst,
+												 work_mem / node->numBatches,
+												 parallel ? &coord : NULL,
+												 false);
+	}
+
+	/*
+	 * Get max using attribute number,
+	 * we don't want call slot_getsomeattrs() more than once
+	 */
+	maxAttr = 0;
+	for (i=node->numGroupCols;i>0;)
+	{
+		if (maxAttr < node->grpColIdx[--i])
+			maxAttr = node->grpColIdx[i];
+	}
+	for (i=node->sort.numCols;i>0;)
+	{
+		if (maxAttr < node->sort.sortColIdx[--i])
+			maxAttr = node->sort.sortColIdx[i];
+	}
+	Assert(maxAttr > 0);
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+		slot = ExecProcNode(outerNode);
+		if (TupIsNull(slot))
+			break;
+
+		/* Get all we using datums from tuple */
+		slot_getsomeattrs(slot, maxAttr);
+
+		/* Get hash value */
+		hash = 0;
+		i = 0;
+		foreach(lc, state->groupFuns)
+		{
+			AttrNumber att = node->grpColIdx[i++]-1;
+			if (slot->tts_isnull[att] == false)
+			{
+				fcinfo = lfirst(lc);
+				fcinfo->args[0].value = slot->tts_values[att];
+				hash = hash_combine(hash, DatumGetUInt32(FunctionCallInvoke(fcinfo)));
+				Assert(fcinfo->isnull == false);
+			}
+		}
+
+		/* Put tuple to differed batch using hash value */
+		tuplesort_puttupleslot(state->batches[hash%node->numBatches], slot);
+	}
+
+	/* Finish all tuplesort put */
+	for (i=node->numBatches;i>0;)
+	{
+		--i;
+		tuplesort_performsort(state->batches[i]);
+		if (parallel)
+		{
+			/*
+			 * In parallel mode we need reopen tuplesort as leader
+			 * and almost impossible get all batches,
+			 * so we close it for free resource
+			 */
+			tuplesort_end(state->batches[i]);
+			state->batches[i] = NULL;
+		}
+	}
+
+	if (parallel)
+	{
+		/* Wait other workers finish sort */
+		BarrierArriveAndWait(&parallel->barrier, WAIT_EVENT_BATCH_SORT_BUILD);
+		Assert(BarrierPhase(&parallel->barrier) >= BUILD_BATCH_DONE);
+	}
+
+build_already_done_:
+	/* All tuplesort already done */
+	state->sort_Done = true;
+
+	/*
+	 * Change execute function.
+	 * Separating the preparation and scanning data functions helps to improve performance,
+	 * because we don't have to judge whether the data is ready every time
+	 */
+	if (parallel)
+	{
+		/* we don't need barrier any more */
+		BarrierDetach(&parallel->barrier);
+
+		if (ExecNextParallelBatchSort(state))
+			ExecSetExecProcNode(pstate, ExecBatchSort);	/* got a batch, we can scan it */
+		else
+			ExecSetExecProcNode(pstate, ExecEmptyBatchSort);	/* no more batch, so no more tuple */
+	}else
+	{
+		/* scan tuplesort from first batch */
+		state->curBatch = 0;
+		ExecSetExecProcNode(pstate, ExecBatchSort);
+	}
+
+	/* return first tuple or empty */
+	return (*pstate->ExecProcNodeReal)(pstate);
+}
+
+/*
+ * Create the run-time state information for the batch sort node
+ */
+BatchSortState *
+ExecInitBatchSort(BatchSort *node, EState *estate, int eflags)
+{
+	BatchSortState *state;
+	TypeCacheEntry *typentry;
+	TupleDesc		desc;
+	int				i;
+
+	state = makeNode(BatchSortState);
+	state->ps.plan = (Plan*) node;
+	state->ps.state = estate;
+	state->ps.ExecProcNode = ExecBatchSortPrepare;
+
+	state->sort_Done = false;
+	state->batches = palloc0(node->numBatches * sizeof(Tuplesortstate*));
+
+	outerPlanState(state) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize return slot and type. No need to initialize projection info
+	 * because this node doesn't do projections.
+	 */
+	ExecInitResultTupleSlotTL(&state->ps, &TTSOpsMinimalTuple);
+	state->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * create hash function call info data
+	 */
+	Assert(node->numGroupCols > 0);
+	desc = ExecGetResultType(outerPlanState(state));
+	for (i=0;i<node->numGroupCols;++i)
+	{
+		FmgrInfo			   *flinfo;
+		FunctionCallInfo		fcinfo;
+		Form_pg_attribute		attr = TupleDescAttr(desc, node->grpColIdx[i]-1);
+		typentry = lookup_type_cache(attr->atttypid, TYPECACHE_HASH_PROC);
+		if (!OidIsValid(typentry->hash_proc))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("could not identify an extended hash function for type %s",
+							format_type_be(attr->atttypid))));
+
+		/* Alloc memory for hash function call info data */
+		flinfo = palloc0(sizeof(*flinfo));
+		fcinfo = palloc0(SizeForFunctionCallInfo(1));	/* hash function only have one argument */
+		fmgr_info(typentry->hash_proc, flinfo);
+		InitFunctionCallInfoData(*fcinfo, flinfo, 1, attr->attcollation, NULL, NULL);
+		fcinfo->args[0].isnull = false;	/* first argument always not null */
+
+		/* Save call info data to run-time state */
+		state->groupFuns = lappend(state->groupFuns, fcinfo);
+	}
+
+	return state;
+}
+
+/*
+ * Free tuplesort resources
+ * and set "all tuplesort already done" as false
+ */
+static void
+CleanBatchSort(BatchSortState *node)
+{
+	int i;
+
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	if (node->sort_Done)
+	{
+		for (i=castNode(BatchSort, node->ps.plan)->numBatches;i>0;)
+		{
+			if (node->batches[--i] != NULL)
+			{
+				tuplesort_end(node->batches[i]);
+				node->batches[i] = NULL;
+			}
+		}
+		node->sort_Done = false;
+	}
+}
+
+void
+ExecEndBatchSort(BatchSortState *node)
+{
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	CleanBatchSort(node);
+	ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanBatchSort(BatchSortState *node)
+{
+	int					i;
+	Bitmapset		   *outerParams;
+	ParallelBatchSort  *parallel;
+	BatchSort		   *plan;
+
+	/* If we haven't sorted yet, just return. */
+	if (node->sort_Done == false)
+		return;
+
+	parallel = node->parallel;
+	outerParams = outerPlanState(node)->chgParam;
+	plan = castNode(BatchSort, node->ps.plan);
+
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	if (bms_is_empty(outerParams) ||
+		/*
+		 * In parallel mod in parallel mode don't need rescan outer
+		 * when it only have gather param change,
+		 * because we cached all tuple from it and we can read in any worker
+		 */
+		(parallel != NULL &&
+		 plan->gather_param >= 0 &&
+		 bms_membership(outerParams) == BMS_SINGLETON &&
+		 bms_is_member(plan->gather_param, outerParams)))
+	{
+		if (parallel)
+		{
+			/* restart batches */
+			pg_atomic_write_u32(&parallel->cur_batch, 0);
+
+			/*
+			 * We want call ExecNextParallelBatchSort() function when
+			 * parent call ExecProcNode(), not now. Because it is possible that
+			 * other workers will finish processing other batches very quickly,
+			 * and we did occupy a batch and we have been in no hurry to deal with it.
+			 * So we don't want occupy a batch for now.
+			 */
+			ExecSetExecProcNode(&node->ps, ExecBatchSortPrepare);
+		}else
+		{
+			/* restart batches */
+			node->curBatch = 0;
+
+			/* rescan tuplesort */
+			i = plan->numBatches;
+			while (i>0)
+				tuplesort_rescan(node->batches[--i]);
+			ExecSetExecProcNode(&node->ps, ExecBatchSort);
+		}
+	}else
+	{
+		/* clean opend tuplesort */
+		CleanBatchSort(node);
+
+		if (parallel)
+		{
+			/* reset tuplesort */
+			i = plan->numBatches;
+			while (i>0)
+			{
+				--i;
+				tuplesort_reset_shared(PARALLEL_BATCH_SORT_SHARED(parallel, i));
+			}
+
+			/* reset shared memory */
+			BarrierInit(&parallel->barrier, 0);
+			pg_atomic_write_u32(&parallel->attached, 0);
+			pg_atomic_write_u32(&parallel->cur_batch, 0);
+		}
+		ExecSetExecProcNode(&node->ps, ExecBatchSortPrepare);
+		node->sort_Done = false;
+	}
+}
+
+void
+ExecShutdownBatchSort(BatchSortState *node)
+{
+	CleanBatchSort(node);
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecBatchSortEstimate
+ *
+ *		Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBatchSortEstimate(BatchSortState *node, ParallelContext *pcxt)
+{
+	Size size = mul_size(MAXALIGN(tuplesort_estimate_shared(pcxt->nworkers+1)),
+						 castNode(BatchSort, node->ps.plan)->numBatches);
+	size = add_size(size, PARALLEL_BATCH_SORT_SIZE);
+
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* Initialize shared memory for batch sort */
+static void
+InitializeBatchSortParallel(ParallelBatchSort *parallel,
+							int num_batches,
+							int num_workers,
+							dsm_segment *seg)
+{
+	int i;
+	BarrierInit(&parallel->barrier, 0);
+	pg_atomic_init_u32(&parallel->attached, 0);
+	pg_atomic_init_u32(&parallel->cur_batch, 0);
+	for (i=0;i<num_batches;++i)
+	{
+		tuplesort_initialize_shared(PARALLEL_BATCH_SORT_SHARED(parallel, i),
+									num_workers,
+									seg);
+	}
+}
+
+void
+ExecBatchSortInitializeDSM(BatchSortState *node, ParallelContext *pcxt)
+{
+	ParallelBatchSort  *parallel;
+	BatchSort		   *plan = castNode(BatchSort, node->ps.plan);
+	Size				tuplesort_size = MAXALIGN(tuplesort_estimate_shared(pcxt->nworkers+1));
+	Size				size = mul_size(tuplesort_size, plan->numBatches);
+	size = add_size(PARALLEL_BATCH_SORT_SIZE, size);
+
+	node->parallel = parallel = shm_toc_allocate(pcxt->toc, size);
+	parallel->tuplesort_size = tuplesort_size;
+	InitializeBatchSortParallel(parallel, plan->numBatches, pcxt->nworkers+1, pcxt->seg);
+	shm_toc_insert(pcxt->toc, plan->sort.plan.plan_node_id, parallel);
+}
+
+void
+ExecBatchSortReInitializeDSM(BatchSortState *node, ParallelContext *pcxt)
+{
+	pg_atomic_write_u32(&node->parallel->cur_batch, 0);
+}
+
+void
+ExecBatchSortInitializeWorker(BatchSortState *node, ParallelWorkerContext *pwcxt)
+{
+	uint32				i;
+	BatchSort		   *plan = castNode(BatchSort, node->ps.plan);
+	ParallelBatchSort  *parallel = shm_toc_lookup(pwcxt->toc,
+												  plan->sort.plan.plan_node_id,
+												  false);
+	node->parallel = parallel;
+	for (i=0;i<plan->numBatches;++i)
+	{
+		tuplesort_attach_shared(PARALLEL_BATCH_SORT_SHARED(parallel, i),
+								pwcxt->seg);
+	}
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index ba3ccc712c..7ee5d673f1 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -959,6 +959,23 @@ _copySort(const Sort *from)
 	return newnode;
 }
 
+/*
+ * _copyBatchSort
+ */
+static BatchSort *
+_copyBatchSort(const BatchSort *from)
+{
+	BatchSort	   *newnode = makeNode(BatchSort);
+
+	CopySortFields(&from->sort, &newnode->sort);
+
+	COPY_SCALAR_FIELD(numGroupCols);
+	COPY_SCALAR_FIELD(numBatches);
+	COPY_SCALAR_FIELD(gather_param);
+	COPY_POINTER_FIELD(grpColIdx, from->numGroupCols * sizeof(AttrNumber));
+
+	return newnode;
+}
 
 /*
  * _copyIncrementalSort
@@ -1024,6 +1041,8 @@ _copyAgg(const Agg *from)
 	COPY_BITMAPSET_FIELD(aggParams);
 	COPY_NODE_FIELD(groupingSets);
 	COPY_NODE_FIELD(chain);
+	COPY_SCALAR_FIELD(numBatches);
+	COPY_SCALAR_FIELD(gatherParam);
 
 	return newnode;
 }
@@ -4948,6 +4967,9 @@ copyObjectImpl(const void *from)
 		case T_Sort:
 			retval = _copySort(from);
 			break;
+		case T_BatchSort:
+			retval = _copyBatchSort(from);
+			break;
 		case T_IncrementalSort:
 			retval = _copyIncrementalSort(from);
 			break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 8392be6d44..61187685a6 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -785,6 +785,8 @@ _outAgg(StringInfo str, const Agg *node)
 	WRITE_BITMAPSET_FIELD(aggParams);
 	WRITE_NODE_FIELD(groupingSets);
 	WRITE_NODE_FIELD(chain);
+	WRITE_INT_FIELD(numBatches);
+	WRITE_INT_FIELD(gatherParam);
 }
 
 static void
@@ -854,6 +856,19 @@ _outSort(StringInfo str, const Sort *node)
 	_outSortInfo(str, node);
 }
 
+static void
+_outBatchSort(StringInfo str, const BatchSort *node)
+{
+	WRITE_NODE_TYPE("BATCHSORT");
+
+	_outSortInfo(str, &node->sort);
+
+	WRITE_INT_FIELD(numGroupCols);
+	WRITE_INT_FIELD(numBatches);
+	WRITE_INT_FIELD(gather_param);
+	WRITE_ATTRNUMBER_ARRAY(grpColIdx, node->numGroupCols);
+}
+
 static void
 _outIncrementalSort(StringInfo str, const IncrementalSort *node)
 {
@@ -3836,6 +3851,9 @@ outNode(StringInfo str, const void *obj)
 			case T_Sort:
 				_outSort(str, obj);
 				break;
+			case T_BatchSort:
+				_outBatchSort(str, obj);
+				break;
 			case T_IncrementalSort:
 				_outIncrementalSort(str, obj);
 				break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index d2c8d58070..e0d225aa85 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2182,6 +2182,21 @@ _readSort(void)
 	READ_DONE();
 }
 
+static BatchSort *
+_readBatchSort(void)
+{
+	READ_LOCALS(BatchSort);
+
+	ReadCommonSort(&local_node->sort);
+
+	READ_INT_FIELD(numGroupCols);
+	READ_INT_FIELD(numBatches);
+	READ_INT_FIELD(gather_param);
+	READ_ATTRNUMBER_ARRAY(grpColIdx, local_node->numGroupCols);
+
+	READ_DONE();
+}
+
 /*
  * _readIncrementalSort
  */
@@ -2236,6 +2251,8 @@ _readAgg(void)
 	READ_BITMAPSET_FIELD(aggParams);
 	READ_NODE_FIELD(groupingSets);
 	READ_NODE_FIELD(chain);
+	READ_INT_FIELD(numBatches);
+	READ_INT_FIELD(gatherParam);
 
 	READ_DONE();
 }
@@ -2835,6 +2852,8 @@ parseNodeString(void)
 		return_value = _readMaterial();
 	else if (MATCH("SORT", 4))
 		return_value = _readSort();
+	else if (MATCH("BATCHSORT", 9))
+		return_value = _readBatchSort();
 	else if (MATCH("INCREMENTALSORT", 15))
 		return_value = _readIncrementalSort();
 	else if (MATCH("GROUP", 5))
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 380336518f..c0a75e31c1 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -147,6 +147,8 @@ bool		enable_partitionwise_aggregate = false;
 bool		enable_parallel_append = true;
 bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
+int			max_sort_batches = 0;
+int			max_hashagg_batches = 0;
 
 typedef struct
 {
@@ -1958,6 +1960,84 @@ cost_sort(Path *path, PlannerInfo *root,
 	path->total_cost = startup_cost + run_cost;
 }
 
+void cost_batchsort(Path *path, PlannerInfo *root,
+					List *batchkeys, Cost input_cost,
+					double tuples, int width,
+					Cost comparison_cost, int sort_mem,
+					uint32 numGroupCols, uint32 numBatches)
+{
+	Cost		startup_cost = input_cost;
+	Cost		run_cost = 0;
+	double		input_bytes = relation_byte_size(tuples, width);
+	double		batch_bytes = input_bytes / numBatches;
+	double		batch_tuples = tuples / numBatches;
+	long		sort_mem_bytes = sort_mem * 1024L;
+
+	if (sort_mem_bytes < (64*1024))
+		sort_mem_bytes = (64*1024);
+
+	/* hash cost */
+	startup_cost += cpu_operator_cost * numGroupCols * tuples;
+
+	path->rows = tuples;
+
+	/*
+	 * We want to be sure the cost of a sort is never estimated as zero, even
+	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
+	 */
+	if (tuples < 2.0)
+		tuples = 2.0;
+
+	if (batch_bytes > sort_mem_bytes)
+	{
+		/*
+		 * We'll have to use a disk-based sort of all the tuples
+		 */
+		double		npages = ceil(batch_bytes / BLCKSZ);
+		double		nruns = batch_bytes / sort_mem_bytes;
+		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
+		double		log_runs;
+		double		npageaccesses;
+
+		/*
+		 * CPU costs
+		 *
+		 * Assume about N log2 N comparisons
+		 */
+		startup_cost += comparison_cost * batch_tuples * LOG2(batch_tuples) * numBatches;
+
+		/* Disk costs */
+
+		/* Compute logM(r) as log(r) / log(M) */
+		if (nruns > mergeorder)
+			log_runs = ceil(log(nruns) / log(mergeorder));
+		else
+			log_runs = 1.0;
+		npageaccesses = 2.0 * npages * log_runs;
+		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
+		startup_cost += npageaccesses * numBatches *
+			(seq_page_cost * 0.75 + random_page_cost * 0.25);
+
+	}else
+	{
+		/* We'll use plain quicksort on all the input tuples */
+		startup_cost += comparison_cost * tuples * LOG2(tuples);
+	}
+
+	/*
+	 * Also charge a small amount (arbitrarily set equal to operator cost) per
+	 * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node
+	 * doesn't do qual-checking or projection, so it has less overhead than
+	 * most plan nodes.  Note it's correct to use tuples not output_tuples
+	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
+	 * counting the LIMIT otherwise.
+	 */
+	run_cost += cpu_operator_cost * tuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
+}
+
 /*
  * append_nonpartial_cost
  *	  Estimate the cost of the non-partial paths in a Parallel Append.
@@ -2332,7 +2412,7 @@ cost_agg(Path *path, PlannerInfo *root,
 	/* Use all-zero per-aggregate costs if NULL is passed */
 	if (aggcosts == NULL)
 	{
-		Assert(aggstrategy == AGG_HASHED);
+		Assert(aggstrategy == AGG_HASHED || aggstrategy == AGG_BATCH_HASH);
 		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
 		aggcosts = &dummy_aggcosts;
 	}
@@ -2391,10 +2471,13 @@ cost_agg(Path *path, PlannerInfo *root,
 	}
 	else
 	{
-		/* must be AGG_HASHED */
+		/* must be AGG_HASHED or AGG_BATCH_HASH */
+		Assert(aggstrategy == AGG_HASHED || max_hashagg_batches > 0);
 		startup_cost = input_total_cost;
-		if (!enable_hashagg)
+		if (aggstrategy == AGG_HASHED && !enable_hashagg)
+		{
 			startup_cost += disable_cost;
+		}
 		startup_cost += aggcosts->transCost.startup;
 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
 		/* cost of computing hash value */
@@ -2406,6 +2489,15 @@ cost_agg(Path *path, PlannerInfo *root,
 		/* cost of retrieving from hash table */
 		total_cost += cpu_tuple_cost * numGroups;
 		output_tuples = numGroups;
+
+		if (aggstrategy == AGG_BATCH_HASH)
+		{
+			double	nbytes = relation_byte_size(input_tuples, input_width);
+			double	npages = ceil(nbytes / BLCKSZ);
+			double	material_cost = (seq_page_cost * npages);
+			startup_cost += material_cost;
+			total_cost += material_cost;
+		}
 	}
 
 	/*
@@ -2421,7 +2513,9 @@ cost_agg(Path *path, PlannerInfo *root,
 	 * Accrue writes (spilled tuples) to startup_cost and to total_cost;
 	 * accrue reads only to total_cost.
 	 */
-	if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
+	if (aggstrategy == AGG_HASHED ||
+		aggstrategy == AGG_BATCH_HASH ||
+		aggstrategy == AGG_MIXED)
 	{
 		double		pages;
 		double		pages_written = 0.0;
@@ -2434,6 +2528,14 @@ cost_agg(Path *path, PlannerInfo *root,
 		int			num_partitions;
 		int			depth;
 
+		if (aggstrategy == AGG_BATCH_HASH &&
+			numGroups > max_hashagg_batches)
+		{
+			numGroups /= max_hashagg_batches;
+			if (numGroups < 1.0)
+				numGroups = 1.0;
+		}
+
 		/*
 		 * Estimate number of batches based on the computed limits. If less
 		 * than or equal to one, all groups are expected to fit in memory;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 25d4750ca6..baa645529c 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -30,6 +30,7 @@
 #include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paramassign.h"
+#include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/placeholder.h"
 #include "optimizer/plancat.h"
@@ -98,6 +99,7 @@ static Plan *create_projection_plan(PlannerInfo *root,
 									int flags);
 static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe);
 static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags);
+static BatchSort *create_batchsort_plan(PlannerInfo *root, BatchSortPath *best_path, int flags);
 static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root,
 													IncrementalSortPath *best_path, int flags);
 static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path);
@@ -468,6 +470,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
 											 (SortPath *) best_path,
 											 flags);
 			break;
+		case T_BatchSort:
+			plan = (Plan *) create_batchsort_plan(root,
+												  (BatchSortPath*) best_path,
+												  flags);
+			break;
 		case T_IncrementalSort:
 			plan = (Plan *) create_incrementalsort_plan(root,
 														(IncrementalSortPath *) best_path,
@@ -2009,6 +2016,40 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
 	return plan;
 }
 
+static BatchSort *create_batchsort_plan(PlannerInfo *root, BatchSortPath *best_path, int flags)
+{
+	BatchSort	   *plan;
+	Plan		   *subplan;
+
+	subplan = create_plan_recurse(root, best_path->subpath,
+								  flags | CP_SMALL_TLIST);
+
+	plan = makeNode(BatchSort);
+	subplan = prepare_sort_from_pathkeys(subplan,
+										 best_path->batchkeys,
+										 IS_OTHER_REL(best_path->subpath->parent) ?
+											 best_path->path.parent->relids : NULL,
+										 NULL,
+										 false,
+										 &plan->sort.numCols,
+										 &plan->sort.sortColIdx,
+										 &plan->sort.sortOperators,
+										 &plan->sort.collations,
+										 &plan->sort.nullsFirst);
+	plan->sort.plan.targetlist = subplan->targetlist;
+	plan->sort.plan.qual = NIL;
+	outerPlan(plan) = subplan;
+	innerPlan(plan) = NULL;
+	plan->numBatches = best_path->numBatches;
+	plan->numGroupCols = list_length(best_path->batchgroup);
+	plan->gather_param = -1;
+	plan->grpColIdx = extract_grouping_cols(best_path->batchgroup,
+											subplan->targetlist);
+
+	copy_generic_path_info(&plan->sort.plan, &best_path->path);
+	return plan;
+}
+
 /*
  * create_incrementalsort_plan
  *
@@ -2085,6 +2126,12 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag
 {
 	Unique	   *plan;
 	Plan	   *subplan;
+	List	   *pathkeys;
+
+	if (IsA(best_path->subpath, BatchSortPath))
+		pathkeys = ((BatchSortPath*)best_path->subpath)->batchkeys;
+	else
+		pathkeys = best_path->path.pathkeys;
 
 	/*
 	 * Unique doesn't project, so tlist requirements pass through; moreover we
@@ -2094,7 +2141,7 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag
 								  flags | CP_LABEL_TLIST);
 
 	plan = make_unique_from_pathkeys(subplan,
-									 best_path->path.pathkeys,
+									 pathkeys,
 									 best_path->numkeys);
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
@@ -2282,7 +2329,9 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 			if (!rollup->is_hashed)
 				is_first_sort = false;
 
-			if (rollup->is_hashed)
+			if (best_path->aggstrategy == AGG_BATCH_HASH)
+				strat = AGG_BATCH_HASH;
+			else if (rollup->is_hashed)
 				strat = AGG_HASHED;
 			else if (list_length(linitial(rollup->gsets)) == 0)
 				strat = AGG_PLAIN;
@@ -6373,12 +6422,20 @@ make_agg(List *tlist, List *qual,
 	node->aggParams = NULL;		/* SS_finalize_plan() will fill this */
 	node->groupingSets = groupingSets;
 	node->chain = chain;
+	node->gatherParam = -1;
 
 	plan->qual = qual;
 	plan->targetlist = tlist;
 	plan->lefttree = lefttree;
 	plan->righttree = NULL;
 
+	if (aggstrategy == AGG_BATCH_HASH)
+	{
+		node->numBatches = (int32)numGroups;
+		if (node->numBatches > max_hashagg_batches)
+			node->numBatches = max_hashagg_batches;
+	}
+
 	return node;
 }
 
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4e6497ff32..0307ee094d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -175,6 +175,12 @@ static void consider_groupingsets_paths(PlannerInfo *root,
 										grouping_sets_data *gd,
 										const AggClauseCosts *agg_costs,
 										double dNumGroups);
+static void consider_parallel_hash_groupingsets_paths(PlannerInfo *root,
+													  RelOptInfo *grouped_rel,
+													  Path *path,
+													  grouping_sets_data *gd,
+													  const AggClauseCosts *agg_costs,
+													  double dNumGroups);
 static RelOptInfo *create_window_paths(PlannerInfo *root,
 									   RelOptInfo *input_rel,
 									   PathTarget *input_target,
@@ -3866,6 +3872,11 @@ create_grouping_paths(PlannerInfo *root,
 		extra.havingQual = parse->havingQual;
 		extra.targetList = parse->targetList;
 		extra.partial_costs_set = false;
+		if (parse->groupClause != NIL &&
+			(gd == NULL || gd->rollups == NIL))
+			extra.hashable_groups = grouping_get_hashable(parse->groupClause);
+		else
+			extra.hashable_groups = NIL;
 
 		/*
 		 * Determine whether partitionwise aggregation is in theory possible.
@@ -4522,6 +4533,78 @@ consider_groupingsets_paths(PlannerInfo *root,
 										  dNumGroups));
 }
 
+static void
+consider_parallel_hash_groupingsets_paths(PlannerInfo *root,
+										  RelOptInfo *grouped_rel,
+										  Path *path,
+										  grouping_sets_data *gd,
+										  const AggClauseCosts *agg_costs,
+										  double dNumGroups)
+{
+	int			hash_mem = get_hash_mem();
+	List	   *new_rollups = NIL;
+	List	   *sets_data;
+	ListCell   *lc;
+	RollupData *rollup;
+	GroupingSetData *gs;
+	double		hashsize;
+	double		numGroups;
+
+	sets_data = list_copy(gd->unsortable_sets);
+	foreach (lc, gd->rollups)
+	{
+		rollup = lfirst_node(RollupData, lc);
+		if (rollup->hashable == false)
+		{
+			list_free(sets_data);
+			return;
+		}
+		sets_data = list_concat(sets_data, rollup->gsets_data);
+	}
+	foreach (lc, sets_data)
+	{
+		gs = lfirst_node(GroupingSetData, lc);
+		numGroups = gs->numGroups / max_hashagg_batches;
+		if (numGroups < 1.0)
+			numGroups = 1.0;
+		hashsize = estimate_hashagg_tablesize(root,
+											  path,
+											  agg_costs,
+											  numGroups);
+		if (hashsize > hash_mem * 1024L)
+		{
+			list_free(sets_data);
+			list_free_deep(new_rollups);
+			return;
+		}
+
+		rollup = makeNode(RollupData);
+		rollup->groupClause = preprocess_groupclause(root, gs->set);
+		rollup->gsets_data = list_make1(gs);
+		rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+												 rollup->gsets_data,
+												 gd->tleref_to_colnum_map);
+		rollup->numGroups = gs->numGroups;
+		rollup->hashable = true;
+		rollup->is_hashed = true;
+		new_rollups = lappend(new_rollups, rollup);
+	}
+
+	numGroups = dNumGroups / path->parallel_workers;
+	if (numGroups < list_length(new_rollups))
+		numGroups = list_length(new_rollups);
+	path = (Path*)create_groupingsets_path(root,
+										   grouped_rel,
+										   path,
+										   (List*) root->parse->havingQual,
+										   AGG_BATCH_HASH,
+										   new_rollups,
+										   agg_costs,
+										   numGroups);
+	path->parallel_aware = true;
+	add_partial_path(grouped_rel, path);
+}
+
 /*
  * create_window_paths
  *
@@ -4795,6 +4878,8 @@ create_distinct_paths(PlannerInfo *root,
 											  cheapest_input_path->rows,
 											  NULL);
 	}
+	distinct_rel->rows = numDistinctRows;
+	distinct_rel->reltarget = root->upper_targets[UPPERREL_DISTINCT];
 
 	/*
 	 * Consider sort-based implementations of DISTINCT, if possible.
@@ -4814,6 +4899,7 @@ create_distinct_paths(PlannerInfo *root,
 		 * the other.)
 		 */
 		List	   *needed_pathkeys;
+		List	   *hashable_clause;
 
 		if (parse->hasDistinctOn &&
 			list_length(root->distinct_pathkeys) <
@@ -4860,6 +4946,39 @@ create_distinct_paths(PlannerInfo *root,
 										  path,
 										  list_length(root->distinct_pathkeys),
 										  numDistinctRows));
+
+		/* add parallel unique */
+		if (max_sort_batches > 0 &&
+			distinct_rel->consider_parallel &&
+			input_rel->partial_pathlist != NIL &&
+			numDistinctRows >= 2.0 &&
+			(hashable_clause = grouping_get_hashable(parse->distinctClause)) != NIL)
+		{
+			double	numPartialDistinctRows;
+			uint32	num_batchs = (uint32)numDistinctRows;
+			if (num_batchs > max_sort_batches)
+				num_batchs = max_sort_batches;
+
+			foreach (lc, input_rel->partial_pathlist)
+			{
+				Path *path = (Path*)create_batchsort_path(root,
+														  distinct_rel,
+														  lfirst(lc),
+														  needed_pathkeys,
+														  hashable_clause,
+														  num_batchs,
+														  true);
+				numPartialDistinctRows = numDistinctRows / path->parallel_workers;
+				if (numPartialDistinctRows < 1.0)
+					numPartialDistinctRows = 1.0;
+				path = (Path*)create_upper_unique_path(root,
+													   distinct_rel,
+													   path,
+													   list_length(root->distinct_pathkeys),
+													   numPartialDistinctRows);
+				add_partial_path(distinct_rel, path);
+			}
+		}
 	}
 
 	/*
@@ -4895,8 +5014,34 @@ create_distinct_paths(PlannerInfo *root,
 								 NIL,
 								 NULL,
 								 numDistinctRows));
+
+		/* Generate parallel batch hashed aggregate path */
+		if (max_hashagg_batches > 0 &&
+			distinct_rel->consider_parallel &&
+			input_rel->partial_pathlist != NIL &&
+			numDistinctRows > 1.0)
+		{
+			Path *path = linitial(input_rel->partial_pathlist);
+			double numRows = numDistinctRows / path->parallel_workers;
+			if (numRows < 1.0)
+				numRows = 1.0;
+			path = (Path *)create_agg_path(root,
+										   distinct_rel,
+										   path,
+										   path->pathtarget,
+										   AGG_BATCH_HASH,
+										   AGGSPLIT_SIMPLE,
+										   parse->distinctClause,
+										   NIL,
+										   NULL,
+										   numRows);
+			path->parallel_aware = true;
+			add_partial_path(distinct_rel, path);
+		}
 	}
 
+	generate_useful_gather_paths(root, distinct_rel, false);
+
 	/* Give a helpful error if we failed to find any implementation */
 	if (distinct_rel->pathlist == NIL)
 		ereport(ERROR,
@@ -6654,6 +6799,58 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			}
 		}
 
+		/*
+		 * create simple agg using parallel
+		 */
+		if (max_sort_batches > 0 &&
+			grouped_rel->consider_parallel &&
+			extra->hashable_groups != NIL &&
+			input_rel->partial_pathlist != NIL &&
+			dNumGroups >= 2.0)
+		{
+			Path	   *path;
+			double		numGroups;
+			uint32		numBatches = (uint32)dNumGroups;
+
+			if (numBatches > max_sort_batches)
+				numBatches = max_sort_batches;
+			numGroups = dNumGroups / numBatches;
+			if (numGroups < 1.0)
+				numGroups = 1.0;
+
+			Assert(parse->groupingSets == NIL);
+			Assert(parse->groupClause != NIL);
+			foreach (lc, input_rel->partial_pathlist)
+			{
+				path = (Path*)create_batchsort_path(root,
+													grouped_rel,
+													lfirst(lc),
+													root->group_pathkeys,
+													extra->hashable_groups,
+													numBatches,
+													true);
+				if (parse->hasAggs)
+					path = (Path*)create_agg_path(root,
+												  grouped_rel,
+												  path,
+												  grouped_rel->reltarget,
+												  AGG_SORTED,
+												  AGGSPLIT_SIMPLE,
+												  parse->groupClause,
+												  havingQual,
+												  agg_costs,
+												  numGroups);
+				else
+					path = (Path*)create_group_path(root,
+													grouped_rel,
+													path,
+													parse->groupClause,
+													havingQual,
+													numGroups);
+				add_partial_path(grouped_rel, path);
+			}
+		}
+
 		/*
 		 * Instead of operating directly on the input relation, we can
 		 * consider finalizing a partially aggregated path.
@@ -6757,6 +6954,56 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 											   havingQual,
 											   dNumGroups));
 			}
+
+			/* create parallel batch sort aggregate paths */
+			if (max_sort_batches > 0 &&
+				grouped_rel->consider_parallel &&
+				extra->hashable_groups != NIL &&
+				partially_grouped_rel->partial_pathlist != NIL &&
+				dNumGroups >= 2.0)
+			{
+				Path	   *path;
+				double		numGroups;
+				uint32		numBatches = (uint32)dNumGroups;
+
+				if (numBatches > max_sort_batches)
+					numBatches = max_sort_batches;
+				numGroups = dNumGroups / numBatches;
+				if (numGroups < 1.0)
+					numGroups = 1.0;
+
+				Assert(parse->groupingSets == NIL);
+				Assert(parse->groupClause != NIL);
+				foreach (lc, partially_grouped_rel->partial_pathlist)
+				{
+					path = (Path*)create_batchsort_path(root,
+														grouped_rel,
+														lfirst(lc),
+														root->group_pathkeys,
+														extra->hashable_groups,
+														numBatches,
+														true);
+					if (parse->hasAggs)
+						path = (Path*)create_agg_path(root,
+													  grouped_rel,
+													  path,
+													  grouped_rel->reltarget,
+													  AGG_SORTED,
+													  AGGSPLIT_FINAL_DESERIAL,
+													  parse->groupClause,
+													  havingQual,
+													  agg_costs,
+													  numGroups);
+					else
+						path = (Path*)create_group_path(root,
+														grouped_rel,
+														path,
+														parse->groupClause,
+														havingQual,
+														numGroups);
+					add_partial_path(grouped_rel, path);
+				}
+			}
 		}
 	}
 
@@ -6770,6 +7017,15 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			consider_groupingsets_paths(root, grouped_rel,
 										cheapest_path, false, true,
 										gd, agg_costs, dNumGroups);
+			if (max_hashagg_batches > 0 &&
+				grouped_rel->consider_parallel &&
+				input_rel->partial_pathlist != NIL)
+				consider_parallel_hash_groupingsets_paths(root,
+														  grouped_rel,
+														  linitial(input_rel->partial_pathlist),
+														  gd,
+														  agg_costs,
+														  dNumGroups);
 		}
 		else
 		{
@@ -6787,6 +7043,29 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 									 havingQual,
 									 agg_costs,
 									 dNumGroups));
+
+			if (max_hashagg_batches > 0 &&
+				grouped_rel->consider_parallel &&
+				input_rel->partial_pathlist != NIL &&
+				dNumGroups >= 2.0)
+			{
+				Path   *path = linitial(input_rel->partial_pathlist);
+				double	numGroups = dNumGroups / path->parallel_workers;
+				if (numGroups < 1.0)
+					numGroups = 1.0;
+				path = (Path*)create_agg_path(root,
+											  grouped_rel,
+											  path,
+											  grouped_rel->reltarget,
+											  AGG_BATCH_HASH,
+											  AGGSPLIT_SIMPLE,
+											  parse->groupClause,
+											  havingQual,
+											  agg_costs,
+											  numGroups);
+				path->parallel_aware = true;
+				add_partial_path(grouped_rel, path);
+			}
 		}
 
 		/*
@@ -6809,6 +7088,32 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 									 agg_final_costs,
 									 dNumGroups));
 		}
+
+		/*
+		 * Generate a Finalize BatchHashAgg
+		 */
+		if (max_hashagg_batches > 0 &&
+			dNumGroups >= 2.0 &&
+			partially_grouped_rel &&
+			partially_grouped_rel->partial_pathlist)
+		{
+			Path   *path = linitial(partially_grouped_rel->partial_pathlist);
+			double	numGroups = dNumGroups / path->parallel_workers;
+			if (numGroups < 1.0)
+				numGroups = 1.0;
+			path = (Path*)create_agg_path(root,
+										  grouped_rel,
+										  path,
+										  grouped_rel->reltarget,
+										  AGG_BATCH_HASH,
+										  AGGSPLIT_FINAL_DESERIAL,
+										  parse->groupClause,
+										  havingQual,
+										  agg_final_costs,
+										  numGroups);
+			path->parallel_aware = true;
+			add_partial_path(grouped_rel, path);
+		}
 	}
 
 	/*
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index c3c36be13e..d2060dbeff 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -737,6 +737,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
 
 		case T_Material:
 		case T_Sort:
+		case T_BatchSort:
 		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 6d4cc1bcce..4556ec9d2a 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2698,6 +2698,8 @@ finalize_plan(PlannerInfo *root, Plan *plan,
 										  &aggcontext);
 					agg->aggParams = aggcontext.paramids;
 				}
+
+				agg->gatherParam = gather_param;
 			}
 			break;
 
@@ -2759,6 +2761,10 @@ finalize_plan(PlannerInfo *root, Plan *plan,
 			/* no node-type-specific fields need fixing */
 			break;
 
+		case T_BatchSort:
+			((BatchSort*)plan)->gather_param = gather_param;
+			break;
+
 		default:
 			elog(ERROR, "unrecognized node type: %d",
 				 (int) nodeTag(plan));
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 86f794c193..1faab8a6ef 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -67,7 +67,7 @@ static List *plan_union_children(PlannerInfo *root,
 								 List *refnames_tlist,
 								 List **tlist_list);
 static Path *make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
-							   PlannerInfo *root);
+							   PlannerInfo *root, List *groupList, List *sortKeys);
 static void postprocess_setop_rel(PlannerInfo *root, RelOptInfo *rel);
 static bool choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 								Path *input_path,
@@ -354,6 +354,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 			rel = generate_nonunion_paths(op, root,
 										  refnames_tlist,
 										  pTargetList);
+		generate_useful_gather_paths(root, rel, false);
 		if (pNumGroups)
 			*pNumGroups = rel->rows;
 
@@ -552,6 +553,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	List	   *tlist_list;
 	List	   *tlist;
 	Path	   *path;
+	List	   *groupList = NIL;
+	List	   *sortKeys = NIL;
 
 	/*
 	 * If plain UNION, tell children to fetch all tuples.
@@ -587,6 +590,14 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 
 	*pTargetList = tlist;
 
+	if (!op->all)
+	{
+		/* Identify the grouping semantics */
+		groupList = generate_setop_grouplist(op, tlist);
+		if (grouping_is_sortable(groupList))
+			sortKeys = make_pathkeys_for_sortclauses(root, groupList, tlist);
+	}
+
 	/* Build path lists and relid set. */
 	foreach(lc, rellist)
 	{
@@ -627,7 +638,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	 * node(s) to remove duplicates.
 	 */
 	if (!op->all)
-		path = make_union_unique(op, path, tlist, root);
+		path = make_union_unique(op, path, tlist, root, groupList, sortKeys);
 
 	add_path(result_rel, path);
 
@@ -646,6 +657,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	{
 		Path	   *ppath;
 		ListCell   *lc;
+		List	   *hashable_list;
 		int			parallel_workers = 0;
 
 		/* Find the highest number of workers requested for any subpath. */
@@ -678,11 +690,64 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 							   NIL, NULL,
 							   parallel_workers, enable_parallel_append,
 							   NIL, -1);
+
+		/* create parallel batch sort union */
+		if (!op->all &&
+			max_sort_batches > 0 &&
+			sortKeys != NIL &&
+			ppath->rows >= 2.0 &&
+			(hashable_list = grouping_get_hashable(groupList)) != NIL)
+		{
+			Path   *partial_path;
+			uint32	numBatches = ppath->rows;
+			if (numBatches > max_sort_batches)
+				numBatches = max_sort_batches;
+			Assert(list_length(sortKeys) >= list_length(hashable_list));
+			partial_path = (Path*)create_batchsort_path(root,
+														result_rel,
+														ppath,
+														sortKeys,
+														hashable_list,
+														numBatches,
+														true);
+			partial_path = (Path*) create_upper_unique_path(root,
+															result_rel,
+															partial_path,
+															list_length(sortKeys),
+															partial_path->rows);
+			add_partial_path(result_rel, partial_path);
+		}
+
+		/* create parallel batch hashed union */
+		if (!op->all &&
+			max_hashagg_batches > 0 &&
+			ppath->rows > 1.0 &&
+			grouping_is_hashable(groupList) &&
+			ppath->pathtarget->width * ppath->rows / max_hashagg_batches <= get_hash_mem() * 1024L)
+		{
+			Path   *partial_path;
+			double	dNumGroups = ppath->rows / ppath->parallel_workers;
+			if (dNumGroups < 1.0)
+				dNumGroups = 1.0;
+			partial_path = (Path*)create_agg_path(root,
+												  result_rel,
+												  ppath,
+												  create_pathtarget(root, tlist),
+												  AGG_BATCH_HASH,
+												  AGGSPLIT_SIMPLE,
+												  groupList,
+												  NIL,
+												  NULL,
+												  dNumGroups);
+			partial_path->parallel_aware = true;
+			add_partial_path(result_rel, partial_path);
+		}
+
 		ppath = (Path *)
 			create_gather_path(root, result_rel, ppath,
 							   result_rel->reltarget, NULL, NULL);
 		if (!op->all)
-			ppath = make_union_unique(op, ppath, tlist, root);
+			ppath = make_union_unique(op, ppath, tlist, root, groupList, sortKeys);
 		add_path(result_rel, ppath);
 	}
 
@@ -933,15 +998,11 @@ plan_union_children(PlannerInfo *root,
  */
 static Path *
 make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
-				  PlannerInfo *root)
+				  PlannerInfo *root, List *groupList, List *sortKeys)
 {
 	RelOptInfo *result_rel = fetch_upper_rel(root, UPPERREL_SETOP, NULL);
-	List	   *groupList;
 	double		dNumGroups;
 
-	/* Identify the grouping semantics */
-	groupList = generate_setop_grouplist(op, tlist);
-
 	/*
 	 * XXX for the moment, take the number of distinct groups as equal to the
 	 * total input size, ie, the worst case.  This is too conservative, but
@@ -976,9 +1037,7 @@ make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
 				create_sort_path(root,
 								 result_rel,
 								 path,
-								 make_pathkeys_for_sortclauses(root,
-															   groupList,
-															   tlist),
+								 sortKeys,
 								 -1.0);
 		path = (Path *) create_upper_unique_path(root,
 												 result_rel,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index d465b9e213..f88762fada 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2880,6 +2880,45 @@ create_sort_path(PlannerInfo *root,
 	return pathnode;
 }
 
+BatchSortPath *
+create_batchsort_path(PlannerInfo *root,
+					  RelOptInfo *rel,
+					  Path *subpath,
+					  List *pathkeys,
+					  List *groupClause,
+					  uint32 numBatches,
+					  bool parallel_sort)
+{
+	BatchSortPath   *pathnode = makeNode(BatchSortPath);
+	Assert(numBatches > 0);
+
+	pathnode->path.pathtype = T_BatchSort;
+	pathnode->path.parent = rel;
+	/* Sort doesn't project, so use source path's pathtarget */
+	pathnode->path.pathtarget = subpath->pathtarget;
+	/* For now, assume we are above any joins, so no parameterization */
+	pathnode->path.param_info = NULL;
+	pathnode->path.parallel_aware = parallel_sort;
+	pathnode->path.parallel_safe = rel->consider_parallel &&
+		subpath->parallel_safe;
+	pathnode->path.parallel_workers = subpath->parallel_workers;
+	pathnode->batchkeys = pathkeys;
+	pathnode->batchgroup = groupClause;
+	pathnode->numBatches = numBatches;
+
+	pathnode->subpath = subpath;
+
+	cost_batchsort(&pathnode->path, root, pathkeys,
+				   subpath->total_cost, subpath->rows,
+				   subpath->pathtarget->width,
+				   0.0,				/* XXX comparison_cost shouldn't be 0? */
+				   work_mem/numBatches,
+				   list_length(groupClause),
+				   numBatches);
+
+	return pathnode;
+}
+
 /*
  * create_group_path
  *	  Creates a pathnode that represents performing grouping of presorted input
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 89853a0630..7725139573 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -588,6 +588,27 @@ grouping_is_hashable(List *groupClause)
 	return true;
 }

+/*
+ * grouping_get_hashable - extract all hashable clause
+ *
+ * Some time we don't all clauses can be hash
+ */
+List *
+grouping_get_hashable(List *groupClause)
+{
+	ListCell   *lc;
+	List	   *result = NIL;
+
+	foreach (lc, groupClause)
+	{
+		SortGroupClause *groupcl = lfirst_node(SortGroupClause, lc);
+
+		if (groupcl->hashable)
+			result = lappend(result, groupcl);
+	}
+
+	return result;
+}
 
 /*****************************************************************************
  *		PathTarget manipulation functions
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 3f24a33ef1..5025724fd4 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4045,6 +4045,12 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_XACT_GROUP_UPDATE:
 			event_name = "XactGroupUpdate";
 			break;
+		case WAIT_EVENT_BATCH_SORT_BUILD:
+			event_name = "Batch/Sort/Building";
+			break;
+		case WAIT_EVENT_BATCH_HASH_BUILD:
+			event_name = "Batch/Hash/Building";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index db7e59f8b7..1fd839904c 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -170,8 +170,6 @@ static const char *const BuiltinTrancheNames[] = {
 	"PerSessionRecordType",
 	/* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
 	"PerSessionRecordTypmod",
-	/* LWTRANCHE_SHARED_TUPLESTORE: */
-	"SharedTupleStore",
 	/* LWTRANCHE_SHARED_TIDBITMAP: */
 	"SharedTidBitmap",
 	/* LWTRANCHE_PARALLEL_APPEND: */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 17579eeaca..d9d1ea728e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3422,6 +3422,32 @@ static struct config_int ConfigureNamesInt[] =
 		check_huge_page_size, NULL, NULL
 	},
 
+	{
+		{"max_sort_batches", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Sets the maximum number of batches for sort"),
+			NULL
+		},
+		&max_sort_batches,
+		/* boot and min value 0 mean is disable */
+		0, 0,
+		/* I think too many batches will make each batch not enough working memory when sorting */
+		MAX_PARALLEL_WORKER_LIMIT * 3,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"max_hashagg_batches", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Sets the maximum number of batches for hash aggregate"),
+			NULL
+		},
+		&max_hashagg_batches,
+		/* boot and min value 0 mean is disable, I think max_files_per_process/2 is a good value */
+		0, 0,
+		/* I think too many batches will make FD will open and close files frequently, this is guessed-at value */
+		10000,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"debug_invalidate_system_caches_always", PGC_SUSET, DEVELOPER_OPTIONS,
 			gettext_noop("Aggressively invalidate system caches for debugging purposes."),
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index 7ac3659261..f82f5aa8cd 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -19,7 +19,8 @@ OBJS = \
 	sharedtuplestore.o \
 	sortsupport.o \
 	tuplesort.o \
-	tuplestore.o
+	tuplestore.o \
+	batchstore.o
 
 tuplesort.o: qsort_tuple.c
 
diff --git a/src/backend/utils/sort/batchstore.c b/src/backend/utils/sort/batchstore.c
new file mode 100644
index 0000000000..4578c718d3
--- /dev/null
+++ b/src/backend/utils/sort/batchstore.c
@@ -0,0 +1,466 @@
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "commands/tablespace.h"
+#include "executor/nodeHash.h"
+#include "port/atomics.h"
+#include "storage/buffile.h"
+#include "utils/batchstore.h"
+#include "utils/memutils.h"
+#include "utils/sharedtuplestore.h"
+
+#define InvalidBatch UINT32_MAX
+
+typedef enum BatchMethod
+{
+	BSM_HASH = 1,
+	BSM_PARALLEL_HASH
+}BatchMethod;
+
+typedef struct BatchStoreParallelHashData
+{
+	pg_atomic_uint32	cur_batches;
+	uint32				num_batches;
+	uint32				num_participants;
+}BatchStoreParallelHashData;
+
+typedef struct BatchStoreData
+{
+	BatchStoreFuncs	func;
+	BatchMethod	method;
+	uint32		num_batches;
+	void	   *cur_batch_ptr;
+	uint32		cur_batch_num;
+	union
+	{
+		/* for hash */
+		struct
+		{
+			StringInfoData	hash_read_buf;
+		};
+		/* for parallel hash */
+		struct
+		{
+			dsm_segment		   *dsm_seg;
+			MemoryContext		accessor_mcontext;
+			/* in shared memory, next idle parallel hash batch number */
+			pg_atomic_uint32   *shm_ph_batch_num;
+		};
+	};
+	void	 *all_batches[FLEXIBLE_ARRAY_MEMBER];
+}BatchStoreData;
+
+static void bs_write_normal_hash(BatchStore bs, MinimalTuple mtup, uint32 hash);
+static MinimalTuple bs_read_normal_hash(BatchStore bs, uint32 *hash);
+
+static void bs_write_parallel_hash(BatchStore bs, MinimalTuple mtup, uint32 hash);
+static void bs_write_parallel_one_batch_hash(BatchStore bs, MinimalTuple mtup, uint32 hash);
+static MinimalTuple bs_read_parallel_hash(BatchStore bs, uint32 *hash);
+
+/*
+ * make an empty batch store
+ */
+static inline BatchStore
+make_empty_batch_store(uint32 num_batches, BatchMethod method)
+{
+	BatchStore bs = palloc0(offsetof(BatchStoreData, all_batches) +
+								sizeof(void*) * num_batches);
+
+	bs->method = method;
+	bs->num_batches = num_batches;
+	bs->cur_batch_num = InvalidBatch;
+
+	return bs;
+}
+
+/*
+ * Create a normal batch store
+ */
+BatchStore
+bs_begin_hash(uint32 num_batches)
+{
+	BatchStore bs = make_empty_batch_store(num_batches, BSM_HASH);
+
+	/* Initialize hash read buffer for MinimalTuple */
+	initStringInfo(&bs->hash_read_buf);
+	enlargeStringInfo(&bs->hash_read_buf, MINIMAL_TUPLE_DATA_OFFSET);
+	MemSet(bs->hash_read_buf.data, 0, MINIMAL_TUPLE_DATA_OFFSET);
+
+	PrepareTempTablespaces();
+
+	bs->func.hash_write = bs_write_normal_hash;
+	bs->func.hash_read = bs_read_normal_hash;
+	return bs;
+}
+
+size_t
+bs_parallel_hash_estimate(uint32 num_batches, uint32 nparticipants)
+{
+	return MAXALIGN(sizeof(struct BatchStoreParallelHashData)) +
+				MAXALIGN(sts_estimate(nparticipants)) * num_batches;
+}
+
+/*
+ * Create or attach shared batch store
+ */
+static BatchStore
+bs_begin_parallel_hash(BatchStoreParallelHash bsph,
+					   uint32 my_participant_num, bool init,
+					   SharedFileSet *fileset, const char *name,
+					   dsm_segment *dsm_seg)
+{
+	uint32			i;
+	MemoryContext	oldcontext;
+	char		   *addr;
+	char			buffer[24];
+	Size			sts_size = MAXALIGN(sts_estimate(bsph->num_participants));
+	BatchStore		bs = make_empty_batch_store(bsph->num_batches, BSM_PARALLEL_HASH);
+
+	bs->shm_ph_batch_num = &bsph->cur_batches;
+
+	bs->accessor_mcontext = AllocSetContextCreate(CurrentMemoryContext,
+												  "batch parallel hash",
+												  ALLOCSET_DEFAULT_SIZES);
+	oldcontext = MemoryContextSwitchTo(bs->accessor_mcontext);
+	addr = ((char*)bsph) + MAXALIGN(sizeof(*bsph));
+	for (i=bsph->num_batches;i>0;)
+	{
+		--i;
+		if (init)
+		{
+			sprintf(buffer, "%s_%u", name, i);
+			bs->all_batches[i] = sts_initialize((SharedTuplestore*)addr,
+												bsph->num_participants,
+												my_participant_num,
+												sizeof(uint32),
+												0,
+												fileset,
+												buffer);
+		}else
+		{
+			bs->all_batches[i] = sts_attach((SharedTuplestore*)addr,
+											my_participant_num,
+											fileset);
+		}
+		addr += sts_size;
+	}
+	MemoryContextSwitchTo(oldcontext);
+
+	bs->dsm_seg = dsm_seg;
+	bs->func.hash_read = bs_read_parallel_hash;
+	if (bs->num_batches == 1)
+		bs->func.hash_write = bs_write_parallel_one_batch_hash;
+	else
+		bs->func.hash_write = bs_write_parallel_hash;
+
+	return bs;
+}
+
+/*
+ * Create a parallel batch store
+ */
+BatchStore
+bs_init_parallel_hash(uint32 num_batches,
+					  uint32 nparticipants, uint32 my_participant_num,
+					  BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+					  SharedFileSet *fileset, const char *name)
+{
+	Assert(name != NULL && fileset != NULL);
+	bsph->num_batches = num_batches;
+	bsph->num_participants = nparticipants;
+	pg_atomic_init_u32(&bsph->cur_batches, InvalidBatch);
+
+	return bs_begin_parallel_hash(bsph, my_participant_num, true, fileset, name, dsm_seg);
+}
+
+/*
+ * Attach from parallel batch store
+ */
+BatchStore
+bs_attach_parallel_hash(BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+						SharedFileSet *fileset, uint32 my_participant_num)
+{
+	return bs_begin_parallel_hash(bsph, my_participant_num, false, fileset, NULL, dsm_seg);
+}
+
+/* Destory batch store */
+void
+bs_destory(BatchStore bs)
+{
+	uint32	i;
+	if (bs == NULL)
+		return;
+
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		for(i=0;i<bs->num_batches;++i)
+		{
+			if (bs->all_batches[i])
+				BufFileClose(bs->all_batches[i]);
+		}
+		pfree(bs->hash_read_buf.data);
+		break;
+	case BSM_PARALLEL_HASH:
+		{
+			BatchStoreParallelHash bsph = (BatchStoreParallelHash)(((char*)bs->shm_ph_batch_num) -
+											offsetof(BatchStoreParallelHashData, cur_batches));
+			uint32 count = bsph->num_batches;
+			while (count > 0)
+				sts_detach(bs->all_batches[--count]);
+			MemoryContextDelete(bs->accessor_mcontext);
+		}
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+
+	pfree(bs);
+}
+
+/*
+ * Write MinimalTuple and hash value to normal batch store
+ */
+static void
+bs_write_normal_hash(BatchStore bs, MinimalTuple mtup, uint32 hash)
+{
+	uint32 batch = hash % bs->num_batches;
+	uint32 data_len = mtup->t_len - MINIMAL_TUPLE_DATA_OFFSET;
+	BufFile *buffile = bs->all_batches[batch];
+
+	if (unlikely(buffile == NULL))
+	{
+		MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(bs));
+		buffile = BufFileCreateTemp(false);
+		bs->all_batches[batch] = buffile;
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	BufFileWrite(buffile, &hash, sizeof(hash));
+	BufFileWrite(buffile, &mtup->t_len, sizeof(mtup->t_len));
+	BufFileWrite(buffile, ((char*)mtup) + MINIMAL_TUPLE_DATA_OFFSET, data_len);
+}
+
+/*
+ * Read MinimalTuple and hash value from normal batch store
+ */
+static MinimalTuple
+bs_read_normal_hash(BatchStore bs, uint32 *hash)
+{
+	MinimalTuple	mtup;
+	size_t			nread;
+	uint32			head[2];
+	uint32			data_len;
+
+	/* Read hash value and tuple length */
+	nread = BufFileRead(bs->cur_batch_ptr, head, sizeof(head));
+	if (nread == 0)
+		return NULL;
+
+	if (nread != sizeof(head))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from batch store temporary file: %m")));
+	*hash = head[0];
+
+	/* Enlarge buffer and read tuple data */
+	enlargeStringInfo(&bs->hash_read_buf, head[1]);
+	mtup = (MinimalTuple)bs->hash_read_buf.data;
+	mtup->t_len = head[1];
+	data_len = head[1] - MINIMAL_TUPLE_DATA_OFFSET;
+	if (BufFileRead(bs->cur_batch_ptr, ((char*)mtup) + MINIMAL_TUPLE_DATA_OFFSET, data_len) != data_len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from batch store temporary file: %m")));
+
+	return mtup;
+}
+
+/* end write batch store, ready for read */
+void
+bs_end_write(BatchStore bs)
+{
+	uint32 i;
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		/* nothing to do */
+		break;
+	case BSM_PARALLEL_HASH:
+		for (i=bs->num_batches;i>0;)
+			sts_end_write(bs->all_batches[--i]);
+		bs->cur_batch_ptr = NULL;
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+}
+
+/*
+ * Write MinimalTuple and hash value to parallel batch store
+ */
+static void
+bs_write_parallel_hash(BatchStore bs, MinimalTuple mtup, uint32 hash)
+{
+	sts_puttuple(bs->all_batches[hash%bs->num_batches],
+				 &hash,
+				 mtup);
+}
+
+/*
+ * Write MinimalTuple and hash value to parallel batch store,
+ * only for only have one batch
+ */
+static void
+bs_write_parallel_one_batch_hash(BatchStore bs, MinimalTuple mtup, uint32 hash)
+{
+	Assert(bs->num_batches == 1);
+	sts_puttuple(bs->all_batches[0],
+				 &hash,
+				 mtup);
+}
+
+/*
+ * Read MinimalTuple and hash value from parallel batch store
+ */
+static MinimalTuple
+bs_read_parallel_hash(BatchStore bs, uint32 *hash)
+{
+	return sts_scan_next(bs->cur_batch_ptr, hash);
+}
+
+/*
+ * Get next batch from batch store, return false if no more.
+ */
+bool
+bs_next_batch(BatchStore bs, bool no_parallel)
+{
+	uint32 batch;
+	switch(bs->method)
+	{
+	case BSM_HASH:
+
+		batch = bs->cur_batch_num;
+		++batch;
+
+		for (;batch < bs->num_batches;++batch)
+		{
+			if (bs->all_batches[batch])
+			{
+				bs->cur_batch_ptr = bs->all_batches[batch];
+				bs->cur_batch_num = batch;
+				if (BufFileSeek(bs->cur_batch_ptr, 0, 0, SEEK_SET) != 0)
+				{
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("can not seek batch store file to head")));
+				}
+				return true;
+			}
+		}
+		break;
+	case BSM_PARALLEL_HASH:
+		if (no_parallel)
+		{
+			batch = bs->cur_batch_num;
+			++batch;
+		}else
+		{
+			batch = pg_atomic_add_fetch_u32(bs->shm_ph_batch_num, 1);
+		}
+
+		if (batch < bs->num_batches)
+		{
+			bs->cur_batch_num = batch;
+			bs->cur_batch_ptr = bs->all_batches[batch];
+			sts_begin_scan(bs->cur_batch_ptr);
+			return true;
+		}
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+
+	return false;
+}
+
+void
+bs_rescan(BatchStore bs)
+{
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		break;
+	case BSM_PARALLEL_HASH:
+		for (uint32 i=bs->num_batches;i>0;)
+			sts_reinitialize(bs->all_batches[--i]);
+		pg_atomic_write_u32(bs->shm_ph_batch_num, InvalidBatch);
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+	bs->cur_batch_ptr = NULL;
+	bs->cur_batch_num = InvalidBatch;
+}
+
+/*
+ * Delete all the contents of a batch store
+ */
+void
+bs_clear(BatchStore bs)
+{
+	uint32	i;
+	switch (bs->method)
+	{
+	case BSM_HASH:
+		for (i=bs->num_batches;i>0;)
+		{
+			--i;
+			if (bs->all_batches[i])
+			{
+				BufFileClose(bs->all_batches[i]);
+				bs->all_batches[i] = 0;
+			}
+		}
+		break;
+	case BSM_PARALLEL_HASH:
+		pg_atomic_write_u32(bs->shm_ph_batch_num, InvalidBatch);
+		for (i=bs->num_batches;i>0;)
+			sts_clear(bs->all_batches[--i]);
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+	bs->cur_batch_ptr = NULL;
+	bs->cur_batch_num = InvalidBatch;
+}
+
+void
+bs_end_cur_batch(BatchStore bs)
+{
+	switch(bs->method)
+	{
+	case BSM_HASH:
+		bs->cur_batch_ptr = NULL;
+		break;
+	case BSM_PARALLEL_HASH:
+		sts_end_scan(bs->cur_batch_ptr);
+		bs->cur_batch_ptr = NULL;
+		break;
+	default:
+		ereport(ERROR,
+				(errmsg("unknown batch store method %u", bs->method)));
+		break;
+	}
+}
diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c
index 57e35db4f8..03472c080e 100644
--- a/src/backend/utils/sort/sharedtuplestore.c
+++ b/src/backend/utils/sort/sharedtuplestore.c
@@ -24,9 +24,10 @@
 #include "access/htup.h"
 #include "access/htup_details.h"
 #include "miscadmin.h"
+#include "port/atomics.h"
 #include "storage/buffile.h"
-#include "storage/lwlock.h"
 #include "storage/sharedfileset.h"
+#include "utils/memutils.h"
 #include "utils/sharedtuplestore.h"
 
 /*
@@ -50,8 +51,7 @@ typedef struct SharedTuplestoreChunk
 /* Per-participant shared state. */
 typedef struct SharedTuplestoreParticipant
 {
-	LWLock		lock;
-	BlockNumber read_page;		/* Page number for next read. */
+	pg_atomic_uint32 read_page;	/* Page number for next read. */
 	BlockNumber npages;			/* Number of pages written. */
 	bool		writing;		/* Used only for assertions. */
 } SharedTuplestoreParticipant;
@@ -72,6 +72,8 @@ struct SharedTuplestore
 struct SharedTuplestoreAccessor
 {
 	int			participant;	/* My participant number. */
+	bool		is_read_only;	/* is read only attach? */
+	bool		is_normal_scan;	/* is not parallel scan? */
 	SharedTuplestore *sts;		/* The shared state. */
 	SharedFileSet *fileset;		/* The SharedFileSet holding files. */
 	MemoryContext context;		/* Memory context for buffers. */
@@ -155,9 +157,8 @@ sts_initialize(SharedTuplestore *sts, int participants,
 
 	for (i = 0; i < participants; ++i)
 	{
-		LWLockInitialize(&sts->participants[i].lock,
-						 LWTRANCHE_SHARED_TUPLESTORE);
-		sts->participants[i].read_page = 0;
+		pg_atomic_init_u32(&sts->participants[i].read_page, 0);
+		sts->participants[i].npages = 0;
 		sts->participants[i].writing = false;
 	}
 
@@ -192,6 +193,27 @@ sts_attach(SharedTuplestore *sts,
 	return accessor;
 }
 
+/*
+ * Like function sts_attach, but only for read, can't write any tuple
+ */
+SharedTuplestoreAccessor *
+sts_attach_read_only(SharedTuplestore *sts,
+					 SharedFileSet *fileset)
+{
+	SharedTuplestoreAccessor *accessor;
+
+	Assert(sts->nparticipants > 0);
+
+	accessor = palloc0(sizeof(SharedTuplestoreAccessor));
+	accessor->is_read_only = true;
+	accessor->participant = 0;
+	accessor->sts = sts;
+	accessor->fileset = fileset;
+	accessor->context = CurrentMemoryContext;
+
+	return accessor;
+}
+
 static void
 sts_flush_chunk(SharedTuplestoreAccessor *accessor)
 {
@@ -242,10 +264,42 @@ sts_reinitialize(SharedTuplestoreAccessor *accessor)
 	 */
 	for (i = 0; i < accessor->sts->nparticipants; ++i)
 	{
-		accessor->sts->participants[i].read_page = 0;
+		pg_atomic_init_u32(&accessor->sts->participants[i].read_page, 0);
 	}
 }
 
+/*
+ * Delete all the contents of shared tuplestore, and reset read page to the start
+ */
+void
+sts_clear(SharedTuplestoreAccessor *accessor)
+{
+	int					i;
+	char				name[MAXPGPATH];
+	SharedTuplestore   *sts = accessor->sts;
+
+	/* Must not in read and write */
+	Assert(accessor->read_file == NULL &&
+		   accessor->write_file == NULL);
+
+	/* Delete all created files */
+	for (i=0;i<sts->nparticipants;++i)
+	{
+		Assert(sts->participants[i].writing == false);
+		if (sts->participants[i].npages > 0)
+		{
+			sts_filename(name, accessor, i);
+			BufFileDeleteShared(accessor->fileset, name);
+			pg_atomic_write_u32(&sts->participants[i].read_page, 0);
+			sts->participants[i].npages = 0;
+		}
+	}
+
+	accessor->write_page = 0;
+	accessor->write_pointer = NULL;
+	accessor->write_end = NULL;
+}
+
 /*
  * Begin scanning the contents in parallel.
  */
@@ -272,6 +326,8 @@ sts_begin_parallel_scan(SharedTuplestoreAccessor *accessor)
 	accessor->read_participant = accessor->participant;
 	accessor->read_file = NULL;
 	accessor->read_next_page = 0;
+	accessor->read_ntuples = 0;
+	accessor->read_ntuples_available = 0;
 }
 
 /*
@@ -302,15 +358,23 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data,
 {
 	size_t		size;
 
+	if (unlikely(accessor->is_read_only))
+	{
+		ereport(ERROR,
+				(errmsg("shard tuplestore is attached read only")));
+	}
+
 	/* Do we have our own file yet? */
 	if (accessor->write_file == NULL)
 	{
 		SharedTuplestoreParticipant *participant;
-		char		name[MAXPGPATH];
+		char			name[MAXPGPATH];
+		MemoryContext	oldcontext = MemoryContextSwitchTo(accessor->context);
 
 		/* Create one.  Only this backend will write into it. */
 		sts_filename(name, accessor, accessor->participant);
 		accessor->write_file = BufFileCreateShared(accessor->fileset, name);
+		MemoryContextSwitchTo(oldcontext);
 
 		/* Set up the shared state for this backend's file. */
 		participant = &accessor->sts->participants[accessor->participant];
@@ -532,20 +596,36 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
 		/* Find the location of a new chunk to read. */
 		p = &accessor->sts->participants[accessor->read_participant];
 
-		LWLockAcquire(&p->lock, LW_EXCLUSIVE);
-		/* We can skip directly past overflow pages we know about. */
-		if (p->read_page < accessor->read_next_page)
-			p->read_page = accessor->read_next_page;
-		eof = p->read_page >= p->npages;
-		if (!eof)
+		if (accessor->is_normal_scan)
+		{
+			eof = accessor->read_next_page >= p->npages;
+			if (!eof)
+			{
+				read_page = accessor->read_next_page;
+				accessor->read_next_page += STS_CHUNK_PAGES;
+			}
+		}else
 		{
 			/* Claim the next chunk. */
-			read_page = p->read_page;
-			/* Advance the read head for the next reader. */
-			p->read_page += STS_CHUNK_PAGES;
-			accessor->read_next_page = p->read_page;
+			read_page = pg_atomic_read_u32(&p->read_page);
+			/* We can skip directly past overflow pages we know about. */
+			while (read_page < accessor->read_next_page)
+			{
+				if (pg_atomic_compare_exchange_u32(&p->read_page,
+												   &read_page,
+												   accessor->read_next_page))
+					break;
+			}
+			while ((eof = read_page >= p->npages) == false)
+			{
+				/* Advance the read head for the next reader. */
+				accessor->read_next_page = read_page + STS_CHUNK_PAGES;
+				if (pg_atomic_compare_exchange_u32(&p->read_page,
+												   &read_page,
+												   accessor->read_next_page))
+					break;
+			}
 		}
-		LWLockRelease(&p->lock);
 
 		if (!eof)
 		{
@@ -556,10 +636,12 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
 			if (accessor->read_file == NULL)
 			{
 				char		name[MAXPGPATH];
+				MemoryContext oldcontext = MemoryContextSwitchTo(accessor->context);
 
 				sts_filename(name, accessor, accessor->read_participant);
 				accessor->read_file =
 					BufFileOpenShared(accessor->fileset, name, O_RDONLY);
+				MemoryContextSwitchTo(oldcontext);
 			}
 
 			/* Seek and load the chunk header. */
@@ -626,3 +708,46 @@ sts_filename(char *name, SharedTuplestoreAccessor *accessor, int participant)
 {
 	snprintf(name, MAXPGPATH, "%s.p%d", accessor->sts->name, participant);
 }
+
+/*
+ * Begin scanning the contents.
+ */
+void
+sts_begin_scan(SharedTuplestoreAccessor *accessor)
+{
+	sts_begin_parallel_scan(accessor);
+	accessor->is_normal_scan = true;
+}
+
+/*
+ * Finish a normal scan, freeing associated backend-local resources.
+ */
+void
+sts_end_scan(SharedTuplestoreAccessor *accessor)
+{
+	Assert(accessor->is_normal_scan);
+	sts_end_parallel_scan(accessor);
+	accessor->is_normal_scan = false;
+}
+
+/*
+ * Get the next tuple in the current normal scan.
+ */
+MinimalTuple
+sts_scan_next(SharedTuplestoreAccessor *accessor,
+					   void *meta_data)
+{
+	Assert(accessor->is_normal_scan);
+	return sts_parallel_scan_next(accessor, meta_data);
+}
+
+/*
+ * Detach SharedTuplestore and freeing associated backend-local resources.
+ */
+void
+sts_detach(SharedTuplestoreAccessor *accessor)
+{
+	sts_end_write(accessor);
+	sts_end_parallel_scan(accessor);
+	pfree(accessor);
+}
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 7d0f96afb7..fc38d26a68 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -4573,6 +4573,26 @@ tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
 	}
 }
 
+/*
+ * tuplesort_reset_shared - reset shared tuplesort state
+ *
+ * Must be called from leader process before workers are launched,
+ * can be reuse after this call
+ */
+void
+tuplesort_reset_shared(Sharedsort *shared)
+{
+	int			i;
+
+	shared->currentWorker = 0;
+	shared->workersFinished = 0;
+	SharedFileSetDeleteAll(&shared->fileset);
+	for (i = 0; i < shared->nTapes; i++)
+	{
+		shared->tapes[i].firstblocknumber = 0L;
+	}
+}
+
 /*
  * tuplesort_attach_shared - attach to shared tuplesort state
  *
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index 398446d11f..68b277b379 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -310,6 +310,8 @@ typedef struct AggStatePerHashData
 	int			largestGrpColIdx;	/* largest col required for hashing */
 	AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
 	AttrNumber *hashGrpColIdxHash;	/* indices in hash table tuples */
+	Bitmapset  *colnos_needed;	/* all columns needed from the outer plan */
+	struct BatchStoreData *batch_store;	/* grouping set batch store hash */
 	Agg		   *aggnode;		/* original Agg node, for numGroups etc. */
 }			AggStatePerHashData;
 
diff --git a/src/include/executor/nodeBatchSort.h b/src/include/executor/nodeBatchSort.h
new file mode 100644
index 0000000000..66c68e0125
--- /dev/null
+++ b/src/include/executor/nodeBatchSort.h
@@ -0,0 +1,19 @@
+
+#ifndef NODE_BATCH_SORT_H
+#define NODE_BATCH_SORT_H
+
+#include "access/parallel.h"
+#include "nodes/execnodes.h"
+
+extern BatchSortState *ExecInitBatchSort(BatchSort *node, EState *estate, int eflags);
+extern void ExecEndBatchSort(BatchSortState *node);
+extern void ExecReScanBatchSort(BatchSortState *node);
+
+/* parallel scan support */
+extern void ExecBatchSortEstimate(BatchSortState *node, ParallelContext *pcxt);
+extern void ExecBatchSortInitializeDSM(BatchSortState *node, ParallelContext *pcxt);
+extern void ExecBatchSortReInitializeDSM(BatchSortState *node, ParallelContext *pcxt);
+extern void ExecBatchSortInitializeWorker(BatchSortState *node, ParallelWorkerContext *pwcxt);
+extern void ExecShutdownBatchSort(BatchSortState *node);
+
+#endif							/* NODE_BATCH_SORT_H */
\ No newline at end of file
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 48c3f570fa..8ad2a24ab7 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2011,6 +2011,22 @@ typedef struct SortState
 	SharedSortInfo *shared_info;	/* one entry per worker */
 } SortState;
 
+/* ----------------
+ *	 BatchSortState information
+ * ----------------
+ */
+typedef struct BatchSortState
+{
+	PlanState	ps;				/* its first field is NodeTag */
+	struct Tuplesortstate
+			  **batches;		/* private state of tuplesort.c */
+	List	   *groupFuns;		/* hash function call info for each group-key */
+	struct ParallelBatchSort
+			   *parallel;		/* parallel info, private in nodeBatchSort.c */
+	int			curBatch;		/* current batch index */
+	bool		sort_Done;		/* sort completed yet? */
+}BatchSortState;
+
 /* ----------------
  *	 Instrumentation information for IncrementalSort
  * ----------------
@@ -2202,6 +2218,9 @@ typedef struct AggState
 										 * ->hash_pergroup */
 	ProjectionInfo *combinedproj;	/* projection machinery */
 	SharedAggInfo *shared_info; /* one entry per worker */
+	struct Barrier *batch_barrier;		/* for parallel batch */
+	int			current_batch_set;		/* current batch grouping set */
+	bool		batch_filled;			/* is all batches filled? */
 } AggState;
 
 /* ----------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index caed683ba9..8da83c7269 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -74,6 +74,7 @@ typedef enum NodeTag
 	T_HashJoin,
 	T_Material,
 	T_Sort,
+	T_BatchSort,
 	T_IncrementalSort,
 	T_Group,
 	T_Agg,
@@ -131,6 +132,7 @@ typedef enum NodeTag
 	T_HashJoinState,
 	T_MaterialState,
 	T_SortState,
+	T_BatchSortState,
 	T_IncrementalSortState,
 	T_GroupState,
 	T_AggState,
@@ -245,6 +247,7 @@ typedef enum NodeTag
 	T_ProjectionPath,
 	T_ProjectSetPath,
 	T_SortPath,
+	T_BatchSortPath,
 	T_IncrementalSortPath,
 	T_GroupPath,
 	T_UpperUniquePath,
@@ -759,7 +762,8 @@ typedef enum AggStrategy
 	AGG_PLAIN,					/* simple agg across all input rows */
 	AGG_SORTED,					/* grouped agg, input must be sorted */
 	AGG_HASHED,					/* grouped agg, use internal hashtable */
-	AGG_MIXED					/* grouped agg, hash and sort both used */
+	AGG_MIXED,					/* grouped agg, hash and sort both used */
+	AGG_BATCH_HASH				/* grouped agg, use batch hash */
 } AggStrategy;
 
 /*
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index cde2637798..2db484a7da 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1654,6 +1654,16 @@ typedef struct SortPath
 	Path	   *subpath;		/* path representing input source */
 } SortPath;
 
+typedef struct BatchSortPath
+{
+	Path		path;
+	Path	   *subpath;		/* path representing input source */
+	List	   *batchkeys;		/* our result is not all ordered, only for each batch,
+								 * so we can not use Path::pathkeys */
+	List	   *batchgroup;		/* a list of SortGroupClause for hash */
+	uint32		numBatches;
+}BatchSortPath;
+
 /*
  * IncrementalSortPath represents an incremental sort step
  *
@@ -2500,6 +2510,7 @@ typedef struct
 	bool		target_parallel_safe;
 	Node	   *havingQual;
 	List	   *targetList;
+	List	   *hashable_groups;
 	PartitionwiseAggregateType patype;
 } GroupPathExtraData;
 
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 43160439f0..675850de13 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -774,6 +774,19 @@ typedef struct Sort
 	bool	   *nullsFirst;		/* NULLS FIRST/LAST directions */
 } Sort;
 
+/* ----------------
+ *		batch sort node
+ * ----------------
+ */
+typedef struct BatchSort
+{
+	Sort		sort;
+	int			numGroupCols;	/* number of group-key columns */
+	int			numBatches;		/* number of group */
+	int			gather_param;	/* ID of Param of plan Gather or GatherMerge */
+	AttrNumber *grpColIdx;		/* their indexes in the target list */
+}BatchSort;
+
 /* ----------------
  *		incremental sort node
  * ----------------
@@ -828,6 +841,8 @@ typedef struct Agg
 	/* Note: planner provides numGroups & aggParams only in HASHED/MIXED case */
 	List	   *groupingSets;	/* grouping sets to use */
 	List	   *chain;			/* chained Agg/Sort nodes */
+	int			numBatches;		/* valid in AGG_BATCH_HASH */
+	int			gatherParam;	/* param of gather */
 } Agg;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index ed2e4af4be..f44796ad68 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -66,6 +66,8 @@ extern PGDLLIMPORT bool enable_parallel_append;
 extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
 extern PGDLLIMPORT int constraint_exclusion;
+extern PGDLLIMPORT int max_sort_batches;
+extern PGDLLIMPORT int max_hashagg_batches;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
 								  double index_pages, PlannerInfo *root);
@@ -102,6 +104,11 @@ extern void cost_sort(Path *path, PlannerInfo *root,
 					  List *pathkeys, Cost input_cost, double tuples, int width,
 					  Cost comparison_cost, int sort_mem,
 					  double limit_tuples);
+extern void cost_batchsort(Path *path, PlannerInfo *root,
+						   List *batchkeys, Cost input_cost,
+						   double tuples, int width,
+						   Cost comparison_cost, int sort_mem,
+						   uint32 numGroupCols, uint32 numBatchs);
 extern void cost_incremental_sort(Path *path,
 								  PlannerInfo *root, List *pathkeys, int presorted_keys,
 								  Cost input_startup_cost, Cost input_total_cost,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 23dec14cbd..9f87d71b95 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -195,6 +195,13 @@ extern IncrementalSortPath *create_incremental_sort_path(PlannerInfo *root,
 														 List *pathkeys,
 														 int presorted_keys,
 														 double limit_tuples);
+extern BatchSortPath *create_batchsort_path(PlannerInfo *root,
+											RelOptInfo *rel,
+											Path *subpath,
+											List *pathkeys,
+											List *groupClause,
+											uint32 numBatches,
+											bool parallel_sort);
 extern GroupPath *create_group_path(PlannerInfo *root,
 									RelOptInfo *rel,
 									Path *subpath,
diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h
index e081ef2d5e..188a91e0c2 100644
--- a/src/include/optimizer/tlist.h
+++ b/src/include/optimizer/tlist.h
@@ -36,6 +36,7 @@ extern Oid *extract_grouping_collations(List *groupClause, List *tlist);
 extern AttrNumber *extract_grouping_cols(List *groupClause, List *tlist);
 extern bool grouping_is_sortable(List *groupClause);
 extern bool grouping_is_hashable(List *groupClause);
+extern List *grouping_get_hashable(List *groupClause);
 
 extern PathTarget *make_pathtarget_from_tlist(List *tlist);
 extern List *make_tlist_from_pathtarget(PathTarget *target);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c38b689710..67138fa566 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -964,7 +964,9 @@ typedef enum
 	WAIT_EVENT_REPLICATION_SLOT_DROP,
 	WAIT_EVENT_SAFE_SNAPSHOT,
 	WAIT_EVENT_SYNC_REP,
-	WAIT_EVENT_XACT_GROUP_UPDATE
+	WAIT_EVENT_XACT_GROUP_UPDATE,
+	WAIT_EVENT_BATCH_SORT_BUILD,
+	WAIT_EVENT_BATCH_HASH_BUILD
 } WaitEventIPC;
 
 /* ----------
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index cbf2510fbf..cee0a540c9 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -214,7 +214,6 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_PER_SESSION_DSA,
 	LWTRANCHE_PER_SESSION_RECORD_TYPE,
 	LWTRANCHE_PER_SESSION_RECORD_TYPMOD,
-	LWTRANCHE_SHARED_TUPLESTORE,
 	LWTRANCHE_SHARED_TIDBITMAP,
 	LWTRANCHE_PARALLEL_APPEND,
 	LWTRANCHE_PER_XACT_PREDICATE_LIST,
diff --git a/src/include/utils/batchstore.h b/src/include/utils/batchstore.h
new file mode 100644
index 0000000000..f8d276a0d7
--- /dev/null
+++ b/src/include/utils/batchstore.h
@@ -0,0 +1,38 @@
+#ifndef BATCH_STORE_H
+#define BATCH_STORE_H
+
+#include "access/htup.h"
+#include "storage/dsm.h"
+#include "storage/sharedfileset.h"
+
+typedef struct BatchStoreData* BatchStore;
+typedef struct BatchStoreParallelHashData* BatchStoreParallelHash;
+
+typedef struct BatchStoreFuncs
+{
+	void (*hash_write)(BatchStore bs, MinimalTuple mtup, uint32 hash);
+	MinimalTuple (*hash_read)(BatchStore bs, uint32 *hash);
+}BatchStoreFuncs;
+
+#define bs_write_hash(bs, mtup, hash) (*((BatchStoreFuncs*)bs)->hash_write)(bs, mtup, hash)
+#define bs_read_hash(bs, phash) (*((BatchStoreFuncs*)bs)->hash_read)(bs, phash)
+
+extern BatchStore bs_begin_hash(uint32 num_batches);
+
+extern size_t bs_parallel_hash_estimate(uint32 num_batches, uint32 nparticipants);
+extern BatchStore bs_init_parallel_hash(uint32 num_batches,
+										uint32 nparticipants, uint32 my_participant_num,
+										BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+										SharedFileSet *fileset, const char *name);
+extern BatchStore bs_attach_parallel_hash(BatchStoreParallelHash bsph, dsm_segment *dsm_seg,
+										  SharedFileSet *fileset, uint32 my_participant_num);
+
+extern void bs_destory(BatchStore bs);
+extern void bs_clear(BatchStore bs);
+
+extern void bs_end_write(BatchStore bs);
+
+extern bool bs_next_batch(BatchStore bs, bool no_parallel);
+extern void bs_rescan(BatchStore bs);
+extern void bs_end_cur_batch(BatchStore bs);
+#endif /* BATCH_STORE_H */
\ No newline at end of file
diff --git a/src/include/utils/sharedtuplestore.h b/src/include/utils/sharedtuplestore.h
index 01ad6efe51..1848dc8d78 100644
--- a/src/include/utils/sharedtuplestore.h
+++ b/src/include/utils/sharedtuplestore.h
@@ -43,10 +43,15 @@ extern SharedTuplestoreAccessor *sts_attach(SharedTuplestore *sts,
 											int my_participant_number,
 											SharedFileSet *fileset);
 
+extern SharedTuplestoreAccessor *sts_attach_read_only(SharedTuplestore *sts,
+													  SharedFileSet *fileset);
+
 extern void sts_end_write(SharedTuplestoreAccessor *accessor);
 
 extern void sts_reinitialize(SharedTuplestoreAccessor *accessor);
 
+extern void sts_clear(SharedTuplestoreAccessor *accessor);
+
 extern void sts_begin_parallel_scan(SharedTuplestoreAccessor *accessor);
 
 extern void sts_end_parallel_scan(SharedTuplestoreAccessor *accessor);
@@ -58,4 +63,13 @@ extern void sts_puttuple(SharedTuplestoreAccessor *accessor,
 extern MinimalTuple sts_parallel_scan_next(SharedTuplestoreAccessor *accessor,
 										   void *meta_data);
 
+extern void sts_begin_scan(SharedTuplestoreAccessor *accessor);
+
+extern void sts_end_scan(SharedTuplestoreAccessor *accessor);
+
+extern MinimalTuple sts_scan_next(SharedTuplestoreAccessor *accessor,
+								  void *meta_data);
+
+extern void sts_detach(SharedTuplestoreAccessor *accessor);
+
 #endif							/* SHAREDTUPLESTORE_H */
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index f94949370b..19f92ee946 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -266,6 +266,7 @@ extern Size tuplesort_estimate_shared(int nworkers);
 extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
 										dsm_segment *seg);
 extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
+extern void tuplesort_reset_shared(Sharedsort *shared);
 
 /*
  * These routines may only be called if randomAccess was specified 'true'.
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index 7c844c6e09..bc1310ad23 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1929,4 +1929,69 @@ set work_mem to default;
 
 drop table gs_group_1;
 drop table gs_hash_1;
+-- parallel grouping sets
+BEGIN;
+set max_hashagg_batches = 512;
+set min_parallel_table_scan_size = 0;
+set parallel_setup_cost = 10;
+explain (costs off)
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather Merge
+   Workers Planned: 2
+   ->  Sort
+         Sort Key: two, four, ten, twenty
+         ->  Parallel BatchHashAggregate
+               Group Key: two, twenty
+               Group Key: two
+               Group Key: ()
+               Group Key: four
+               Group Key: ten
+               ->  Parallel Seq Scan on tenk1
+(11 rows)
+
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+   sum    | count | two | four | ten | twenty 
+----------+-------+-----+------+-----+--------
+  2495000 |   500 |   0 |      |     |      0
+  2496000 |   500 |   0 |      |     |      2
+  2497000 |   500 |   0 |      |     |      4
+  2498000 |   500 |   0 |      |     |      6
+  2499000 |   500 |   0 |      |     |      8
+  2500000 |   500 |   0 |      |     |     10
+  2501000 |   500 |   0 |      |     |     12
+  2502000 |   500 |   0 |      |     |     14
+  2503000 |   500 |   0 |      |     |     16
+  2504000 |   500 |   0 |      |     |     18
+ 24995000 |  5000 |   0 |      |     |       
+  2495500 |   500 |   1 |      |     |      1
+  2496500 |   500 |   1 |      |     |      3
+  2497500 |   500 |   1 |      |     |      5
+  2498500 |   500 |   1 |      |     |      7
+  2499500 |   500 |   1 |      |     |      9
+  2500500 |   500 |   1 |      |     |     11
+  2501500 |   500 |   1 |      |     |     13
+  2502500 |   500 |   1 |      |     |     15
+  2503500 |   500 |   1 |      |     |     17
+  2504500 |   500 |   1 |      |     |     19
+ 25000000 |  5000 |   1 |      |     |       
+ 12495000 |  2500 |     |    0 |     |       
+ 12497500 |  2500 |     |    1 |     |       
+ 12500000 |  2500 |     |    2 |     |       
+ 12502500 |  2500 |     |    3 |     |       
+  4995000 |  1000 |     |      |   0 |       
+  4996000 |  1000 |     |      |   1 |       
+  4997000 |  1000 |     |      |   2 |       
+  4998000 |  1000 |     |      |   3 |       
+  4999000 |  1000 |     |      |   4 |       
+  5000000 |  1000 |     |      |   5 |       
+  5001000 |  1000 |     |      |   6 |       
+  5002000 |  1000 |     |      |   7 |       
+  5003000 |  1000 |     |      |   8 |       
+  5004000 |  1000 |     |      |   9 |       
+ 49995000 | 10000 |     |      |     |       
+(37 rows)
+
+ABORT;
 -- end
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index dfa4b036b5..899291f9e9 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -1517,3 +1517,141 @@ SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) <
  21 | 6000 | 6.0000000000000000 |  1000
 (6 rows)
 
+-- simple agg in parallel
+BEGIN;
+SET max_sort_batches = 6;
+SET min_parallel_table_scan_size = 0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  GroupAggregate
+         Group Key: unique2
+         ->  Parallel BatchSort
+               Sort Key: unique2
+               ->  Parallel Seq Scan on tenk1
+(7 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  GroupAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel BatchSort
+                     Sort Key: tenk1.unique2
+                     ->  Parallel Seq Scan on tenk1
+(8 rows)
+
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: ((count(*)) = tenk1.unique2)
+                     ->  GroupAggregate
+                           Group Key: tenk1_1.unique2
+                           ->  Parallel BatchSort
+                                 Sort Key: tenk1_1.unique2
+                                 ->  Parallel Seq Scan on tenk1 tenk1_1
+                     ->  Parallel Hash
+                           ->  Parallel Seq Scan on tenk1
+(13 rows)
+
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ count 
+-------
+ 10000
+(1 row)
+
+-- using batch hash
+SET max_sort_batches = 0;
+SET max_hashagg_batches = 512;
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+               QUERY PLAN               
+----------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Parallel BatchHashAggregate
+         Group Key: unique2
+         ->  Parallel Seq Scan on tenk1
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Seq Scan on tenk1
+(6 rows)
+
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: ((count(*)) = tenk1.unique2)
+                     ->  Parallel BatchHashAggregate
+                           Group Key: tenk1_1.unique2
+                           ->  Parallel Seq Scan on tenk1 tenk1_1
+                     ->  Parallel Hash
+                           ->  Parallel Seq Scan on tenk1
+(11 rows)
+
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ count 
+-------
+ 10000
+(1 row)
+
+ABORT;
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 11c6f50fbf..27d77a33ba 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -306,3 +306,80 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+-- parallel distinct
+BEGIN;
+SET max_sort_batches = 6;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Unique
+               ->  Parallel BatchSort
+                     Sort Key: tenk1.unique2
+                     ->  Parallel Seq Scan on tenk1
+(7 rows)
+
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Unique
+         ->  Parallel BatchSort
+               Sort Key: tenk1.unique2
+               ->  Unique
+                     ->  Parallel BatchSort
+                           Sort Key: tenk1.unique2
+                           ->  Parallel Seq Scan on tenk1
+(9 rows)
+
+-- using batch hash
+SET max_sort_batches = 0;
+SET max_hashagg_batches = 512;
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Seq Scan on tenk1
+(6 rows)
+
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Parallel BatchHashAggregate
+         Group Key: tenk1.unique2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Seq Scan on tenk1
+(7 rows)
+
+ABORT;
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 9b0c418db7..0d399f3d18 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -845,6 +845,92 @@ select * from
  VVVVxx  |  2500 | 3
 (12 rows)
 
+--test rescan for batch sort
+SET max_sort_batches = 12;
+explain (costs off)
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Sort
+   Sort Key: "*VALUES*".column1, tenk1.string4
+   ->  Nested Loop Left Join
+         ->  Values Scan on "*VALUES*"
+         ->  Gather
+               Workers Planned: 4
+               ->  GroupAggregate
+                     Group Key: tenk1.string4
+                     ->  Parallel BatchSort
+                           Sort Key: tenk1.string4
+                           ->  Parallel Seq Scan on tenk1
+(11 rows)
+
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+ string4 | count | x 
+---------+-------+---
+ AAAAxx  |  2500 | 1
+ HHHHxx  |  2500 | 1
+ OOOOxx  |  2500 | 1
+ VVVVxx  |  2500 | 1
+ AAAAxx  |  2500 | 2
+ HHHHxx  |  2500 | 2
+ OOOOxx  |  2500 | 2
+ VVVVxx  |  2500 | 2
+ AAAAxx  |  2500 | 3
+ HHHHxx  |  2500 | 3
+ OOOOxx  |  2500 | 3
+ VVVVxx  |  2500 | 3
+(12 rows)
+
+reset max_sort_batches;
+--test rescan for batch hash agg
+SET max_hashagg_batches = 512;
+explain (costs off)
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Sort
+   Sort Key: "*VALUES*".column1, tenk1.string4
+   ->  Nested Loop Left Join
+         ->  Values Scan on "*VALUES*"
+         ->  Gather Merge
+               Workers Planned: 4
+               ->  Sort
+                     Sort Key: tenk1.string4
+                     ->  Parallel BatchHashAggregate
+                           Group Key: tenk1.string4
+                           ->  Parallel Seq Scan on tenk1
+(11 rows)
+
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+ string4 | count | x 
+---------+-------+---
+ AAAAxx  |  2500 | 1
+ HHHHxx  |  2500 | 1
+ OOOOxx  |  2500 | 1
+ VVVVxx  |  2500 | 1
+ AAAAxx  |  2500 | 2
+ HHHHxx  |  2500 | 2
+ OOOOxx  |  2500 | 2
+ VVVVxx  |  2500 | 2
+ AAAAxx  |  2500 | 3
+ HHHHxx  |  2500 | 3
+ OOOOxx  |  2500 | 3
+ VVVVxx  |  2500 | 3
+(12 rows)
+
+reset max_hashagg_batches;
 reset enable_material;
 reset enable_hashagg;
 -- check parallelized int8 aggregate (bug #14897)
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 75f78db8f5..8083a4802a 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1420,3 +1420,88 @@ where (x = 0) or (q1 >= q2 and q1 <= q2);
  4567890123456789 |  4567890123456789 | 1
 (6 rows)
 
+-- parallel union
+BEGIN;
+SET max_sort_batches = 6;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Unique
+               ->  Parallel BatchSort
+                     Sort Key: tenk1.unique2
+                     ->  Parallel Append
+                           ->  Parallel Seq Scan on tenk1
+                           ->  Parallel Seq Scan on tenk1 tenk1_1
+(9 rows)
+
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: (tenk1_1.unique2 = tenk1.unique2)
+                     ->  Unique
+                           ->  Parallel BatchSort
+                                 Sort Key: tenk1_1.unique2
+                                 ->  Parallel Append
+                                       ->  Parallel Seq Scan on tenk1 tenk1_1
+                                       ->  Parallel Seq Scan on tenk1 tenk1_2
+                     ->  Parallel Hash
+                           ->  Parallel Seq Scan on tenk1
+(14 rows)
+
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+ count 
+-------
+ 10000
+(1 row)
+
+-- using batch hash
+SET max_sort_batches = 0;
+SET max_hashagg_batches = 512;
+EXPLAIN (costs off)
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Parallel BatchHashAggregate
+               Group Key: tenk1.hundred
+               ->  Parallel Append
+                     ->  Parallel Seq Scan on tenk1
+                     ->  Parallel Seq Scan on tenk1 tenk1_1
+(8 rows)
+
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+ count 
+-------
+   100
+(1 row)
+
+ABORT;
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index 18ae803e9d..1913ee78ba 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -529,4 +529,14 @@ set work_mem to default;
 drop table gs_group_1;
 drop table gs_hash_1;
 
+-- parallel grouping sets
+BEGIN;
+set max_hashagg_batches = 512;
+set min_parallel_table_scan_size = 0;
+set parallel_setup_cost = 10;
+explain (costs off)
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+select sum(unique1),count(unique1),two,four,ten,twenty from tenk1 group by grouping sets(two,four,ten,(two,twenty),()) order by 3,4,5,6;
+ABORT;
+
 -- end
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index c17294b15b..2948d8c2b4 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -331,3 +331,48 @@ RESET parallel_setup_cost;
 EXPLAIN (COSTS OFF)
 SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) < 7 ORDER BY 1, 2, 3;
 SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) < 7 ORDER BY 1, 2, 3;
+
+-- simple agg in parallel
+BEGIN;
+SET max_sort_batches = 6;
+SET min_parallel_table_scan_size = 0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+-- using batch hash
+SET max_sort_batches = 0;
+SET max_hashagg_batches = 512;
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ABORT;
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index 33102744eb..c98926309a 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -135,3 +135,26 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- parallel distinct
+BEGIN;
+SET max_sort_batches = 6;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+-- using batch hash
+SET max_sort_batches = 0;
+SET max_hashagg_batches = 512;
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ABORT;
\ No newline at end of file
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index 5a01a98b26..7ee69e1b97 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -312,6 +312,38 @@ select * from
    from tenk1 group by string4 order by string4) ss
   right join (values (1),(2),(3)) v(x) on true;
 
+--test rescan for batch sort
+SET max_sort_batches = 12;
+
+explain (costs off)
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+
+reset max_sort_batches;
+
+--test rescan for batch hash agg
+SET max_hashagg_batches = 512;
+
+explain (costs off)
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+
+reset max_hashagg_batches;
+
 reset enable_material;
 
 reset enable_hashagg;
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index ce22f34c71..171f6e41ec 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -540,3 +540,31 @@ select * from
    union all
    select *, 1 as x from int8_tbl b) ss
 where (x = 0) or (q1 >= q2 and q1 <= q2);
+
+-- parallel union
+BEGIN;
+SET max_sort_batches = 6;
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using batch sort
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+-- using batch hash
+SET max_sort_batches = 0;
+SET max_hashagg_batches = 512;
+EXPLAIN (costs off)
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+ABORT;
\ No newline at end of file
-- 
2.16.3

#22David Steele
david@pgmasters.net
In reply to: bucoo@sohu.com (#21)
Re: parallel distinct union and aggregate support patch

On 1/25/21 9:14 AM, bucoo@sohu.com wrote:

Now, I rewrite batch hashagg and sort, add some comment and combin too
patches. base on master 2ad78a87f018260d4474eee63187e1cc73c9b976.
They are support rescan and change GUC
enable_batch_hashagg/enable_batch_sort to
max_hashagg_batches/max_sort_batch, default value is "0"(mean is disable).
The "max_hashagg_batches" in grouping sets each chain using this value,
maybe we need a better algorithm.
Do not set "max_sort_batch" too large, because each tuplesort's work
memory is "work_mem/max_sort_batch".

Next step I want use batch sort add parallel merge join(thinks Dilip
Kumar) and except/intersect support after this patch commit, welcome to
discuss.

This patch has not gotten any review in the last two CFs and is unlikely
to be committed for PG14 so I have moved it to the 2021-07 CF. A rebase
is also required so marked Waiting for Author.

I can see this is a work in progress, but you may want to consider the
several suggestions that an unbuffered approach might be better.

Regards,
--
-David
david@pgmasters.net

#23bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

This patch has not gotten any review in the last two CFs and is unlikely
to be committed for PG14 so I have moved it to the 2021-07 CF. A rebase
is also required so marked Waiting for Author.

I can see this is a work in progress, but you may want to consider the
several suggestions that an unbuffered approach might be better.

I have written a plan with similar functions, It is known that the following two situations do not work well.
1. Under "Parallel Append" plan
Gather
-> Parallel Append
-> Agg
-> Parallel Redistribute(1)
-> ...
-> Agg
-> Parallel Redistribute(2)
-> ...
when parallel worker 1 execute "Parallel Redistribute(1)" and worker execute "Parallel Redistribute(2)",
both "Parallel Redistribute" plan can not send tuples to other worker(both worker are stuck),
because outher worker's memory buffer run out soon.

2. Under "Nestloop" plan
Gather
-> Nestloop(1)
-> Nestloop(2)
-> Parallel Redistribute
-> ...
-> IndexScan
-> Agg
At some point might be the case: parallel worker 1 executing Agg and "Parallel Redistribute" plan's memory buffer is full,
worker 2 executing "Parallel Redistribute" and it waiting worker 1 eat "Parallel Redistribute" plan's memory buffer,
it's stuck.

bucoo@sohu.com

#24David Rowley
dgrowleyml@gmail.com
In reply to: bucoo@sohu.com (#23)
Re: Re: parallel distinct union and aggregate support patch

On Tue, 30 Mar 2021 at 22:33, bucoo@sohu.com <bucoo@sohu.com> wrote:

I have written a plan with similar functions, It is known that the following two situations do not work well.

I read through this thread and also wondered about a Parallel
Partition type operator. It also seems to me that if it could be done
this way then you could just plug in existing nodes to get Sorting and
Aggregation rather than having to modify existing nodes to get them to
do what you need.

From what I've seen looking over the thread, a few people suggested
this and I didn't see anywhere where you responded to them about the
idea. Just so you're aware, contributing to PostgreSQL is not a case
of throwing code at a wall and seeing which parts stick. You need to
interact and respond to people reviewing your work. This is especially
true for the people who actually have the authority to merge any of
your work with the main code repo.

It seems to me you might be getting off to a bad start and you might
not be aware of this process. So I hope this email will help put you
on track.

Some of the people that you've not properly responded to include:

Thomas Munro: PostgreSQL committer. Wrote Parallel Hash Join.
Robert Hass: PostgreSQL committer. Wrote much of the original parallel
query code
Heikki Linnakangas: PostgreSQL committer. Worked on many parts of the
planner and executor. Also works for the company that develops
Greenplum, a massively parallel processing RDBMS, based on Postgres.

You might find other information in [1]https://www.postgresql.org/community/contributors/.

If I wanted to do what you want to do, I think those 3 people might be
some of the last people I'd pick to ignore questions from! :-)

Also, I'd say also copying in Tom Lane randomly when he's not shown
any interest in the patch here is likely not a good way of making
forward progress. You might find that it might bubble up on his radar
if you start constructively interacting with the people who have
questioned your design. I'd say that should be your next step.

The probability of anyone merging any of your code without properly
discussing the design with the appropriate people are either very
close to zero or actually zero.

I hope this email helps you get on track.

David

[1]: https://www.postgresql.org/community/contributors/

#25bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
Re: Re: parallel distinct union and aggregate support patch

Sorry, this email was marked spam by sohu, so I didn't notice it, and last few months I work hard for merge PostgreSQL 14 to our cluster version(github.com/ADBSQL/AntDB).

I have an idea how to make "Parallel Redistribute" work, even under "Parallel Append" and "Nestloop". but "grouping sets" can not work in parallel mode using "Parallel Redistribute".
Wait days please, path coming soon.

From: David Rowley
Date: 2021-07-06 10:48
To: bucoo@sohu.com
CC: David Steele; pgsql-hackers; tgl; Dilip Kumar; Thomas Munro; Tomas Vondra; hlinnaka; robertmhaas; pgsql
Subject: Re: Re: parallel distinct union and aggregate support patch
On Tue, 30 Mar 2021 at 22:33, bucoo@sohu.com <bucoo@sohu.com> wrote:

I have written a plan with similar functions, It is known that the following two situations do not work well.

I read through this thread and also wondered about a Parallel
Partition type operator. It also seems to me that if it could be done
this way then you could just plug in existing nodes to get Sorting and
Aggregation rather than having to modify existing nodes to get them to
do what you need.

From what I've seen looking over the thread, a few people suggested
this and I didn't see anywhere where you responded to them about the
idea. Just so you're aware, contributing to PostgreSQL is not a case
of throwing code at a wall and seeing which parts stick. You need to
interact and respond to people reviewing your work. This is especially
true for the people who actually have the authority to merge any of
your work with the main code repo.

It seems to me you might be getting off to a bad start and you might
not be aware of this process. So I hope this email will help put you
on track.

Some of the people that you've not properly responded to include:

Thomas Munro: PostgreSQL committer. Wrote Parallel Hash Join.
Robert Hass: PostgreSQL committer. Wrote much of the original parallel
query code
Heikki Linnakangas: PostgreSQL committer. Worked on many parts of the
planner and executor. Also works for the company that develops
Greenplum, a massively parallel processing RDBMS, based on Postgres.

You might find other information in [1]https://www.postgresql.org/community/contributors/.

If I wanted to do what you want to do, I think those 3 people might be
some of the last people I'd pick to ignore questions from! :-)

Also, I'd say also copying in Tom Lane randomly when he's not shown
any interest in the patch here is likely not a good way of making
forward progress. You might find that it might bubble up on his radar
if you start constructively interacting with the people who have
questioned your design. I'd say that should be your next step.

The probability of anyone merging any of your code without properly
discussing the design with the appropriate people are either very
close to zero or actually zero.

I hope this email helps you get on track.

David

[1]: https://www.postgresql.org/community/contributors/

#26bucoo@sohu.com
bucoo@sohu.com
In reply to: bucoo@sohu.com (#1)
2 attachment(s)
Re: Re: parallel distinct union and aggregate support patch

That are busy days, sorry patchs too later.
Here is an unbuffered plan Redistribute for parallel aggregate/distinct/union,
like this(when new GUC param redistribute_query_size large then 0):
Gather
-> Finalize HashAggregate
-> Parallel Redistribute
-> Partial HashAggregate
-> Parallel Seq Scan on test
0001-xxx.patch:
Fix cost_subqueryscan() get wrong parallel cost, it always same as none parallel path.
If not apply this patch parallel union always can't be choose.

How Redistribute work:
Each have N*MQ + 1*SharedTuplestore, N is parallel workers number(include leader).
1. Alloc shared memory for Redistribute(using plan parallel worker number).
2. Leader worker after all parallel workers launched change "final_worker_num" to launched workers number.
3. Each worker try to get a unique part number. part number count is "final_worker_num".
4. If get a invalid part number return null tuple.
5. Try read tuple from MQ, if get a tuple then return it, else goto next step.
6-0. Get tuple from outer, if get a tuple compute mod as "hash value % final_worker_num", else goto step 7.
6-1. If mod equal our part number then return this tuple.
6-2. Use mod get part's MQ and try write tuple to the MQ, if write success got step 6-0.
6-3. Write tuple to part's SharedTuplestore.
7. Read tuple from MQ, if get a tuple then return it, else close all opend MQ and goto next step.
8. Read tuple from SharedTuplestore, if get a tuple then return it, else close it and goto next step.
9. Try get next unique part number, if get an invalid part number then return null tuple, else goto step 7.

In step "6-2" we can't use shm_mq_send() function, because it maybe write partial data,
if this happend we must write remaining data to this MQ, so we must wait other worker read some date from this MQ.
However, we do't want to wait(this may cause all worker to wait for each other).
So, I write a new function named shm_mq_send_once(). It like shm_mq_send, but return would block immediately when
no space for write data and "do not write any data" to MQ.
This will cause a problem, when MQ ring size small then tuple size, it never write to MQ(write to SharedTuplestore).
So it's best to make sure that MQ has enough space for tuple(change GUC param "redistribute_query_size").

Execute comparison
prepare data:
begin;
create table gtest(id integer, txt text);
insert into gtest select t1.id,'txt'||t1.id from (select generate_series(1,10*1000*1000) id) t1,(select generate_series(1,10) id) t2;
analyze gtest;
commit;
set max_parallel_workers_per_gather=8;
set work_mem = '256MB';

hash aggregate
explain (verbose,analyze,costs off)
select sum(id),txt from gtest group by txt;
QUERY PLAN
---------------------------------------------------------------------------------------------------------
Finalize HashAggregate (actual time=11733.519..19075.309 rows=10000000 loops=1)
Output: sum(id), txt
Group Key: gtest.txt
Batches: 21 Memory Usage: 262201kB Disk Usage: 359808kB
-> Gather (actual time=5540.052..8029.550 rows=10000056 loops=1)
Output: txt, (PARTIAL sum(id))
Workers Planned: 6
Workers Launched: 6
-> Partial HashAggregate (actual time=5534.690..5914.643 rows=1428579 loops=7)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 1 Memory Usage: 188433kB
Worker 0: actual time=5533.956..5913.461 rows=1443740 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 1: actual time=5533.552..5913.595 rows=1400439 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 2: actual time=5533.553..5913.357 rows=1451759 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 3: actual time=5533.834..5907.952 rows=1379830 loops=1
Batches: 1 Memory Usage: 180241kB
Worker 4: actual time=5533.782..5912.408 rows=1428060 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 5: actual time=5534.271..5910.458 rows=1426987 loops=1
Batches: 1 Memory Usage: 188433kB
-> Parallel Seq Scan on public.gtest (actual time=0.022..1523.231 rows=14285714 loops=7)
Output: id, txt
Worker 0: actual time=0.032..1487.403 rows=14437315 loops=1
Worker 1: actual time=0.016..1635.675 rows=14004315 loops=1
Worker 2: actual time=0.015..1482.005 rows=14517505 loops=1
Worker 3: actual time=0.017..1664.469 rows=13798225 loops=1
Worker 4: actual time=0.018..1471.233 rows=14280520 loops=1
Worker 5: actual time=0.030..1463.973 rows=14269790 loops=1
Planning Time: 0.075 ms
Execution Time: 19575.976 ms

parallel hash aggregate
set redistribute_query_size = '256kB';
explain (verbose,analyze,costs off)
select sum(id),txt from gtest group by txt;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------
Gather (actual time=9710.061..11372.560 rows=10000000 loops=1)
Output: (sum(id)), txt
Workers Planned: 6
Workers Launched: 6
-> Finalize HashAggregate (actual time=9703.098..10082.575 rows=1428571 loops=7)
Output: sum(id), txt
Group Key: gtest.txt
Batches: 1 Memory Usage: 188433kB
Worker 0: actual time=9701.365..10077.995 rows=1428857 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 1: actual time=9701.415..10095.876 rows=1430065 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 2: actual time=9701.315..10077.635 rows=1425811 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 3: actual time=9703.047..10088.985 rows=1427745 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 4: actual time=9703.166..10077.937 rows=1431644 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 5: actual time=9701.809..10076.922 rows=1426156 loops=1
Batches: 1 Memory Usage: 188433kB
-> Parallel Redistribute (actual time=5593.440..9036.392 rows=1428579 loops=7)
Output: txt, (PARTIAL sum(id))
Hash Key: gtest.txt
Parts: 1 Disk Usage: 0kB Disk Rows: 0
Worker 0: actual time=5591.812..9036.394 rows=1428865 loops=1
Parts: 1 Disk Usage: 0kB Disk Rows: 0
Worker 1: actual time=5591.773..9002.576 rows=1430072 loops=1
Parts: 1 Disk Usage: 0kB Disk Rows: 0
Worker 2: actual time=5591.774..9039.341 rows=1425817 loops=1
Parts: 1 Disk Usage: 0kB Disk Rows: 0
Worker 3: actual time=5593.635..9040.148 rows=1427753 loops=1
Parts: 1 Disk Usage: 0kB Disk Rows: 0
Worker 4: actual time=5593.565..9044.528 rows=1431652 loops=1
Parts: 1 Disk Usage: 0kB Disk Rows: 0
Worker 5: actual time=5592.220..9043.953 rows=1426167 loops=1
Parts: 1 Disk Usage: 0kB Disk Rows: 0
-> Partial HashAggregate (actual time=5566.237..5990.671 rows=1428579 loops=7)
Output: txt, PARTIAL sum(id)
Group Key: gtest.txt
Batches: 1 Memory Usage: 188433kB
Worker 0: actual time=5565.941..5997.635 rows=1449687 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 1: actual time=5565.930..6073.977 rows=1400013 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 2: actual time=5565.945..5975.454 rows=1446727 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 3: actual time=5567.673..5981.978 rows=1396379 loops=1
Batches: 1 Memory Usage: 180241kB
Worker 4: actual time=5567.622..5972.500 rows=1415832 loops=1
Batches: 1 Memory Usage: 188433kB
Worker 5: actual time=5566.148..5962.503 rows=1415665 loops=1
Batches: 1 Memory Usage: 188433kB
-> Parallel Seq Scan on public.gtest (actual time=0.022..1520.647 rows=14285714 loops=7)
Output: id, txt
Worker 0: actual time=0.021..1476.653 rows=14496785 loops=1
Worker 1: actual time=0.020..1519.023 rows=14000060 loops=1
Worker 2: actual time=0.020..1476.707 rows=14467185 loops=1
Worker 3: actual time=0.019..1654.088 rows=13963715 loops=1
Worker 4: actual time=0.027..1527.803 rows=14158235 loops=1
Worker 5: actual time=0.030..1514.247 rows=14156570 loops=1
Planning Time: 0.080 ms
Execution Time: 11830.773 ms

Attachments:

0001-fix-cost_subqueryscan-get-worning-parallel-cost.patchapplication/octet-stream; name=0001-fix-cost_subqueryscan-get-worning-parallel-cost.patchDownload
From a8f3ca34095dae3645c17a4c3191ad478e3c126f Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Wed, 15 Sep 2021 14:20:22 +0800
Subject: [PATCH 1/2] fix cost_subqueryscan get worning parallel cost

---
 src/backend/optimizer/path/costsize.c         | 22 +++++++++++++++++++
 .../regress/expected/incremental_sort.out     | 10 ++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1fd53b40bb..1096a236d5 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1426,6 +1426,28 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
 	startup_cost += path->path.pathtarget->cost.startup;
 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
 
+	/* Adjust costing for parallelism, if used. */
+	if (path->path.parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(&path->path);
+
+		/* The CPU cost is divided among all the workers. */
+		run_cost /= parallel_divisor;
+
+		/*
+		 * It may be possible to amortize some of the I/O cost, but probably
+		 * not very much, because most operating systems already do aggressive
+		 * prefetching.  For now, we assume that the disk run cost can't be
+		 * amortized at all.
+		 */
+
+		/*
+		 * In the case of a parallel plan, the row count needs to represent
+		 * the number of tuples processed per worker.
+		 */
+		path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
+	}
+
 	path->path.startup_cost += startup_cost;
 	path->path.total_cost += startup_cost + run_cost;
 }
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
index 545e301e48..b10ecb8cb8 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1487,14 +1487,12 @@ explain (costs off) select * from t union select * from t order by 1,3;
    ->  Unique
          ->  Sort
                Sort Key: t.a, t.b, t.c
-               ->  Append
-                     ->  Gather
-                           Workers Planned: 2
+               ->  Gather
+                     Workers Planned: 2
+                     ->  Parallel Append
                            ->  Parallel Seq Scan on t
-                     ->  Gather
-                           Workers Planned: 2
                            ->  Parallel Seq Scan on t t_1
-(13 rows)
+(11 rows)
 
 -- Full sort, not just incremental sort can be pushed below a gather merge path
 -- by generate_useful_gather_paths.
-- 
2.25.1

0002-parallel-redistribute-plan.patchapplication/octet-stream; name=0002-parallel-redistribute-plan.patchDownload
From 999cec40f6725ccb7a5ceea1700d25f6e3d48b80 Mon Sep 17 00:00:00 2001
From: bucoo <bucoo@sohu.com>
Date: Wed, 15 Sep 2021 15:13:34 +0800
Subject: [PATCH 2/2] parallel union/aggregate/distinct support using
 redistribute plan

---
 src/backend/commands/explain.c                |  59 ++
 src/backend/executor/Makefile                 |   1 +
 src/backend/executor/execParallel.c           |  43 +
 src/backend/executor/execProcnode.c           |  10 +
 src/backend/executor/nodeGather.c             |   1 +
 src/backend/executor/nodeGatherMerge.c        |   1 +
 src/backend/executor/nodeRedistribute.c       | 941 ++++++++++++++++++
 src/backend/nodes/copyfuncs.c                 |  25 +
 src/backend/nodes/outfuncs.c                  |  28 +
 src/backend/nodes/readfuncs.c                 |  18 +
 src/backend/optimizer/path/costsize.c         |  15 +
 src/backend/optimizer/plan/createplan.c       |  31 +-
 src/backend/optimizer/plan/planner.c          | 260 +++++
 src/backend/optimizer/plan/setrefs.c          |   1 +
 src/backend/optimizer/plan/subselect.c        |   1 +
 src/backend/optimizer/prep/prepunion.c        |  79 +-
 src/backend/optimizer/util/pathnode.c         |  26 +
 src/backend/optimizer/util/tlist.c            |  21 +
 src/backend/storage/ipc/shm_mq.c              | 100 ++
 src/backend/utils/activity/wait_event.c       |   6 +
 src/backend/utils/misc/guc.c                  |  12 +
 src/backend/utils/sort/sharedtuplestore.c     |  53 +-
 src/include/executor/execParallel.h           |   1 +
 src/include/executor/nodeRedistribute.h       |  21 +
 src/include/nodes/execnodes.h                 |  39 +
 src/include/nodes/nodes.h                     |   3 +
 src/include/nodes/pathnodes.h                 |  10 +
 src/include/nodes/plannodes.h                 |  14 +
 src/include/optimizer/cost.h                  |   2 +
 src/include/optimizer/pathnode.h              |   4 +
 src/include/optimizer/tlist.h                 |   1 +
 src/include/storage/shm_mq.h                  |   4 +
 src/include/utils/sharedtuplestore.h          |   7 +
 src/include/utils/wait_event.h                |   4 +-
 .../test_shm_mq/expected/test_shm_mq.out      |   6 +
 .../modules/test_shm_mq/sql/test_shm_mq.sql   |   1 +
 src/test/modules/test_shm_mq/test.c           |   6 +-
 .../modules/test_shm_mq/test_shm_mq--1.0.sql  |   3 +-
 .../regress/expected/partition_aggregate.out  |  75 ++
 src/test/regress/expected/select_distinct.out |  46 +
 src/test/regress/expected/select_parallel.out |  48 +
 src/test/regress/expected/union.out           |  95 ++
 src/test/regress/sql/partition_aggregate.sql  |  26 +
 src/test/regress/sql/select_distinct.sql      |  15 +
 src/test/regress/sql/select_parallel.sql      |  19 +
 src/test/regress/sql/union.sql                |  32 +
 46 files changed, 2187 insertions(+), 27 deletions(-)
 create mode 100644 src/backend/executor/nodeRedistribute.c
 create mode 100644 src/include/executor/nodeRedistribute.h

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 10644dfac4..68c1468138 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -117,6 +117,7 @@ static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 static void show_instrumentation_count(const char *qlabel, int which,
 									   PlanState *planstate, ExplainState *es);
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
+static void show_redistribute_info(RedistributeState *node, ExplainState *es);
 static void show_eval_params(Bitmapset *bms_params, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static void show_buffer_usage(ExplainState *es, const BufferUsage *usage,
@@ -1388,6 +1389,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Hash:
 			pname = sname = "Hash";
 			break;
+		case T_Redistribute:
+			pname = sname = "Redistribute";
+			break;
 		default:
 			pname = sname = "???";
 			break;
@@ -2019,6 +2023,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			show_memoize_info(castNode(MemoizeState, planstate), ancestors,
 							  es);
 			break;
+		case T_Redistribute:
+			show_sort_group_keys(planstate, "Hash Key",
+								 ((Redistribute*)plan)->numCols, 0,
+								 ((Redistribute*)plan)->hashColIdx,
+								 NULL, NULL, NULL,
+								 ancestors, es);
+			show_redistribute_info(castNode(RedistributeState, planstate), es);
+			break;
 		default:
 			break;
 	}
@@ -3429,6 +3441,53 @@ show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es)
 	}
 }
 
+/*
+ * Show extra information for a Redistribute node's one worker.
+ */
+static void
+show_redistribute_one_info(RedistributeInstrumentation *instrument, ExplainState *es)
+{
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Parts: %d", instrument->parts_got);
+		appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB", instrument->disk_used / 1024);
+		appendStringInfo(es->str, "  Disk Rows: " UINT64_FORMAT, instrument->disk_rows);
+		appendStringInfoChar(es->str, '\n');
+	}else
+	{
+		ExplainPropertyInteger("Parts", NULL, instrument->parts_got, es);
+		ExplainPropertyInteger("Disk Usage", "kB", instrument->disk_used / 1024, es);
+		ExplainPropertyInteger("Disk Rows", NULL, instrument->disk_rows, es);
+	}
+}
+
+/*
+ * Show extra information for a Redistribute node.
+ */
+static void
+show_redistribute_info(RedistributeState *node, ExplainState *es)
+{
+	SharedRedistributeInfo *info = node->shared_instrument;
+	int						i;
+	if (info == NULL)
+		return;
+
+	if (info->sinstrument[0].parts_got != 0)
+		show_redistribute_one_info(&info->sinstrument[0], es);
+
+	for (i=1;i<info->num_workers;++i)
+	{
+		if (info->sinstrument[i].parts_got == 0)
+			continue;
+		if (es->workers_state)
+			ExplainOpenWorker(i-1, es);
+		show_redistribute_one_info(&info->sinstrument[i], es);
+		if (es->workers_state)
+			ExplainCloseWorker(i-1, es);
+	}
+}
+
 /*
  * Show initplan params evaluated at Gather or Gather Merge node.
  */
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index 11118d0ce0..4adea66848 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -61,6 +61,7 @@ OBJS = \
 	nodeNestloop.o \
 	nodeProjectSet.o \
 	nodeRecursiveunion.o \
+	nodeRedistribute.o \
 	nodeResult.o \
 	nodeSamplescan.o \
 	nodeSeqscan.o \
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index f8a4a40e7b..40248669bf 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -36,6 +36,7 @@
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeMemoize.h"
+#include "executor/nodeRedistribute.h"
 #include "executor/nodeSeqscan.h"
 #include "executor/nodeSort.h"
 #include "executor/nodeSubplan.h"
@@ -297,6 +298,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecMemoizeEstimate((MemoizeState *) planstate, e->pcxt);
 			break;
+		case T_RedistributeState:
+			ExecRedistributeEstimate((RedistributeState *) planstate, e->pcxt);
+			break;
 		default:
 			break;
 	}
@@ -521,6 +525,9 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecMemoizeInitializeDSM((MemoizeState *) planstate, d->pcxt);
 			break;
+		case T_RedistributeState:
+			ExecRedistributeInitializeDSM((RedistributeState *) planstate, d->pcxt);
+			break;;
 		default:
 			break;
 	}
@@ -994,6 +1001,10 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 				ExecHashJoinReInitializeDSM((HashJoinState *) planstate,
 											pcxt);
 			break;
+		case T_RedistributeState:
+			ExecRedistributeReInitializeDSM((RedistributeState *) planstate,
+											pcxt);
+			break;
 		case T_HashState:
 		case T_SortState:
 		case T_IncrementalSortState:
@@ -1008,6 +1019,32 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 	return planstate_tree_walker(planstate, ExecParallelReInitializeDSM, pcxt);
 }
 
+static bool
+ExecParallelLaunchedWalker(PlanState *planstate, ParallelContext *pcxt)
+{
+	if (planstate == NULL)
+		return false;
+
+	switch (nodeTag(planstate))
+	{
+		case T_RedistributeState:
+			ExecRedistributeParallelLaunched((RedistributeState *)planstate,
+											 pcxt);
+			break;
+
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelLaunchedWalker, pcxt);
+}
+
+void
+ExecParallelLaunched(PlanState *planstate, ParallelContext *pcxt)
+{
+	(void)planstate_tree_walker(planstate, ExecParallelLaunchedWalker, pcxt);
+}
+
 /*
  * Copy instrumentation information about this node and its descendants from
  * dynamic shared memory.
@@ -1070,6 +1107,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_MemoizeState:
 			ExecMemoizeRetrieveInstrumentation((MemoizeState *) planstate);
 			break;
+		case T_RedistributeState:
+			ExecRedistributeRetrieveInstrumentation((RedistributeState *) planstate);
+			break;
 		default:
 			break;
 	}
@@ -1366,6 +1406,9 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecMemoizeInitializeWorker((MemoizeState *) planstate, pwcxt);
 			break;
+		case T_RedistributeState:
+			ExecRedistributeInitializeWorker((RedistributeState *) planstate, pwcxt);
+			break;
 		default:
 			break;
 	}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 1752b9bfd8..22ba0e4290 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -102,6 +102,7 @@
 #include "executor/nodeNestloop.h"
 #include "executor/nodeProjectSet.h"
 #include "executor/nodeRecursiveunion.h"
+#include "executor/nodeRedistribute.h"
 #include "executor/nodeResult.h"
 #include "executor/nodeSamplescan.h"
 #include "executor/nodeSeqscan.h"
@@ -381,6 +382,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 												 estate, eflags);
 			break;
 
+		case T_Redistribute:
+			result = (PlanState *) ExecInitRedistribute((Redistribute *)node,
+														estate, eflags);
+			break;
+
 		default:
 			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
 			result = NULL;		/* keep compiler quiet */
@@ -756,6 +762,10 @@ ExecEndNode(PlanState *node)
 			ExecEndLimit((LimitState *) node);
 			break;
 
+		case T_RedistributeState:
+			ExecEndRedistribute((RedistributeState *) node);
+			break;
+
 		default:
 			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
 			break;
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 734142b7b1..ed5f0068d8 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -184,6 +184,7 @@ ExecGather(PlanState *pstate)
 			 */
 			pcxt = node->pei->pcxt;
 			LaunchParallelWorkers(pcxt);
+			ExecParallelLaunched(outerPlanState(node), pcxt);
 			/* We save # workers launched for the benefit of EXPLAIN */
 			node->nworkers_launched = pcxt->nworkers_launched;
 
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
index 03f02a19aa..10f1341347 100644
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -225,6 +225,7 @@ ExecGatherMerge(PlanState *pstate)
 			/* Try to launch workers. */
 			pcxt = node->pei->pcxt;
 			LaunchParallelWorkers(pcxt);
+			ExecParallelLaunched(outerPlanState(node), pcxt);
 			/* We save # workers launched for the benefit of EXPLAIN */
 			node->nworkers_launched = pcxt->nworkers_launched;
 
diff --git a/src/backend/executor/nodeRedistribute.c b/src/backend/executor/nodeRedistribute.c
new file mode 100644
index 0000000000..9ca199d902
--- /dev/null
+++ b/src/backend/executor/nodeRedistribute.c
@@ -0,0 +1,941 @@
+
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "executor/nodeRedistribute.h"
+#include "miscadmin.h"
+#include "storage/barrier.h"
+#include "storage/shm_mq.h"
+#include "utils/sharedtuplestore.h"
+#include "utils/typcache.h"
+#include "utils/wait_event.h"
+
+#define INVALID_PART_NUMBER			0xffffffff
+
+/*
+ * local scan status
+ */
+#define	STATUS_LOCALLY_END			0x01	/* outer returned NULL slot */
+#define STATUS_QUERY_STS_END		0x02	/* all tuples on STS scanned */
+#define STATUS_READY_STS_READ		0x04	/* STS ready to scan */
+
+/*
+ * all workers working status
+ */
+#define PRW_WAIT_WORKER_NUMBER		0	/* waiting leader write worker number to shared memory */
+#define PRW_WRITING					1	/* writing tuple(s) to STS and MQ */
+#define PRW_END_STS					2	/* all attached worker called sts_end_write(ready for read) */
+
+#if 1
+#define REDISTRIBUTE_DEBUG(...)	ereport(LOG, errbacktrace(), errmsg(__VA_ARGS__))
+#else
+#define REDISTRIBUTE_DEBUG(...)	((void)0)
+#endif
+
+#define SharedRedistributeOffset(shmem_, offset_)	\
+	(AssertMacro((shmem_)->offset_ >= offsetof(RedistributeShmem, padding)), \
+	 ((char*)(shmem_)) + (shmem_)->offset_)
+
+typedef struct RedistributeWriteInfo
+{
+	SharedTuplestoreAccessor   *sta;
+	shm_mq_handle			   *mqwrite;
+}RedistributeWriteInfo;
+
+typedef struct RedistributeShmem
+{
+	Barrier				part_barrier;				/* see PRW_XXX */
+	pg_atomic_uint32	final_worker_num;			/* how many workers final start successed */
+	pg_atomic_uint32	next_idle_part;				/* next idle part index */
+	uint32				plan_worker_num;			/* how many workers plan to start */
+	Size				sts_shmem_offset;			/* all SharedTuplestore offset (start with this struct) */
+	Size				sts_shmem_size;				/* each SharedTuplestore size in bytes */
+	Size				mq_shmem_offset;			/* all shm_mq offset (start with this struct) */
+	Size				mq_shmem_size;				/* each shm_mq size in bytes */
+	Size				mq_shmem_part_size;			/* equal plan_worker_num * mq_shmem_size */
+	Size				instrument_shmem_offset;	/* RedistributeInstrumentations offset (start with this struct), 0 for not alloc */
+	SharedFileSet		fileset;					/* using by SharedTuplestore */
+
+	char				padding[FLEXIBLE_ARRAY_MEMBER];
+}RedistributeShmem;
+
+typedef struct RedistributeHashInfo
+{
+	FmgrInfo	flinfo;
+	union
+	{
+		FunctionCallInfoBaseData	fcinfo;
+		char	fcinfo_data[SizeForFunctionCallInfo(1)];
+	};
+	AttrNumber	attindex;
+}RedistributeHashInfo;
+
+int	redistribute_query_size = 0;	/* GUC variable */
+
+#define WaitMyLatch(wait_event_info)	\
+	WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, wait_event_info)
+
+/*
+ * This function should not be call.
+ * When resource not ready, just report an error,
+ * else some function will call ExecSetExecProcNode using different execute function
+ */
+static TupleTableSlot *
+ExecRedistributeError(PlanState *pstate)
+{
+	ereport(ERROR,
+			errcode(ERRCODE_INTERNAL_ERROR),
+			errmsg("DSM not initialized for Redistribute plan"));
+
+	return NULL;	/* keep compiler quiet */
+}
+
+static TupleTableSlot *
+ExecCopyOuterSlot(TupleTableSlot *result, TupleTableSlot *outer)
+{
+	int		natts = result->tts_tupleDescriptor->natts;
+	Assert(natts == outer->tts_tupleDescriptor->natts);
+
+	if (result->tts_ops == outer->tts_ops)
+		return outer;
+
+	/* copy outer slot to result slot and return */
+	ExecClearTuple(result);
+	slot_getsomeattrs(outer, natts);
+	memcpy(result->tts_values, outer->tts_values, sizeof(outer->tts_values[0]) * natts);
+	memcpy(result->tts_isnull, outer->tts_isnull, sizeof(outer->tts_isnull[0]) * natts);
+	return ExecStoreVirtualTuple(result);
+}
+
+/*
+ * ExecRedistributeDirect
+ * this function for no any one parallel work start successed
+ */
+static TupleTableSlot *
+ExecRedistributeDirect(PlanState *pstate)
+{
+	TupleTableSlot *outer_slot = ExecProcNode(outerPlanState(pstate));
+	if (unlikely(TupIsNull(outer_slot)))
+		return ExecClearTuple(pstate->ps_ResultTupleSlot);
+
+	return ExecCopyOuterSlot(pstate->ps_ResultTupleSlot, outer_slot);
+}
+
+static shm_mq*
+GetRedistributeSharedMemoryQueue(RedistributeShmem *shmem, uint32 part, uint32 worknum)
+{
+	/* get shm_mq start address */
+	char *addr = SharedRedistributeOffset(shmem, mq_shmem_offset);
+
+	/* get part start address */
+	addr += shmem->mq_shmem_part_size * part;
+
+	/* get offset of part */
+	addr += shmem->mq_shmem_size * worknum;
+
+	return (shm_mq*)addr;
+}
+
+static void
+ExecInitRedistributeWriter(RedistributeState *node, uint32 my_work_num)
+{
+	MemoryContext		oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(node));
+	RedistributeShmem  *shmem = node->shmem;
+	uint32				nworkers = pg_atomic_read_u32(&shmem->final_worker_num);
+	uint32				mq_offset = shmem->sts_shmem_size + shmem->mq_shmem_part_size;
+	uint32				i;
+
+	for (i=0;i<nworkers;++i)
+	{
+		RedistributeWriteInfo  *writer;
+		char				   *addr;
+		shm_mq				   *mq;
+		if (unlikely(i == my_work_num))
+			continue;
+
+		addr = SharedRedistributeOffset(shmem, sts_shmem_offset);
+		addr += shmem->sts_shmem_size * i;
+		writer = &node->writer[i];
+		writer->sta = sts_attach((SharedTuplestore*)addr, my_work_num, &shmem->fileset);
+
+		REDISTRIBUTE_DEBUG("writer sts_attach(%p)=%p mq_attach %p part %u index %u",
+						   addr, writer->sta, addr + mq_offset, my_work_num, i);
+
+		mq = GetRedistributeSharedMemoryQueue(shmem, i, my_work_num);
+		shm_mq_set_sender(mq, MyProc);
+		writer->mqwrite = shm_mq_attach(mq, NULL, NULL);
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+static void
+ExecInitRedistributeReader(RedistributeState *node)
+{
+	MemoryContext		oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(node));
+	RedistributeShmem  *shmem = node->shmem;
+	char			   *start;
+	uint32				i;
+	uint32				nworkers = pg_atomic_read_u32(&shmem->final_worker_num);
+	Assert(node->current_part < nworkers);
+
+	/* attach SharedTuplestore */
+	start = SharedRedistributeOffset(shmem, sts_shmem_offset);
+	start += node->current_part * shmem->sts_shmem_size;
+	node->sta = sts_attach((SharedTuplestore*)start,
+						   node->current_part,
+						   &shmem->fileset);
+	REDISTRIBUTE_DEBUG("reader sts_attach(%p)=%p part %u", start, node->sta, node->current_part);
+
+	/* attach shm_mq */
+	node->nextreader = 0;
+	node->nreaders = 0;
+	for (i=0;i<nworkers;++i)
+	{
+		shm_mq *mq;
+		if (unlikely(i == node->current_part))
+			continue;
+
+		mq = GetRedistributeSharedMemoryQueue(shmem, node->current_part, i);
+		REDISTRIBUTE_DEBUG("reader mq_attach %p part %u", mq, node->current_part);
+		shm_mq_set_receiver(mq, MyProc);
+		node->mqreader[node->nreaders] = shm_mq_attach(mq, NULL, NULL);
+		++(node->nreaders);
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+static bool
+ExecNextRedistributePart(RedistributeState *node)
+{
+	RedistributeShmem  *shmem = node->shmem;
+	uint32				count_part = pg_atomic_read_u32(&shmem->final_worker_num);
+	uint32				idle_part = pg_atomic_read_u32(&shmem->next_idle_part);
+
+	while (idle_part < count_part)
+	{
+		if (pg_atomic_compare_exchange_u32(&shmem->next_idle_part,
+										   &idle_part,
+										   idle_part + 1))
+		{
+			node->current_part = idle_part;
+			if (node->instrument)
+				++(node->instrument->parts_got);
+			return true;
+		}
+	}
+
+	node->current_part = INVALID_PART_NUMBER;
+	return false;
+}
+
+/*
+ * initialize redistribute hash function info list
+ * and alloc reader and writer array
+ */
+static void
+InitializeRedistributeExecute(RedistributeState *node)
+{
+	AttrNumber				maxattr;
+	int						i;
+	TypeCacheEntry		   *typeCache;
+	RedistributeHashInfo   *info;
+	Form_pg_attribute		attr;
+	TupleDesc				desc = ExecGetResultType(outerPlanState(node));
+	List				   *list = NIL;
+	Redistribute		   *plan = castNode(Redistribute, node->ps.plan);
+	RedistributeShmem	   *shmem = node->shmem;
+	MemoryContext			oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(node));
+
+	/* initialize hash functions */
+	if (node->hash_funcs == NIL)
+	{
+		maxattr = 0;
+		for (i=0;i<plan->numCols;++i)
+		{
+			info = palloc0(sizeof(*info));
+
+			info->attindex = plan->hashColIdx[i]-1;
+			attr = TupleDescAttr(desc, info->attindex);
+
+			typeCache = lookup_type_cache(attr->atttypid, TYPECACHE_HASH_PROC);
+
+			fmgr_info(typeCache->hash_proc, &info->flinfo);
+			InitFunctionCallInfoData(info->fcinfo, &info->flinfo, 1, attr->attcollation, NULL, NULL);
+			info->fcinfo.args[0].isnull = false;
+			list = lappend(list, info);
+
+			if (plan->hashColIdx[i] > maxattr)
+				maxattr = plan->hashColIdx[i];
+		}
+		node->hash_funcs = list;
+		node->hash_max_attr = maxattr;
+	}
+
+	/* alloc reader and writer array */
+	Assert(pg_atomic_read_u32(&shmem->final_worker_num) > 1);
+	Assert(shmem->plan_worker_num >= pg_atomic_read_u32(&shmem->final_worker_num));
+	if (node->mqreader == NULL)
+		node->mqreader = palloc0(sizeof(node->mqreader[0]) * shmem->plan_worker_num);
+	if (node->writer == NULL)
+	{
+		node->writer = palloc0(sizeof(node->writer[0]) * shmem->plan_worker_num);
+		node->nwriter = shmem->plan_worker_num;
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+static uint32
+RedistributeGetHash(RedistributeState *node, TupleTableSlot *input)
+{
+	ListCell			   *lc;
+	RedistributeHashInfo   *info;
+	Datum					datum;
+	uint32					hash = 0;
+
+	/* get all values we need */
+	slot_getsomeattrs(input, node->hash_max_attr);
+
+	foreach (lc, node->hash_funcs)
+	{
+		info = lfirst(lc);
+
+		/* skip NULL */
+		if (input->tts_isnull[info->attindex])
+			continue;
+
+		info->fcinfo.args[0].value = input->tts_values[info->attindex];
+		datum = FunctionCallInvoke(&info->fcinfo);
+		if (unlikely(info->fcinfo.isnull))
+			elog(ERROR, "hash function %u returned NULL", info->flinfo.fn_oid);
+
+		hash = hash_combine(hash, DatumGetUInt32(datum));
+	}
+
+	return hash;
+}
+
+/* like function gather_readnext */
+static MinimalTuple
+GetTupleFromMQ(RedistributeState *node)
+{
+	shm_mq_handle  *mq;
+	void		   *data;
+	Size			nbytes;
+	shm_mq_result	mq_result;
+	int				nvisited = 0;
+
+	for (;;)
+	{
+		/* Check for async events, particularly messages from workers. */
+		CHECK_FOR_INTERRUPTS();
+
+		Assert(node->nextreader < node->nreaders);
+		mq = node->mqreader[node->nextreader];
+
+		mq_result = shm_mq_receive(mq, &nbytes, &data, true);
+retest_:
+		if (unlikely(mq_result == SHM_MQ_DETACHED))
+		{
+			shm_mq_detach(mq);
+			node->mqreader[node->nextreader] = NULL;
+			--(node->nreaders);
+			if (node->nreaders == 0)
+				return NULL;
+
+			memmove(&node->mqreader[node->nextreader],
+					&node->mqreader[node->nextreader + 1],
+					sizeof(node->mqreader[0]) * (node->nreaders - node->nextreader));
+			if (node->nextreader >= node->nreaders)
+				node->nextreader = 0;
+		}
+		else if (mq_result == SHM_MQ_WOULD_BLOCK)
+		{
+			/*
+			 * When SharedTuplestore ready to scan and MQ sender not attached
+			 * then it will no longer attach, and we can assume that the queue has ended
+			 */
+			if (node->status_flags & STATUS_READY_STS_READ &&
+				shm_mq_receiver_get_sender(mq) == NULL)
+			{
+				mq_result = SHM_MQ_DETACHED;
+				goto retest_;
+			}
+
+			/*
+			 * Advance nextreader pointer in round-robin fashion.  Note that we
+			 * only reach this code if we weren't able to get a tuple from the
+			 * current worker.  We used to advance the nextreader pointer after
+			 * every tuple, but it turns out to be much more efficient to keep
+			 * reading from the same queue until that would require blocking.
+			 */
+			node->nextreader++;
+			if (node->nextreader >= node->nreaders)
+				node->nextreader = 0;
+
+			/* Have we visited every (surviving) shm_mq_handle? */
+			nvisited++;
+			if (nvisited >= node->nreaders)
+			{
+				return NULL;
+			}
+		}else
+		{
+			Assert(mq_result == SHM_MQ_SUCCESS);
+			Assert(((MinimalTuple)data)->t_len == nbytes);
+			return (MinimalTuple)data;
+		}
+	}
+}
+
+static void
+PutSlotToQuery(RedistributeWriteInfo *writer, TupleTableSlot *slot,
+			   RedistributeInstrumentation *instrument)
+{
+	MinimalTuple	mtup;
+	shm_mq_result	mq_result;
+	bool			shouldFree;
+
+	mtup = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+	mq_result = shm_mq_send_once(writer->mqwrite, mtup->t_len, mtup);
+	switch (mq_result)
+	{
+	case SHM_MQ_SUCCESS:
+		/* nothing todo */
+		break;
+	case SHM_MQ_WOULD_BLOCK:
+		sts_puttuple(writer->sta, NULL, mtup);
+		if (instrument)
+			++(instrument->disk_rows);
+		break;
+	case SHM_MQ_DETACHED:
+		/* other worker should report an error */
+		break;
+	}
+
+	if (shouldFree)
+		heap_free_minimal_tuple(mtup);
+}
+
+static void
+ExecRedistributeEndWrite(RedistributeState *node)
+{
+	RedistributeWriteInfo *writer;
+	uint32	i = node->nwriter;
+
+	while (i>0)
+	{
+		--i;
+		writer = &node->writer[i];
+		if (writer->sta)
+		{
+			int64	size;
+			REDISTRIBUTE_DEBUG("writer sts_close(%p) part %u", writer->sta, i);
+			size = sts_close(writer->sta);
+			writer->sta = NULL;
+			if (node->instrument)
+				node->instrument->disk_used += size;
+		}
+		if (writer->mqwrite != NULL)
+		{
+			shm_mq_detach(writer->mqwrite);
+			writer->mqwrite = NULL;
+		}
+	}
+}
+
+static TupleTableSlot *
+ExecRedistributeReadOnly(PlanState *pstate)
+{
+	MinimalTuple		mtup;
+	RedistributeState  *node = castNode(RedistributeState, pstate);
+
+loop_:
+	Assert(node->status_flags & STATUS_LOCALLY_END);
+
+	/* try read tuple from MQ and STS */
+	while (node->nreaders > 0)
+	{
+		/* try read tuple from MQ */
+		if ((mtup = GetTupleFromMQ(node)) != NULL)
+			return ExecStoreMinimalTuple(mtup, pstate->ps_ResultTupleSlot, false);
+
+		/*
+		 * GetTupleFromMQ maybe change nreaders,
+		 * so test it again.
+		 */
+		if (unlikely(node->nreaders == 0))
+			break;
+
+		/* test for all attached worker called sts_end_write */
+		if ((node->status_flags & STATUS_READY_STS_READ) == 0)
+		{
+			Barrier *part_barrier = &node->shmem->part_barrier;
+			int part_phase = BarrierAttach(part_barrier);
+			BarrierDetach(part_barrier);
+
+			if (part_phase >= PRW_END_STS)
+			{
+				sts_begin_parallel_scan(node->sta);
+				node->status_flags |= STATUS_READY_STS_READ;
+				SetLatch(MyLatch);
+			}
+		}
+
+		/* try read tuple from STS */
+		if ((node->status_flags & (STATUS_READY_STS_READ|STATUS_QUERY_STS_END)) == STATUS_READY_STS_READ)
+		{
+			mtup = sts_parallel_scan_next(node->sta, NULL);
+			if (mtup != NULL)
+				return ExecStoreMinimalTuple(mtup, pstate->ps_ResultTupleSlot, false);
+			REDISTRIBUTE_DEBUG("reader sts_close(%p)", node->sta);
+			sts_close(node->sta);
+			node->sta = NULL;
+			node->status_flags |= STATUS_QUERY_STS_END;
+		}
+
+		/* wait latch */
+		WaitMyLatch(WAIT_EVENT_MQ_RECEIVE);
+		ResetLatch(MyLatch);
+	}
+
+	if ((node->status_flags & STATUS_READY_STS_READ) == 0)
+	{
+		/* wait for all worker call sts_end_write */
+		Barrier *part_barrier = &node->shmem->part_barrier;
+		BarrierAttach(part_barrier);
+		while (BarrierPhase(part_barrier) < PRW_END_STS)
+			BarrierArriveAndWait(part_barrier, WAIT_EVENT_REDISTRIBUTE_SHARED_TUPLESTORE);
+		BarrierDetach(part_barrier);
+
+		sts_begin_parallel_scan(node->sta);
+		node->status_flags |= STATUS_READY_STS_READ;
+	}
+
+	if ((node->status_flags & STATUS_QUERY_STS_END) == 0)
+	{
+		mtup = sts_parallel_scan_next(node->sta, NULL);
+		if (mtup != NULL)
+			return ExecStoreMinimalTuple(mtup, pstate->ps_ResultTupleSlot, false);
+		REDISTRIBUTE_DEBUG("reader sts_close(%p)", node->sta);
+		sts_close(node->sta);
+		node->sta = NULL;
+		node->status_flags |= STATUS_QUERY_STS_END;
+	}
+
+	/* try read next part */
+	if (ExecNextRedistributePart(node))
+	{
+		ExecInitRedistributeReader(node);
+		sts_begin_parallel_scan(node->sta);
+		node->status_flags &= ~STATUS_QUERY_STS_END;
+		goto loop_;
+	}
+
+	return ExecClearTuple(pstate->ps_ResultTupleSlot);
+}
+
+/*
+ * fetch tuples from outer and other workers
+ */
+static TupleTableSlot *
+ExecRedistributeMixed(PlanState *pstate)
+{
+	uint32				count_part;
+	uint32				hash;
+	uint32				mod;
+	MinimalTuple		mtup;
+	TupleTableSlot	   *slot;
+	RedistributeState  *node = castNode(RedistributeState, pstate);
+
+	/* first try read tuple from MQ */
+	if (node->nreaders > 0)
+	{
+		mtup = GetTupleFromMQ(node);
+		if (mtup != NULL)
+			return ExecStoreMinimalTuple(mtup, pstate->ps_ResultTupleSlot, false);
+	}
+
+	count_part = pg_atomic_read_u32(&node->shmem->final_worker_num);
+	while ((node->status_flags & STATUS_LOCALLY_END) == 0)
+	{
+		slot = ExecProcNode(outerPlanState(pstate));
+		if (TupIsNull(slot))
+		{
+			Assert(BarrierPhase(&node->shmem->part_barrier) == PRW_WRITING);
+			ExecRedistributeEndWrite(node);
+			BarrierArriveAndDetach(&node->shmem->part_barrier);
+			node->status_flags |= STATUS_LOCALLY_END;
+
+			break;
+		}
+
+		hash = RedistributeGetHash(node, slot);
+		Assert(node->current_part != INVALID_PART_NUMBER);
+		Assert(node->current_part < count_part);
+		mod = hash % count_part;
+		if (mod == node->current_part)
+			return ExecCopyOuterSlot(pstate->ps_ResultTupleSlot, slot);
+
+		PutSlotToQuery(&node->writer[mod], slot, node->instrument);
+
+		/* try read tuple from MQ again */
+		if (node->nreaders > 0 &&
+			(mtup = GetTupleFromMQ(node)) != NULL)
+		{
+			return ExecStoreMinimalTuple(mtup, pstate->ps_ResultTupleSlot, false);
+		}
+	}
+
+	/* no more tuples from outer, change to only read tuples from other workers */
+	ExecSetExecProcNode(pstate, ExecRedistributeReadOnly);
+	return ExecRedistributeReadOnly(pstate);
+}
+
+/*
+ * Now, we know how many parallel work start succcessed
+ */
+static TupleTableSlot *
+ExecRedistributeReady(PlanState *pstate)
+{
+	RedistributeState  *node = castNode(RedistributeState, pstate);
+	RedistributeShmem  *shmem = node->shmem;
+	int					part_phase;
+
+	/* attach instrument if exist */
+	if (shmem->instrument_shmem_offset > 0)
+	{
+		int		index = ParallelWorkerNumber + 1;
+		Assert(index < shmem->plan_worker_num);
+		node->instrument = (RedistributeInstrumentation*)SharedRedistributeOffset(shmem, instrument_shmem_offset);
+		node->instrument = &node->instrument[index];
+	}
+
+	/* wait leader change final_worker_num */
+	part_phase = BarrierAttach(&shmem->part_barrier);
+	while (part_phase <= PRW_WAIT_WORKER_NUMBER)
+	{
+		/* only leader can change final_worker_num */
+		BarrierArriveAndWait(&shmem->part_barrier, WAIT_EVENT_REDISTRIBUTE_PARALLEL_START);
+		part_phase = BarrierPhase(&shmem->part_barrier);
+	}
+
+	/* get next part for read and write */
+	if (ExecNextRedistributePart(node) == false)
+	{
+		Assert(part_phase >= PRW_END_STS);
+		BarrierDetach(&shmem->part_barrier);
+		return ExecClearTuple(pstate->ps_ResultTupleSlot);
+	}
+
+	/* intialize execute and read resource */
+	InitializeRedistributeExecute(node);
+	ExecInitRedistributeReader(node);
+
+	if (part_phase >= PRW_END_STS)
+	{
+		/*
+		 * all outer's tuple fetched, don't need fetch anymore
+		 */
+		node->status_flags |= STATUS_LOCALLY_END;
+		BarrierDetach(&shmem->part_barrier);
+		ExecSetExecProcNode(pstate, ExecRedistributeReadOnly);
+	}else
+	{
+		ExecInitRedistributeWriter(node, node->current_part);
+		ExecSetExecProcNode(pstate, ExecRedistributeMixed);
+	}
+
+	Assert(part_phase != PRW_WRITING ||
+		   node->current_part != INVALID_PART_NUMBER);
+
+	return (*pstate->ExecProcNodeReal)(pstate);
+}
+
+RedistributeState *
+ExecInitRedistribute(Redistribute *node, EState *estate, int eflags)
+{
+	RedistributeState *state;
+
+	Assert(outerPlan(node) != NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state struct
+	 */
+	state = makeNode(RedistributeState);
+	state->ps.plan = (Plan*)node;
+	state->ps.state = estate;
+	/*
+	 * ExecRedistributeInitializeDSM and ExecRedistributeInitializeWorker
+	 * will change execute function
+	 */
+	state->ps.ExecProcNode = ExecRedistributeError;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &state->ps);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(state) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * tuple table and result type initialization
+	 */
+	ExecInitResultTupleSlotTL(&state->ps, &TTSOpsMinimalTuple);
+
+	return state;
+}
+
+/* release parallel work resource */
+static void
+ExecEndRedistributeInternal(RedistributeState *node)
+{
+	uint32	i;
+
+	if (node->writer != NULL &&
+		node->shmem != NULL)
+		ExecRedistributeEndWrite(node);
+
+	if (node->nreaders > 0)
+	{
+		for (i=0;i<node->nreaders;++i)
+			shm_mq_detach(node->mqreader[i]);
+		node->nreaders = node->nextreader = 0;
+	}
+
+	if (node->sta != NULL)
+	{
+		sts_close(node->sta);
+		node->sta = NULL;
+	}
+}
+
+void
+ExecEndRedistribute(RedistributeState *node)
+{
+	ExecEndRedistributeInternal(node);
+	ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanRedistribute(RedistributeState *node)
+{
+	ExecEndRedistributeInternal(node);
+	ExecReScan(outerPlanState(node));
+}
+
+/* parallel scan support */
+static Size
+ComputeRedistributeShmemSize(RedistributeShmem *shmem, int instrument)
+{
+	Size	size;
+	uint32	nworks = shmem->plan_worker_num;
+	Assert(nworks > 1);
+
+	/* RedistributeShmem size */
+	size = MAXALIGN(offsetof(RedistributeShmem, padding));
+
+	/* SharedTuplestore size */
+	Assert(size == MAXALIGN(size));
+	shmem->sts_shmem_offset = size;
+	shmem->sts_shmem_size = MAXALIGN(sts_estimate(nworks));
+	size = add_size(size, mul_size(shmem->sts_shmem_size, nworks));
+
+	/* shm_mq size */
+	Assert(redistribute_query_size > 0);
+	Assert(size == MAXALIGN(size));
+	shmem->mq_shmem_offset = size;
+	shmem->mq_shmem_size = ((Size)(redistribute_query_size)) * 1024;
+	shmem->mq_shmem_part_size = mul_size(shmem->mq_shmem_size, nworks);
+	size = add_size(size, mul_size(shmem->mq_shmem_part_size, nworks));
+
+	/* instrument size if need */
+	if (instrument)
+	{
+		Assert(size == MAXALIGN(size));
+		shmem->instrument_shmem_offset = size;
+		size = add_size(size, sizeof(RedistributeInstrumentation) * nworks);
+	}else
+	{
+		shmem->instrument_shmem_offset = 0;
+	}
+
+	return size;
+}
+
+void
+ExecRedistributeEstimate(RedistributeState *node, ParallelContext *pcxt)
+{
+	RedistributeShmem tmp;
+	tmp.plan_worker_num = pcxt->nworkers + 1;
+
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   ComputeRedistributeShmemSize(&tmp, node->ps.state->es_instrument));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+static void
+InitializeRedistributeSharedMemory(RedistributeShmem *shmem,
+								   dsm_segment *seg)
+{
+	uint32	i,j;
+	char	name[MAXPGPATH];
+	char   *addr;
+	Assert(shmem->plan_worker_num > 1);
+
+	/* base init */
+	BarrierInit(&shmem->part_barrier, 0);
+	BarrierAttach(&shmem->part_barrier);	/* let other worker wait */
+	pg_atomic_init_u32(&shmem->final_worker_num, 0);
+	pg_atomic_init_u32(&shmem->next_idle_part, 0);
+	if (seg != NULL)	/* not reinitialize */
+		SharedFileSetInit(&shmem->fileset, seg);
+
+	/* initialize all SharedTuplestore */
+	Assert(shmem->sts_shmem_offset >= offsetof(RedistributeShmem, padding) &&
+		   shmem->sts_shmem_size > 0);
+	addr = SharedRedistributeOffset(shmem, sts_shmem_offset);
+	for (i=0;i<shmem->plan_worker_num;++i)
+	{
+		AssertPointerAlignment(addr, MAXIMUM_ALIGNOF);
+		snprintf(name, sizeof(name), "rd%d", i);
+		sts_create((SharedTuplestore*)addr,
+				   shmem->plan_worker_num,
+				   0,
+				   SHARED_TUPLESTORE_SINGLE_PASS,
+				   name);
+		addr += shmem->sts_shmem_size;
+	}
+
+	/* initialize shm_mq */
+	Assert(shmem->mq_shmem_offset >= offsetof(RedistributeShmem, padding) &&
+		   shmem->mq_shmem_size > 0);
+	addr = SharedRedistributeOffset(shmem, mq_shmem_offset);
+	for (i=0;i<shmem->plan_worker_num;++i)
+	{
+		/* initialize message query for each worker */
+		for (j=0;j<shmem->plan_worker_num;++j)
+		{
+			AssertPointerAlignment(addr, MAXIMUM_ALIGNOF);
+			shm_mq_create(addr, shmem->mq_shmem_size);
+			addr += shmem->mq_shmem_size;
+		}
+	}
+
+	/* initialize instrument if alloced */
+	if (shmem->instrument_shmem_offset > 0)
+	{
+		Assert(shmem->instrument_shmem_offset >= offsetof(RedistributeShmem, padding));
+		addr = SharedRedistributeOffset(shmem, instrument_shmem_offset);
+		AssertPointerAlignment(addr, MAXIMUM_ALIGNOF);
+
+		MemSet(addr, 0, sizeof(RedistributeInstrumentation) * shmem->plan_worker_num);
+	}
+}
+
+void
+ExecRedistributeInitializeDSM(RedistributeState *node, ParallelContext *pcxt)
+{
+	RedistributeShmem tmp;
+
+	tmp.plan_worker_num = pcxt->nworkers + 1;
+	node->shmem = shm_toc_allocate(pcxt->toc,
+								   ComputeRedistributeShmemSize(&tmp, node->ps.state->es_instrument));
+	*node->shmem = tmp;
+	shm_toc_insert(pcxt->toc,
+				   node->ps.plan->plan_node_id,
+				   node->shmem);
+
+	InitializeRedistributeSharedMemory(node->shmem,
+									   pcxt->seg);
+}
+
+void
+ExecRedistributeReInitializeDSM(RedistributeState *node, ParallelContext *pcxt)
+{
+	RedistributeShmem *shmem = node->shmem;
+	if (shmem)
+	{
+		SharedFileSetDeleteAll(&shmem->fileset);
+		InitializeRedistributeSharedMemory(shmem, NULL);
+	}
+	ExecSetExecProcNode(&node->ps, ExecRedistributeError);
+}
+
+/*
+ * find shared memory and change execute proc function
+ */
+void
+ExecRedistributeInitializeWorker(RedistributeState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shmem = shm_toc_lookup(pwcxt->toc,
+								 node->ps.plan->plan_node_id,
+								 false);
+	ExecSetExecProcNode(&node->ps, ExecRedistributeReady);
+}
+
+/*
+ * Call after all parallel workers startup, make sure got parallel
+ * workers startup success count number(ParallelContext::nworkers_launched).
+ */
+void
+ExecRedistributeParallelLaunched(RedistributeState *node, ParallelContext *pcxt)
+{
+	RedistributeShmem *shmem;
+	if (pcxt->nworkers_launched > 0)
+	{
+		/*
+		 * All workers should be waiting leader change final_worker_num,
+		 * Leader already attached part_barrier at initialize DSM
+		 */
+		shmem = node->shmem;
+		Assert(BarrierPhase(&shmem->part_barrier) == PRW_WAIT_WORKER_NUMBER);
+		pg_atomic_write_u32(&shmem->final_worker_num, pcxt->nworkers_launched + 1);
+		BarrierArriveAndDetach(&shmem->part_barrier);
+
+		ExecSetExecProcNode(&node->ps, ExecRedistributeReady);
+	}
+	else
+	{
+		/*
+		 * have no any parallel work start success,
+		 * just return tuples from outer
+		 */
+		ExecSetExecProcNode(&node->ps, ExecRedistributeDirect);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecRedistributeRetrieveInstrumentation
+ *
+ *		Transfer redistribute statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecRedistributeRetrieveInstrumentation(RedistributeState *node)
+{
+	Size					size;
+	SharedRedistributeInfo *info;
+	RedistributeShmem	   *shmem = node->shmem;
+
+	if (shmem == NULL ||
+		shmem->instrument_shmem_offset == 0)
+		return;
+
+	size = shmem->plan_worker_num * sizeof(RedistributeInstrumentation);
+	info = palloc(size + offsetof(SharedRedistributeInfo, sinstrument));
+	memcpy(&info->sinstrument[0],
+		   SharedRedistributeOffset(shmem, instrument_shmem_offset),
+		   size);
+	info->num_workers = shmem->plan_worker_num;
+	node->shared_instrument = info;
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 228387eaee..201e2a9a1b 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1235,6 +1235,28 @@ _copyLimit(const Limit *from)
 	return newnode;
 }
 
+/*
+ * _copyRedistribute
+ */
+static Redistribute *
+_copyRedistribute(const Redistribute *from)
+{
+	Redistribute	   *newnode = makeNode(Redistribute);
+
+	/*
+	 * copy node superclass fields
+	 */
+	CopyPlanFields((const Plan *) from, (Plan *) newnode);
+
+	/*
+	 * copy remainder of node
+	 */
+	COPY_SCALAR_FIELD(numCols);
+	COPY_POINTER_FIELD(hashColIdx, from->numCols * sizeof(from->hashColIdx[0]));
+
+	return newnode;
+}
+
 /*
  * _copyNestLoopParam
  */
@@ -5128,6 +5150,9 @@ copyObjectImpl(const void *from)
 		case T_Limit:
 			retval = _copyLimit(from);
 			break;
+		case T_Redistribute:
+			retval = _copyRedistribute(from);
+			break;
 		case T_NestLoopParam:
 			retval = _copyNestLoopParam(from);
 			break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f2a6a6e7a0..4e60386b1f 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -971,6 +971,17 @@ _outLimit(StringInfo str, const Limit *node)
 	WRITE_OID_ARRAY(uniqCollations, node->uniqNumCols);
 }
 
+static void
+_outRedistribute(StringInfo str, const Redistribute *node)
+{
+	WRITE_NODE_TYPE("REDISTRIBUTE");
+
+	_outPlanInfo(str, (const Plan *) node);
+
+	WRITE_INT_FIELD(numCols);
+	WRITE_ATTRNUMBER_ARRAY(hashColIdx, node->numCols);
+}
+
 static void
 _outNestLoopParam(StringInfo str, const NestLoopParam *node)
 {
@@ -2216,6 +2227,17 @@ _outLimitPath(StringInfo str, const LimitPath *node)
 	WRITE_ENUM_FIELD(limitOption, LimitOption);
 }
 
+static void
+_outRedistributePath(StringInfo str, const RedistributePath *node)
+{
+	WRITE_NODE_TYPE("REDISTRIBUTEPATH");
+
+	_outPathInfo(str, (const Path *) node);
+
+	WRITE_NODE_FIELD(subpath);
+	WRITE_NODE_FIELD(hashClause);
+}
+
 static void
 _outGatherMergePath(StringInfo str, const GatherMergePath *node)
 {
@@ -3987,6 +4009,9 @@ outNode(StringInfo str, const void *obj)
 			case T_Limit:
 				_outLimit(str, obj);
 				break;
+			case T_Redistribute:
+				_outRedistribute(str, obj);
+				break;
 			case T_NestLoopParam:
 				_outNestLoopParam(str, obj);
 				break;
@@ -4257,6 +4282,9 @@ outNode(StringInfo str, const void *obj)
 			case T_LimitPath:
 				_outLimitPath(str, obj);
 				break;
+			case T_RedistributePath:
+				_outRedistributePath(str, obj);
+				break;
 			case T_GatherMergePath:
 				_outGatherMergePath(str, obj);
 				break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 0dd1ad7dfc..e02b8355b9 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2493,6 +2493,22 @@ _readLimit(void)
 	READ_DONE();
 }
 
+/*
+ * _readRedistribute
+ */
+static Redistribute *
+_readRedistribute(void)
+{
+	READ_LOCALS(Redistribute);
+
+	ReadCommonPlan(&local_node->plan);
+
+	READ_INT_FIELD(numCols);
+	READ_ATTRNUMBER_ARRAY(hashColIdx, local_node->numCols);
+
+	READ_DONE();
+}
+
 /*
  * _readNestLoopParam
  */
@@ -2949,6 +2965,8 @@ parseNodeString(void)
 		return_value = _readLockRows();
 	else if (MATCH("LIMIT", 5))
 		return_value = _readLimit();
+	else if (MATCH("REDISTRIBUTE", 12))
+		return_value = _readRedistribute();
 	else if (MATCH("NESTLOOPPARAM", 13))
 		return_value = _readNestLoopParam();
 	else if (MATCH("PLANROWMARK", 11))
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1096a236d5..c81cedc74f 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -4113,6 +4113,21 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	path->jpath.path.total_cost = startup_cost + run_cost;
 }
 
+void
+cost_redistribute(RedistributePath *path)
+{
+	Cost	run_cost;
+	double	rows = path->path.rows;
+
+	/* hash cost */
+	run_cost = cpu_operator_cost * list_length(path->hashClause) * rows;
+
+	/* communication cost */
+	run_cost += parallel_tuple_cost * (rows - rows / (path->path.parallel_workers+1));
+
+	path->path.startup_cost = path->subpath->startup_cost;
+	path->path.total_cost = path->subpath->total_cost + run_cost;
+}
 
 /*
  * cost_subplan
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index a5f6d678cc..84488c9858 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -311,7 +311,9 @@ static ModifyTable *make_modifytable(PlannerInfo *root, Plan *subplan,
 									 List *rowMarks, OnConflictExpr *onconflict, int epqParam);
 static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
 											 GatherMergePath *best_path);
-
+static Redistribute *create_redistribute_plan(PlannerInfo *root,
+											  RedistributePath *best_path,
+											  int flags);
 
 /*
  * create_plan
@@ -536,6 +538,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
 			plan = (Plan *) create_gather_merge_plan(root,
 													 (GatherMergePath *) best_path);
 			break;
+		case T_Redistribute:
+			plan = (Plan *) create_redistribute_plan(root,
+													 (RedistributePath *) best_path,
+													 flags);
+			break;
 		default:
 			elog(ERROR, "unrecognized node type: %d",
 				 (int) best_path->pathtype);
@@ -1922,6 +1929,28 @@ create_gather_merge_plan(PlannerInfo *root, GatherMergePath *best_path)
 	return gm_plan;
 }
 
+static Redistribute *
+create_redistribute_plan(PlannerInfo *root, RedistributePath *best_path, int flags)
+{
+	Redistribute   *plan;
+	Plan		   *subplan;
+
+	subplan = create_plan_recurse(root, best_path->subpath,
+								  flags | CP_SMALL_TLIST);
+
+	plan = makeNode(Redistribute);
+	plan->plan.targetlist = subplan->targetlist;
+	plan->plan.qual = NIL;
+	outerPlan(plan) = subplan;
+	innerPlan(plan) = NULL;
+	plan->numCols = list_length(best_path->hashClause);
+	plan->hashColIdx = extract_grouping_cols(best_path->hashClause,
+											 subplan->targetlist);
+
+	copy_generic_path_info(&plan->plan, &best_path->path);
+	return plan;
+}
+
 /*
  * create_projection_plan
  *
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1e42d75465..77f9d43797 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -192,6 +192,9 @@ static RelOptInfo *create_distinct_paths(PlannerInfo *root,
 static void create_partial_distinct_paths(PlannerInfo *root,
 										  RelOptInfo *input_rel,
 										  RelOptInfo *final_distinct_rel);
+static void create_parallel_distinct_paths(PlannerInfo *root,
+										   RelOptInfo *input_rel,
+										   RelOptInfo *distinct_rel);
 static RelOptInfo *create_final_distinct_paths(PlannerInfo *root,
 											   RelOptInfo *input_rel,
 											   RelOptInfo *distinct_rel);
@@ -3361,6 +3364,7 @@ create_grouping_paths(PlannerInfo *root,
 		extra.havingQual = parse->havingQual;
 		extra.targetList = parse->targetList;
 		extra.partial_costs_set = false;
+		extra.hashableList = grouping_get_hashable(parse->groupClause);
 
 		/*
 		 * Determine whether partitionwise aggregation is in theory possible.
@@ -4257,6 +4261,7 @@ create_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel)
 	distinct_rel->userid = input_rel->userid;
 	distinct_rel->useridiscurrent = input_rel->useridiscurrent;
 	distinct_rel->fdwroutine = input_rel->fdwroutine;
+	distinct_rel->reltarget = root->upper_targets[UPPERREL_DISTINCT];
 
 	/* build distinct paths based on input_rel's pathlist */
 	create_final_distinct_paths(root, input_rel, distinct_rel);
@@ -4264,6 +4269,11 @@ create_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel)
 	/* now build distinct paths based on input_rel's partial_pathlist */
 	create_partial_distinct_paths(root, input_rel, distinct_rel);
 
+	/* now build parallel distinct paths based on input_rel's partial_pathlist */
+	create_parallel_distinct_paths(root, input_rel, distinct_rel);
+
+	generate_useful_gather_paths(root, distinct_rel, false);
+
 	/* Give a helpful error if we failed to create any paths */
 	if (distinct_rel->pathlist == NIL)
 		ereport(ERROR,
@@ -4565,6 +4575,95 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 	return distinct_rel;
 }
 
+static void
+create_parallel_distinct_paths(PlannerInfo *root,
+							   RelOptInfo *input_rel,
+							   RelOptInfo *distinct_rel)
+{
+	List	   *hashable_clause;
+	Query	   *parse;
+	List	   *distinctExprs;
+	double		numDistinctRows;
+	Path	   *cheapest_partial_path;
+	Path	   *path;
+
+	if (distinct_rel->consider_parallel == false ||
+		redistribute_query_size <= 0 ||
+		input_rel->partial_pathlist == NIL)
+		return;
+
+	parse = root->parse;
+
+	hashable_clause = grouping_get_hashable(parse->distinctClause);
+	if (hashable_clause == NIL)
+		return;
+
+	cheapest_partial_path = linitial(input_rel->partial_pathlist);
+
+	distinctExprs = get_sortgrouplist_exprs(parse->distinctClause,
+											parse->targetList);
+
+	/* estimate how many distinct rows we'll get from each worker */
+	numDistinctRows = estimate_num_groups(root, distinctExprs,
+										  input_rel->cheapest_total_path->rows,
+										  NULL, NULL);
+	numDistinctRows /= (cheapest_partial_path->parallel_workers + 1);
+	if (numDistinctRows < 1.0)
+		numDistinctRows = 1.0;
+
+	if (grouping_is_sortable(parse->distinctClause))
+	{
+		List	   *needed_pathkeys;
+
+		/* For explicit-sort case, always use the more rigorous clause */
+		if (list_length(root->distinct_pathkeys) <
+			list_length(root->sort_pathkeys))
+		{
+			needed_pathkeys = root->sort_pathkeys;
+			/* Assert checks that parser didn't mess up... */
+			Assert(pathkeys_contained_in(root->distinct_pathkeys,
+										 needed_pathkeys));
+		}
+		else
+			needed_pathkeys = root->distinct_pathkeys;
+
+		path = (Path *) create_redistribute_path(root,
+												 distinct_rel,
+												 cheapest_partial_path,
+												 hashable_clause);
+		path = (Path *) create_sort_path(root,
+										 distinct_rel,
+										 path,
+										 needed_pathkeys,
+										 -1.0);
+		path = (Path *)create_upper_unique_path(root,
+												distinct_rel,
+												path,
+												list_length(root->distinct_pathkeys),
+												numDistinctRows);
+		add_partial_path(distinct_rel, path);
+	}
+
+	if (grouping_is_hashable(parse->distinctClause))
+	{
+		path = (Path *) create_redistribute_path(root,
+												 distinct_rel,
+												 cheapest_partial_path,
+												 hashable_clause);
+		path = (Path *)create_agg_path(root,
+									   distinct_rel,
+									   path,
+									   path->pathtarget,
+									   AGG_HASHED,
+									   AGGSPLIT_SIMPLE,
+									   parse->distinctClause,
+									   NIL,
+									   NULL,
+									   numDistinctRows);
+		add_partial_path(distinct_rel, path);
+	}
+}
+
 /*
  * create_ordered_paths
  *
@@ -6294,6 +6393,57 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			}
 		}
 
+		if (parse->groupingSets == NIL && /* not support grouping sets */
+			extra->hashableList != NIL &&
+			grouped_rel->consider_parallel &&
+			input_rel->partial_pathlist != NIL &&
+			redistribute_query_size > 0)
+		{
+			Path   *path = linitial(input_rel->partial_pathlist);
+			double	rows = dNumGroups / (path->parallel_workers + 1);
+			if (rows < 1.0)
+				rows = 1.0;
+
+			/* Redistribute tuples using hashable group columns */
+			path = (Path*) create_redistribute_path(root,
+													grouped_rel,
+													path,
+													extra->hashableList);
+
+			/* Sort the path */
+			path = (Path*) create_sort_path(root,
+											grouped_rel,
+											path,
+											root->group_pathkeys,
+											-1.0);
+
+			Assert(parse->groupClause != NIL);
+			if (parse->hasAggs)
+			{
+				add_partial_path(grouped_rel, (Path *)
+						 create_agg_path(root,
+										 grouped_rel,
+										 path,
+										 grouped_rel->reltarget,
+										 AGG_SORTED,
+										 AGGSPLIT_SIMPLE,
+										 parse->groupClause,
+										 havingQual,
+										 agg_costs,
+										 rows));
+			}
+			else
+			{
+				add_partial_path(grouped_rel, (Path *)
+								 create_group_path(root,
+												   grouped_rel,
+												   path,
+												   parse->groupClause,
+												   havingQual,
+												   dNumGroups));
+			}
+		}
+
 		/*
 		 * Instead of operating directly on the input relation, we can
 		 * consider finalizing a partially aggregated path.
@@ -6397,6 +6547,58 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 											   havingQual,
 											   dNumGroups));
 			}
+
+			if (parse->groupingSets == NIL && /* not support grouping sets */
+				extra->hashableList != NIL &&
+				grouped_rel->consider_parallel &&
+				partially_grouped_rel->partial_pathlist != NIL &&
+				redistribute_query_size > 0)
+			{
+				Path   *path = linitial(partially_grouped_rel->partial_pathlist);
+				double	rows = dNumGroups / (path->parallel_workers + 1);
+				if (rows < 1.0)
+					rows = 1.0;
+
+				/* Redistribute tuples using hashable group columns */
+				path = (Path*) create_redistribute_path(root,
+														grouped_rel,
+														path,
+														extra->hashableList);
+
+				/* Sort the path */
+				path = (Path*) create_sort_path(root,
+												grouped_rel,
+												path,
+												root->group_pathkeys,
+												-1.0);
+
+				Assert(parse->groupClause != NIL);
+				if (parse->hasAggs)
+				{
+					add_partial_path(grouped_rel, (Path *)
+							 create_agg_path(root,
+											 grouped_rel,
+											 path,
+											 grouped_rel->reltarget,
+											 AGG_SORTED,
+											 AGGSPLIT_FINAL_DESERIAL,
+											 parse->groupClause,
+											 havingQual,
+											 agg_costs,
+											 rows));
+				}
+				else
+				{
+					add_partial_path(grouped_rel, (Path *)
+									 create_group_path(root,
+													   grouped_rel,
+													   path,
+													   parse->groupClause,
+													   havingQual,
+													   dNumGroups));
+				}
+			}
+
 		}
 	}
 
@@ -6427,6 +6629,34 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 									 havingQual,
 									 agg_costs,
 									 dNumGroups));
+
+			if (redistribute_query_size > 0 &&
+				extra->hashableList != NIL &&
+				grouped_rel->consider_parallel &&
+				input_rel->partial_pathlist != NIL)
+			{
+				Path   *path = linitial(input_rel->partial_pathlist);
+				double	rows = dNumGroups / (path->parallel_workers + 1);
+				if (rows < 1.0)
+					rows = 1.0;
+
+				/* Redistribute tuples using hashable group columns */
+				path = (Path*) create_redistribute_path(root,
+														grouped_rel,
+														path,
+														extra->hashableList);
+
+				path = (Path *) create_agg_path(root, grouped_rel,
+												path,
+												grouped_rel->reltarget,
+												AGG_HASHED,
+												AGGSPLIT_SIMPLE,
+												parse->groupClause,
+												havingQual,
+												agg_costs,
+												rows);
+				add_partial_path(grouped_rel, path);
+			}
 		}
 
 		/*
@@ -6448,6 +6678,36 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 									 havingQual,
 									 agg_final_costs,
 									 dNumGroups));
+
+			if (redistribute_query_size > 0 &&
+				extra->hashableList != NIL &&
+				grouped_rel->consider_parallel &&
+				partially_grouped_rel->partial_pathlist != NIL)
+			{
+				double	rows;
+				path = linitial(partially_grouped_rel->partial_pathlist);
+				rows = dNumGroups / (path->parallel_workers + 1);
+				if (rows < 1.0)
+					rows = 1.0;
+
+				/* Redistribute tuples using hashable group columns */
+				path = (Path*) create_redistribute_path(root,
+														grouped_rel,
+														path,
+														extra->hashableList);
+
+				path = (Path *) create_agg_path(root,
+												grouped_rel,
+												path,
+												grouped_rel->reltarget,
+												AGG_HASHED,
+												AGGSPLIT_FINAL_DESERIAL,
+												parse->groupClause,
+												havingQual,
+												agg_final_costs,
+												rows);
+				add_partial_path(grouped_rel, path);
+			}
 		}
 	}
 
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 3d23a2e5ac..0f03eb2cff 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -810,6 +810,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
 		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
+		case T_Redistribute:
 
 			/*
 			 * These plan types don't actually bother to evaluate their
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index c9f7a09d10..ef9aaeb4c4 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2756,6 +2756,7 @@ finalize_plan(PlannerInfo *root, Plan *plan,
 		case T_Unique:
 		case T_SetOp:
 		case T_Group:
+		case T_Redistribute:
 			/* no node-type-specific fields need fixing */
 			break;
 
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index e9256a2d4d..5bb5acf5d1 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -67,7 +67,7 @@ static List *plan_union_children(PlannerInfo *root,
 								 List *refnames_tlist,
 								 List **tlist_list);
 static Path *make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
-							   PlannerInfo *root);
+							   PlannerInfo *root, List *groupList, List *sortKeys);
 static void postprocess_setop_rel(PlannerInfo *root, RelOptInfo *rel);
 static bool choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 								Path *input_path,
@@ -355,6 +355,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 			rel = generate_nonunion_paths(op, root,
 										  refnames_tlist,
 										  pTargetList);
+		generate_useful_gather_paths(root, rel, false);
 		if (pNumGroups)
 			*pNumGroups = rel->rows;
 
@@ -553,6 +554,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	List	   *tlist_list;
 	List	   *tlist;
 	Path	   *path;
+	List	   *groupList = NIL;
+	List	   *sortKeys = NIL;
 
 	/*
 	 * If plain UNION, tell children to fetch all tuples.
@@ -588,6 +591,15 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 
 	*pTargetList = tlist;
 
+	if (!op->all)
+	{
+		/* Identify the grouping semantics */
+		groupList = generate_setop_grouplist(op, tlist);
+		if (groupList != NIL &&
+			grouping_is_sortable(groupList))
+			sortKeys = make_pathkeys_for_sortclauses(root, groupList, tlist);
+	}
+
 	/* Build path lists and relid set. */
 	foreach(lc, rellist)
 	{
@@ -628,7 +640,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	 * node(s) to remove duplicates.
 	 */
 	if (!op->all)
-		path = make_union_unique(op, path, tlist, root);
+		path = make_union_unique(op, path, tlist, root, groupList, sortKeys);
 
 	add_path(result_rel, path);
 
@@ -679,11 +691,62 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 							   NIL, NULL,
 							   parallel_workers, enable_parallel_append,
 							   -1);
+
+		if (!op->all &&
+			redistribute_query_size > 0)
+		{
+			Path   *parallel_path;
+			List   *hashable_list;
+
+			/* create parallel sort union */
+			if (sortKeys != NIL &&
+				(hashable_list = grouping_get_hashable(groupList)) != NIL)
+			{
+				Assert(list_length(sortKeys) >= list_length(hashable_list));
+
+				parallel_path = (Path*) create_redistribute_path(root,
+																 result_rel,
+																 ppath,
+																 hashable_list);
+				parallel_path = (Path*) create_sort_path(root,
+														 result_rel,
+														 parallel_path,
+														 sortKeys,
+														 -1.0);
+				parallel_path = (Path*) create_upper_unique_path(root,
+																result_rel,
+																parallel_path,
+																list_length(sortKeys),
+																ppath->rows);
+				add_partial_path(result_rel, parallel_path);
+			}
+
+			/* create parallel hash union */
+			if (grouping_is_hashable(groupList))
+			{
+				parallel_path = (Path*) create_redistribute_path(root,
+																 result_rel,
+																 ppath,
+																 groupList);
+				parallel_path = (Path*) create_agg_path(root,
+														result_rel,
+														parallel_path,
+														create_pathtarget(root, tlist),
+														AGG_HASHED,
+														AGGSPLIT_SIMPLE,
+														groupList,
+														NIL,
+														NULL,
+														ppath->rows);
+				add_partial_path(result_rel, parallel_path);
+			}
+		}
+
 		ppath = (Path *)
 			create_gather_path(root, result_rel, ppath,
 							   result_rel->reltarget, NULL, NULL);
 		if (!op->all)
-			ppath = make_union_unique(op, ppath, tlist, root);
+			ppath = make_union_unique(op, ppath, tlist, root, groupList, sortKeys);
 		add_path(result_rel, ppath);
 	}
 
@@ -934,15 +997,11 @@ plan_union_children(PlannerInfo *root,
  */
 static Path *
 make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
-				  PlannerInfo *root)
+				  PlannerInfo *root, List *groupList, List *sortKeys)
 {
 	RelOptInfo *result_rel = fetch_upper_rel(root, UPPERREL_SETOP, NULL);
-	List	   *groupList;
 	double		dNumGroups;
 
-	/* Identify the grouping semantics */
-	groupList = generate_setop_grouplist(op, tlist);
-
 	/*
 	 * XXX for the moment, take the number of distinct groups as equal to the
 	 * total input size, ie, the worst case.  This is too conservative, but
@@ -977,9 +1036,7 @@ make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
 				create_sort_path(root,
 								 result_rel,
 								 path,
-								 make_pathkeys_for_sortclauses(root,
-															   groupList,
-															   tlist),
+								 sortKeys,
 								 -1.0);
 		path = (Path *) create_upper_unique_path(root,
 												 result_rel,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index cedb3848dd..a515edc827 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3822,6 +3822,32 @@ adjust_limit_rows_costs(double *rows,	/* in/out parameter */
 	}
 }
 
+RedistributePath *
+create_redistribute_path(PlannerInfo *root,
+						 RelOptInfo *rel,
+						 Path *subpath,
+						 List *groupClause)
+{
+	RedistributePath   *path;
+	Assert(rel->consider_parallel && subpath->parallel_safe);
+
+	path = makeNode(RedistributePath);
+	path->path.pathtype = T_Redistribute;
+	path->path.parent = rel;
+	/* Redistribute doesn't project, so use source path's pathtarget */
+	path->path.pathtarget = subpath->pathtarget;
+	path->path.parallel_aware = true;
+	path->path.parallel_safe = true;
+	path->path.parallel_workers = subpath->parallel_workers;
+	path->path.rows = subpath->rows;
+	path->path.pathkeys = subpath->pathkeys;
+	path->subpath = subpath;
+	path->hashClause = groupClause;
+
+	cost_redistribute(path);
+
+	return path;
+}
 
 /*
  * reparameterize_path
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 311579d059..d6b42ce67a 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -560,6 +560,27 @@ grouping_is_hashable(List *groupClause)
 	return true;
 }
 
+/*
+ * grouping_get_hashable - extract all hashable clause
+ *
+ * Some time we don't all clauses can be hash
+ */
+List *
+grouping_get_hashable(List *groupClause)
+{
+	ListCell   *lc;
+	List	   *result = NIL;
+
+	foreach (lc, groupClause)
+	{
+		SortGroupClause *groupcl = lfirst_node(SortGroupClause, lc);
+
+		if (groupcl->hashable)
+			result = lappend(result, groupcl);
+	}
+
+	return result;
+}
 
 /*****************************************************************************
  *		PathTarget manipulation functions
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index 91a7093e03..8b6ffc595e 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -259,6 +259,68 @@ shm_mq_get_sender(shm_mq *mq)
 	return sender;
 }
 
+/*
+ * Receiver get peer
+ */
+PGPROC *
+shm_mq_receiver_get_sender(shm_mq_handle *mqh)
+{
+	PGPROC	   *sender;
+	shm_mq	   *mq = mqh->mqh_queue;
+	Assert(mq->mq_receiver == MyProc);
+
+	/*
+	 * If the counterparty is known to have attached, we can read mq_receiver
+	 * without acquiring the spinlock and assume it isn't NULL.  Otherwise,
+	 * more caution is needed.
+	 */
+	if (mqh->mqh_counterparty_attached)
+	{
+		sender = mq->mq_sender;
+	}
+	else
+	{
+		SpinLockAcquire(&mq->mq_mutex);
+		sender = mq->mq_sender;
+		SpinLockRelease(&mq->mq_mutex);
+		if (sender != NULL)
+			mqh->mqh_counterparty_attached = true;
+	}
+
+	return sender;
+}
+
+/*
+ * Sender get peer
+ */
+PGPROC *
+shm_mq_sender_get_reciver(shm_mq_handle *mqh)
+{
+	PGPROC	   *receiver;
+	shm_mq	   *mq = mqh->mqh_queue;
+	Assert(mq->mq_sender == MyProc);
+
+	/*
+	 * If the counterparty is known to have attached, we can read mq_receiver
+	 * without acquiring the spinlock and assume it isn't NULL.  Otherwise,
+	 * more caution is needed.
+	 */
+	if (mqh->mqh_counterparty_attached)
+	{
+		receiver = mq->mq_receiver;
+	}
+	else
+	{
+		SpinLockAcquire(&mq->mq_mutex);
+		receiver = mq->mq_receiver;
+		SpinLockRelease(&mq->mq_mutex);
+		if (receiver != NULL)
+			mqh->mqh_counterparty_attached = true;
+	}
+
+	return receiver;
+}
+
 /*
  * Attach to a shared message queue so we can send or receive messages.
  *
@@ -523,6 +585,44 @@ shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
 	return SHM_MQ_SUCCESS;
 }
 
+/*
+ * like shm_mq_send(nowait), but return SHM_MQ_WOULD_BLOCK immediately when
+ * no space for write data, not write any data to queue
+ */
+shm_mq_result
+shm_mq_send_once(shm_mq_handle *mqh, Size nbytes, const void *data)
+{
+	PGPROC	   *receiver;
+	shm_mq	   *mq = mqh->mqh_queue;
+	uint64		rb = pg_atomic_read_u64(&mq->mq_bytes_read);
+	uint64		wb = pg_atomic_read_u64(&mq->mq_bytes_written);
+	Size		written;
+	shm_mq_result result;
+
+	Assert(mq->mq_sender == MyProc);
+	Assert(mqh->mqh_length_word_complete == false);
+	Assert(mqh->mqh_partial_bytes == 0);
+
+	/* test space */
+	if (mq->mq_ring_size - (wb - rb) < nbytes + sizeof(Size))
+		return SHM_MQ_WOULD_BLOCK;
+
+	/* write date */
+	if ((result = shm_mq_send_bytes(mqh, sizeof(Size), &nbytes, true, &written)) != SHM_MQ_SUCCESS ||
+		(result = shm_mq_send_bytes(mqh, nbytes, data, true, &written)) != SHM_MQ_SUCCESS)
+	{
+		/* should not get SHM_MQ_WOULD_BLOCK */
+		Assert(result == SHM_MQ_DETACHED);
+		return result;
+	}
+
+	/* Notify receiver of the newly-written data(if attached), and return. */
+	receiver = shm_mq_sender_get_reciver(mqh);
+	if (receiver != NULL)
+		SetLatch(&receiver->procLatch);
+	return SHM_MQ_SUCCESS;
+}
+
 /*
  * Receive a message from a shared message queue.
  *
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index ef7e6bfb77..f86c2f36ae 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -451,6 +451,12 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_XACT_GROUP_UPDATE:
 			event_name = "XactGroupUpdate";
 			break;
+		case WAIT_EVENT_REDISTRIBUTE_PARALLEL_START:
+			event_name = "RedistributeParallelStartup";
+			break;
+		case WAIT_EVENT_REDISTRIBUTE_SHARED_TUPLESTORE:
+			event_name = "RedistributeSharedTuplestore";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 23236fa4c3..6642fa84bc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -137,6 +137,7 @@ extern char *temp_tablespaces;
 extern bool ignore_checksum_failure;
 extern bool ignore_invalid_pages;
 extern bool synchronize_seqscans;
+extern int	redistribute_query_size;	/* in nodeRedistribute.c */
 
 #ifdef TRACE_SYNCSCAN
 extern bool trace_syncscan;
@@ -3558,6 +3559,17 @@ static struct config_int ConfigureNamesInt[] =
 		check_client_connection_check_interval, NULL, NULL
 	},
 
+	{
+		{"redistribute_query_size", PGC_USERSET, 0,
+			gettext_noop("Sets the maximum shared memory to be used by redistribute plan"),
+			NULL,
+			GUC_UNIT_KB,
+		},
+		&redistribute_query_size,
+		0, 0, MAX_KILOBYTES,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c
index 033088f9bc..13071d43a7 100644
--- a/src/backend/utils/sort/sharedtuplestore.c
+++ b/src/backend/utils/sort/sharedtuplestore.c
@@ -131,11 +131,22 @@ sts_initialize(SharedTuplestore *sts, int participants,
 			   SharedFileSet *fileset,
 			   const char *name)
 {
-	SharedTuplestoreAccessor *accessor;
-	int			i;
-
-	Assert(my_participant_number < participants);
+	sts_create(sts, participants, meta_data_size, flags, name);
+	return sts_attach(sts, my_participant_number, fileset);
+}
 
+/*
+ * The caller must supply a SharedFileSet, which is essentially a directory
+ * that will be cleaned up automatically, and a name which must be unique
+ * across all SharedTuplestores created in the same SharedFileSet.
+ */
+void
+sts_create(SharedTuplestore *sts, int participants,
+		   size_t meta_data_size,
+		   int flags,
+		   const char *name)
+{
+	int		i;
 	sts->nparticipants = participants;
 	sts->meta_data_size = meta_data_size;
 	sts->flags = flags;
@@ -160,14 +171,6 @@ sts_initialize(SharedTuplestore *sts, int participants,
 		sts->participants[i].read_page = 0;
 		sts->participants[i].writing = false;
 	}
-
-	accessor = palloc0(sizeof(SharedTuplestoreAccessor));
-	accessor->participant = my_participant_number;
-	accessor->sts = sts;
-	accessor->fileset = fileset;
-	accessor->context = CurrentMemoryContext;
-
-	return accessor;
 }
 
 /*
@@ -205,6 +208,32 @@ sts_flush_chunk(SharedTuplestoreAccessor *accessor)
 		STS_CHUNK_PAGES;
 }
 
+/*
+ * close opend files and free memory resource
+ * return written size
+ */
+int64
+sts_close(SharedTuplestoreAccessor *accessor)
+{
+	int64	size = 0;
+	if (accessor->write_file != NULL)
+	{
+		sts_flush_chunk(accessor);
+		size = BufFileSize(accessor->write_file);
+		BufFileClose(accessor->write_file);
+		pfree(accessor->write_chunk);
+		accessor->write_chunk = NULL;
+		accessor->write_file = NULL;
+		accessor->sts->participants[accessor->participant].writing = false;
+	}
+
+	if (accessor->read_file)
+		BufFileClose(accessor->read_file);
+	pfree(accessor);
+
+	return size;
+}
+
 /*
  * Finish writing tuples.  This must be called by all backends that have
  * written data before any backend begins reading it.
diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h
index 3888175a2f..be67633358 100644
--- a/src/include/executor/execParallel.h
+++ b/src/include/executor/execParallel.h
@@ -45,6 +45,7 @@ extern void ExecParallelFinish(ParallelExecutorInfo *pei);
 extern void ExecParallelCleanup(ParallelExecutorInfo *pei);
 extern void ExecParallelReinitialize(PlanState *planstate,
 									 ParallelExecutorInfo *pei, Bitmapset *sendParam);
+extern void ExecParallelLaunched(PlanState *planstate, ParallelContext *pcxt);
 
 extern void ParallelQueryMain(dsm_segment *seg, shm_toc *toc);
 
diff --git a/src/include/executor/nodeRedistribute.h b/src/include/executor/nodeRedistribute.h
new file mode 100644
index 0000000000..960d434eb2
--- /dev/null
+++ b/src/include/executor/nodeRedistribute.h
@@ -0,0 +1,21 @@
+#ifndef NODEREDISTRIBUTE_H
+#define NODEREDISTRIBUTE_H
+
+#include "access/parallel.h"
+#include "nodes/execnodes.h"
+
+extern RedistributeState *ExecInitRedistribute(Redistribute *node, EState *estate, int eflags);
+extern void ExecEndRedistribute(RedistributeState *node);
+extern void ExecReScanRedistribute(RedistributeState *node);
+
+/* parallel scan support */
+extern void ExecRedistributeEstimate(RedistributeState *node, ParallelContext *pcxt);
+extern void ExecRedistributeInitializeDSM(RedistributeState *node, ParallelContext *pcxt);
+extern void ExecRedistributeReInitializeDSM(RedistributeState *node, ParallelContext *pcxt);
+extern void ExecRedistributeInitializeWorker(RedistributeState *node,
+											 ParallelWorkerContext *pwcxt);
+extern void ExecRedistributeParallelLaunched(RedistributeState *node,
+											 ParallelContext *pcxt);
+extern void ExecRedistributeRetrieveInstrumentation(RedistributeState *node);
+
+#endif							/* NODEREDISTRIBUTE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 37cb4f3d59..0c506faeb9 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2640,4 +2640,43 @@ typedef struct LimitState
 	TupleTableSlot *last_slot;	/* slot for evaluation of ties */
 } LimitState;
 
+/* ---------------------
+ *	per-worker redistribute information
+ * ---------------------
+ */
+typedef struct RedistributeInstrumentation
+{
+	Size	disk_used;		/* bytes of disk space used */
+	Size	disk_rows;		/* rows of disk writed */
+	uint32	parts_got;		/* parts fetched */
+}RedistributeInstrumentation;
+
+/* ----------------
+ *	 Shared memory container for per-worker aggregate information
+ * ----------------
+ */
+typedef struct SharedRedistributeInfo
+{
+	int			num_workers;
+	RedistributeInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedRedistributeInfo;
+
+typedef struct RedistributeState
+{
+	PlanState		ps;
+	List		   *hash_funcs;				/* for compute hash with redistribute */
+	int				hash_max_attr;			/* max attribute from subplan's output */
+	struct RedistributeShmem *shmem;		/* private in nodeRedistribute.c */
+	struct RedistributeWriteInfo *writer;	/* private in nodeRedistribute.c */
+	struct SharedTuplestoreAccessor *sta;	/* in disk tuples from other workers */
+	struct shm_mq_handle  **mqreader;		/* array with nreaders active entries */
+	uint32			nreaders;				/* number of working workers for writing */
+	uint32			nextreader;				/* next one to try to read from */
+	uint32			nwriter;				/* number of array writer */
+	uint32			status_flags;			/* execute status flags */
+	uint32			current_part;			/* current partition number */
+	RedistributeInstrumentation *instrument;
+	SharedRedistributeInfo *shared_instrument;
+}RedistributeState;
+
 #endif							/* EXECNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index b3ee4194d3..b935c56474 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -87,6 +87,7 @@ typedef enum NodeTag
 	T_SetOp,
 	T_LockRows,
 	T_Limit,
+	T_Redistribute,
 	/* these aren't subclasses of Plan: */
 	T_NestLoopParam,
 	T_PlanRowMark,
@@ -146,6 +147,7 @@ typedef enum NodeTag
 	T_SetOpState,
 	T_LockRowsState,
 	T_LimitState,
+	T_RedistributeState,
 
 	/*
 	 * TAGS FOR PRIMITIVE NODES (primnodes.h)
@@ -262,6 +264,7 @@ typedef enum NodeTag
 	T_LockRowsPath,
 	T_ModifyTablePath,
 	T_LimitPath,
+	T_RedistributePath,
 	/* these aren't subclasses of Path: */
 	T_EquivalenceClass,
 	T_EquivalenceMember,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index c8a8eec6e2..0c73682ae5 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1908,6 +1908,15 @@ typedef struct LimitPath
 	LimitOption limitOption;	/* FETCH FIRST with ties or exact number */
 } LimitPath;
 
+/*
+ * Redistribute represents redistribute tuples
+ */
+typedef struct RedistributePath
+{
+	Path		path;
+	Path	   *subpath;		/* path representing input source */
+	List	   *hashClause;		/* a list of SortGroupClause's */
+}RedistributePath;
 
 /*
  * Restriction clause info.
@@ -2596,6 +2605,7 @@ typedef struct
 	bool		target_parallel_safe;
 	Node	   *havingQual;
 	List	   *targetList;
+	List	   *hashableList;	/* hashable group list */
 	PartitionwiseAggregateType patype;
 } GroupPathExtraData;
 
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index ec9a8b0c81..48f08149f1 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -1034,6 +1034,20 @@ typedef struct Limit
 } Limit;
 
 
+/* ----------------
+ *		limit node
+ *
+ * Note: as of Postgres 8.2, the offset and count expressions are expected
+ * to yield int8, rather than int4 as before.
+ * ----------------
+ */
+typedef struct Redistribute
+{
+	Plan		plan;
+	int			numCols;		/* number of hash columns */
+	AttrNumber *hashColIdx;		/* their indexes in the target list */
+}Redistribute;
+
 /*
  * RowMarkType -
  *	  enums for types of row-marking operations
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 2113bc82de..17180c01f8 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -68,6 +68,7 @@ extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
 extern PGDLLIMPORT bool enable_async_append;
 extern PGDLLIMPORT int constraint_exclusion;
+extern PGDLLIMPORT int redistribute_query_size; /* in nodeRedistribute.c */
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
 								  double index_pages, PlannerInfo *root);
@@ -169,6 +170,7 @@ extern void cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
 							  RelOptInfo *rel, ParamPathInfo *param_info,
 							  Cost input_startup_cost, Cost input_total_cost,
 							  double *rows);
+extern void cost_redistribute(RedistributePath *path);
 extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan);
 extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root);
 extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index f704d39980..3ceecc2bf2 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -290,6 +290,10 @@ extern Path *reparameterize_path(PlannerInfo *root, Path *path,
 								 double loop_count);
 extern Path *reparameterize_path_by_child(PlannerInfo *root, Path *path,
 										  RelOptInfo *child_rel);
+extern RedistributePath *create_redistribute_path(PlannerInfo *root,
+												  RelOptInfo *rel,
+												  Path *subpath,
+												  List *groupClause);
 
 /*
  * prototypes for relnode.c
diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h
index d62a09665a..afa06de584 100644
--- a/src/include/optimizer/tlist.h
+++ b/src/include/optimizer/tlist.h
@@ -35,6 +35,7 @@ extern Oid *extract_grouping_collations(List *groupClause, List *tlist);
 extern AttrNumber *extract_grouping_cols(List *groupClause, List *tlist);
 extern bool grouping_is_sortable(List *groupClause);
 extern bool grouping_is_hashable(List *groupClause);
+extern List *grouping_get_hashable(List *groupClause);
 
 extern PathTarget *make_pathtarget_from_tlist(List *tlist);
 extern List *make_tlist_from_pathtarget(PathTarget *target);
diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h
index e693f3f760..2bd6f70161 100644
--- a/src/include/storage/shm_mq.h
+++ b/src/include/storage/shm_mq.h
@@ -54,6 +54,8 @@ extern void shm_mq_set_sender(shm_mq *mq, PGPROC *);
 /* Accessor methods for sender and receiver. */
 extern PGPROC *shm_mq_get_receiver(shm_mq *);
 extern PGPROC *shm_mq_get_sender(shm_mq *);
+extern PGPROC *shm_mq_receiver_get_sender(shm_mq_handle *mqh);
+extern PGPROC *shm_mq_sender_get_reciver(shm_mq_handle *mqh);
 
 /* Set up backend-local queue state. */
 extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg,
@@ -73,6 +75,8 @@ extern shm_mq_result shm_mq_send(shm_mq_handle *mqh,
 								 Size nbytes, const void *data, bool nowait);
 extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh,
 								  shm_mq_iovec *iov, int iovcnt, bool nowait);
+extern shm_mq_result shm_mq_send_once(shm_mq_handle *mqh,
+									  Size nbytes, const void *data);
 extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh,
 									Size *nbytesp, void **datap, bool nowait);
 
diff --git a/src/include/utils/sharedtuplestore.h b/src/include/utils/sharedtuplestore.h
index 01ad6efe51..ff6df228bc 100644
--- a/src/include/utils/sharedtuplestore.h
+++ b/src/include/utils/sharedtuplestore.h
@@ -39,10 +39,17 @@ extern SharedTuplestoreAccessor *sts_initialize(SharedTuplestore *sts,
 												SharedFileSet *fileset,
 												const char *name);
 
+extern void sts_create(SharedTuplestore *sts, int participants,
+					   size_t meta_data_size,
+					   int flags,
+					   const char *name);
+
 extern SharedTuplestoreAccessor *sts_attach(SharedTuplestore *sts,
 											int my_participant_number,
 											SharedFileSet *fileset);
 
+extern int64 sts_close(SharedTuplestoreAccessor *accessor);
+
 extern void sts_end_write(SharedTuplestoreAccessor *accessor);
 
 extern void sts_reinitialize(SharedTuplestoreAccessor *accessor);
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 6007827b44..2487737e42 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -125,7 +125,9 @@ typedef enum
 	WAIT_EVENT_SYNC_REP,
 	WAIT_EVENT_WAL_RECEIVER_EXIT,
 	WAIT_EVENT_WAL_RECEIVER_WAIT_START,
-	WAIT_EVENT_XACT_GROUP_UPDATE
+	WAIT_EVENT_XACT_GROUP_UPDATE,
+	WAIT_EVENT_REDISTRIBUTE_PARALLEL_START,
+	WAIT_EVENT_REDISTRIBUTE_SHARED_TUPLESTORE
 } WaitEventIPC;
 
 /* ----------
diff --git a/src/test/modules/test_shm_mq/expected/test_shm_mq.out b/src/test/modules/test_shm_mq/expected/test_shm_mq.out
index c4858b0c20..ed5d526d89 100644
--- a/src/test/modules/test_shm_mq/expected/test_shm_mq.out
+++ b/src/test/modules/test_shm_mq/expected/test_shm_mq.out
@@ -34,3 +34,9 @@ SELECT test_shm_mq_pipelined(16384, (select string_agg(chr(32+(random()*95)::int
  
 (1 row)
 
+SELECT test_shm_mq_pipelined(16384, (select string_agg(chr(32+(random()*95)::int), '') from generate_series(1,4000)), 600, 3, true, true);
+ test_shm_mq_pipelined 
+-----------------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_shm_mq/sql/test_shm_mq.sql b/src/test/modules/test_shm_mq/sql/test_shm_mq.sql
index 9de19d304a..247417b616 100644
--- a/src/test/modules/test_shm_mq/sql/test_shm_mq.sql
+++ b/src/test/modules/test_shm_mq/sql/test_shm_mq.sql
@@ -10,3 +10,4 @@ SELECT test_shm_mq(1024, 'a', 2001, 1);
 SELECT test_shm_mq(32768, (select string_agg(chr(32+(random()*95)::int), '') from generate_series(1,(100+900*random())::int)), 10000, 1);
 SELECT test_shm_mq(100, (select string_agg(chr(32+(random()*95)::int), '') from generate_series(1,(100+200*random())::int)), 10000, 1);
 SELECT test_shm_mq_pipelined(16384, (select string_agg(chr(32+(random()*95)::int), '') from generate_series(1,270000)), 200, 3);
+SELECT test_shm_mq_pipelined(16384, (select string_agg(chr(32+(random()*95)::int), '') from generate_series(1,4000)), 600, 3, true, true);
diff --git a/src/test/modules/test_shm_mq/test.c b/src/test/modules/test_shm_mq/test.c
index 2d8d695f97..adef9e106b 100644
--- a/src/test/modules/test_shm_mq/test.c
+++ b/src/test/modules/test_shm_mq/test.c
@@ -136,6 +136,7 @@ test_shm_mq_pipelined(PG_FUNCTION_ARGS)
 	int32		loop_count = PG_GETARG_INT32(2);
 	int32		nworkers = PG_GETARG_INT32(3);
 	bool		verify = PG_GETARG_BOOL(4);
+	bool		use_once = PG_GETARG_BOOL(5);
 	int32		send_count = 0;
 	int32		receive_count = 0;
 	dsm_segment *seg;
@@ -177,7 +178,10 @@ test_shm_mq_pipelined(PG_FUNCTION_ARGS)
 		 */
 		if (send_count < loop_count)
 		{
-			res = shm_mq_send(outqh, message_size, message_contents, true);
+			if (use_once)
+				res = shm_mq_send_once(outqh, message_size, message_contents);
+			else
+				res = shm_mq_send(outqh, message_size, message_contents, true);
 			if (res == SHM_MQ_SUCCESS)
 			{
 				++send_count;
diff --git a/src/test/modules/test_shm_mq/test_shm_mq--1.0.sql b/src/test/modules/test_shm_mq/test_shm_mq--1.0.sql
index 56db05d93d..e8fc9e9348 100644
--- a/src/test/modules/test_shm_mq/test_shm_mq--1.0.sql
+++ b/src/test/modules/test_shm_mq/test_shm_mq--1.0.sql
@@ -14,6 +14,7 @@ CREATE FUNCTION test_shm_mq_pipelined(queue_size pg_catalog.int8,
 					   message pg_catalog.text,
 					   repeat_count pg_catalog.int4 default 1,
 					   num_workers pg_catalog.int4 default 1,
-					   verify pg_catalog.bool default true)
+					   verify pg_catalog.bool default true,
+					   use_once pg_catalog.bool default false)
     RETURNS pg_catalog.void STRICT
 	AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index dfa4b036b5..698c761bda 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -1517,3 +1517,78 @@ SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) <
  21 | 6000 | 6.0000000000000000 |  1000
 (6 rows)
 
+-- simple agg in parallel
+BEGIN;
+SET redistribute_query_size = '128kB';
+SET min_parallel_table_scan_size = 0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+                  QUERY PLAN                  
+----------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  HashAggregate
+         Group Key: unique2
+         ->  Parallel Redistribute
+               Hash Key: unique2
+               ->  Parallel Seq Scan on tenk1
+(7 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  HashAggregate
+                     Group Key: tenk1.unique2
+                     ->  Parallel Redistribute
+                           Hash Key: tenk1.unique2
+                           ->  Parallel Seq Scan on tenk1
+(9 rows)
+
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: ((count(*)) = tenk1.unique2)
+                     ->  HashAggregate
+                           Group Key: tenk1_1.unique2
+                           ->  Parallel Redistribute
+                                 Hash Key: tenk1_1.unique2
+                                 ->  Parallel Seq Scan on tenk1 tenk1_1
+                     ->  Parallel Hash
+                           ->  Parallel Seq Scan on tenk1
+(13 rows)
+
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ count 
+-------
+ 10000
+(1 row)
+
+ABORT;
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 58122c6f88..2c0b865677 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -375,3 +375,49 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+-- parallel distinct
+BEGIN;
+SET redistribute_query_size = '128kB';
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  HashAggregate
+               Group Key: tenk1.unique2
+               ->  Parallel Redistribute
+                     Hash Key: tenk1.unique2
+                     ->  Parallel Seq Scan on tenk1
+(8 rows)
+
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Gather Merge
+   Workers Planned: 2
+   ->  Unique
+         ->  Sort
+               Sort Key: tenk1.unique2
+               ->  Parallel Redistribute
+                     Hash Key: tenk1.unique2
+                     ->  HashAggregate
+                           Group Key: tenk1.unique2
+                           ->  Parallel Redistribute
+                                 Hash Key: tenk1.unique2
+                                 ->  Parallel Seq Scan on tenk1
+(12 rows)
+
+ABORT;
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 4ea1aa7dfd..34a61b62e0 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -845,6 +845,54 @@ select * from
 
 reset enable_material;
 reset enable_hashagg;
+--test rescan for redistribute
+SET redistribute_query_size = '128kB';
+SET enable_sort = off;
+explain (costs off)
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Sort
+   Sort Key: "*VALUES*".column1, tenk1.string4
+   ->  Nested Loop Left Join
+         ->  Values Scan on "*VALUES*"
+         ->  Materialize
+               ->  Gather
+                     Workers Planned: 4
+                     ->  Finalize HashAggregate
+                           Group Key: tenk1.string4
+                           ->  Parallel Redistribute
+                                 Hash Key: tenk1.string4
+                                 ->  Partial HashAggregate
+                                       Group Key: tenk1.string4
+                                       ->  Parallel Seq Scan on tenk1
+(14 rows)
+
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+ string4 | count | x 
+---------+-------+---
+ AAAAxx  |  2500 | 1
+ HHHHxx  |  2500 | 1
+ OOOOxx  |  2500 | 1
+ VVVVxx  |  2500 | 1
+ AAAAxx  |  2500 | 2
+ HHHHxx  |  2500 | 2
+ OOOOxx  |  2500 | 2
+ VVVVxx  |  2500 | 2
+ AAAAxx  |  2500 | 3
+ HHHHxx  |  2500 | 3
+ OOOOxx  |  2500 | 3
+ VVVVxx  |  2500 | 3
+(12 rows)
+
+reset enable_sort;
+reset redistribute_query_size;
 -- check parallelized int8 aggregate (bug #14897)
 explain (costs off)
 select avg(unique1::int8) from tenk1;
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index dece7310cf..29fb62bb61 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1432,3 +1432,98 @@ where (x = 0) or (q1 >= q2 and q1 <= q2);
  4567890123456789 |  4567890123456789 | 1
 (6 rows)
 
+-- parallel union
+BEGIN;
+SET redistribute_query_size = '128kB';
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+-- using redistribute+sort sort
+SET enable_hashagg = OFF;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Unique
+                     ->  Sort
+                           Sort Key: tenk1.unique2
+                           ->  Parallel Redistribute
+                                 Hash Key: tenk1.unique2
+                                 ->  Parallel Append
+                                       ->  Parallel Seq Scan on tenk1
+                                       ->  Parallel Seq Scan on tenk1 tenk1_1
+(12 rows)
+
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+ count 
+-------
+ 10000
+(1 row)
+
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  Parallel Hash Join
+                     Hash Cond: (tenk1.unique2 = tenk1_1.unique2)
+                     ->  Parallel Seq Scan on tenk1
+                     ->  Parallel Hash
+                           ->  Unique
+                                 ->  Sort
+                                       Sort Key: tenk1_1.unique2
+                                       ->  Parallel Redistribute
+                                             Hash Key: tenk1_1.unique2
+                                             ->  Parallel Append
+                                                   ->  Parallel Seq Scan on tenk1 tenk1_1
+                                                   ->  Parallel Seq Scan on tenk1 tenk1_2
+(16 rows)
+
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+ count 
+-------
+ 10000
+(1 row)
+
+reset enable_hashagg;
+-- using batch hash
+SET enable_sort = OFF;
+EXPLAIN (costs off)
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 2
+         ->  Partial Aggregate
+               ->  HashAggregate
+                     Group Key: tenk1.hundred
+                     ->  Parallel Redistribute
+                           Hash Key: tenk1.hundred
+                           ->  Parallel Append
+                                 ->  Parallel Seq Scan on tenk1
+                                 ->  Parallel Seq Scan on tenk1 tenk1_1
+(11 rows)
+
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+ count 
+-------
+   100
+(1 row)
+
+reset enable_sort;
+ABORT;
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index c17294b15b..091f5d3eaa 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -331,3 +331,29 @@ RESET parallel_setup_cost;
 EXPLAIN (COSTS OFF)
 SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) < 7 ORDER BY 1, 2, 3;
 SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) < 7 ORDER BY 1, 2, 3;
+
+-- simple agg in parallel
+BEGIN;
+SET redistribute_query_size = '128kB';
+SET min_parallel_table_scan_size = 0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+
+EXPLAIN (COSTS OFF)
+SELECT unique2,count(*) FROM tenk1 GROUP BY 1;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+SELECT count(*) FROM (SELECT unique2,count(*) FROM tenk1 GROUP BY 1) foo;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+SELECT count(*) FROM
+  (SELECT unique2,count(*) id FROM tenk1 GROUP BY 1) t1
+    INNER JOIN
+  (SELECT unique2  id FROM tenk1) t2
+  USING(id);
+ABORT;
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index 1bfe59c26f..d90ef7bc27 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -174,3 +174,18 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- parallel distinct
+BEGIN;
+SET redistribute_query_size = '128kB';
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+
+EXPLAIN (costs off)
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+SELECT COUNT(*) FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+explain (costs off)
+SELECT DISTINCT * FROM (SELECT DISTINCT unique2 FROM tenk1) foo;
+ABORT;
\ No newline at end of file
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index f924731248..c7603ab02a 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -314,6 +314,25 @@ reset enable_material;
 
 reset enable_hashagg;
 
+--test rescan for redistribute
+SET redistribute_query_size = '128kB';
+SET enable_sort = off;
+
+explain (costs off)
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+
+select * from
+  (select string4, count(unique2)
+   from tenk1 group by string4) ss
+  right join (values (1),(2),(3)) v(x) on true order by 3,1;
+
+reset enable_sort;
+
+reset redistribute_query_size;
+
 -- check parallelized int8 aggregate (bug #14897)
 explain (costs off)
 select avg(unique1::int8) from tenk1;
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index ca8c9b4d12..acaf88759f 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -540,3 +540,35 @@ select * from
    union all
    select *, 1 as x from int8_tbl b) ss
 where (x = 0) or (q1 >= q2 and q1 <= q2);
+
+-- parallel union
+BEGIN;
+SET redistribute_query_size = '128kB';
+SET min_parallel_table_scan_size =0;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET enable_indexonlyscan = OFF;
+
+-- using redistribute+sort sort
+SET enable_hashagg = OFF;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) foo;
+EXPLAIN (costs off)
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+SELECT count(*) FROM (SELECT unique2 FROM tenk1 UNION SELECT unique2 FROM tenk1) t1 INNER JOIN tenk1 USING(unique2);
+reset enable_hashagg;
+
+-- using batch hash
+SET enable_sort = OFF;
+EXPLAIN (costs off)
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+SELECT count(*) from (
+  SELECT hundred from tenk1
+    union
+  SELECT hundred from tenk1) foo;
+reset enable_sort;
+ABORT;
-- 
2.25.1

#27Daniel Gustafsson
daniel@yesql.se
In reply to: David Steele (#22)
Re: parallel distinct union and aggregate support patch

On 29 Mar 2021, at 15:36, David Steele <david@pgmasters.net> wrote:

A rebase is also required so marked Waiting for Author.

Many months on and this patch still needs a rebase to apply, and the thread has
stalled. I'm marking this Returned with Feedback. Please feel free to open a
new entry if you return to this patch.

--
Daniel Gustafsson https://vmware.com/