should INSERT SELECT use a BulkInsertState?

Started by Justin Pryzbyover 5 years ago27 messages

pryzby@telsasoft.com

over 5 years ago

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

I mentioned a bit about our use-case here:
/messages/by-id/20200219173742.GA30939@telsasoft.com
=> I'd prefer our loaders to write their own data rather than dirtying large
fractions of buffer cache and leaving it around for other backends to clean up.

commit 7f9e061363e58f30eee0cccc8a0e46f637bf137b
Author: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri May 8 02:17:32 2020 -0500

Make INSERT SELECT use a BulkInsertState

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 20a4c474cc..6da4325225 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -578,7 +578,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   mtstate->bistate,
 										   specToken);

/* insert index entries for tuple */
@@ -617,7 +617,7 @@ ExecInsert(ModifyTableState *mtstate,
/* insert the tuple normally */
table_tuple_insert(resultRelationDesc, slot,
estate->es_output_cid,
- 0, NULL);
+ 0, mtstate->bistate);

/* insert index entries for tuple */
if (resultRelInfo->ri_NumIndices > 0)
@@ -2332,6 +2332,14 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)

 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT)
+	{
+		Plan *p = linitial(node->plans);
+		Assert(nplans == 1);
+		if (!IsA(p, Result) && !IsA(p, ValuesScan))
+			mtstate->bistate = GetBulkInsertState();
+	}

/* set up epqstate with dummy subplan data for the moment */
EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2809,6 +2817,9 @@ ExecEndModifyTable(ModifyTableState *node)
*/
for (i = 0; i < node->mt_nplans; i++)
ExecEndNode(node->mt_plans[i]);
+
+ if (node->bistate)
+ FreeBulkInsertState(node->bistate);
}

 void
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 4fee043bb2..daf365f181 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H

+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1177,6 +1178,7 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* State for bulk insert like INSERT SELECT */

/*
* Slot for storing tuples in the root partitioned table's rowtype during

pryzby@telsasoft.com

over 5 years ago

In reply to: Justin Pryzby (#1)

1 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Fri, May 08, 2020 at 02:25:45AM -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

I mentioned a bit about our use-case here:
/messages/by-id/20200219173742.GA30939@telsasoft.com
=> I'd prefer our loaders to write their own data rather than dirtying large
fractions of buffer cache and leaving it around for other backends to clean up.

Nobody suggested otherwise so I added here and cleaned up to pass tests.
https://commitfest.postgresql.org/28/2553/

--
Justin

Attachments:

v1-0001-Make-INSERT-SELECT-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From ba5cf05960a097cf82c10a29af81f4f66a9274a6 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v1] Make INSERT SELECT use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 21 +++++++++++++++++++--
 src/include/nodes/execnodes.h          |  2 ++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 20a4c474cc..aa85245f39 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -578,7 +578,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -617,7 +617,7 @@ ExecInsert(ModifyTableState *mtstate,
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2332,6 +2332,17 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT &&
+			node->onConflictAction != ONCONFLICT_UPDATE &&
+			node->rootResultRelIndex < 0)
+	{
+		Plan *p = linitial(node->plans);
+		Assert(nplans == 1);
+
+		if (!IsA(p, Result) && !IsA(p, ProjectSet) && !IsA(p, ValuesScan))
+			mtstate->bistate = GetBulkInsertState();
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2776,6 +2787,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert((getTargetResultRelInfo(node))->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 4fee043bb2..daf365f181 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1177,6 +1178,7 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* State for bulk insert like INSERT SELECT */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
-- 
2.17.0

Michael Paquier

michael@paquier.xyz

over 5 years ago

In reply to: Justin Pryzby (#1)

Re: should INSERT SELECT use a BulkInsertState?

On Fri, May 08, 2020 at 02:25:45AM -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

I mentioned a bit about our use-case here:
/messages/by-id/20200219173742.GA30939@telsasoft.com
=> I'd prefer our loaders to write their own data rather than dirtying large
fractions of buffer cache and leaving it around for other backends to clean up.

Does it matter in terms of performance and for which cases does it
actually matter?

diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 4fee043bb2..daf365f181 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
#ifndef EXECNODES_H
#define EXECNODES_H

+#include "access/heapam.h"
#include "access/tupconvert.h"
#include "executor/instrument.h"
#include "fmgr.h"
@@ -1177,6 +1178,7 @@ typedef struct ModifyTableState
List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* State for bulk insert like INSERT SELECT */

I think that this needs more thoughts. You are introducing a
dependency between some generic execution-related nodes and heap, a
table AM.
--
Michael

andres@anarazel.de

over 5 years ago

In reply to: Justin Pryzby (#1)

Re: should INSERT SELECT use a BulkInsertState?

Hi,

On 2020-05-08 02:25:45 -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

Well, the problem is that this can cause very very significant
regressions. As in 10x slower or more. The ringbuffer can cause constant
XLogFlush() calls (due to the lsn interlock), and the eviction from
shared_buffers (regardless of actual available) will mean future vacuums
etc will be much slower. I think this is likely to cause pretty
widespread regressions on upgrades.

Now, it sucks that we have this problem in the general facility that's
supposed to be used for this kind of bulk operation. But I don't really
see it realistic as expanding use of bulk insert strategies unless we
have some more fundamental fixes.

Regards,

Andres

Daniel Gustafsson

daniel@yesql.se

over 5 years ago

In reply to: Andres Freund (#4)

Re: should INSERT SELECT use a BulkInsertState?

On 4 Jun 2020, at 19:30, Andres Freund <andres@anarazel.de> wrote:
On 2020-05-08 02:25:45 -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

Well, the problem is that this can cause very very significant
regressions. As in 10x slower or more. The ringbuffer can cause constant
XLogFlush() calls (due to the lsn interlock), and the eviction from
shared_buffers (regardless of actual available) will mean future vacuums
etc will be much slower. I think this is likely to cause pretty
widespread regressions on upgrades.

Now, it sucks that we have this problem in the general facility that's
supposed to be used for this kind of bulk operation. But I don't really
see it realistic as expanding use of bulk insert strategies unless we
have some more fundamental fixes.

Based on the above, and the lack of activity in the thread, it sounds like this
patch should be marked Returned with Feedback; but Justin: you set it to
Waiting on Author at the start of the commitfest, are you working on a new
version?

cheers ./daniel

pryzby@telsasoft.com

over 5 years ago

In reply to: Andres Freund (#4)

1 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Thu, Jun 04, 2020 at 10:30:47AM -0700, Andres Freund wrote:

On 2020-05-08 02:25:45 -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

Well, the problem is that this can cause very very significant
regressions. As in 10x slower or more. The ringbuffer can cause constant
XLogFlush() calls (due to the lsn interlock), and the eviction from
shared_buffers (regardless of actual available) will mean future vacuums
etc will be much slower. I think this is likely to cause pretty
widespread regressions on upgrades.

Now, it sucks that we have this problem in the general facility that's
supposed to be used for this kind of bulk operation. But I don't really
see it realistic as expanding use of bulk insert strategies unless we
have some more fundamental fixes.

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

postgres=# \t on \\ \set QUIET \\ VACUUM FULL t; \dt+ t \\ begin ; \timing on \\ INSERT INTO t SELECT * FROM t; rollback; SELECT COUNT(1), usagecount FROM pg_buffercache GROUP BY 2 ORDER BY 2;
| public | t | table | pryzbyj | 35 MB |
|Time: 9497.318 ms (00:09.497)
| 33 | 1
| 3 | 2
| 18 | 3
| 5 | 4
| 4655 | 5
| 11670 |

vs

postgres=# \t on \\ \set QUIET \\ VACUUM FULL t; \dt+ t \\ begin BULK ; \timing on \\ INSERT INTO t SELECT * FROM t; rollback; SELECT COUNT(1), usagecount FROM pg_buffercache GROUP BY 2 ORDER BY 2;
| public | t | table | pryzbyj | 35 MB |
|Time: 8268.780 ms (00:08.269)
| 2080 | 1
| 3 | 2
| 19 | 4
| 234 | 5
| 14048 |

And:

postgres=# begin ; \x \\ \t \\ SELECT statement_timestamp(); \o /dev/null \\ SELECT 'INSERT INTO t VALUES(0)' FROM generate_series(1,999999); \set ECHO errors \\ \set QUIET on \\ \o \\ \gexec \\ SELECT statement_timestamp(); abort; \x \\ SELECT COUNT(1), usagecount FROM pg_buffercache GROUP BY 2 ORDER BY 2; a
|statement_timestamp | 2020-07-12 20:31:43.717328-05
|statement_timestamp | 2020-07-12 20:36:16.692469-05
|
| 52 | 1
| 24 | 2
| 17 | 3
| 6 | 4
| 4531 | 5
| 11754 |

vs

postgres=# begin BULK ; \x \\ \t \\ SELECT statement_timestamp(); \o /dev/null \\ SELECT 'INSERT INTO t VALUES(0)' FROM generate_series(1,999999); \set ECHO errors \\ \set QUIET on \\ \o \\ \gexec \\ SELECT statement_timestamp(); abort; \x \\ SELECT COUNT(1), usagecount FROM pg_buffercache GROUP BY 2 ORDER BY 2; a
|statement_timestamp | 2020-07-12 20:43:47.089538-05
|statement_timestamp | 2020-07-12 20:48:04.798138-05
|
| 4456 | 1
| 22 | 2
| 1 | 3
| 7 | 4
| 79 | 5
| 11819 |

--
Justin

Attachments:

v2-0001-Allow-INSERT-SELECT-to-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From 0362537e0f3a8496ac760574931db66c59f7c1ba Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v2] Allow INSERT SELECT to use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 23 +++++++++++++++++++++--
 src/backend/parser/gram.y              |  6 +++++-
 src/backend/tcop/utility.c             |  4 ++++
 src/backend/utils/misc/guc.c           | 12 +++++++++++-
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  2 ++
 src/include/parser/kwlist.h            |  1 +
 7 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 20a4c474cc..5ff4a2e901 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -75,6 +75,8 @@ static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
 static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
 static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
 												   int whichplan);
+/* guc */
+bool insert_in_bulk = false;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -578,7 +580,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -617,7 +619,7 @@ ExecInsert(ModifyTableState *mtstate,
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2332,6 +2334,17 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT &&
+			node->onConflictAction != ONCONFLICT_UPDATE &&
+			node->rootResultRelIndex < 0)
+	{
+		// Plan *p = linitial(node->plans);
+		Assert(nplans == 1);
+
+		if (insert_in_bulk)
+			mtstate->bistate = GetBulkInsertState();
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2776,6 +2789,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert((getTargetResultRelInfo(node))->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index dbb47d4982..38c4d1d1e7 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -629,7 +629,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
 
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
-	BOOLEAN_P BOTH BY
+	BOOLEAN_P BOTH BY BULK
 
 	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
@@ -9841,6 +9841,9 @@ transaction_mode_item:
 			| NOT DEFERRABLE
 					{ $$ = makeDefElem("transaction_deferrable",
 									   makeIntConst(false, @1), @1); }
+			| BULK
+					{ $$ = makeDefElem("bulk",
+									   makeIntConst(true, @1), @1); }
 		;
 
 /* Syntax with commas is SQL-spec, without commas is Postgres historical */
@@ -15041,6 +15044,7 @@ unreserved_keyword:
 			| BACKWARD
 			| BEFORE
 			| BEGIN_P
+			| BULK
 			| BY
 			| CACHE
 			| CALL
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 9b0c376c8c..061a022374 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,6 +611,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
+								else if (strcmp(item->defname, "bulk") == 0)
+									SetPGVariable("bulk_insert",
+												  list_make1(item->arg),
+												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 442e5af2b2..66a7dcf5a9 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -51,6 +51,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -917,7 +918,6 @@ static const unit_conversion time_unit_conversion_table[] =
  *	  variable_is_guc_list_quote() in src/bin/pg_dump/dumputils.c.
  */
 
-
 /******** option records follow ********/
 
 static struct config_bool ConfigureNamesBool[] =
@@ -2041,6 +2041,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the transaction to bulk insert mode."),
+			gettext_noop("A ring buffer of limited size will be used."),
+		},
+		&insert_in_bulk,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 4ec4ebdabc..c0a16e6098 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT bool insert_in_bulk;
+
 extern void ExecComputeStoredGenerated(EState *estate, TupleTableSlot *slot, CmdType cmdtype);
 
 extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0187989fd1..2679cbbb9b 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1177,6 +1178,7 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* State for bulk insert like INSERT SELECT */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 08f22ce211..80dfbac702 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -60,6 +60,7 @@ PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD)
 PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD)
 PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD)
 PG_KEYWORD("both", BOTH, RESERVED_KEYWORD)
+PG_KEYWORD("bulk", BULK, UNRESERVED_KEYWORD)
 PG_KEYWORD("by", BY, UNRESERVED_KEYWORD)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD)
-- 
2.17.0

pryzby@telsasoft.com

over 5 years ago

In reply to: Justin Pryzby (#6)

1 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Sun, Jul 12, 2020 at 08:57:00PM -0500, Justin Pryzby wrote:

On Thu, Jun 04, 2020 at 10:30:47AM -0700, Andres Freund wrote:

On 2020-05-08 02:25:45 -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

Well, the problem is that this can cause very very significant
regressions. As in 10x slower or more. The ringbuffer can cause constant
XLogFlush() calls (due to the lsn interlock), and the eviction from
shared_buffers (regardless of actual available) will mean future vacuums
etc will be much slower. I think this is likely to cause pretty
widespread regressions on upgrades.

Now, it sucks that we have this problem in the general facility that's
supposed to be used for this kind of bulk operation. But I don't really
see it realistic as expanding use of bulk insert strategies unless we
have some more fundamental fixes.

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

@cfbot: rebased

Attachments:

v3-0001-Allow-INSERT-SELECT-to-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From acfc6ef7b84a6753a49b7f4c9d5b77a0abbfd37c Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v3] Allow INSERT SELECT to use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 23 +++++++++++++++++++++--
 src/backend/parser/gram.y              |  7 ++++++-
 src/backend/tcop/utility.c             |  4 ++++
 src/backend/utils/misc/guc.c           | 12 +++++++++++-
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  2 ++
 src/include/parser/kwlist.h            |  1 +
 7 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 9812089161..464ad5e346 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -75,6 +75,8 @@ static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
 static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
 static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
 												   int whichplan);
+/* guc */
+bool insert_in_bulk = false;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -578,7 +580,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -617,7 +619,7 @@ ExecInsert(ModifyTableState *mtstate,
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2332,6 +2334,17 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT &&
+			node->onConflictAction != ONCONFLICT_UPDATE &&
+			node->rootResultRelIndex < 0)
+	{
+		// Plan *p = linitial(node->plans);
+		Assert(nplans == 1);
+
+		if (insert_in_bulk)
+			mtstate->bistate = GetBulkInsertState();
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2776,6 +2789,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert((getTargetResultRelInfo(node))->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 017940bdcd..0bc2108a2b 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -631,7 +631,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
 
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
-	BOOLEAN_P BOTH BY
+	BOOLEAN_P BOTH BY BULK
 
 	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
@@ -9846,6 +9846,9 @@ transaction_mode_item:
 			| NOT DEFERRABLE
 					{ $$ = makeDefElem("transaction_deferrable",
 									   makeIntConst(false, @1), @1); }
+			| BULK
+					{ $$ = makeDefElem("bulk",
+									   makeIntConst(true, @1), @1); }
 		;
 
 /* Syntax with commas is SQL-spec, without commas is Postgres historical */
@@ -15052,6 +15055,7 @@ unreserved_keyword:
 			| BACKWARD
 			| BEFORE
 			| BEGIN_P
+			| BULK
 			| BY
 			| CACHE
 			| CALL
@@ -15563,6 +15567,7 @@ bare_label_keyword:
 			| BIT
 			| BOOLEAN_P
 			| BOTH
+			| BULK
 			| BY
 			| CACHE
 			| CALL
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 9a35147b26..cb3933b0e9 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,6 +611,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
+								else if (strcmp(item->defname, "bulk") == 0)
+									SetPGVariable("bulk_insert",
+												  list_make1(item->arg),
+												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 596bcb7b84..409c0d6a5a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -922,7 +923,6 @@ static const unit_conversion time_unit_conversion_table[] =
  *	  variable_is_guc_list_quote() in src/bin/pg_dump/dumputils.c.
  */
 
-
 /******** option records follow ********/
 
 static struct config_bool ConfigureNamesBool[] =
@@ -2036,6 +2036,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the transaction to bulk insert mode."),
+			gettext_noop("A ring buffer of limited size will be used."),
+		},
+		&insert_in_bulk,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 4ec4ebdabc..c0a16e6098 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT bool insert_in_bulk;
+
 extern void ExecComputeStoredGenerated(EState *estate, TupleTableSlot *slot, CmdType cmdtype);
 
 extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index a5ab1aed14..9d480f1fb1 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1176,6 +1177,7 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* State for bulk insert like INSERT SELECT */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 71dcdf2889..0991da11e7 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -60,6 +60,7 @@ PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("bulk", BULK, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
-- 
2.17.0

pryzby@telsasoft.com

about 5 years ago

In reply to: Justin Pryzby (#7)

1 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Sat, Sep 19, 2020 at 08:32:15AM -0500, Justin Pryzby wrote:

On Sun, Jul 12, 2020 at 08:57:00PM -0500, Justin Pryzby wrote:

On Thu, Jun 04, 2020 at 10:30:47AM -0700, Andres Freund wrote:

On 2020-05-08 02:25:45 -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

Well, the problem is that this can cause very very significant
regressions. As in 10x slower or more. The ringbuffer can cause constant
XLogFlush() calls (due to the lsn interlock), and the eviction from
shared_buffers (regardless of actual available) will mean future vacuums
etc will be much slower. I think this is likely to cause pretty
widespread regressions on upgrades.

Now, it sucks that we have this problem in the general facility that's
supposed to be used for this kind of bulk operation. But I don't really
see it realistic as expanding use of bulk insert strategies unless we
have some more fundamental fixes.

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

@cfbot: rebased

again

--
Justin

Attachments:

v4-0001-Allow-INSERT-SELECT-to-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From 7f856d0597a34a98be848c337612f1671497f52f Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v4] Allow INSERT SELECT to use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 22 ++++++++++++++++++++--
 src/backend/parser/gram.y              |  7 ++++++-
 src/backend/tcop/utility.c             |  4 ++++
 src/backend/utils/misc/guc.c           | 11 +++++++++++
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  2 ++
 src/include/parser/kwlist.h            |  1 +
 7 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 0c055ed408..d19d6d1a85 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -76,6 +76,8 @@ static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
 static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
 static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
 												   int whichplan);
+/* guc */
+bool insert_in_bulk = false;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -598,7 +600,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -638,7 +640,7 @@ ExecInsert(ModifyTableState *mtstate,
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2365,6 +2367,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT &&
+			node->onConflictAction != ONCONFLICT_UPDATE &&
+			node->rootRelation == 0)
+	{
+		Assert(nplans == 1);
+
+		if (insert_in_bulk)
+			mtstate->bistate = GetBulkInsertState();
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2805,6 +2817,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert((getTargetResultRelInfo(node))->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 480d168346..b87e31e36a 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -631,7 +631,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
 
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
-	BOOLEAN_P BOTH BY
+	BOOLEAN_P BOTH BY BULK
 
 	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
@@ -9873,6 +9873,9 @@ transaction_mode_item:
 			| NOT DEFERRABLE
 					{ $$ = makeDefElem("transaction_deferrable",
 									   makeIntConst(false, @1), @1); }
+			| BULK
+					{ $$ = makeDefElem("bulk",
+									   makeIntConst(true, @1), @1); }
 		;
 
 /* Syntax with commas is SQL-spec, without commas is Postgres historical */
@@ -15079,6 +15082,7 @@ unreserved_keyword:
 			| BACKWARD
 			| BEFORE
 			| BEGIN_P
+			| BULK
 			| BY
 			| CACHE
 			| CALL
@@ -15590,6 +15594,7 @@ bare_label_keyword:
 			| BIT
 			| BOOLEAN_P
 			| BOTH
+			| BULK
 			| BY
 			| CACHE
 			| CALL
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 9a35147b26..cb3933b0e9 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,6 +611,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
+								else if (strcmp(item->defname, "bulk") == 0)
+									SetPGVariable("bulk_insert",
+												  list_make1(item->arg),
+												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a62d64eaa4..5f2fc9004b 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -2036,6 +2037,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the transaction to bulk insert mode."),
+			gettext_noop("A ring buffer of limited size will be used."),
+		},
+		&insert_in_bulk,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 46a2dc9511..09c312a052 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT bool insert_in_bulk;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b7e9e5d539..50af9dabf5 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1159,6 +1160,7 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* State for bulk insert like INSERT SELECT */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 71dcdf2889..0991da11e7 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -60,6 +60,7 @@ PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("bulk", BULK, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
-- 
2.17.0

simon@2ndquadrant.com

about 5 years ago

In reply to: Andres Freund (#4)

Re: should INSERT SELECT use a BulkInsertState?

On Thu, 4 Jun 2020 at 18:31, Andres Freund <andres@anarazel.de> wrote:

On 2020-05-08 02:25:45 -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

Well, the problem is that this can cause very very significant
regressions. As in 10x slower or more. The ringbuffer can cause constant
XLogFlush() calls (due to the lsn interlock), and the eviction from
shared_buffers (regardless of actual available) will mean future vacuums
etc will be much slower. I think this is likely to cause pretty
widespread regressions on upgrades.

Now, it sucks that we have this problem in the general facility that's
supposed to be used for this kind of bulk operation. But I don't really
see it realistic as expanding use of bulk insert strategies unless we
have some more fundamental fixes.

Are you saying that *anything* that uses the BulkInsertState is
generally broken? We use it for VACUUM and COPY writes, so you are
saying they are broken??

When we put that in, the use of the ringbuffer for writes required a
much larger number of blocks to smooth out the extra XLogFlush()
calls, but overall it was a clear win in those earlier tests. Perhaps
the ring buffer needs to be increased, or made configurable. The
eviction behavior was/is deliberate, to avoid large data loads
spoiling cache - perhaps that could also be configurable for the case
where data fits in shared buffers.

Anyway, if we can discuss what you see as broken, we can fix that and
then extend the usage to other cases, such as INSERT SELECT.

--
Simon Riggs http://www.EnterpriseDB.com/

simon@2ndquadrant.com

about 5 years ago

In reply to: Justin Pryzby (#8)

Re: should INSERT SELECT use a BulkInsertState?

On Fri, 16 Oct 2020 at 22:05, Justin Pryzby <pryzby@telsasoft.com> wrote:

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

I think it would be better if this was self-tuning. So that we don't
allocate a bulkinsert state until we've done say 100 (?) rows
inserted.

If there are other conditions under which this is non-optimal
(Andres?), we can also autodetect that and avoid them.

You should also use table_multi_insert() since that will give further
performance gains by reducing block access overheads. Switching from
single row to multi-row should also only happen once we've loaded a
few rows, so we don't introduce overahads for smaller SQL statements.

--
Simon Riggs http://www.EnterpriseDB.com/

pryzby@telsasoft.com

about 5 years ago

In reply to: Simon Riggs (#10)

2 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Thu, Oct 22, 2020 at 01:29:53PM +0100, Simon Riggs wrote:

On Fri, 16 Oct 2020 at 22:05, Justin Pryzby <pryzby@telsasoft.com> wrote:

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

I think it would be better if this was self-tuning. So that we don't
allocate a bulkinsert state until we've done say 100 (?) rows
inserted.

I made it an optional, non-default behavior in response to the legitimate
concern for performance regression for the cases where a loader needs to be as
fast as possible - as compared with our case, where we want instead to optimize
for our reports by making the loaders responsible for their own writes, rather
than leaving behind many dirty pages, and clobbering the cache, too.

Also, INSERT SELECT doesn't immediately help us (telsasoft), since we use
INSERT .. VALUES () .. ON CONFLICT. This would handle that case, which is
great, even though that wasn't a design goal. It could also be an integer GUC
to allow configuring the size of the ring buffer.

You should also use table_multi_insert() since that will give further
performance gains by reducing block access overheads. Switching from
single row to multi-row should also only happen once we've loaded a
few rows, so we don't introduce overahads for smaller SQL statements.

Good idea...multi_insert (which reduces the overhead of individual inserts) is
mostly independent from BulkInsert state (which uses a ring-buffer to avoid
dirtying the cache). I made this 0002.

This makes INSERT SELECT several times faster, and not clobber the cache too.

Time: 4700.606 ms (00:04.701)
123 | 1
37 | 2
20 | 3
11 | 4
4537 | 5
11656 |

Time: 1125.302 ms (00:01.125)
2171 | 1
37 | 2
20 | 3
11 | 4
111 | 5
14034 |

When enabled, this passes nearly all regression tests, and all but 2 of the
changes are easily understood. The 2nd patch still needs work.

--
Justin

Attachments:

v5-0001-Allow-INSERT-SELECT-to-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From 16057608bd58f54a5e365433ded18757aca8ec48 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v5 1/2] Allow INSERT SELECT to use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 22 ++++++++++++++++++++--
 src/backend/parser/gram.y              |  7 ++++++-
 src/backend/tcop/utility.c             |  4 ++++
 src/backend/utils/misc/guc.c           | 11 +++++++++++
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  3 +++
 src/include/parser/kwlist.h            |  1 +
 src/test/regress/expected/insert.out   | 23 +++++++++++++++++++++++
 src/test/regress/sql/insert.sql        | 13 +++++++++++++
 9 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 29e07b7228..26ff964105 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -72,6 +72,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+bool insert_in_bulk = false;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -594,7 +596,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -631,10 +633,17 @@ ExecInsert(ModifyTableState *mtstate,
 		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2232,6 +2241,9 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT && insert_in_bulk)
+		mtstate->bistate = GetBulkInsertState();
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2698,6 +2710,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 480d168346..1e3c23f723 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -631,7 +631,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
 
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
-	BOOLEAN_P BOTH BY
+	BOOLEAN_P BOTH BULK BY
 
 	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
@@ -9873,6 +9873,9 @@ transaction_mode_item:
 			| NOT DEFERRABLE
 					{ $$ = makeDefElem("transaction_deferrable",
 									   makeIntConst(false, @1), @1); }
+			| BULK
+					{ $$ = makeDefElem("bulk",
+									   makeIntConst(true, @1), @1); }
 		;
 
 /* Syntax with commas is SQL-spec, without commas is Postgres historical */
@@ -15079,6 +15082,7 @@ unreserved_keyword:
 			| BACKWARD
 			| BEFORE
 			| BEGIN_P
+			| BULK
 			| BY
 			| CACHE
 			| CALL
@@ -15590,6 +15594,7 @@ bare_label_keyword:
 			| BIT
 			| BOOLEAN_P
 			| BOTH
+			| BULK
 			| BY
 			| CACHE
 			| CALL
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 9a35147b26..cb3933b0e9 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,6 +611,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
+								else if (strcmp(item->defname, "bulk") == 0)
+									SetPGVariable("bulk_insert",
+												  list_make1(item->arg),
+												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a62d64eaa4..5f2fc9004b 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -2036,6 +2037,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the transaction to bulk insert mode."),
+			gettext_noop("A ring buffer of limited size will be used."),
+		},
+		&insert_in_bulk,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 46a2dc9511..09c312a052 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT bool insert_in_bulk;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6c0a7d68d6..e034562877 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1176,6 +1177,8 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 71dcdf2889..0991da11e7 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -60,6 +60,7 @@ PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("bulk", BULK, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..da0dae6240 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,29 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin bulk;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+insert into hash_parted select generate_series(1,9999);
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..d3a94f053b 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,19 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin bulk;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+insert into hash_parted select generate_series(1,9999);
+select count(1) from hash_parted;
+commit;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
-- 
2.17.0

v5-0002-Make-INSERT-SELECT-use-multi_insert.patchtext/x-diff; charset=us-asciiDownload

From 5df306987950859ff532347ea5d6a6be954a25be Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 24 Oct 2020 22:49:01 -0500
Subject: [PATCH v5 2/2] Make INSERT SELECT use multi_insert

TODO: CTAS and matview ?

See also: 86b85044e823a304d2a265abc030254d39efe7df
---
 src/backend/commands/copy.c            | 185 +------------------------
 src/backend/executor/nodeModifyTable.c | 167 +++++++++++++++++++---
 src/include/commands/copy.h            | 185 +++++++++++++++++++++++++
 src/include/nodes/execnodes.h          |   9 +-
 src/test/regress/expected/insert.out   |  16 +++
 src/test/regress/sql/insert.sql        |   5 +
 6 files changed, 358 insertions(+), 209 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 36ddcdccdb..b613e6e43d 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -31,7 +31,6 @@
 #include "commands/defrem.h"
 #include "commands/trigger.h"
 #include "executor/execPartition.h"
-#include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
 #include "executor/tuptable.h"
 #include "foreign/fdwapi.h"
@@ -241,54 +240,6 @@ typedef struct
 } DR_copy;
 
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /*
  * These macros centralize code used to process line_buf and raw_buf buffers.
  * They are macros because they often do continue/break control and to avoid
@@ -2386,48 +2337,13 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
 /*
  * Initialize an already allocated CopyMultiInsertInfo.
  *
  * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
  * for that table.
  */
-static void
+void
 CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
 						CopyState cstate, EState *estate, CommandId mycid,
 						int ti_options)
@@ -2449,27 +2365,6 @@ CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
 		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
 }
 
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
 /*
  * Write the tuples stored in 'buffer' out to the table.
  */
@@ -2554,35 +2449,6 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
 	cstate->cur_lineno = save_cur_lineno;
 }
 
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
 /*
  * Write out all stored tuples in all buffers out to the tables.
  *
@@ -2639,7 +2505,7 @@ CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
 /*
  * Cleanup allocated buffers and free memory
  */
-static inline void
+void
 CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
 {
 	ListCell   *lc;
@@ -2650,53 +2516,6 @@ CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
 	list_free(miinfo->multiInsertBuffers);
 }
 
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 26ff964105..1437acfc22 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -362,6 +363,65 @@ ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 	MemoryContextSwitchTo(oldContext);
 }
 
+/*
+ * This is copied from CopyMultiInsertInfoFlush and modified to avoid
+ * dragging in CopyState..
+ */
+static void
+InsertMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo)
+{
+	MemoryContext oldcontext;
+	ModifyTableState *mtstate = (ModifyTableState*)miinfo->cstate;
+
+	ListCell   *lc;
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
+		ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+
+		oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(miinfo->estate));
+		table_multi_insert(resultRelInfo->ri_RelationDesc,
+				buffer->slots,
+				buffer->nused,
+				miinfo->mycid,
+				miinfo->ti_options,
+				buffer->bistate);
+		MemoryContextSwitchTo(oldcontext);
+
+		for (int i = 0; i < buffer->nused; ++i)
+		{
+			if (resultRelInfo->ri_NumIndices > 0)
+			{
+				List *recheckIndexes;
+				recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+					buffer->slots[i], miinfo->estate,
+					false, NULL, NIL);
+				ExecARInsertTriggers(miinfo->estate, resultRelInfo,
+						buffer->slots[i], recheckIndexes,
+						mtstate->mt_transition_capture);
+				list_free(recheckIndexes);
+			}
+			else if (resultRelInfo->ri_TrigDesc != NULL &&
+					(resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+					 resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+			{
+				ExecARInsertTriggers(miinfo->estate, resultRelInfo,
+						buffer->slots[i], NIL,
+						mtstate->mt_transition_capture);
+			}
+
+			ExecClearTuple(buffer->slots[i]);
+		}
+		buffer->nused = 0;
+	}
+
+	// TODO: MAX_PARTITION_BUFFERS
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+}
+
+
 /* ----------------------------------------------------------------
  *		ExecInsert
  *
@@ -391,6 +451,7 @@ ExecInsert(ModifyTableState *mtstate,
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	TupleTableSlot *batchslot = NULL;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -631,6 +692,34 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (mtstate->miinfo != NULL &&
+			/* Has unsupported trigger? */
+				(resultRelInfo->ri_TrigDesc == NULL ||
+				 (
+				  // !resultRelInfo->ri_TrigDesc->trig_insert_before_row
+				  !resultRelInfo->ri_TrigDesc->trig_insert_after_row
+				  // && !resultRelInfo->ri_TrigDesc->trig_insert_instead_row
+				  && !resultRelInfo->ri_TrigDesc->trig_insert_after_statement
+				 )
+				) &&
+			/* Has Fdw? */
+				!resultRelInfo->ri_FdwRoutine
+				// !cstate->volatile_defexprs)
+				// !(contain_volatile_functions(cstate->whereClause))
+			)
+		{
+			if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
+				CopyMultiInsertInfoSetupBuffer(mtstate->miinfo,
+						resultRelInfo);
+
+			batchslot = CopyMultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			CopyMultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (CopyMultiInsertInfoIsFull(mtstate->miinfo))
+				InsertMultiInsertInfoFlush(mtstate->miinfo);
+		}
 		else
 		{
 			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
@@ -640,6 +729,13 @@ ExecInsert(ModifyTableState *mtstate,
 				mtstate->prevResultRelInfo = resultRelInfo;
 			}
 
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				InsertMultiInsertInfoFlush(mtstate->miinfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
@@ -659,32 +755,36 @@ ExecInsert(ModifyTableState *mtstate,
 		setLastTid(&slot->tts_tid);
 	}
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
+
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2242,9 +2342,25 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
 	mtstate->bistate = NULL;
+	mtstate->miinfo = NULL;
 	if (operation == CMD_INSERT && insert_in_bulk)
+	{
 		mtstate->bistate = GetBulkInsertState();
 
+		/*
+		 * For partitioned tables we don't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		if (node->rootRelation == 0 ||
+			 !mtstate->rootResultRelInfo->ri_TrigDesc ||
+			 !mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		{
+			mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+			CopyMultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+					(void*)mtstate, estate, GetCurrentCommandId(true), 0);
+		}
+	}
+
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
 	mtstate->fireBSTriggers = true;
@@ -2716,6 +2832,13 @@ ExecEndModifyTable(ModifyTableState *node)
 		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
 	}
 
+	if (node->miinfo)
+	{
+		if (!CopyMultiInsertInfoIsEmpty(node->miinfo))
+			 InsertMultiInsertInfoFlush(node->miinfo);
+		CopyMultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h
index c639833565..8b03f69cf5 100644
--- a/src/include/commands/copy.h
+++ b/src/include/commands/copy.h
@@ -14,6 +14,7 @@
 #ifndef COPY_H
 #define COPY_H
 
+#include "executor/executor.h"
 #include "nodes/execnodes.h"
 #include "nodes/parsenodes.h"
 #include "parser/parse_node.h"
@@ -41,4 +42,188 @@ extern uint64 CopyFrom(CopyState cstate);
 
 extern DestReceiver *CreateCopyDestReceiver(void);
 
+/*
+ * No more than this many tuples per CopyMultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES				1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+	#define MAX_BUFFERED_BYTES			65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS			32
+
+/* Stores multi-insert data related to a single relation in CopyFrom/INSERT. */
+typedef struct CopyMultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;		/* BulkInsertState for this rel */
+	int				nused;			/* number of 'slots' containing tuples */
+	uint64			linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+							 * stream */
+} CopyMultiInsertBuffer;
+
+/*
+ * Stores one or many CopyMultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPYing into a partitioned table.
+ */
+typedef struct CopyMultiInsertInfo
+{
+	List		*multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;  /* number of bytes from all buffered tuples */
+	CopyState	cstate;			/* Copy state for this CopyMultiInsertInfo (not used for insert) */
+	EState		*estate;		/* Executor state used for COPY/INSERT */
+	CommandId	mycid;			/* Command Id used for COPY/INSERT */
+	int			ti_options;	/* table insert options */
+} CopyMultiInsertInfo;
+
+void CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
+						CopyState cstate, EState *estate, CommandId mycid,
+						int ti_options);
+void CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo);
+
+/*
+ * Allocate memory and initialize a new CopyMultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static inline CopyMultiInsertBuffer *
+CopyMultiInsertBufferInit(ResultRelInfo *rri)
+{
+	CopyMultiInsertBuffer *buffer;
+
+	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	CopyMultiInsertBuffer *buffer;
+
+	buffer = CopyMultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_CopyMultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
+							 CopyMultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * CopyMultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* COPY_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e034562877..bcb9986013 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -40,8 +40,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct CopyMultiInsertBuffer;
+// struct CopyMultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -499,7 +499,7 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copy.c when performing multi-inserts */
+	/* for use by copy.c/modifyTable when performing multi-inserts */
 	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
 } ResultRelInfo;
 
@@ -1177,8 +1177,9 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
-	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
 	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct CopyMultiInsertInfo *miinfo;
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da0dae6240..e0c83d7427 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -485,6 +485,22 @@ select count(1) from hash_parted;
 (1 row)
 
 commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index d3a94f053b..99ec18d9a2 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -292,6 +292,11 @@ select * from hash_parted;
 insert into hash_parted select generate_series(1,9999);
 select count(1) from hash_parted;
 commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
 
 -- cleanup
 drop table range_parted, list_parted;
-- 
2.17.0

luc@swarm64.com

about 5 years ago

In reply to: Justin Pryzby (#11)

Re: should INSERT SELECT use a BulkInsertState?

On 30.10.20 05:51, Justin Pryzby wrote:

On Thu, Oct 22, 2020 at 01:29:53PM +0100, Simon Riggs wrote:

On Fri, 16 Oct 2020 at 22:05, Justin Pryzby <pryzby@telsasoft.com> wrote:

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

I think it would be better if this was self-tuning. So that we don't
allocate a bulkinsert state until we've done say 100 (?) rows
inserted.

I made it an optional, non-default behavior in response to the legitimate
concern for performance regression for the cases where a loader needs to be as
fast as possible - as compared with our case, where we want instead to optimize
for our reports by making the loaders responsible for their own writes, rather
than leaving behind many dirty pages, and clobbering the cache, too.

Also, INSERT SELECT doesn't immediately help us (telsasoft), since we use
INSERT .. VALUES () .. ON CONFLICT. This would handle that case, which is
great, even though that wasn't a design goal. It could also be an integer GUC
to allow configuring the size of the ring buffer.

You should also use table_multi_insert() since that will give further
performance gains by reducing block access overheads. Switching from
single row to multi-row should also only happen once we've loaded a
few rows, so we don't introduce overahads for smaller SQL statements.

Good idea...multi_insert (which reduces the overhead of individual inserts) is
mostly independent from BulkInsert state (which uses a ring-buffer to avoid
dirtying the cache). I made this 0002.

This makes INSERT SELECT several times faster, and not clobber the cache too.

Time: 4700.606 ms (00:04.701)
123 | 1
37 | 2
20 | 3
11 | 4
4537 | 5
11656 |

Time: 1125.302 ms (00:01.125)
2171 | 1
37 | 2
20 | 3
11 | 4
111 | 5
14034 |

When enabled, this passes nearly all regression tests, and all but 2 of the
changes are easily understood. The 2nd patch still needs work.

Hi,

Came across this thread because I'm working on an improvement for the
relation extension to improve the speed of the bulkinsert itself in
(highly) parallel cases and would like to make sure that our approaches
work nicely together.

Given what I've seen and tried so far with various benchmarks I would
also really like to see a different approach here. The "BEGIN BULK" can
be problematic for example if you mix small amounts of inserts and big
amounts in the same transaction, or if your application possibly does a
bulk insert but otherwise mostly OLTP transactions.

To me the idea from Simon sounds good to only use a bulk insert state
after inserting e.g. a 1000 rows, and this also seems more applicable to
most applications compared to requiring a change to any application that
wishes to have faster ingest.

Another approach could be to combine this, for example, with a few extra
requirements to limit the amount of regressions and first learn more how
this behaves in the field.
We could, for example, only (just throwing out some ideas), require that:
- the relation has a certain size
- a BufferStrategy a maximum certain size is used
- there is a certain amount of lock waiters on relation extension. (like
we do with bulk extend)
- we have extended the relation for at least e.g. 4 MB and not used the
FSM anymore thereby proving that we are doing bulk operations instead of
random small extensions everywhere into the relation that use the FSM.

Another thing is that we first try to improve the bulk operation
facilities in general and then have another shot at this? Not sure if
there is some benchmark / query that shows where such a 10x slowdown
would appear but maybe that would be worth a look as well possibly.

Regards,
Luc

pryzby@telsasoft.com

about 5 years ago

In reply to: Luc Vlaming (#12)

Re: should INSERT SELECT use a BulkInsertState?

On Mon, Nov 02, 2020 at 07:53:45AM +0100, Luc Vlaming wrote:

On 30.10.20 05:51, Justin Pryzby wrote:

On Thu, Oct 22, 2020 at 01:29:53PM +0100, Simon Riggs wrote:

On Fri, 16 Oct 2020 at 22:05, Justin Pryzby <pryzby@telsasoft.com> wrote:

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

I think it would be better if this was self-tuning. So that we don't
allocate a bulkinsert state until we've done say 100 (?) rows
inserted.

I made it an optional, non-default behavior in response to the legitimate
concern for performance regression for the cases where a loader needs to be as
fast as possible - as compared with our case, where we want instead to optimize
for our reports by making the loaders responsible for their own writes, rather
than leaving behind many dirty pages, and clobbering the cache, too.

Also, INSERT SELECT doesn't immediately help us (telsasoft), since we use
INSERT .. VALUES () .. ON CONFLICT. This would handle that case, which is
great, even though that wasn't a design goal. It could also be an integer GUC
to allow configuring the size of the ring buffer.

You should also use table_multi_insert() since that will give further
performance gains by reducing block access overheads. Switching from
single row to multi-row should also only happen once we've loaded a
few rows, so we don't introduce overahads for smaller SQL statements.

Good idea...multi_insert (which reduces the overhead of individual inserts) is
mostly independent from BulkInsert state (which uses a ring-buffer to avoid
dirtying the cache). I made this 0002.

This makes INSERT SELECT several times faster, and not clobber the cache too.

Time: 4700.606 ms (00:04.701)
123 | 1
37 | 2
20 | 3
11 | 4
4537 | 5
11656 |

Time: 1125.302 ms (00:01.125)
2171 | 1
37 | 2
20 | 3
11 | 4
111 | 5
14034 |

When enabled, this passes nearly all regression tests, and all but 2 of the
changes are easily understood. The 2nd patch still needs work.

Hi,

Came across this thread because I'm working on an improvement for the
relation extension to improve the speed of the bulkinsert itself in (highly)
parallel cases and would like to make sure that our approaches work nicely

Thanks for looking.

Since this is a GUC, I thought it would accomodate users optimizing for either
inserts vs selects, as well as users who don't want to change their application
(they can "ALTER SYSTEM SET bulk_insert=on"). I'm not thrilled about making a
new guc, but that seems to be required for "begin bulk", which was the obvious
way to make it an 'opt-in' feature.

I guess it'd be easy to add a counter to ModifyTableState, although it makes
the code a bit less clean and conceivably performs "discontinuously" - inserts
100rows/sec for the first 999 rows and then 200rows/sec afterwards.

If you "mix" small inserts and big inserts, it would be a bad strategy to
optimize for the small ones. Anyway, in a quick test, small inserts were not
slower.
/messages/by-id/20200713015700.GA23581@telsasoft.com

Do you have an example that regresses with bulk insert ?

The two patches are separate, and it's possible they should be enabled
differently or independently.

--
Justin

Show quoted text

Given what I've seen and tried so far with various benchmarks I would also
really like to see a different approach here. The "BEGIN BULK" can be
problematic for example if you mix small amounts of inserts and big amounts
in the same transaction, or if your application possibly does a bulk insert
but otherwise mostly OLTP transactions.

To me the idea from Simon sounds good to only use a bulk insert state after
inserting e.g. a 1000 rows, and this also seems more applicable to most
applications compared to requiring a change to any application that wishes
to have faster ingest.

Another approach could be to combine this, for example, with a few extra
requirements to limit the amount of regressions and first learn more how
this behaves in the field.
We could, for example, only (just throwing out some ideas), require that:
- the relation has a certain size
- a BufferStrategy a maximum certain size is used
- there is a certain amount of lock waiters on relation extension. (like we
do with bulk extend)
- we have extended the relation for at least e.g. 4 MB and not used the FSM
anymore thereby proving that we are doing bulk operations instead of random
small extensions everywhere into the relation that use the FSM.

Another thing is that we first try to improve the bulk operation facilities
in general and then have another shot at this? Not sure if there is some
benchmark / query that shows where such a 10x slowdown would appear but
maybe that would be worth a look as well possibly.

Regards,
Luc

pryzby@telsasoft.com

about 5 years ago

In reply to: Justin Pryzby (#13)

3 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Mon, Nov 02, 2020 at 12:45:51PM -0600, Justin Pryzby wrote:

On Mon, Nov 02, 2020 at 07:53:45AM +0100, Luc Vlaming wrote:

On 30.10.20 05:51, Justin Pryzby wrote:

On Thu, Oct 22, 2020 at 01:29:53PM +0100, Simon Riggs wrote:

On Fri, 16 Oct 2020 at 22:05, Justin Pryzby <pryzby@telsasoft.com> wrote:

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

I think it would be better if this was self-tuning. So that we don't
allocate a bulkinsert state until we've done say 100 (?) rows
inserted.

I made it an optional, non-default behavior in response to the legitimate
concern for performance regression for the cases where a loader needs to be as
fast as possible - as compared with our case, where we want instead to optimize
for our reports by making the loaders responsible for their own writes, rather
than leaving behind many dirty pages, and clobbering the cache, too.

Also, INSERT SELECT doesn't immediately help us (telsasoft), since we use
INSERT .. VALUES () .. ON CONFLICT. This would handle that case, which is
great, even though that wasn't a design goal. It could also be an integer GUC
to allow configuring the size of the ring buffer.

You should also use table_multi_insert() since that will give further
performance gains by reducing block access overheads. Switching from
single row to multi-row should also only happen once we've loaded a
few rows, so we don't introduce overahads for smaller SQL statements.

Good idea...multi_insert (which reduces the overhead of individual inserts) is
mostly independent from BulkInsert state (which uses a ring-buffer to avoid
dirtying the cache). I made this 0002.

This makes INSERT SELECT several times faster, and not clobber the cache too.

- Rebased on Heikki's copy.c split;
- Rename structures without "Copy" prefix;
- Move MultiInsert* from copyfrom.c to (tentatively) nodeModifyTable.h;
- Move cur_lineno and transition_capture into MultiInsertInfo;

This switches to multi insert after a configurable number of tuples.
If set to -1, that provides the historic behavior that bulk inserts
can leave behind many dirty buffers. Perhaps that should be the default.

I guess this shouldn't be in copy.h or in commands/* at all.
It'll be included by both: commands/copyfrom_internal.h and
executor/nodeModifyTable.h. Maybe it should go in util or lib...
I don't know how to do it without including executor.h, which seems
to be undesirable.

--
Justin

Attachments:

v6-0002-Make-INSERT-SELECT-use-multi_insert.patchtext/x-diff; charset=us-asciiDownload

From 6aee47afb4e0f0730dc03398782ec102a5253aad Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 24 Oct 2020 22:49:01 -0500
Subject: [PATCH v6 2/3] Make INSERT SELECT use multi_insert

TODO: CTAS and matview ?

Renames structures;
Move MultipleInsert functions from copyfrom.c to (tentatively) nodeModifyTable.h;
Move cur_lineno and transition_capture into MultiInsertInfo;

See also: 86b85044e823a304d2a265abc030254d39efe7df
---
 src/backend/commands/copyfrom.c          | 396 ++---------------------
 src/backend/commands/copyfromparse.c     |  10 +-
 src/backend/executor/execMain.c          |   2 +-
 src/backend/executor/execPartition.c     |   2 +-
 src/backend/executor/nodeModifyTable.c   | 110 +++++--
 src/include/commands/copyfrom_internal.h |   3 +-
 src/include/executor/nodeModifyTable.h   | 367 +++++++++++++++++++++
 src/include/nodes/execnodes.h            |  14 +-
 src/test/regress/expected/insert.out     |  16 +
 src/test/regress/sql/insert.sql          |   5 +
 src/tools/pgindent/typedefs.list         |   4 +-
 11 files changed, 515 insertions(+), 414 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 1b14e9a6eb..5d65458f3b 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -44,54 +44,6 @@
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /* non-export function prototypes */
 static char *limit_printout_length(const char *str);
 
@@ -109,7 +61,7 @@ CopyFromErrorCallback(void *arg)
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
-			 cstate->cur_lineno);
+			 cstate->miinfo->cur_lineno);
 
 	if (cstate->opts.binary)
 	{
@@ -204,317 +156,6 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
-/*
- * Initialize an already allocated CopyMultiInsertInfo.
- *
- * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
- * for that table.
- */
-static void
-CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						CopyFromState cstate, EState *estate, CommandId mycid,
-						int ti_options)
-{
-	miinfo->multiInsertBuffers = NIL;
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-	miinfo->cstate = cstate;
-	miinfo->estate = estate;
-	miinfo->mycid = mycid;
-	miinfo->ti_options = ti_options;
-
-	/*
-	 * Only setup the buffer when not dealing with a partitioned table.
-	 * Buffers for partitioned tables will just be setup when we need to send
-	 * tuples their way for the first time.
-	 */
-	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
-}
-
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
-/*
- * Write the tuples stored in 'buffer' out to the table.
- */
-static inline void
-CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
-						   CopyMultiInsertBuffer *buffer)
-{
-	MemoryContext oldcontext;
-	int			i;
-	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
-	EState	   *estate = miinfo->estate;
-	CommandId	mycid = miinfo->mycid;
-	int			ti_options = miinfo->ti_options;
-	bool		line_buf_valid = cstate->line_buf_valid;
-	int			nused = buffer->nused;
-	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
-	TupleTableSlot **slots = buffer->slots;
-
-	/*
-	 * Print error context information correctly, if one of the operations
-	 * below fail.
-	 */
-	cstate->line_buf_valid = false;
-	save_cur_lineno = cstate->cur_lineno;
-
-	/*
-	 * table_multi_insert may leak memory, so switch to short-lived memory
-	 * context before calling it.
-	 */
-	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	table_multi_insert(resultRelInfo->ri_RelationDesc,
-					   slots,
-					   nused,
-					   mycid,
-					   ti_options,
-					   buffer->bistate);
-	MemoryContextSwitchTo(oldcontext);
-
-	for (i = 0; i < nused; i++)
-	{
-		/*
-		 * If there are any indexes, update them for all the inserted tuples,
-		 * and run AFTER ROW INSERT triggers.
-		 */
-		if (resultRelInfo->ri_NumIndices > 0)
-		{
-			List	   *recheckIndexes;
-
-			cstate->cur_lineno = buffer->linenos[i];
-			recheckIndexes =
-				ExecInsertIndexTuples(resultRelInfo,
-									  buffer->slots[i], estate, false, NULL,
-									  NIL);
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], recheckIndexes,
-								 cstate->transition_capture);
-			list_free(recheckIndexes);
-		}
-
-		/*
-		 * There's no indexes, but see if we need to run AFTER ROW INSERT
-		 * triggers anyway.
-		 */
-		else if (resultRelInfo->ri_TrigDesc != NULL &&
-				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
-				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
-		{
-			cstate->cur_lineno = buffer->linenos[i];
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], NIL, cstate->transition_capture);
-		}
-
-		ExecClearTuple(slots[i]);
-	}
-
-	/* Mark that all slots are free */
-	buffer->nused = 0;
-
-	/* reset cur_lineno and line_buf_valid to what they were */
-	cstate->line_buf_valid = line_buf_valid;
-	cstate->cur_lineno = save_cur_lineno;
-}
-
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
-/*
- * Write out all stored tuples in all buffers out to the tables.
- *
- * Once flushed we also trim the tracked buffers list down to size by removing
- * the buffers created earliest first.
- *
- * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
- * used.  When cleaning up old buffers we'll never remove the one for
- * 'curr_rri'.
- */
-static inline void
-CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-	{
-		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
-
-		CopyMultiInsertBufferFlush(miinfo, buffer);
-	}
-
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-
-	/*
-	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
-	 * remove buffers starting with the ones we created first.  It seems less
-	 * likely that these older ones will be needed than the ones that were
-	 * just created.
-	 */
-	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
-	{
-		CopyMultiInsertBuffer *buffer;
-
-		buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-
-		/*
-		 * We never want to remove the buffer that's currently being used, so
-		 * if we happen to find that then move it to the end of the list.
-		 */
-		if (buffer->resultRelInfo == curr_rri)
-		{
-			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-			buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-		}
-
-		CopyMultiInsertBufferCleanup(miinfo, buffer);
-		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-	}
-}
-
-/*
- * Cleanup allocated buffers and free memory
- */
-static inline void
-CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-		CopyMultiInsertBufferCleanup(miinfo, lfirst(lc));
-
-	list_free(miinfo->multiInsertBuffers);
-}
-
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
@@ -536,7 +177,7 @@ CopyFrom(CopyFromState cstate)
 	int			ti_options = 0; /* start with default options for insert */
 	BulkInsertState bistate = NULL;
 	CopyInsertMethod insertMethod;
-	CopyMultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
+	MultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
 	uint64		processed = 0;
 	bool		has_before_insert_row_trig;
 	bool		has_instead_insert_row_trig;
@@ -723,7 +364,7 @@ CopyFrom(CopyFromState cstate)
 		 * For partitioned tables we can't support multi-inserts when there
 		 * are any statement level insert triggers. It might be possible to
 		 * allow partitioned tables with such triggers in the future, but for
-		 * now, CopyMultiInsertInfoFlush expects that any before row insert
+		 * now, MultiInsertInfoFlush expects that any before row insert
 		 * and statement level insert triggers are on the same relation.
 		 */
 		insertMethod = CIM_SINGLE;
@@ -771,7 +412,8 @@ CopyFrom(CopyFromState cstate)
 		else
 			insertMethod = CIM_MULTI;
 
-		CopyMultiInsertInfoInit(&multiInsertInfo, resultRelInfo, cstate,
+		MultiInsertInfoInit(&multiInsertInfo, resultRelInfo,
+								cstate->transition_capture,
 								estate, mycid, ti_options);
 	}
 
@@ -809,6 +451,7 @@ CopyFrom(CopyFromState cstate)
 	errcallback.arg = (void *) cstate;
 	errcallback.previous = error_context_stack;
 	error_context_stack = &errcallback;
+	cstate->miinfo = &multiInsertInfo;
 
 	for (;;)
 	{
@@ -834,7 +477,7 @@ CopyFrom(CopyFromState cstate)
 			Assert(resultRelInfo == target_resultRelInfo);
 			Assert(insertMethod == CIM_MULTI);
 
-			myslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+			myslot = MultiInsertInfoNextFreeSlot(&multiInsertInfo,
 													 resultRelInfo);
 		}
 
@@ -903,18 +546,18 @@ CopyFrom(CopyFromState cstate)
 				/* Set the multi-insert buffer to use for this partition. */
 				if (leafpart_use_multi_insert)
 				{
-					if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
-						CopyMultiInsertInfoSetupBuffer(&multiInsertInfo,
+					if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+						MultiInsertInfoSetupBuffer(&multiInsertInfo,
 													   resultRelInfo);
 				}
 				else if (insertMethod == CIM_MULTI_CONDITIONAL &&
-						 !CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
+						 !MultiInsertInfoIsEmpty(&multiInsertInfo))
 				{
 					/*
 					 * Flush pending inserts if this partition can't use
 					 * batching, so rows are visible to triggers etc.
 					 */
-					CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					MultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
 				}
 
 				if (bistate != NULL)
@@ -960,7 +603,7 @@ CopyFrom(CopyFromState cstate)
 				/* no other path available for partitioned table */
 				Assert(insertMethod == CIM_MULTI_CONDITIONAL);
 
-				batchslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+				batchslot = MultiInsertInfoNextFreeSlot(&multiInsertInfo,
 															resultRelInfo);
 
 				if (map != NULL)
@@ -1040,17 +683,17 @@ CopyFrom(CopyFromState cstate)
 					ExecMaterializeSlot(myslot);
 
 					/* Add this tuple to the tuple buffer */
-					CopyMultiInsertInfoStore(&multiInsertInfo,
+					MultiInsertInfoStore(&multiInsertInfo,
 											 resultRelInfo, myslot,
 											 cstate->line_buf.len,
-											 cstate->cur_lineno);
+											 multiInsertInfo.cur_lineno);
 
 					/*
 					 * If enough inserts have queued up, then flush all
 					 * buffers out to their tables.
 					 */
-					if (CopyMultiInsertInfoIsFull(&multiInsertInfo))
-						CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					if (MultiInsertInfoIsFull(&multiInsertInfo))
+						MultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
 				}
 				else
 				{
@@ -1109,8 +752,8 @@ CopyFrom(CopyFromState cstate)
 	/* Flush any remaining buffered tuples */
 	if (insertMethod != CIM_SINGLE)
 	{
-		if (!CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
-			CopyMultiInsertInfoFlush(&multiInsertInfo, NULL);
+		if (!MultiInsertInfoIsEmpty(&multiInsertInfo))
+			MultiInsertInfoFlush(&multiInsertInfo, NULL);
 	}
 
 	/* Done, clean up */
@@ -1144,7 +787,7 @@ CopyFrom(CopyFromState cstate)
 
 	/* Tear down the multi-insert buffer data */
 	if (insertMethod != CIM_SINGLE)
-		CopyMultiInsertInfoCleanup(&multiInsertInfo);
+		MultiInsertInfoCleanup(&multiInsertInfo);
 
 	/* Close all the partitioned tables, leaf partitions, and their indices */
 	if (proute)
@@ -1323,7 +966,6 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
-	cstate->cur_lineno = 0;
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 2eb08a260c..6370be3641 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -456,14 +456,14 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	Assert(!cstate->opts.binary);
 
 	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	if (cstate->miinfo->cur_lineno == 0 && cstate->opts.header_line)
 	{
-		cstate->cur_lineno++;
+		cstate->miinfo->cur_lineno++;
 		if (CopyReadLine(cstate))
 			return false;		/* done */
 	}
 
-	cstate->cur_lineno++;
+	cstate->miinfo->cur_lineno++;
 
 	/* Actually read the line into memory here */
 	done = CopyReadLine(cstate);
@@ -605,7 +605,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 		int16		fld_count;
 		ListCell   *cur;
 
-		cstate->cur_lineno++;
+		cstate->miinfo->cur_lineno++;
 
 		if (!CopyGetInt16(cstate, &fld_count))
 		{
@@ -913,7 +913,7 @@ CopyReadLineText(CopyFromState cstate)
 			 * at all --- is cur_lineno a physical or logical count?)
 			 */
 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+				cstate->miinfo->cur_lineno++;
 		}
 
 		/* Process \r */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 7179f589f9..855a89b570 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1247,7 +1247,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 													 * ExecInitRoutingInfo */
 	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
 	resultRelInfo->ri_ChildToRootMap = NULL;
-	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	resultRelInfo->ri_MultiInsertBuffer = NULL;
 }
 
 /*
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 86594bd056..1f8ba785db 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -994,7 +994,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
 
-	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	partRelInfo->ri_MultiInsertBuffer = NULL;
 
 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 26ff964105..f8cd537b24 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -362,6 +363,7 @@ ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 	MemoryContextSwitchTo(oldContext);
 }
 
+
 /* ----------------------------------------------------------------
  *		ExecInsert
  *
@@ -391,6 +393,7 @@ ExecInsert(ModifyTableState *mtstate,
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	TupleTableSlot *batchslot = NULL;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -631,6 +634,34 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (mtstate->miinfo != NULL &&
+			/* Has unsupported trigger? */
+				(resultRelInfo->ri_TrigDesc == NULL ||
+				 (
+				  // !resultRelInfo->ri_TrigDesc->trig_insert_before_row
+				  !resultRelInfo->ri_TrigDesc->trig_insert_after_row
+				  // && !resultRelInfo->ri_TrigDesc->trig_insert_instead_row
+				  && !resultRelInfo->ri_TrigDesc->trig_insert_after_statement
+				 )
+				) &&
+			/* Has Fdw? */
+				!resultRelInfo->ri_FdwRoutine
+				// !cstate->volatile_defexprs)
+				// !(contain_volatile_functions(cstate->whereClause))
+			)
+		{
+			if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+				MultiInsertInfoSetupBuffer(mtstate->miinfo,
+						resultRelInfo);
+
+			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (MultiInsertInfoIsFull(mtstate->miinfo))
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+		}
 		else
 		{
 			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
@@ -640,6 +671,13 @@ ExecInsert(ModifyTableState *mtstate,
 				mtstate->prevResultRelInfo = resultRelInfo;
 			}
 
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
@@ -659,32 +697,36 @@ ExecInsert(ModifyTableState *mtstate,
 		setLastTid(&slot->tts_tid);
 	}
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
+
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2242,9 +2284,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
 	mtstate->bistate = NULL;
+	mtstate->miinfo = NULL;
 	if (operation == CMD_INSERT && insert_in_bulk)
+	{
 		mtstate->bistate = GetBulkInsertState();
 
+		/*
+		 * For partitioned tables we don't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		if (node->rootRelation == 0 ||
+			 !mtstate->rootResultRelInfo->ri_TrigDesc ||
+			 !mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		{
+			mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+			MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+					 mtstate->mt_transition_capture,
+					 estate, GetCurrentCommandId(true), 0);
+		}
+	}
+
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
 	mtstate->fireBSTriggers = true;
@@ -2716,6 +2775,13 @@ ExecEndModifyTable(ModifyTableState *node)
 		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
 	}
 
+	if (node->miinfo)
+	{
+		if (!MultiInsertInfoIsEmpty(node->miinfo))
+			 MultiInsertInfoFlush(node->miinfo, node->resultRelInfo); // root ?
+		MultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index c15ea803c3..c13768a9b2 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -15,6 +15,7 @@
 #define COPYFROM_INTERNAL_H
 
 #include "commands/copy.h"
+#include "executor/nodeModifyTable.h"
 #include "commands/trigger.h"
 
 /*
@@ -92,7 +93,7 @@ typedef struct CopyFromStateData
 
 	/* these are just for error messages, see CopyFromErrorCallback */
 	const char *cur_relname;	/* table name for error messages */
-	uint64		cur_lineno;		/* line number for error messages */
+	MultiInsertInfo *miinfo;	/* Link back to miinfo for error callback */
 	const char *cur_attname;	/* current att for error messages */
 	const char *cur_attval;		/* current att value for error messages */
 
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 09c312a052..80d82a9715 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -13,6 +13,8 @@
 #ifndef NODEMODIFYTABLE_H
 #define NODEMODIFYTABLE_H
 
+#include "commands/trigger.h"
+#include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
 extern PGDLLIMPORT bool insert_in_bulk;
@@ -25,4 +27,369 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate,
 extern void ExecEndModifyTable(ModifyTableState *node);
 extern void ExecReScanModifyTable(ModifyTableState *node);
 
+/* Bulk insert stuff which used to live in copy.c */
+
+/*
+ * No more than this many tuples per MultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * MultiInsertBuffer items stored in MultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES		1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+#define MAX_BUFFERED_BYTES		65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS	32
+
+/* Stores multi-insert data related to a single relation in CopyFrom. */
+typedef struct MultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;	/* BulkInsertState for this rel */
+	int			nused;			/* number of 'slots' containing tuples */
+	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+												 * stream */
+} MultiInsertBuffer;
+
+/*
+ * Stores one or many MultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPYing into a partitioned table.
+ */
+typedef struct MultiInsertInfo
+{
+	List	   *multiInsertBuffers; /* List of tracked MultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;	/* number of bytes from all buffered tuples */
+	TransitionCaptureState	*transition_capture;
+	EState	   *estate;			/* Executor state used for COPY */
+	CommandId	mycid;			/* Command Id used for COPY */
+	int			ti_options;		/* table insert options */
+
+	/* Line number for errors in copyfrom.c */
+	uint64		cur_lineno;
+	bool		line_buf_valid;
+} MultiInsertInfo;
+
+
+/*
+ * Allocate memory and initialize a new MultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static MultiInsertBuffer *
+MultiInsertBufferInit(ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = (MultiInsertBuffer *) palloc(sizeof(MultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+MultiInsertInfoSetupBuffer(MultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = MultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_MultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Initialize an already allocated MultiInsertInfo.
+ *
+ * If rri is a non-partitioned table then a MultiInsertBuffer is set up
+ * for that table.
+ */
+static inline void
+MultiInsertInfoInit(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						TransitionCaptureState *transition_capture,
+						EState *estate, CommandId mycid, int ti_options)
+{
+	miinfo->multiInsertBuffers = NIL;
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+	miinfo->transition_capture = transition_capture;
+	miinfo->estate = estate;
+	miinfo->mycid = mycid;
+	miinfo->ti_options = ti_options;
+	miinfo->cur_lineno = 0;
+
+	/*
+	 * Only setup the buffer when not dealing with a partitioned table.
+	 * Buffers for partitioned tables will just be setup when we need to send
+	 * tuples their way for the first time.
+	 */
+	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		MultiInsertInfoSetupBuffer(miinfo, rri);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+MultiInsertInfoIsFull(MultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+MultiInsertInfoIsEmpty(MultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Write the tuples stored in 'buffer' out to the table.
+ */
+static inline void
+MultiInsertBufferFlush(MultiInsertInfo *miinfo,
+						   MultiInsertBuffer *buffer)
+{
+	MemoryContext oldcontext;
+	int			i;
+	uint64		save_cur_lineno;
+	EState	   *estate = miinfo->estate;
+	CommandId	mycid = miinfo->mycid;
+	int			ti_options = miinfo->ti_options;
+	bool		line_buf_valid = miinfo->line_buf_valid;
+	int			nused = buffer->nused;
+	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+	TupleTableSlot **slots = buffer->slots;
+
+	/*
+	 * Print error context information correctly, if one of the operations
+	 * below fail.
+	 */
+	miinfo->line_buf_valid = false;
+	save_cur_lineno = miinfo->cur_lineno;
+
+	/*
+	 * table_multi_insert may leak memory, so switch to short-lived memory
+	 * context before calling it.
+	 */
+	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); // XXX requires executor.h
+	table_multi_insert(resultRelInfo->ri_RelationDesc,
+					   slots,
+					   nused,
+					   mycid,
+					   ti_options,
+					   buffer->bistate);
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < nused; i++)
+	{
+		/*
+		 * If there are any indexes, update them for all the inserted tuples,
+		 * and run AFTER ROW INSERT triggers.
+		 */
+		if (resultRelInfo->ri_NumIndices > 0)
+		{
+			List	   *recheckIndexes;
+
+			miinfo->cur_lineno = buffer->linenos[i];
+			recheckIndexes =
+				ExecInsertIndexTuples(resultRelInfo,
+									  buffer->slots[i], estate, false, NULL,
+									  NIL);
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], recheckIndexes,
+								 miinfo->transition_capture);
+			list_free(recheckIndexes);
+		}
+
+		/*
+		 * There's no indexes, but see if we need to run AFTER ROW INSERT
+		 * triggers anyway.
+		 */
+		else if (resultRelInfo->ri_TrigDesc != NULL &&
+				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+		{
+			miinfo->cur_lineno = buffer->linenos[i];
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], NIL, miinfo->transition_capture);
+		}
+
+		ExecClearTuple(slots[i]);
+	}
+
+	/* Mark that all slots are free */
+	buffer->nused = 0;
+
+	/* reset cur_lineno and line_buf_valid to what they were */
+	miinfo->line_buf_valid = line_buf_valid;
+	miinfo->cur_lineno = save_cur_lineno;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+MultiInsertBufferCleanup(MultiInsertInfo *miinfo,
+							 MultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_MultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Write out all stored tuples in all buffers out to the tables.
+ *
+ * Once flushed we also trim the tracked buffers list down to size by removing
+ * the buffers created earliest first.
+ *
+ * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
+ * used.  When cleaning up old buffers we'll never remove the one for
+ * 'curr_rri'.
+ */
+static inline void
+MultiInsertInfoFlush(MultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		MultiInsertBuffer *buffer = (MultiInsertBuffer *) lfirst(lc);
+
+		MultiInsertBufferFlush(miinfo, buffer);
+	}
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+
+	/*
+	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
+	 * remove buffers starting with the ones we created first.  It seems less
+	 * likely that these older ones will be needed than the ones that were
+	 * just created.
+	 */
+	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
+	{
+		MultiInsertBuffer *buffer;
+
+		buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+
+		/*
+		 * We never want to remove the buffer that's currently being used, so
+		 * if we happen to find that then move it to the end of the list.
+		 */
+		if (buffer->resultRelInfo == curr_rri)
+		{
+			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+			buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+		}
+
+		MultiInsertBufferCleanup(miinfo, buffer);
+		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+	}
+}
+
+/*
+ * Cleanup allocated buffers and free memory
+ */
+static inline void
+MultiInsertInfoCleanup(MultiInsertInfo *miinfo)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+		MultiInsertBufferCleanup(miinfo, lfirst(lc));
+
+	list_free(miinfo->multiInsertBuffers);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+MultiInsertInfoNextFreeSlot(MultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * MultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+MultiInsertInfoStore(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* NODEMODIFYTABLE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index c79d13aa44..985a98f854 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -33,6 +33,9 @@
 #include "utils/tuplesort.h"
 #include "utils/tuplestore.h"
 
+/* This would be a circular inclusion */
+// #include "executor/nodeModifyTable.h"
+
 struct PlanState;				/* forward references in this file */
 struct ParallelHashJoinState;
 struct ExecRowMark;
@@ -40,8 +43,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct MultiInsertBuffer;
+// struct MultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -499,8 +502,8 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copyfrom.c when performing multi-inserts */
-	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
+	/* for use by copyfrom.c/modifyTable when performing multi-inserts */
+	struct MultiInsertBuffer *ri_MultiInsertBuffer;
 } ResultRelInfo;
 
 /* ----------------
@@ -1177,8 +1180,9 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
-	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
 	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct MultiInsertInfo	*miinfo;
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da0dae6240..e0c83d7427 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -485,6 +485,22 @@ select count(1) from hash_parted;
 (1 row)
 
 commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index d3a94f053b..99ec18d9a2 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -292,6 +292,11 @@ select * from hash_parted;
 insert into hash_parted select generate_series(1,9999);
 select count(1) from hash_parted;
 commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
 
 -- cleanup
 drop table range_parted, list_parted;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index fde701bfd4..1bc9db8af4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -423,8 +423,6 @@ ConvertRowtypeExpr
 CookedConstraint
 CopyDest
 CopyInsertMethod
-CopyMultiInsertBuffer
-CopyMultiInsertInfo
 CopyState
 CopyStateData
 CopyStmt
@@ -1389,6 +1387,8 @@ ModifyTableState
 MorphOpaque
 MsgType
 MultiAssignRef
+MultiInsertBuffer
+MultiInsertInfo
 MultiSortSupport
 MultiSortSupportData
 MultiXactId
-- 
2.17.0

v6-0003-Dynamically-switch-to-multi-insert-mode.patchtext/x-diff; charset=us-asciiDownload

From 1605d4b12cc638ebcb80c044f0bc7ff55e119fa4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Mon, 23 Nov 2020 17:57:24 -0600
Subject: [PATCH v6 3/3] Dynamically switch to multi-insert mode..

by popular request
---
 src/backend/executor/nodeModifyTable.c | 47 +++++++++++++++-----------
 src/backend/tcop/utility.c             |  4 ---
 src/backend/utils/misc/guc.c           | 19 +++++------
 src/include/executor/nodeModifyTable.h |  3 +-
 src/include/nodes/execnodes.h          |  1 +
 src/test/regress/expected/insert.out   |  4 +++
 src/test/regress/sql/insert.sql        |  2 ++
 7 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index f8cd537b24..96d5a5d164 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -74,7 +74,7 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
 /* guc */
-bool insert_in_bulk = false;
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -413,6 +413,33 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->operation == CMD_INSERT &&
+			onconflict == ONCONFLICT_NONE &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+
+		/*
+		 * For partitioned tables we don't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		if (proute == 0 ||
+			 !mtstate->rootResultRelInfo->ri_TrigDesc ||
+			 !mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		{
+			elog(DEBUG1, "enabling multi insert");
+			mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+			MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+					 mtstate->mt_transition_capture,
+					 estate, GetCurrentCommandId(true), 0);
+		}
+	}
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -2285,24 +2312,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	mtstate->mt_nplans = nplans;
 	mtstate->bistate = NULL;
 	mtstate->miinfo = NULL;
-	if (operation == CMD_INSERT && insert_in_bulk)
-	{
-		mtstate->bistate = GetBulkInsertState();
-
-		/*
-		 * For partitioned tables we don't support multi-inserts when there
-		 * are any statement level insert triggers.
-		 */
-		if (node->rootRelation == 0 ||
-			 !mtstate->rootResultRelInfo->ri_TrigDesc ||
-			 !mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
-		{
-			mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
-			MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
-					 mtstate->mt_transition_capture,
-					 estate, GetCurrentCommandId(true), 0);
-		}
-	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index a0a4034409..81ac9b1cb2 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,10 +611,6 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
-								else if (strcmp(item->defname, "bulk") == 0)
-									SetPGVariable("bulk_insert",
-												  list_make1(item->arg),
-												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 170c96e749..3ed6b0650c 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2037,16 +2037,6 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
-	{
-		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
-			gettext_noop("Sets the transaction to bulk insert mode."),
-			gettext_noop("A ring buffer of limited size will be used."),
-		},
-		&insert_in_bulk,
-		false,
-		NULL, NULL, NULL
-	},
-
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
@@ -3410,6 +3400,15 @@ static struct config_int ConfigureNamesInt[] =
 		check_huge_page_size, NULL, NULL
 	},
 
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 80d82a9715..16a04151c8 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -17,7 +17,7 @@
 #include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
-extern PGDLLIMPORT bool insert_in_bulk;
+extern PGDLLIMPORT int bulk_insert_ntuples;
 
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
@@ -74,6 +74,7 @@ typedef struct MultiInsertInfo
 	EState	   *estate;			/* Executor state used for COPY */
 	CommandId	mycid;			/* Command Id used for COPY */
 	int			ti_options;		/* table insert options */
+	// size_t		ntuples;		/* Number of rows inserted */
 
 	/* Line number for errors in copyfrom.c */
 	uint64		cur_lineno;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 985a98f854..535380d276 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1183,6 +1183,7 @@ typedef struct ModifyTableState
 	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
 	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
 	struct MultiInsertInfo	*miinfo;
+	size_t		ntuples;	/* Number of tuples inserted; XXX: could be in miinfo ? */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index e0c83d7427..b894180152 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -477,7 +477,11 @@ select * from hash_parted;
 (2 rows)
 
 -- exercise bulk insert to partitions
+SET client_min_messages=debug;
 insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+DEBUG:  enabling multi insert
+RESET client_min_messages;
 select count(1) from hash_parted;
  count 
 -------
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 99ec18d9a2..ff9b57af5e 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -289,7 +289,9 @@ insert into hash_parted values(11);
 insert into hpart0 values(12);
 select * from hash_parted;
 -- exercise bulk insert to partitions
+SET client_min_messages=debug;
 insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
 select count(1) from hash_parted;
 commit;
 -- test that index was updated
-- 
2.17.0

v6-0001-Allow-INSERT-SELECT-to-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From d67c82e870c640e6f4ba25b3da5acf54df7165d2 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v6 1/3] Allow INSERT SELECT to use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 22 ++++++++++++++++++++--
 src/backend/parser/gram.y              |  7 ++++++-
 src/backend/tcop/utility.c             |  4 ++++
 src/backend/utils/misc/guc.c           | 11 +++++++++++
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  3 +++
 src/include/parser/kwlist.h            |  1 +
 src/test/regress/expected/insert.out   | 23 +++++++++++++++++++++++
 src/test/regress/sql/insert.sql        | 13 +++++++++++++
 9 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 29e07b7228..26ff964105 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -72,6 +72,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+bool insert_in_bulk = false;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -594,7 +596,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -631,10 +633,17 @@ ExecInsert(ModifyTableState *mtstate,
 		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2232,6 +2241,9 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT && insert_in_bulk)
+		mtstate->bistate = GetBulkInsertState();
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2698,6 +2710,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index efc9c99754..5915c8c414 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -631,7 +631,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
 
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
-	BOOLEAN_P BOTH BY
+	BOOLEAN_P BOTH BULK BY
 
 	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
@@ -9886,6 +9886,9 @@ transaction_mode_item:
 			| NOT DEFERRABLE
 					{ $$ = makeDefElem("transaction_deferrable",
 									   makeIntConst(false, @1), @1); }
+			| BULK
+					{ $$ = makeDefElem("bulk",
+									   makeIntConst(true, @1), @1); }
 		;
 
 /* Syntax with commas is SQL-spec, without commas is Postgres historical */
@@ -15157,6 +15160,7 @@ unreserved_keyword:
 			| BACKWARD
 			| BEFORE
 			| BEGIN_P
+			| BULK
 			| BY
 			| CACHE
 			| CALL
@@ -15668,6 +15672,7 @@ bare_label_keyword:
 			| BIT
 			| BOOLEAN_P
 			| BOTH
+			| BULK
 			| BY
 			| CACHE
 			| CALL
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 81ac9b1cb2..a0a4034409 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,6 +611,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
+								else if (strcmp(item->defname, "bulk") == 0)
+									SetPGVariable("bulk_insert",
+												  list_make1(item->arg),
+												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index bb34630e8e..170c96e749 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -2036,6 +2037,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the transaction to bulk insert mode."),
+			gettext_noop("A ring buffer of limited size will be used."),
+		},
+		&insert_in_bulk,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 46a2dc9511..09c312a052 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT bool insert_in_bulk;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index f6824bf2e1..c79d13aa44 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1176,6 +1177,8 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 71dcdf2889..0991da11e7 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -60,6 +60,7 @@ PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("bulk", BULK, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..da0dae6240 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,29 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin bulk;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+insert into hash_parted select generate_series(1,9999);
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..d3a94f053b 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,19 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin bulk;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+insert into hash_parted select generate_series(1,9999);
+select count(1) from hash_parted;
+commit;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
-- 
2.17.0

pryzby@telsasoft.com

about 5 years ago

In reply to: Justin Pryzby (#14)

3 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Mon, Nov 23, 2020 at 08:00:20PM -0600, Justin Pryzby wrote:

On Mon, Nov 02, 2020 at 12:45:51PM -0600, Justin Pryzby wrote:

On Mon, Nov 02, 2020 at 07:53:45AM +0100, Luc Vlaming wrote:

On 30.10.20 05:51, Justin Pryzby wrote:

On Thu, Oct 22, 2020 at 01:29:53PM +0100, Simon Riggs wrote:

On Fri, 16 Oct 2020 at 22:05, Justin Pryzby <pryzby@telsasoft.com> wrote:

I made this conditional on BEGIN BULK/SET bulk, so I'll solicit comments on that.

I think it would be better if this was self-tuning. So that we don't
allocate a bulkinsert state until we've done say 100 (?) rows
inserted.

I made it an optional, non-default behavior in response to the legitimate
concern for performance regression for the cases where a loader needs to be as
fast as possible - as compared with our case, where we want instead to optimize
for our reports by making the loaders responsible for their own writes, rather
than leaving behind many dirty pages, and clobbering the cache, too.

Also, INSERT SELECT doesn't immediately help us (telsasoft), since we use
INSERT .. VALUES () .. ON CONFLICT. This would handle that case, which is
great, even though that wasn't a design goal. It could also be an integer GUC
to allow configuring the size of the ring buffer.

You should also use table_multi_insert() since that will give further
performance gains by reducing block access overheads. Switching from
single row to multi-row should also only happen once we've loaded a
few rows, so we don't introduce overahads for smaller SQL statements.

Good idea...multi_insert (which reduces the overhead of individual inserts) is
mostly independent from BulkInsert state (which uses a ring-buffer to avoid
dirtying the cache). I made this 0002.

This makes INSERT SELECT several times faster, and not clobber the cache too.

- Rebased on Heikki's copy.c split;
- Rename structures without "Copy" prefix;
- Move MultiInsert* from copyfrom.c to (tentatively) nodeModifyTable.h;
- Move cur_lineno and transition_capture into MultiInsertInfo;

This switches to multi insert after a configurable number of tuples.
If set to -1, that provides the historic behavior that bulk inserts
can leave behind many dirty buffers. Perhaps that should be the default.

I guess this shouldn't be in copy.h or in commands/* at all.
It'll be included by both: commands/copyfrom_internal.h and
executor/nodeModifyTable.h. Maybe it should go in util or lib...
I don't know how to do it without including executor.h, which seems
to be undesirable.

Attached resolves issue with FDW contrib by including the MultiInsertInfo
structure rather than a pointer and makes the logic more closely match
copyfrom.c related to partition/triggers.

I had made this a conditional based on the concern that bulk insert state would
cause regression. But then it occurred to me that COPY uses a bulk insert
unconditionally. Should COPY be conditional, too ? Or maybe that's ok, since
COPY is assumed to be a bulk operation.

--
Justin

Attachments:

v7-0001-Allow-INSERT-SELECT-to-use-a-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From 886709926523f480255b4897d5bb08984be26a29 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v7 1/3] Allow INSERT SELECT to use a BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 22 ++++++++++++++++++++--
 src/backend/parser/gram.y              |  7 ++++++-
 src/backend/tcop/utility.c             |  4 ++++
 src/backend/utils/misc/guc.c           | 11 +++++++++++
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  3 +++
 src/include/parser/kwlist.h            |  1 +
 src/test/regress/expected/insert.out   | 23 +++++++++++++++++++++++
 src/test/regress/sql/insert.sql        | 13 +++++++++++++
 9 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index e0f24283b8..f65ae2c0d6 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -72,6 +72,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+bool insert_in_bulk = false;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -594,7 +596,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   NULL, /* Bulk insert not supported */
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -631,10 +633,17 @@ ExecInsert(ModifyTableState *mtstate,
 		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2229,6 +2238,9 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+	if (operation == CMD_INSERT && insert_in_bulk)
+		mtstate->bistate = GetBulkInsertState();
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2695,6 +2707,12 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index efc9c99754..5915c8c414 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -631,7 +631,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
 
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
-	BOOLEAN_P BOTH BY
+	BOOLEAN_P BOTH BULK BY
 
 	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
@@ -9886,6 +9886,9 @@ transaction_mode_item:
 			| NOT DEFERRABLE
 					{ $$ = makeDefElem("transaction_deferrable",
 									   makeIntConst(false, @1), @1); }
+			| BULK
+					{ $$ = makeDefElem("bulk",
+									   makeIntConst(true, @1), @1); }
 		;
 
 /* Syntax with commas is SQL-spec, without commas is Postgres historical */
@@ -15157,6 +15160,7 @@ unreserved_keyword:
 			| BACKWARD
 			| BEFORE
 			| BEGIN_P
+			| BULK
 			| BY
 			| CACHE
 			| CALL
@@ -15668,6 +15672,7 @@ bare_label_keyword:
 			| BIT
 			| BOOLEAN_P
 			| BOTH
+			| BULK
 			| BY
 			| CACHE
 			| CALL
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 81ac9b1cb2..a0a4034409 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,6 +611,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
+								else if (strcmp(item->defname, "bulk") == 0)
+									SetPGVariable("bulk_insert",
+												  list_make1(item->arg),
+												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 245a3472bc..c470314134 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -2036,6 +2037,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the transaction to bulk insert mode."),
+			gettext_noop("A ring buffer of limited size will be used."),
+		},
+		&insert_in_bulk,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 46a2dc9511..09c312a052 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT bool insert_in_bulk;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 61ba4c3666..024b0e4da4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1165,6 +1166,8 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 71dcdf2889..0991da11e7 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -60,6 +60,7 @@ PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL)
 PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("bulk", BULK, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..da0dae6240 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,29 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin bulk;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+insert into hash_parted select generate_series(1,9999);
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..d3a94f053b 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,19 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin bulk;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+insert into hash_parted select generate_series(1,9999);
+select count(1) from hash_parted;
+commit;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
-- 
2.17.0

v7-0002-Make-INSERT-SELECT-use-multi_insert.patchtext/x-diff; charset=us-asciiDownload

From e7d23b52c67a8ffb471c9f21f38ab20816ac2309 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 24 Oct 2020 22:49:01 -0500
Subject: [PATCH v7 2/3] Make INSERT SELECT use multi_insert

Renames structures;
Move MultipleInsert functions from copyfrom.c to (tentatively) nodeModifyTable.h;
Move into MultiInsertInfo: transition_capture and cur_lineno (via cstate->miinfo);

See also: 86b85044e823a304d2a265abc030254d39efe7df
---
 src/backend/commands/copyfrom.c          | 394 +----------------------
 src/backend/commands/copyfromparse.c     |  10 +-
 src/backend/executor/execMain.c          |   2 +-
 src/backend/executor/execPartition.c     |   2 +-
 src/backend/executor/nodeModifyTable.c   | 156 +++++++--
 src/include/commands/copyfrom_internal.h |   5 +-
 src/include/executor/nodeModifyTable.h   | 367 +++++++++++++++++++++
 src/include/nodes/execnodes.h            |  14 +-
 src/test/regress/expected/insert.out     |  16 +
 src/test/regress/sql/insert.sql          |   5 +
 src/tools/pgindent/typedefs.list         |   4 +-
 11 files changed, 559 insertions(+), 416 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 1b14e9a6eb..c4fe75df8e 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -44,54 +44,6 @@
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /* non-export function prototypes */
 static char *limit_printout_length(const char *str);
 
@@ -109,7 +61,7 @@ CopyFromErrorCallback(void *arg)
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
-			 cstate->cur_lineno);
+			 cstate->miinfo.cur_lineno);
 
 	if (cstate->opts.binary)
 	{
@@ -204,317 +156,6 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
-/*
- * Initialize an already allocated CopyMultiInsertInfo.
- *
- * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
- * for that table.
- */
-static void
-CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						CopyFromState cstate, EState *estate, CommandId mycid,
-						int ti_options)
-{
-	miinfo->multiInsertBuffers = NIL;
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-	miinfo->cstate = cstate;
-	miinfo->estate = estate;
-	miinfo->mycid = mycid;
-	miinfo->ti_options = ti_options;
-
-	/*
-	 * Only setup the buffer when not dealing with a partitioned table.
-	 * Buffers for partitioned tables will just be setup when we need to send
-	 * tuples their way for the first time.
-	 */
-	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
-}
-
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
-/*
- * Write the tuples stored in 'buffer' out to the table.
- */
-static inline void
-CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
-						   CopyMultiInsertBuffer *buffer)
-{
-	MemoryContext oldcontext;
-	int			i;
-	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
-	EState	   *estate = miinfo->estate;
-	CommandId	mycid = miinfo->mycid;
-	int			ti_options = miinfo->ti_options;
-	bool		line_buf_valid = cstate->line_buf_valid;
-	int			nused = buffer->nused;
-	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
-	TupleTableSlot **slots = buffer->slots;
-
-	/*
-	 * Print error context information correctly, if one of the operations
-	 * below fail.
-	 */
-	cstate->line_buf_valid = false;
-	save_cur_lineno = cstate->cur_lineno;
-
-	/*
-	 * table_multi_insert may leak memory, so switch to short-lived memory
-	 * context before calling it.
-	 */
-	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	table_multi_insert(resultRelInfo->ri_RelationDesc,
-					   slots,
-					   nused,
-					   mycid,
-					   ti_options,
-					   buffer->bistate);
-	MemoryContextSwitchTo(oldcontext);
-
-	for (i = 0; i < nused; i++)
-	{
-		/*
-		 * If there are any indexes, update them for all the inserted tuples,
-		 * and run AFTER ROW INSERT triggers.
-		 */
-		if (resultRelInfo->ri_NumIndices > 0)
-		{
-			List	   *recheckIndexes;
-
-			cstate->cur_lineno = buffer->linenos[i];
-			recheckIndexes =
-				ExecInsertIndexTuples(resultRelInfo,
-									  buffer->slots[i], estate, false, NULL,
-									  NIL);
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], recheckIndexes,
-								 cstate->transition_capture);
-			list_free(recheckIndexes);
-		}
-
-		/*
-		 * There's no indexes, but see if we need to run AFTER ROW INSERT
-		 * triggers anyway.
-		 */
-		else if (resultRelInfo->ri_TrigDesc != NULL &&
-				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
-				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
-		{
-			cstate->cur_lineno = buffer->linenos[i];
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], NIL, cstate->transition_capture);
-		}
-
-		ExecClearTuple(slots[i]);
-	}
-
-	/* Mark that all slots are free */
-	buffer->nused = 0;
-
-	/* reset cur_lineno and line_buf_valid to what they were */
-	cstate->line_buf_valid = line_buf_valid;
-	cstate->cur_lineno = save_cur_lineno;
-}
-
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
-/*
- * Write out all stored tuples in all buffers out to the tables.
- *
- * Once flushed we also trim the tracked buffers list down to size by removing
- * the buffers created earliest first.
- *
- * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
- * used.  When cleaning up old buffers we'll never remove the one for
- * 'curr_rri'.
- */
-static inline void
-CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-	{
-		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
-
-		CopyMultiInsertBufferFlush(miinfo, buffer);
-	}
-
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-
-	/*
-	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
-	 * remove buffers starting with the ones we created first.  It seems less
-	 * likely that these older ones will be needed than the ones that were
-	 * just created.
-	 */
-	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
-	{
-		CopyMultiInsertBuffer *buffer;
-
-		buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-
-		/*
-		 * We never want to remove the buffer that's currently being used, so
-		 * if we happen to find that then move it to the end of the list.
-		 */
-		if (buffer->resultRelInfo == curr_rri)
-		{
-			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-			buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-		}
-
-		CopyMultiInsertBufferCleanup(miinfo, buffer);
-		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-	}
-}
-
-/*
- * Cleanup allocated buffers and free memory
- */
-static inline void
-CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-		CopyMultiInsertBufferCleanup(miinfo, lfirst(lc));
-
-	list_free(miinfo->multiInsertBuffers);
-}
-
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
@@ -536,7 +177,6 @@ CopyFrom(CopyFromState cstate)
 	int			ti_options = 0; /* start with default options for insert */
 	BulkInsertState bistate = NULL;
 	CopyInsertMethod insertMethod;
-	CopyMultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
 	uint64		processed = 0;
 	bool		has_before_insert_row_trig;
 	bool		has_instead_insert_row_trig;
@@ -723,7 +363,7 @@ CopyFrom(CopyFromState cstate)
 		 * For partitioned tables we can't support multi-inserts when there
 		 * are any statement level insert triggers. It might be possible to
 		 * allow partitioned tables with such triggers in the future, but for
-		 * now, CopyMultiInsertInfoFlush expects that any before row insert
+		 * now, MultiInsertInfoFlush expects that any before row insert
 		 * and statement level insert triggers are on the same relation.
 		 */
 		insertMethod = CIM_SINGLE;
@@ -771,7 +411,8 @@ CopyFrom(CopyFromState cstate)
 		else
 			insertMethod = CIM_MULTI;
 
-		CopyMultiInsertInfoInit(&multiInsertInfo, resultRelInfo, cstate,
+		MultiInsertInfoInit(&cstate->miinfo, resultRelInfo,
+								cstate->transition_capture,
 								estate, mycid, ti_options);
 	}
 
@@ -834,7 +475,7 @@ CopyFrom(CopyFromState cstate)
 			Assert(resultRelInfo == target_resultRelInfo);
 			Assert(insertMethod == CIM_MULTI);
 
-			myslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+			myslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 													 resultRelInfo);
 		}
 
@@ -903,18 +544,18 @@ CopyFrom(CopyFromState cstate)
 				/* Set the multi-insert buffer to use for this partition. */
 				if (leafpart_use_multi_insert)
 				{
-					if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
-						CopyMultiInsertInfoSetupBuffer(&multiInsertInfo,
+					if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+						MultiInsertInfoSetupBuffer(&cstate->miinfo,
 													   resultRelInfo);
 				}
 				else if (insertMethod == CIM_MULTI_CONDITIONAL &&
-						 !CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
+						 !MultiInsertInfoIsEmpty(&cstate->miinfo))
 				{
 					/*
 					 * Flush pending inserts if this partition can't use
 					 * batching, so rows are visible to triggers etc.
 					 */
-					CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 
 				if (bistate != NULL)
@@ -960,7 +601,7 @@ CopyFrom(CopyFromState cstate)
 				/* no other path available for partitioned table */
 				Assert(insertMethod == CIM_MULTI_CONDITIONAL);
 
-				batchslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+				batchslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 															resultRelInfo);
 
 				if (map != NULL)
@@ -1040,17 +681,17 @@ CopyFrom(CopyFromState cstate)
 					ExecMaterializeSlot(myslot);
 
 					/* Add this tuple to the tuple buffer */
-					CopyMultiInsertInfoStore(&multiInsertInfo,
+					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
 											 cstate->line_buf.len,
-											 cstate->cur_lineno);
+											 cstate->miinfo.cur_lineno);
 
 					/*
 					 * If enough inserts have queued up, then flush all
 					 * buffers out to their tables.
 					 */
-					if (CopyMultiInsertInfoIsFull(&multiInsertInfo))
-						CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					if (MultiInsertInfoIsFull(&cstate->miinfo))
+						MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 				else
 				{
@@ -1109,8 +750,8 @@ CopyFrom(CopyFromState cstate)
 	/* Flush any remaining buffered tuples */
 	if (insertMethod != CIM_SINGLE)
 	{
-		if (!CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
-			CopyMultiInsertInfoFlush(&multiInsertInfo, NULL);
+		if (!MultiInsertInfoIsEmpty(&cstate->miinfo))
+			MultiInsertInfoFlush(&cstate->miinfo, NULL);
 	}
 
 	/* Done, clean up */
@@ -1144,7 +785,7 @@ CopyFrom(CopyFromState cstate)
 
 	/* Tear down the multi-insert buffer data */
 	if (insertMethod != CIM_SINGLE)
-		CopyMultiInsertInfoCleanup(&multiInsertInfo);
+		MultiInsertInfoCleanup(&cstate->miinfo);
 
 	/* Close all the partitioned tables, leaf partitions, and their indices */
 	if (proute)
@@ -1323,7 +964,6 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
-	cstate->cur_lineno = 0;
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index b7a37bcdbd..6eb3c1be79 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -456,14 +456,14 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	Assert(!cstate->opts.binary);
 
 	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	if (cstate->miinfo.cur_lineno == 0 && cstate->opts.header_line)
 	{
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 		if (CopyReadLine(cstate))
 			return false;		/* done */
 	}
 
-	cstate->cur_lineno++;
+	cstate->miinfo.cur_lineno++;
 
 	/* Actually read the line into memory here */
 	done = CopyReadLine(cstate);
@@ -605,7 +605,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 		int16		fld_count;
 		ListCell   *cur;
 
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 
 		if (!CopyGetInt16(cstate, &fld_count))
 		{
@@ -913,7 +913,7 @@ CopyReadLineText(CopyFromState cstate)
 			 * at all --- is cur_lineno a physical or logical count?)
 			 */
 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+				cstate->miinfo.cur_lineno++;
 		}
 
 		/* Process \r */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 7179f589f9..855a89b570 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1247,7 +1247,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 													 * ExecInitRoutingInfo */
 	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
 	resultRelInfo->ri_ChildToRootMap = NULL;
-	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	resultRelInfo->ri_MultiInsertBuffer = NULL;
 }
 
 /*
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 86594bd056..1f8ba785db 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -994,7 +994,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
 
-	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	partRelInfo->ri_MultiInsertBuffer = NULL;
 
 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index f65ae2c0d6..05f70f140e 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -391,6 +392,8 @@ ExecInsert(ModifyTableState *mtstate,
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	TupleTableSlot *batchslot = NULL;
+	bool	use_multi_insert = false;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -410,6 +413,37 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	if (!mtstate->miinfo ||
+			mtstate->operation != CMD_INSERT || onconflict != ONCONFLICT_NONE)
+		; /* If multi-inserts aren't possible at all, don't check further .. */
+	else if (proute == NULL)
+		use_multi_insert = true;
+	else
+	{
+		/*
+		 * If a partitioned table itself allows multi-insert, and bistate
+		 * indicates we've inserted the threshold number of tuples, check if
+		 * the partition also supports it.
+		 */
+
+		/* Determine which triggers exist on this partition */
+		// XXX copyfrom.c only checks triggers when the partition changes,
+		// so maybe use_multi_insert should be in mtstate ?
+		bool has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_before_row);
+
+		bool has_instead_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_instead_row);
+
+		/*
+		 * Disable multi-inserts when the partition has BEFORE/INSTEAD
+		 * OF triggers, or if the partition is a foreign partition.
+		 */
+		use_multi_insert = !has_before_insert_row_trig &&
+			!has_instead_insert_row_trig &&
+			resultRelInfo->ri_FdwRoutine == NULL;
+	}
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -631,6 +665,19 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (use_multi_insert)
+		{
+			if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+				MultiInsertInfoSetupBuffer(mtstate->miinfo, resultRelInfo);
+
+			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (MultiInsertInfoIsFull(mtstate->miinfo))
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+		}
 		else
 		{
 			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
@@ -640,6 +687,13 @@ ExecInsert(ModifyTableState *mtstate,
 				mtstate->prevResultRelInfo = resultRelInfo;
 			}
 
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
@@ -656,32 +710,36 @@ ExecInsert(ModifyTableState *mtstate,
 	if (canSetTag)
 		(estate->es_processed)++;
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
+
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2238,9 +2296,52 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
-	mtstate->bistate = NULL;
-	if (operation == CMD_INSERT && insert_in_bulk)
+
+	if (insert_in_bulk && operation == CMD_INSERT &&
+			node->onConflictAction == ONCONFLICT_NONE)
 		mtstate->bistate = GetBulkInsertState();
+	else
+		mtstate->bistate = NULL;
+
+	/*
+	 * Set miinfo if it can support multi-insert. This is the equivalent of
+	 * CIM_SINGLE et al in copyfrom.c
+	 */
+
+	if (operation != CMD_INSERT ||
+			node->onConflictAction != ONCONFLICT_NONE ||
+			!insert_in_bulk)
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			(mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_before_row ||
+			 // mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_after_row || // XXX or any row level triggers at all?
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_instead_row))
+		/*
+		 * Can't support multi-inserts when there are any BEFORE/INSTEAD OF
+		 * triggers on the table.
+		 */
+		mtstate->miinfo = NULL;
+	else if (node->rootRelation > 0 &&
+			mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+
+		/*
+		 * For partitioned tables we can't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
+			/* || cstate->volatile_defexprs */ )
+		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
+		/* Can't support multi-inserts to foreign tables or if there are any */
+		mtstate->miinfo = NULL;
+	else
+	{
+		mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+		MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+				mtstate->mt_transition_capture,
+				estate, GetCurrentCommandId(true), 0);
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2713,6 +2814,13 @@ ExecEndModifyTable(ModifyTableState *node)
 		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
 	}
 
+	if (node->miinfo)
+	{
+		if (!MultiInsertInfoIsEmpty(node->miinfo))
+			 MultiInsertInfoFlush(node->miinfo, node->resultRelInfo); // root ?
+		MultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index c15ea803c3..c0603e13ea 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -15,6 +15,7 @@
 #define COPYFROM_INTERNAL_H
 
 #include "commands/copy.h"
+#include "executor/nodeModifyTable.h"
 #include "commands/trigger.h"
 
 /*
@@ -92,10 +93,12 @@ typedef struct CopyFromStateData
 
 	/* these are just for error messages, see CopyFromErrorCallback */
 	const char *cur_relname;	/* table name for error messages */
-	uint64		cur_lineno;		/* line number for error messages */
 	const char *cur_attname;	/* current att for error messages */
 	const char *cur_attval;		/* current att value for error messages */
 
+	/* For bulk inserts and for error callback */
+	MultiInsertInfo miinfo;
+
 	/*
 	 * Working state
 	 */
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 09c312a052..ebe62c2e40 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -13,6 +13,8 @@
 #ifndef NODEMODIFYTABLE_H
 #define NODEMODIFYTABLE_H
 
+#include "commands/trigger.h"
+#include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
 extern PGDLLIMPORT bool insert_in_bulk;
@@ -25,4 +27,369 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate,
 extern void ExecEndModifyTable(ModifyTableState *node);
 extern void ExecReScanModifyTable(ModifyTableState *node);
 
+/* Bulk insert stuff which used to live in copy.c */
+
+/*
+ * No more than this many tuples per MultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * MultiInsertBuffer items stored in MultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES		1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+#define MAX_BUFFERED_BYTES		65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS	32
+
+/* Stores multi-insert data related to a single relation in CopyFrom. */
+typedef struct MultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;	/* BulkInsertState for this rel */
+	int			nused;			/* number of 'slots' containing tuples */
+	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+												 * stream */
+} MultiInsertBuffer;
+
+/*
+ * Stores one or many MultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPY/INSERTing into a partitioned table.
+ */
+typedef struct MultiInsertInfo
+{
+	List	   *multiInsertBuffers; /* List of tracked MultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;	/* number of bytes from all buffered tuples */
+	TransitionCaptureState	*transition_capture;
+	EState	   *estate;			/* Executor state */
+	CommandId	mycid;			/* Command Id */
+	int			ti_options;		/* table insert options */
+
+	/* Line number for errors in copyfrom.c */
+	uint64		cur_lineno;
+	bool		line_buf_valid;
+} MultiInsertInfo;
+
+
+/*
+ * Allocate memory and initialize a new MultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static MultiInsertBuffer *
+MultiInsertBufferInit(ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = (MultiInsertBuffer *) palloc(sizeof(MultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+MultiInsertInfoSetupBuffer(MultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = MultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_MultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Initialize an already allocated MultiInsertInfo.
+ *
+ * If rri is a non-partitioned table then a MultiInsertBuffer is set up
+ * for that table.
+ */
+static inline void
+MultiInsertInfoInit(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						TransitionCaptureState *transition_capture,
+						EState *estate, CommandId mycid, int ti_options)
+{
+	miinfo->multiInsertBuffers = NIL;
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+	miinfo->transition_capture = transition_capture;
+	miinfo->estate = estate;
+	miinfo->mycid = mycid;
+	miinfo->ti_options = ti_options;
+	miinfo->cur_lineno = 0;
+
+	/*
+	 * Only setup the buffer when not dealing with a partitioned table.
+	 * Buffers for partitioned tables will just be setup when we need to send
+	 * tuples their way for the first time.
+	 */
+	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		MultiInsertInfoSetupBuffer(miinfo, rri);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+MultiInsertInfoIsFull(MultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+MultiInsertInfoIsEmpty(MultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Write the tuples stored in 'buffer' out to the table.
+ */
+static inline void
+MultiInsertBufferFlush(MultiInsertInfo *miinfo,
+						   MultiInsertBuffer *buffer)
+{
+	MemoryContext oldcontext;
+	int			i;
+	uint64		save_cur_lineno;
+	EState	   *estate = miinfo->estate;
+	CommandId	mycid = miinfo->mycid;
+	int			ti_options = miinfo->ti_options;
+	bool		line_buf_valid = miinfo->line_buf_valid;
+	int			nused = buffer->nused;
+	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+	TupleTableSlot **slots = buffer->slots;
+
+	/*
+	 * Print error context information correctly, if one of the operations
+	 * below fail.
+	 */
+	miinfo->line_buf_valid = false;
+	save_cur_lineno = miinfo->cur_lineno;
+
+	/*
+	 * table_multi_insert may leak memory, so switch to short-lived memory
+	 * context before calling it.
+	 */
+	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); // XXX requires executor.h
+	table_multi_insert(resultRelInfo->ri_RelationDesc,
+					   slots,
+					   nused,
+					   mycid,
+					   ti_options,
+					   buffer->bistate);
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < nused; i++)
+	{
+		/*
+		 * If there are any indexes, update them for all the inserted tuples,
+		 * and run AFTER ROW INSERT triggers.
+		 */
+		if (resultRelInfo->ri_NumIndices > 0)
+		{
+			List	   *recheckIndexes;
+
+			miinfo->cur_lineno = buffer->linenos[i];
+			recheckIndexes =
+				ExecInsertIndexTuples(resultRelInfo,
+									  buffer->slots[i], estate, false, NULL,
+									  NIL);
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], recheckIndexes,
+								 miinfo->transition_capture);
+			list_free(recheckIndexes);
+		}
+
+		/*
+		 * There's no indexes, but see if we need to run AFTER ROW INSERT
+		 * triggers anyway.
+		 */
+		else if (resultRelInfo->ri_TrigDesc != NULL &&
+				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+		{
+			miinfo->cur_lineno = buffer->linenos[i];
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], NIL, miinfo->transition_capture);
+		}
+
+		ExecClearTuple(slots[i]);
+	}
+
+	/* Mark that all slots are free */
+	buffer->nused = 0;
+
+	/* reset cur_lineno and line_buf_valid to what they were */
+	miinfo->line_buf_valid = line_buf_valid;
+	miinfo->cur_lineno = save_cur_lineno;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+MultiInsertBufferCleanup(MultiInsertInfo *miinfo,
+							 MultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_MultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Write out all stored tuples in all buffers out to the tables.
+ *
+ * Once flushed we also trim the tracked buffers list down to size by removing
+ * the buffers created earliest first.
+ *
+ * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
+ * used.  When cleaning up old buffers we'll never remove the one for
+ * 'curr_rri'.
+ */
+static inline void
+MultiInsertInfoFlush(MultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		MultiInsertBuffer *buffer = (MultiInsertBuffer *) lfirst(lc);
+
+		MultiInsertBufferFlush(miinfo, buffer);
+	}
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+
+	/*
+	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
+	 * remove buffers starting with the ones we created first.  It seems less
+	 * likely that these older ones will be needed than the ones that were
+	 * just created.
+	 */
+	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
+	{
+		MultiInsertBuffer *buffer;
+
+		buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+
+		/*
+		 * We never want to remove the buffer that's currently being used, so
+		 * if we happen to find that then move it to the end of the list.
+		 */
+		if (buffer->resultRelInfo == curr_rri)
+		{
+			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+			buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+		}
+
+		MultiInsertBufferCleanup(miinfo, buffer);
+		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+	}
+}
+
+/*
+ * Cleanup allocated buffers and free memory
+ */
+static inline void
+MultiInsertInfoCleanup(MultiInsertInfo *miinfo)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+		MultiInsertBufferCleanup(miinfo, lfirst(lc));
+
+	list_free(miinfo->multiInsertBuffers);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+MultiInsertInfoNextFreeSlot(MultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * MultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+MultiInsertInfoStore(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* NODEMODIFYTABLE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 024b0e4da4..ab7b8fb51b 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -33,6 +33,9 @@
 #include "utils/tuplesort.h"
 #include "utils/tuplestore.h"
 
+/* This would be a circular inclusion */
+// #include "executor/nodeModifyTable.h"
+
 struct PlanState;				/* forward references in this file */
 struct ParallelHashJoinState;
 struct ExecRowMark;
@@ -40,8 +43,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct MultiInsertBuffer;
+// struct MultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -499,8 +502,8 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copyfrom.c when performing multi-inserts */
-	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
+	/* for use by copyfrom.c/modifyTable when performing multi-inserts */
+	struct MultiInsertBuffer *ri_MultiInsertBuffer;
 } ResultRelInfo;
 
 /* ----------------
@@ -1166,8 +1169,9 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
-	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
 	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct MultiInsertInfo	*miinfo;
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da0dae6240..e0c83d7427 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -485,6 +485,22 @@ select count(1) from hash_parted;
 (1 row)
 
 commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index d3a94f053b..99ec18d9a2 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -292,6 +292,11 @@ select * from hash_parted;
 insert into hash_parted select generate_series(1,9999);
 select count(1) from hash_parted;
 commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
 
 -- cleanup
 drop table range_parted, list_parted;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b8ca8cffd9..42415c9c4c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -422,8 +422,6 @@ ConvertRowtypeExpr
 CookedConstraint
 CopyDest
 CopyInsertMethod
-CopyMultiInsertBuffer
-CopyMultiInsertInfo
 CopyState
 CopyStateData
 CopyStmt
@@ -1388,6 +1386,8 @@ ModifyTableState
 MorphOpaque
 MsgType
 MultiAssignRef
+MultiInsertBuffer
+MultiInsertInfo
 MultiSortSupport
 MultiSortSupportData
 MultiXactId
-- 
2.17.0

v7-0003-Dynamically-switch-to-multi-insert-mode.patchtext/x-diff; charset=us-asciiDownload

From 24d46071082aa4a87c39829ea4908c298799a940 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Mon, 23 Nov 2020 17:57:24 -0600
Subject: [PATCH v7 3/3] Dynamically switch to multi-insert mode..

by popular request
---
 src/backend/executor/nodeModifyTable.c | 45 +++++++++++++++++++-------
 src/backend/tcop/utility.c             |  4 ---
 src/backend/utils/misc/guc.c           | 19 ++++++-----
 src/include/executor/nodeModifyTable.h |  3 +-
 src/include/nodes/execnodes.h          |  1 +
 src/test/regress/expected/insert.out   |  4 +++
 src/test/regress/sql/insert.sql        |  2 ++
 7 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 05f70f140e..9b774d502a 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -74,7 +74,7 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
 /* guc */
-bool insert_in_bulk = false;
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -413,11 +413,29 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	// XXX: maybe this should only be done if it's not a partitioned table or
+	// if the partitions don't support miinfo, which uses its own bistates
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->operation == CMD_INSERT &&
+			onconflict == ONCONFLICT_NONE &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+	}
+
 	if (!mtstate->miinfo ||
 			mtstate->operation != CMD_INSERT || onconflict != ONCONFLICT_NONE)
 		; /* If multi-inserts aren't possible at all, don't check further .. */
 	else if (proute == NULL)
-		use_multi_insert = true;
+	{
+		if (mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
 	else
 	{
 		/*
@@ -438,12 +456,21 @@ ExecInsert(ModifyTableState *mtstate,
 		/*
 		 * Disable multi-inserts when the partition has BEFORE/INSTEAD
 		 * OF triggers, or if the partition is a foreign partition.
+		 * The number of tuples eligible for multi-insert is tracked separately
+		 * from the total number of tuples in case it's not supported for some
+		 * partitions.
 		 */
-		use_multi_insert = !has_before_insert_row_trig &&
+		if (!has_before_insert_row_trig &&
 			!has_instead_insert_row_trig &&
-			resultRelInfo->ri_FdwRoutine == NULL;
+			resultRelInfo->ri_FdwRoutine == NULL &&
+			mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
 	}
 
+	if (use_multi_insert && mtstate->miinfo->ntuples - 1 == bulk_insert_ntuples)
+		elog(DEBUG1, "enabling multi insert");
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -2296,12 +2323,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
-
-	if (insert_in_bulk && operation == CMD_INSERT &&
-			node->onConflictAction == ONCONFLICT_NONE)
-		mtstate->bistate = GetBulkInsertState();
-	else
-		mtstate->bistate = NULL;
+	mtstate->bistate = NULL;
 
 	/*
 	 * Set miinfo if it can support multi-insert. This is the equivalent of
@@ -2309,8 +2331,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	 */
 
 	if (operation != CMD_INSERT ||
-			node->onConflictAction != ONCONFLICT_NONE ||
-			!insert_in_bulk)
+			node->onConflictAction != ONCONFLICT_NONE)
 		mtstate->miinfo = NULL;
 	else if (mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
 			(mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_before_row ||
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index a0a4034409..81ac9b1cb2 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -611,10 +611,6 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 									SetPGVariable("transaction_deferrable",
 												  list_make1(item->arg),
 												  true);
-								else if (strcmp(item->defname, "bulk") == 0)
-									SetPGVariable("bulk_insert",
-												  list_make1(item->arg),
-												  true);
 							}
 						}
 						break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c470314134..1126740021 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2037,16 +2037,6 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
-	{
-		{"bulk_insert", PGC_USERSET, CLIENT_CONN_STATEMENT,
-			gettext_noop("Sets the transaction to bulk insert mode."),
-			gettext_noop("A ring buffer of limited size will be used."),
-		},
-		&insert_in_bulk,
-		false,
-		NULL, NULL, NULL
-	},
-
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
@@ -3410,6 +3400,15 @@ static struct config_int ConfigureNamesInt[] =
 		check_huge_page_size, NULL, NULL
 	},
 
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index ebe62c2e40..71de7cf80e 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -17,7 +17,7 @@
 #include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
-extern PGDLLIMPORT bool insert_in_bulk;
+extern PGDLLIMPORT int bulk_insert_ntuples;
 
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
@@ -74,6 +74,7 @@ typedef struct MultiInsertInfo
 	EState	   *estate;			/* Executor state */
 	CommandId	mycid;			/* Command Id */
 	int			ti_options;		/* table insert options */
+	size_t		ntuples;		/* Number of rows *eligible* for multi-insert */
 
 	/* Line number for errors in copyfrom.c */
 	uint64		cur_lineno;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index ab7b8fb51b..477b326d06 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1172,6 +1172,7 @@ typedef struct ModifyTableState
 	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
 	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
 	struct MultiInsertInfo	*miinfo;
+	size_t		ntuples;	/* Number of tuples inserted; */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index e0c83d7427..b894180152 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -477,7 +477,11 @@ select * from hash_parted;
 (2 rows)
 
 -- exercise bulk insert to partitions
+SET client_min_messages=debug;
 insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+DEBUG:  enabling multi insert
+RESET client_min_messages;
 select count(1) from hash_parted;
  count 
 -------
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 99ec18d9a2..ff9b57af5e 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -289,7 +289,9 @@ insert into hash_parted values(11);
 insert into hpart0 values(12);
 select * from hash_parted;
 -- exercise bulk insert to partitions
+SET client_min_messages=debug;
 insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
 select count(1) from hash_parted;
 commit;
 -- test that index was updated
-- 
2.17.0

pryzby@telsasoft.com

about 5 years ago

In reply to: Justin Pryzby (#15)

Re: should INSERT SELECT use a BulkInsertState?

One loose end in this patch is how to check for volatile default expressions.

copyfrom.c is a utility statement, so it can look at the parser's column list:
COPY table(c1,c2)...

However, for INSERT, in nodeModifyTable.c, it looks like parsing, rewriting,
and planning are done, at which point I don't know if there's a good way to
find that. The default expressions will have been rewritten into the planned
statement.

We need the list of columns whose default is volatile, excluding columns for
which a non-default value is specified.

INSERT INTO table (c1,c2) VALUES (1,default);

We'd want the list of any column in the table with a volatile default,
excluding columns c1, but not-excluding explicit default columns c2 or any
implicit default columns (c3, etc).

Any idea ?

--
Justin

Bharath Rupireddy

bharath.rupireddyforpostgres@gmail.com

about 5 years ago

In reply to: Justin Pryzby (#16)

Re: should INSERT SELECT use a BulkInsertState?

On Wed, Dec 2, 2020 at 10:24 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

One loose end in this patch is how to check for volatile default expressions.

copyfrom.c is a utility statement, so it can look at the parser's column list:
COPY table(c1,c2)...

However, for INSERT, in nodeModifyTable.c, it looks like parsing, rewriting,
and planning are done, at which point I don't know if there's a good way to
find that. The default expressions will have been rewritten into the planned
statement.

We need the list of columns whose default is volatile, excluding columns for
which a non-default value is specified.

INSERT INTO table (c1,c2) VALUES (1,default);

We'd want the list of any column in the table with a volatile default,
excluding columns c1, but not-excluding explicit default columns c2 or any
implicit default columns (c3, etc).

Any idea ?

I think we should be doing all the necessary checks in the planner and
have a flag in the planned stmt to indicate whether to go with multi
insert or not. For the required checks, we can have a look at how the
existing COPY decides to go with either CIM_MULTI or CIM_SINGLE.

Now, the question of how we can get to know whether a given relation
has default expressions or volatile expressions, it is worth to look
at build_column_default() and contain_volatile_functions().

I prefer to have the multi insert deciding code in COPY and INSERT
SELECT, in a single common function which can be reused. Though COPY
has somethings like default expressions and others ready unlike INSERT
SELECT, we can try to keep them under a common function and say for
COPY we can skip some code and for INSERT SELECT we can do extra work
to find default expressions.

Although unrelated, for parallel inserts in INSERT SELECT[1]/messages/by-id/CAJcOf-fy3P+kDArvmbEtdQTxFMf7Rn2=V-sqCnMmKO3QKBsgPA@mail.gmail.com, in the
planner there are some checks to see if the parallelism is safe or
not. Check max_parallel_hazard_for_modify() in
v8-0001-Enable-parallel-SELECT-for-INSERT-INTO-.-SELECT.patch from
[1]: /messages/by-id/CAJcOf-fy3P+kDArvmbEtdQTxFMf7Rn2=V-sqCnMmKO3QKBsgPA@mail.gmail.com
code.

[1]: /messages/by-id/CAJcOf-fy3P+kDArvmbEtdQTxFMf7Rn2=V-sqCnMmKO3QKBsgPA@mail.gmail.com

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com

pryzby@telsasoft.com

about 5 years ago

In reply to: Bharath Rupireddy (#17)

4 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Thu, Dec 03, 2020 at 10:59:34AM +0530, Bharath Rupireddy wrote:

On Wed, Dec 2, 2020 at 10:24 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

One loose end in this patch is how to check for volatile default expressions.

I think we should be doing all the necessary checks in the planner and
have a flag in the planned stmt to indicate whether to go with multi
insert or not. For the required checks, we can have a look at how the
existing COPY decides to go with either CIM_MULTI or CIM_SINGLE.

Yes, you can see that I've copied the checks from copy.
Like copy, some checks are done once, in ExecInitModifyTable, outside of the
ExecModifyTable "loop".

This squishes some commits together.
And uses bistate for ON CONFLICT.
And attempts to use memory context for tuple size.

For the bufferedBytes check, I'm not sure what's best. Copy flushes buffers
after 65k of input line length, but that's totally different from tuple slot
memory context size, which is what I used for insert. Maybe COPY should also
use slot size? Or maybe the threshold to flush needs to be set in miinfo,
rather than a #define, and differ between COPY and INSERT.

--
Justin

Attachments:

v8-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_in.patchtext/x-diff; charset=us-asciiDownload

From f83313efc8612a5e94f1f13a87d80fb0c393c7b0 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v8 1/4] INSERT SELECT to use BulkInsertState and multi_insert

Renames structures;
Move MultipleInsert functions from copyfrom.c to (tentatively) nodeModifyTable.h;
Move into MultiInsertInfo: transition_capture and cur_lineno (via cstate->miinfo);

Dynamically switch to multi-insert mode based on the number of insertions.
This is intended to accomodate 1) the original use case of INSERT using a small
ring buffer to avoid leaving behind dirty buffers; and, 2) Automatically using
multi-inserts for batch operations; 3) allow the old behavior of leaving behind
dirty buffers, which might allow INSERT to run more quickly, at the cost of
leaving behind many dirty buffers which other backends may have to write out.

XXX: for (1), the bulk-insert state is used even if not multi-insert, including
for a VALUES.

TODO: use cstate->miinfo.cur_lineno++ instead of mtstate->miinfo->ntuples
---
 src/backend/commands/copyfrom.c          | 394 +----------------------
 src/backend/commands/copyfromparse.c     |  10 +-
 src/backend/executor/execMain.c          |   2 +-
 src/backend/executor/execPartition.c     |   2 +-
 src/backend/executor/nodeModifyTable.c   | 196 +++++++++--
 src/backend/utils/misc/guc.c             |  10 +
 src/include/commands/copyfrom_internal.h |   5 +-
 src/include/executor/nodeModifyTable.h   | 370 +++++++++++++++++++++
 src/include/nodes/execnodes.h            |  16 +-
 src/test/regress/expected/insert.out     |  43 +++
 src/test/regress/sql/insert.sql          |  20 ++
 src/tools/pgindent/typedefs.list         |   4 +-
 12 files changed, 657 insertions(+), 415 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 1b14e9a6eb..c4fe75df8e 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -44,54 +44,6 @@
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /* non-export function prototypes */
 static char *limit_printout_length(const char *str);
 
@@ -109,7 +61,7 @@ CopyFromErrorCallback(void *arg)
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
-			 cstate->cur_lineno);
+			 cstate->miinfo.cur_lineno);
 
 	if (cstate->opts.binary)
 	{
@@ -204,317 +156,6 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
-/*
- * Initialize an already allocated CopyMultiInsertInfo.
- *
- * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
- * for that table.
- */
-static void
-CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						CopyFromState cstate, EState *estate, CommandId mycid,
-						int ti_options)
-{
-	miinfo->multiInsertBuffers = NIL;
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-	miinfo->cstate = cstate;
-	miinfo->estate = estate;
-	miinfo->mycid = mycid;
-	miinfo->ti_options = ti_options;
-
-	/*
-	 * Only setup the buffer when not dealing with a partitioned table.
-	 * Buffers for partitioned tables will just be setup when we need to send
-	 * tuples their way for the first time.
-	 */
-	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
-}
-
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
-/*
- * Write the tuples stored in 'buffer' out to the table.
- */
-static inline void
-CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
-						   CopyMultiInsertBuffer *buffer)
-{
-	MemoryContext oldcontext;
-	int			i;
-	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
-	EState	   *estate = miinfo->estate;
-	CommandId	mycid = miinfo->mycid;
-	int			ti_options = miinfo->ti_options;
-	bool		line_buf_valid = cstate->line_buf_valid;
-	int			nused = buffer->nused;
-	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
-	TupleTableSlot **slots = buffer->slots;
-
-	/*
-	 * Print error context information correctly, if one of the operations
-	 * below fail.
-	 */
-	cstate->line_buf_valid = false;
-	save_cur_lineno = cstate->cur_lineno;
-
-	/*
-	 * table_multi_insert may leak memory, so switch to short-lived memory
-	 * context before calling it.
-	 */
-	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	table_multi_insert(resultRelInfo->ri_RelationDesc,
-					   slots,
-					   nused,
-					   mycid,
-					   ti_options,
-					   buffer->bistate);
-	MemoryContextSwitchTo(oldcontext);
-
-	for (i = 0; i < nused; i++)
-	{
-		/*
-		 * If there are any indexes, update them for all the inserted tuples,
-		 * and run AFTER ROW INSERT triggers.
-		 */
-		if (resultRelInfo->ri_NumIndices > 0)
-		{
-			List	   *recheckIndexes;
-
-			cstate->cur_lineno = buffer->linenos[i];
-			recheckIndexes =
-				ExecInsertIndexTuples(resultRelInfo,
-									  buffer->slots[i], estate, false, NULL,
-									  NIL);
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], recheckIndexes,
-								 cstate->transition_capture);
-			list_free(recheckIndexes);
-		}
-
-		/*
-		 * There's no indexes, but see if we need to run AFTER ROW INSERT
-		 * triggers anyway.
-		 */
-		else if (resultRelInfo->ri_TrigDesc != NULL &&
-				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
-				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
-		{
-			cstate->cur_lineno = buffer->linenos[i];
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], NIL, cstate->transition_capture);
-		}
-
-		ExecClearTuple(slots[i]);
-	}
-
-	/* Mark that all slots are free */
-	buffer->nused = 0;
-
-	/* reset cur_lineno and line_buf_valid to what they were */
-	cstate->line_buf_valid = line_buf_valid;
-	cstate->cur_lineno = save_cur_lineno;
-}
-
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
-/*
- * Write out all stored tuples in all buffers out to the tables.
- *
- * Once flushed we also trim the tracked buffers list down to size by removing
- * the buffers created earliest first.
- *
- * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
- * used.  When cleaning up old buffers we'll never remove the one for
- * 'curr_rri'.
- */
-static inline void
-CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-	{
-		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
-
-		CopyMultiInsertBufferFlush(miinfo, buffer);
-	}
-
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-
-	/*
-	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
-	 * remove buffers starting with the ones we created first.  It seems less
-	 * likely that these older ones will be needed than the ones that were
-	 * just created.
-	 */
-	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
-	{
-		CopyMultiInsertBuffer *buffer;
-
-		buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-
-		/*
-		 * We never want to remove the buffer that's currently being used, so
-		 * if we happen to find that then move it to the end of the list.
-		 */
-		if (buffer->resultRelInfo == curr_rri)
-		{
-			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-			buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-		}
-
-		CopyMultiInsertBufferCleanup(miinfo, buffer);
-		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-	}
-}
-
-/*
- * Cleanup allocated buffers and free memory
- */
-static inline void
-CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-		CopyMultiInsertBufferCleanup(miinfo, lfirst(lc));
-
-	list_free(miinfo->multiInsertBuffers);
-}
-
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
@@ -536,7 +177,6 @@ CopyFrom(CopyFromState cstate)
 	int			ti_options = 0; /* start with default options for insert */
 	BulkInsertState bistate = NULL;
 	CopyInsertMethod insertMethod;
-	CopyMultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
 	uint64		processed = 0;
 	bool		has_before_insert_row_trig;
 	bool		has_instead_insert_row_trig;
@@ -723,7 +363,7 @@ CopyFrom(CopyFromState cstate)
 		 * For partitioned tables we can't support multi-inserts when there
 		 * are any statement level insert triggers. It might be possible to
 		 * allow partitioned tables with such triggers in the future, but for
-		 * now, CopyMultiInsertInfoFlush expects that any before row insert
+		 * now, MultiInsertInfoFlush expects that any before row insert
 		 * and statement level insert triggers are on the same relation.
 		 */
 		insertMethod = CIM_SINGLE;
@@ -771,7 +411,8 @@ CopyFrom(CopyFromState cstate)
 		else
 			insertMethod = CIM_MULTI;
 
-		CopyMultiInsertInfoInit(&multiInsertInfo, resultRelInfo, cstate,
+		MultiInsertInfoInit(&cstate->miinfo, resultRelInfo,
+								cstate->transition_capture,
 								estate, mycid, ti_options);
 	}
 
@@ -834,7 +475,7 @@ CopyFrom(CopyFromState cstate)
 			Assert(resultRelInfo == target_resultRelInfo);
 			Assert(insertMethod == CIM_MULTI);
 
-			myslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+			myslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 													 resultRelInfo);
 		}
 
@@ -903,18 +544,18 @@ CopyFrom(CopyFromState cstate)
 				/* Set the multi-insert buffer to use for this partition. */
 				if (leafpart_use_multi_insert)
 				{
-					if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
-						CopyMultiInsertInfoSetupBuffer(&multiInsertInfo,
+					if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+						MultiInsertInfoSetupBuffer(&cstate->miinfo,
 													   resultRelInfo);
 				}
 				else if (insertMethod == CIM_MULTI_CONDITIONAL &&
-						 !CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
+						 !MultiInsertInfoIsEmpty(&cstate->miinfo))
 				{
 					/*
 					 * Flush pending inserts if this partition can't use
 					 * batching, so rows are visible to triggers etc.
 					 */
-					CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 
 				if (bistate != NULL)
@@ -960,7 +601,7 @@ CopyFrom(CopyFromState cstate)
 				/* no other path available for partitioned table */
 				Assert(insertMethod == CIM_MULTI_CONDITIONAL);
 
-				batchslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+				batchslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 															resultRelInfo);
 
 				if (map != NULL)
@@ -1040,17 +681,17 @@ CopyFrom(CopyFromState cstate)
 					ExecMaterializeSlot(myslot);
 
 					/* Add this tuple to the tuple buffer */
-					CopyMultiInsertInfoStore(&multiInsertInfo,
+					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
 											 cstate->line_buf.len,
-											 cstate->cur_lineno);
+											 cstate->miinfo.cur_lineno);
 
 					/*
 					 * If enough inserts have queued up, then flush all
 					 * buffers out to their tables.
 					 */
-					if (CopyMultiInsertInfoIsFull(&multiInsertInfo))
-						CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					if (MultiInsertInfoIsFull(&cstate->miinfo))
+						MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 				else
 				{
@@ -1109,8 +750,8 @@ CopyFrom(CopyFromState cstate)
 	/* Flush any remaining buffered tuples */
 	if (insertMethod != CIM_SINGLE)
 	{
-		if (!CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
-			CopyMultiInsertInfoFlush(&multiInsertInfo, NULL);
+		if (!MultiInsertInfoIsEmpty(&cstate->miinfo))
+			MultiInsertInfoFlush(&cstate->miinfo, NULL);
 	}
 
 	/* Done, clean up */
@@ -1144,7 +785,7 @@ CopyFrom(CopyFromState cstate)
 
 	/* Tear down the multi-insert buffer data */
 	if (insertMethod != CIM_SINGLE)
-		CopyMultiInsertInfoCleanup(&multiInsertInfo);
+		MultiInsertInfoCleanup(&cstate->miinfo);
 
 	/* Close all the partitioned tables, leaf partitions, and their indices */
 	if (proute)
@@ -1323,7 +964,6 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
-	cstate->cur_lineno = 0;
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 34ed3cfcd5..606268be04 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -456,14 +456,14 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	Assert(!cstate->opts.binary);
 
 	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	if (cstate->miinfo.cur_lineno == 0 && cstate->opts.header_line)
 	{
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 		if (CopyReadLine(cstate))
 			return false;		/* done */
 	}
 
-	cstate->cur_lineno++;
+	cstate->miinfo.cur_lineno++;
 
 	/* Actually read the line into memory here */
 	done = CopyReadLine(cstate);
@@ -604,7 +604,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 		int16		fld_count;
 		ListCell   *cur;
 
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 
 		if (!CopyGetInt16(cstate, &fld_count))
 		{
@@ -912,7 +912,7 @@ CopyReadLineText(CopyFromState cstate)
 			 * at all --- is cur_lineno a physical or logical count?)
 			 */
 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+				cstate->miinfo.cur_lineno++;
 		}
 
 		/* Process \r */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 7179f589f9..855a89b570 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1247,7 +1247,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 													 * ExecInitRoutingInfo */
 	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
 	resultRelInfo->ri_ChildToRootMap = NULL;
-	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	resultRelInfo->ri_MultiInsertBuffer = NULL;
 }
 
 /*
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 86594bd056..1f8ba785db 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -994,7 +994,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
 
-	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	partRelInfo->ri_MultiInsertBuffer = NULL;
 
 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index e0f24283b8..3428d9f48a 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -72,6 +73,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -389,6 +392,8 @@ ExecInsert(ModifyTableState *mtstate,
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	TupleTableSlot *batchslot = NULL;
+	bool	use_multi_insert = false;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -408,6 +413,66 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	// XXX: maybe this should only be done if it's not a partitioned table or
+	// if the partitions don't support miinfo, which uses its own bistates
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->operation == CMD_INSERT &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+	}
+
+	if (!mtstate->miinfo)
+	{
+		/*
+		 * If multi-inserts aren't possible for this statement at all, so don't
+		 * check further
+		 */
+	} else if (proute == NULL)
+	{
+		if (mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+	else
+	{
+		/*
+		 * If a partitioned table itself allows multi-insert, and bistate
+		 * indicates we've inserted the threshold number of tuples, check if
+		 * the partition also supports it.
+		 */
+
+		/* Determine which triggers exist on this partition */
+		// XXX copyfrom.c only checks triggers when the partition changes,
+		// so maybe use_multi_insert should be in mtstate ?
+		bool has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_before_row);
+
+		bool has_instead_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_instead_row);
+
+		/*
+		 * Disable multi-inserts when the partition has BEFORE/INSTEAD
+		 * OF triggers, or if the partition is a foreign partition.
+		 * The number of tuples eligible for multi-insert is tracked separately
+		 * from the total number of tuples in case it's not supported for some
+		 * partitions.
+		 */
+		if (!has_before_insert_row_trig &&
+			!has_instead_insert_row_trig &&
+			resultRelInfo->ri_FdwRoutine == NULL &&
+			mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+
+	if (use_multi_insert && mtstate->miinfo->ntuples - 1 == bulk_insert_ntuples)
+		elog(DEBUG1, "enabling multi insert");
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -594,7 +659,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   mtstate->bistate,
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -629,12 +694,39 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (use_multi_insert)
+		{
+			if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+				MultiInsertInfoSetupBuffer(mtstate->miinfo, resultRelInfo);
+
+			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (MultiInsertInfoIsFull(mtstate->miinfo))
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -647,32 +739,36 @@ ExecInsert(ModifyTableState *mtstate,
 	if (canSetTag)
 		(estate->es_processed)++;
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
+
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2229,6 +2325,45 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+
+	/*
+	 * Set miinfo if it can support multi-insert. This is the equivalent of
+	 * CIM_MULTI_* et al in copyfrom.c
+	 */
+
+	if (operation != CMD_INSERT ||
+			node->onConflictAction != ONCONFLICT_NONE)
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			(mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_before_row ||
+			 // mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_after_row || // XXX or any row level triggers at all?
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_instead_row))
+		/*
+		 * Can't support multi-inserts when there are any BEFORE/INSTEAD OF
+		 * triggers on the table.
+		 */
+		mtstate->miinfo = NULL;
+	else if (node->rootRelation > 0 &&
+			mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		/*
+		 * For partitioned tables we can't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
+			/* || cstate->volatile_defexprs */ )
+		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
+		/* Can't support multi-inserts to foreign tables or if there are any */
+		mtstate->miinfo = NULL;
+	else
+	{
+		mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+		MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+				mtstate->mt_transition_capture,
+				estate, GetCurrentCommandId(true), 0);
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2695,6 +2830,19 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
+	if (node->miinfo)
+	{
+		if (!MultiInsertInfoIsEmpty(node->miinfo))
+			 MultiInsertInfoFlush(node->miinfo, node->resultRelInfo); // root ?
+		MultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 635d91d50a..1401217616 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -3399,6 +3400,15 @@ static struct config_int ConfigureNamesInt[] =
 		check_huge_page_size, NULL, NULL
 	},
 
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index c15ea803c3..c0603e13ea 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -15,6 +15,7 @@
 #define COPYFROM_INTERNAL_H
 
 #include "commands/copy.h"
+#include "executor/nodeModifyTable.h"
 #include "commands/trigger.h"
 
 /*
@@ -92,10 +93,12 @@ typedef struct CopyFromStateData
 
 	/* these are just for error messages, see CopyFromErrorCallback */
 	const char *cur_relname;	/* table name for error messages */
-	uint64		cur_lineno;		/* line number for error messages */
 	const char *cur_attname;	/* current att for error messages */
 	const char *cur_attval;		/* current att value for error messages */
 
+	/* For bulk inserts and for error callback */
+	MultiInsertInfo miinfo;
+
 	/*
 	 * Working state
 	 */
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 46a2dc9511..71de7cf80e 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -13,8 +13,12 @@
 #ifndef NODEMODIFYTABLE_H
 #define NODEMODIFYTABLE_H
 
+#include "commands/trigger.h"
+#include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT int bulk_insert_ntuples;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
@@ -23,4 +27,370 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate,
 extern void ExecEndModifyTable(ModifyTableState *node);
 extern void ExecReScanModifyTable(ModifyTableState *node);
 
+/* Bulk insert stuff which used to live in copy.c */
+
+/*
+ * No more than this many tuples per MultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * MultiInsertBuffer items stored in MultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES		1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+#define MAX_BUFFERED_BYTES		65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS	32
+
+/* Stores multi-insert data related to a single relation in CopyFrom. */
+typedef struct MultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;	/* BulkInsertState for this rel */
+	int			nused;			/* number of 'slots' containing tuples */
+	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+												 * stream */
+} MultiInsertBuffer;
+
+/*
+ * Stores one or many MultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPY/INSERTing into a partitioned table.
+ */
+typedef struct MultiInsertInfo
+{
+	List	   *multiInsertBuffers; /* List of tracked MultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;	/* number of bytes from all buffered tuples */
+	TransitionCaptureState	*transition_capture;
+	EState	   *estate;			/* Executor state */
+	CommandId	mycid;			/* Command Id */
+	int			ti_options;		/* table insert options */
+	size_t		ntuples;		/* Number of rows *eligible* for multi-insert */
+
+	/* Line number for errors in copyfrom.c */
+	uint64		cur_lineno;
+	bool		line_buf_valid;
+} MultiInsertInfo;
+
+
+/*
+ * Allocate memory and initialize a new MultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static MultiInsertBuffer *
+MultiInsertBufferInit(ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = (MultiInsertBuffer *) palloc(sizeof(MultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+MultiInsertInfoSetupBuffer(MultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = MultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_MultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Initialize an already allocated MultiInsertInfo.
+ *
+ * If rri is a non-partitioned table then a MultiInsertBuffer is set up
+ * for that table.
+ */
+static inline void
+MultiInsertInfoInit(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						TransitionCaptureState *transition_capture,
+						EState *estate, CommandId mycid, int ti_options)
+{
+	miinfo->multiInsertBuffers = NIL;
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+	miinfo->transition_capture = transition_capture;
+	miinfo->estate = estate;
+	miinfo->mycid = mycid;
+	miinfo->ti_options = ti_options;
+	miinfo->cur_lineno = 0;
+
+	/*
+	 * Only setup the buffer when not dealing with a partitioned table.
+	 * Buffers for partitioned tables will just be setup when we need to send
+	 * tuples their way for the first time.
+	 */
+	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		MultiInsertInfoSetupBuffer(miinfo, rri);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+MultiInsertInfoIsFull(MultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+MultiInsertInfoIsEmpty(MultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Write the tuples stored in 'buffer' out to the table.
+ */
+static inline void
+MultiInsertBufferFlush(MultiInsertInfo *miinfo,
+						   MultiInsertBuffer *buffer)
+{
+	MemoryContext oldcontext;
+	int			i;
+	uint64		save_cur_lineno;
+	EState	   *estate = miinfo->estate;
+	CommandId	mycid = miinfo->mycid;
+	int			ti_options = miinfo->ti_options;
+	bool		line_buf_valid = miinfo->line_buf_valid;
+	int			nused = buffer->nused;
+	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+	TupleTableSlot **slots = buffer->slots;
+
+	/*
+	 * Print error context information correctly, if one of the operations
+	 * below fail.
+	 */
+	miinfo->line_buf_valid = false;
+	save_cur_lineno = miinfo->cur_lineno;
+
+	/*
+	 * table_multi_insert may leak memory, so switch to short-lived memory
+	 * context before calling it.
+	 */
+	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); // XXX requires executor.h
+	table_multi_insert(resultRelInfo->ri_RelationDesc,
+					   slots,
+					   nused,
+					   mycid,
+					   ti_options,
+					   buffer->bistate);
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < nused; i++)
+	{
+		/*
+		 * If there are any indexes, update them for all the inserted tuples,
+		 * and run AFTER ROW INSERT triggers.
+		 */
+		if (resultRelInfo->ri_NumIndices > 0)
+		{
+			List	   *recheckIndexes;
+
+			miinfo->cur_lineno = buffer->linenos[i];
+			recheckIndexes =
+				ExecInsertIndexTuples(resultRelInfo,
+									  buffer->slots[i], estate, false, NULL,
+									  NIL);
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], recheckIndexes,
+								 miinfo->transition_capture);
+			list_free(recheckIndexes);
+		}
+
+		/*
+		 * There's no indexes, but see if we need to run AFTER ROW INSERT
+		 * triggers anyway.
+		 */
+		else if (resultRelInfo->ri_TrigDesc != NULL &&
+				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+		{
+			miinfo->cur_lineno = buffer->linenos[i];
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], NIL, miinfo->transition_capture);
+		}
+
+		ExecClearTuple(slots[i]);
+	}
+
+	/* Mark that all slots are free */
+	buffer->nused = 0;
+
+	/* reset cur_lineno and line_buf_valid to what they were */
+	miinfo->line_buf_valid = line_buf_valid;
+	miinfo->cur_lineno = save_cur_lineno;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+MultiInsertBufferCleanup(MultiInsertInfo *miinfo,
+							 MultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_MultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Write out all stored tuples in all buffers out to the tables.
+ *
+ * Once flushed we also trim the tracked buffers list down to size by removing
+ * the buffers created earliest first.
+ *
+ * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
+ * used.  When cleaning up old buffers we'll never remove the one for
+ * 'curr_rri'.
+ */
+static inline void
+MultiInsertInfoFlush(MultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		MultiInsertBuffer *buffer = (MultiInsertBuffer *) lfirst(lc);
+
+		MultiInsertBufferFlush(miinfo, buffer);
+	}
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+
+	/*
+	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
+	 * remove buffers starting with the ones we created first.  It seems less
+	 * likely that these older ones will be needed than the ones that were
+	 * just created.
+	 */
+	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
+	{
+		MultiInsertBuffer *buffer;
+
+		buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+
+		/*
+		 * We never want to remove the buffer that's currently being used, so
+		 * if we happen to find that then move it to the end of the list.
+		 */
+		if (buffer->resultRelInfo == curr_rri)
+		{
+			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+			buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+		}
+
+		MultiInsertBufferCleanup(miinfo, buffer);
+		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+	}
+}
+
+/*
+ * Cleanup allocated buffers and free memory
+ */
+static inline void
+MultiInsertInfoCleanup(MultiInsertInfo *miinfo)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+		MultiInsertBufferCleanup(miinfo, lfirst(lc));
+
+	list_free(miinfo->multiInsertBuffers);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+MultiInsertInfoNextFreeSlot(MultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * MultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+MultiInsertInfoStore(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* NODEMODIFYTABLE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 61ba4c3666..477b326d06 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -32,6 +33,9 @@
 #include "utils/tuplesort.h"
 #include "utils/tuplestore.h"
 
+/* This would be a circular inclusion */
+// #include "executor/nodeModifyTable.h"
+
 struct PlanState;				/* forward references in this file */
 struct ParallelHashJoinState;
 struct ExecRowMark;
@@ -39,8 +43,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct MultiInsertBuffer;
+// struct MultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -498,8 +502,8 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copyfrom.c when performing multi-inserts */
-	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
+	/* for use by copyfrom.c/modifyTable when performing multi-inserts */
+	struct MultiInsertBuffer *ri_MultiInsertBuffer;
 } ResultRelInfo;
 
 /* ----------------
@@ -1165,6 +1169,10 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct MultiInsertInfo	*miinfo;
+	size_t		ntuples;	/* Number of tuples inserted; */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..bc4c1a4fc2 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,49 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+DEBUG:  enabling multi insert
+RESET client_min_messages;
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..a74eb3826a 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,26 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
+select count(1) from hash_parted;
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index cf63acbf6f..72653c16e4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -422,8 +422,6 @@ ConvertRowtypeExpr
 CookedConstraint
 CopyDest
 CopyInsertMethod
-CopyMultiInsertBuffer
-CopyMultiInsertInfo
 CopyState
 CopyStateData
 CopyStmt
@@ -1388,6 +1386,8 @@ ModifyTableState
 MorphOpaque
 MsgType
 MultiAssignRef
+MultiInsertBuffer
+MultiInsertInfo
 MultiSortSupport
 MultiSortSupportData
 MultiXactId
-- 
2.17.0

v8-0002-WIP-Check-for-volatile-defaults.patchtext/x-diff; charset=us-asciiDownload

From 0dc38dced6fbf302f1174af60754341998c3ef4f Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 1 Dec 2020 23:11:31 -0600
Subject: [PATCH v8 2/4] WIP: Check for volatile defaults

We want to check if any column *uses* a volatile default value, but after
parsing and rewriting, that information appears to be lost about which column
values are defaults and which were specified.  insertedcols doesn't appear to
be useful for this.  So add a field to track if a TargetEntry is planned with
column default.
---
 src/backend/executor/nodeModifyTable.c | 64 ++++++++++++++++++++++++--
 src/backend/nodes/copyfuncs.c          |  1 +
 src/backend/nodes/equalfuncs.c         |  1 +
 src/backend/nodes/makefuncs.c          |  1 +
 src/backend/nodes/outfuncs.c           |  1 +
 src/backend/nodes/readfuncs.c          |  1 +
 src/backend/optimizer/util/tlist.c     |  1 +
 src/backend/rewrite/rewriteHandler.c   |  3 ++
 src/include/nodes/primnodes.h          |  2 +
 9 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 3428d9f48a..47a5271b91 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -50,6 +50,7 @@
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -2258,6 +2259,61 @@ ExecModifyTable(PlanState *pstate)
 	return NULL;
 }
 
+/*
+ * Determine if a table has volatile column defaults which are used by a given
+ * planned statement (if the column is not specified or specified as DEFAULT).
+ * This works only for INSERT.
+ */
+static bool
+has_volatile_defaults(ResultRelInfo *resultRelInfo, ModifyTable *node)
+{
+	TupleDesc	tupDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+	Plan		*plan;
+
+	Assert(list_length(node->plans) == 1);
+	plan = linitial(node->plans);
+
+	for (int attnum = 1; attnum <= tupDesc->natts; attnum++)
+	{
+		Form_pg_attribute att = TupleDescAttr(tupDesc, attnum - 1);
+		Expr		*defexpr;
+		TargetEntry	*tle;
+
+		/* We don't need to check dropped/generated attributes */
+		if (att->attisdropped || att->attgenerated)
+			continue;
+
+		tle = list_nth(plan->targetlist, attnum - 1);
+		Assert(tle != NULL);
+		Assert(tle->resno == attnum);
+
+		/*
+		 * If the column was specified with a non-default value, then don't
+		 * check the volatility of its default
+		 */
+		if (!tle->isdefault)
+			continue;
+
+		/* Check the column's default value if one exists */
+		defexpr = (Expr *) build_column_default(resultRelInfo->ri_RelationDesc, attnum);
+		if (defexpr == NULL)
+			continue;
+
+		/* Run the expression through planner */
+		// defexpr = expression_planner(defexpr);
+		// (void) ExecInitExpr(defexpr, NULL);
+		expression_planner(defexpr);
+
+		if (contain_volatile_functions_not_nextval((Node *) defexpr))
+		{
+			elog(DEBUG1, "found volatile att %d", attnum);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /* ----------------------------------------------------------------
  *		ExecInitModifyTable
  * ----------------------------------------------------------------
@@ -2352,10 +2408,10 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		 * are any statement level insert triggers.
 		 */
 		mtstate->miinfo = NULL;
-	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
-			/* || cstate->volatile_defexprs */ )
-		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
-		/* Can't support multi-inserts to foreign tables or if there are any */
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL ||
+			has_volatile_defaults(mtstate->rootResultRelInfo, node))
+		/* Can't support multi-inserts to foreign tables or if there are any
+		 * volatile default expressions in the table. */
 		mtstate->miinfo = NULL;
 	else
 	{
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 910906f639..3bfa59f1a5 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2177,6 +2177,7 @@ _copyTargetEntry(const TargetEntry *from)
 	COPY_SCALAR_FIELD(resorigtbl);
 	COPY_SCALAR_FIELD(resorigcol);
 	COPY_SCALAR_FIELD(resjunk);
+	COPY_SCALAR_FIELD(isdefault);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 687609f59e..84530468b8 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -769,6 +769,7 @@ _equalTargetEntry(const TargetEntry *a, const TargetEntry *b)
 	COMPARE_SCALAR_FIELD(resorigtbl);
 	COMPARE_SCALAR_FIELD(resorigcol);
 	COMPARE_SCALAR_FIELD(resjunk);
+	COMPARE_SCALAR_FIELD(isdefault);
 
 	return true;
 }
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index ee033ae779..1cd14fed3d 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -254,6 +254,7 @@ makeTargetEntry(Expr *expr,
 	tle->ressortgroupref = 0;
 	tle->resorigtbl = InvalidOid;
 	tle->resorigcol = 0;
+	tle->isdefault = false;
 
 	tle->resjunk = resjunk;
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 9c73c605a4..ef6d14e072 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1660,6 +1660,7 @@ _outTargetEntry(StringInfo str, const TargetEntry *node)
 	WRITE_OID_FIELD(resorigtbl);
 	WRITE_INT_FIELD(resorigcol);
 	WRITE_BOOL_FIELD(resjunk);
+	WRITE_BOOL_FIELD(isdefault);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 169d5581b9..ba2288f7b0 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1274,6 +1274,7 @@ _readTargetEntry(void)
 	READ_OID_FIELD(resorigtbl);
 	READ_INT_FIELD(resorigcol);
 	READ_BOOL_FIELD(resjunk);
+	READ_BOOL_FIELD(isdefault);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 02a3c6b165..1043ee7b66 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -354,6 +354,7 @@ apply_tlist_labeling(List *dest_tlist, List *src_tlist)
 		dest_tle->resorigtbl = src_tle->resorigtbl;
 		dest_tle->resorigcol = src_tle->resorigcol;
 		dest_tle->resjunk = src_tle->resjunk;
+		dest_tle->isdefault = src_tle->isdefault;
 	}
 }
 
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index c25012f325..9998db2268 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -986,10 +986,13 @@ rewriteTargetListIU(List *targetList,
 			}
 
 			if (new_expr)
+			{
 				new_tle = makeTargetEntry((Expr *) new_expr,
 										  attrno,
 										  pstrdup(NameStr(att_tup->attname)),
 										  false);
+				new_tle->isdefault = true;
+			}
 		}
 
 		/*
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index cdbe781c73..c5e626f175 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -1428,6 +1428,8 @@ typedef struct TargetEntry
 	AttrNumber	resorigcol;		/* column's number in source table */
 	bool		resjunk;		/* set to true to eliminate the attribute from
 								 * final target list */
+	bool		isdefault;		/* true if using the column default, either
+								 * by "DEFAULT" or omission of the column */
 } TargetEntry;
 
 
-- 
2.17.0

v8-0003-COPY-flush-multi-insert-buffer-based-on-accumulat.patchtext/x-diff; charset=us-asciiDownload

From 7930b240635af0045e95591001eac9cba16cfd70 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 5 Dec 2020 08:52:14 -0600
Subject: [PATCH v8 3/4] COPY: flush multi-insert buffer based on accumulated
 size of tuples..

..rather than line length
---
 src/backend/commands/copyfrom.c        | 2 +-
 src/include/executor/nodeModifyTable.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index c4fe75df8e..a8bbfcc71f 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -683,7 +683,7 @@ CopyFrom(CopyFromState cstate)
 					/* Add this tuple to the tuple buffer */
 					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
-											 cstate->line_buf.len,
+											 MemoryContextMemAllocated(myslot->tts_mcxt, true),
 											 cstate->miinfo.cur_lineno);
 
 					/*
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 71de7cf80e..e6bb27aade 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -41,10 +41,10 @@ extern void ExecReScanModifyTable(ModifyTableState *node);
 #define MAX_BUFFERED_TUPLES		1000
 
 /*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
+ * Flush buffers if there are >= this many bytes of tuples stored, as counted
+ * by the slot's memory contexts.
  */
-#define MAX_BUFFERED_BYTES		65535
+#define MAX_BUFFERED_BYTES		(1024*1024*8)
 
 /* Trim the list of buffers back down to this number after flushing */
 #define MAX_PARTITION_BUFFERS	32
-- 
2.17.0

v8-0004-WIP-check-tuple-size.patchtext/x-diff; charset=us-asciiDownload

From ff2a04f3958953f4956e8bbaa7d4ad434e623468 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 1 Dec 2020 17:20:25 -0600
Subject: [PATCH v8 4/4] WIP: check tuple size

Or maybe INSERT should flush buffer based only on the *number* of tuples, and
not their size ?
---
 src/backend/executor/nodeModifyTable.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 47a5271b91..b4f523e32d 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -703,7 +703,13 @@ ExecInsert(ModifyTableState *mtstate,
 			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
 			ExecCopySlot(batchslot, slot);
 
-			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot,
+					// sizeof(void*) * batchslot->tts_nvalid, /* tuple size - underestimate */
+					MemoryContextMemAllocated(batchslot->tts_mcxt, true), /* tuple size */
+					mtstate->ntuples); /* lineno */
+
+			elog(DEBUG2, "bufferedBytes %d; tuples %ld",
+					mtstate->miinfo->bufferedBytes, mtstate->ntuples);
 
 			if (MultiInsertInfoIsFull(mtstate->miinfo))
 				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
-- 
2.17.0

pryzby@telsasoft.com

almost 5 years ago

In reply to: Justin Pryzby (#18)

4 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Sat, Dec 05, 2020 at 01:59:41PM -0600, Justin Pryzby wrote:

On Thu, Dec 03, 2020 at 10:59:34AM +0530, Bharath Rupireddy wrote:

On Wed, Dec 2, 2020 at 10:24 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

One loose end in this patch is how to check for volatile default expressions.

I think we should be doing all the necessary checks in the planner and
have a flag in the planned stmt to indicate whether to go with multi
insert or not. For the required checks, we can have a look at how the
existing COPY decides to go with either CIM_MULTI or CIM_SINGLE.

Yes, you can see that I've copied the checks from copy.
Like copy, some checks are done once, in ExecInitModifyTable, outside of the
ExecModifyTable "loop".

This squishes some commits together.
And uses bistate for ON CONFLICT.
And attempts to use memory context for tuple size.

Rebased on 9dc718bdf2b1a574481a45624d42b674332e2903

I guess my patch should/may be subsumed by this other one - I'm fine with that.
https://commitfest.postgresql.org/31/2871/

Note that my interest here is just in bistate, to avoid leaving behind many
dirty buffers, not improved performance of COPY.

--
Justin

Attachments:

v9-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_in.patchtext/x-diff; charset=us-asciiDownload

From 65fd9d9352634e4d0ad49685a988c5e4e157b9d8 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v9 1/4] INSERT SELECT to use BulkInsertState and multi_insert

Renames structures;
Move MultipleInsert functions from copyfrom.c to (tentatively) nodeModifyTable.h;
Move into MultiInsertInfo: transition_capture and cur_lineno (via cstate->miinfo);

Dynamically switch to multi-insert mode based on the number of insertions.
This is intended to accomodate 1) the original use case of INSERT using a small
ring buffer to avoid leaving behind dirty buffers; and, 2) Automatically using
multi-inserts for batch operations; 3) allow the old behavior of leaving behind
dirty buffers, which might allow INSERT to run more quickly, at the cost of
leaving behind many dirty buffers which other backends may have to write out.

XXX: for (1), the bulk-insert state is used even if not multi-insert, including
for a VALUES.

TODO: use cstate->miinfo.cur_lineno++ instead of mtstate->miinfo->ntuples
---
 src/backend/commands/copyfrom.c          | 394 +----------------------
 src/backend/commands/copyfromparse.c     |  10 +-
 src/backend/executor/execMain.c          |   2 +-
 src/backend/executor/execPartition.c     |   2 +-
 src/backend/executor/nodeModifyTable.c   | 196 +++++++++--
 src/backend/utils/misc/guc.c             |  10 +
 src/include/commands/copyfrom_internal.h |   5 +-
 src/include/executor/nodeModifyTable.h   | 371 +++++++++++++++++++++
 src/include/nodes/execnodes.h            |  16 +-
 src/test/regress/expected/insert.out     |  43 +++
 src/test/regress/sql/insert.sql          |  20 ++
 src/tools/pgindent/typedefs.list         |   4 +-
 12 files changed, 658 insertions(+), 415 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index c39cc736ed..8221a2c5d3 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -46,54 +46,6 @@
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /* non-export function prototypes */
 static char *limit_printout_length(const char *str);
 
@@ -111,7 +63,7 @@ CopyFromErrorCallback(void *arg)
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
-			 cstate->cur_lineno);
+			 cstate->miinfo.cur_lineno);
 
 	if (cstate->opts.binary)
 	{
@@ -206,317 +158,6 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
-/*
- * Initialize an already allocated CopyMultiInsertInfo.
- *
- * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
- * for that table.
- */
-static void
-CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						CopyFromState cstate, EState *estate, CommandId mycid,
-						int ti_options)
-{
-	miinfo->multiInsertBuffers = NIL;
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-	miinfo->cstate = cstate;
-	miinfo->estate = estate;
-	miinfo->mycid = mycid;
-	miinfo->ti_options = ti_options;
-
-	/*
-	 * Only setup the buffer when not dealing with a partitioned table.
-	 * Buffers for partitioned tables will just be setup when we need to send
-	 * tuples their way for the first time.
-	 */
-	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
-}
-
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
-/*
- * Write the tuples stored in 'buffer' out to the table.
- */
-static inline void
-CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
-						   CopyMultiInsertBuffer *buffer)
-{
-	MemoryContext oldcontext;
-	int			i;
-	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
-	EState	   *estate = miinfo->estate;
-	CommandId	mycid = miinfo->mycid;
-	int			ti_options = miinfo->ti_options;
-	bool		line_buf_valid = cstate->line_buf_valid;
-	int			nused = buffer->nused;
-	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
-	TupleTableSlot **slots = buffer->slots;
-
-	/*
-	 * Print error context information correctly, if one of the operations
-	 * below fail.
-	 */
-	cstate->line_buf_valid = false;
-	save_cur_lineno = cstate->cur_lineno;
-
-	/*
-	 * table_multi_insert may leak memory, so switch to short-lived memory
-	 * context before calling it.
-	 */
-	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	table_multi_insert(resultRelInfo->ri_RelationDesc,
-					   slots,
-					   nused,
-					   mycid,
-					   ti_options,
-					   buffer->bistate);
-	MemoryContextSwitchTo(oldcontext);
-
-	for (i = 0; i < nused; i++)
-	{
-		/*
-		 * If there are any indexes, update them for all the inserted tuples,
-		 * and run AFTER ROW INSERT triggers.
-		 */
-		if (resultRelInfo->ri_NumIndices > 0)
-		{
-			List	   *recheckIndexes;
-
-			cstate->cur_lineno = buffer->linenos[i];
-			recheckIndexes =
-				ExecInsertIndexTuples(resultRelInfo,
-									  buffer->slots[i], estate, false, false,
-									  NULL, NIL);
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], recheckIndexes,
-								 cstate->transition_capture);
-			list_free(recheckIndexes);
-		}
-
-		/*
-		 * There's no indexes, but see if we need to run AFTER ROW INSERT
-		 * triggers anyway.
-		 */
-		else if (resultRelInfo->ri_TrigDesc != NULL &&
-				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
-				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
-		{
-			cstate->cur_lineno = buffer->linenos[i];
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], NIL, cstate->transition_capture);
-		}
-
-		ExecClearTuple(slots[i]);
-	}
-
-	/* Mark that all slots are free */
-	buffer->nused = 0;
-
-	/* reset cur_lineno and line_buf_valid to what they were */
-	cstate->line_buf_valid = line_buf_valid;
-	cstate->cur_lineno = save_cur_lineno;
-}
-
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
-/*
- * Write out all stored tuples in all buffers out to the tables.
- *
- * Once flushed we also trim the tracked buffers list down to size by removing
- * the buffers created earliest first.
- *
- * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
- * used.  When cleaning up old buffers we'll never remove the one for
- * 'curr_rri'.
- */
-static inline void
-CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-	{
-		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
-
-		CopyMultiInsertBufferFlush(miinfo, buffer);
-	}
-
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-
-	/*
-	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
-	 * remove buffers starting with the ones we created first.  It seems less
-	 * likely that these older ones will be needed than the ones that were
-	 * just created.
-	 */
-	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
-	{
-		CopyMultiInsertBuffer *buffer;
-
-		buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-
-		/*
-		 * We never want to remove the buffer that's currently being used, so
-		 * if we happen to find that then move it to the end of the list.
-		 */
-		if (buffer->resultRelInfo == curr_rri)
-		{
-			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-			buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-		}
-
-		CopyMultiInsertBufferCleanup(miinfo, buffer);
-		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-	}
-}
-
-/*
- * Cleanup allocated buffers and free memory
- */
-static inline void
-CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-		CopyMultiInsertBufferCleanup(miinfo, lfirst(lc));
-
-	list_free(miinfo->multiInsertBuffers);
-}
-
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
@@ -538,7 +179,6 @@ CopyFrom(CopyFromState cstate)
 	int			ti_options = 0; /* start with default options for insert */
 	BulkInsertState bistate = NULL;
 	CopyInsertMethod insertMethod;
-	CopyMultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
 	uint64		processed = 0;
 	bool		has_before_insert_row_trig;
 	bool		has_instead_insert_row_trig;
@@ -725,7 +365,7 @@ CopyFrom(CopyFromState cstate)
 		 * For partitioned tables we can't support multi-inserts when there
 		 * are any statement level insert triggers. It might be possible to
 		 * allow partitioned tables with such triggers in the future, but for
-		 * now, CopyMultiInsertInfoFlush expects that any before row insert
+		 * now, MultiInsertInfoFlush expects that any before row insert
 		 * and statement level insert triggers are on the same relation.
 		 */
 		insertMethod = CIM_SINGLE;
@@ -773,7 +413,8 @@ CopyFrom(CopyFromState cstate)
 		else
 			insertMethod = CIM_MULTI;
 
-		CopyMultiInsertInfoInit(&multiInsertInfo, resultRelInfo, cstate,
+		MultiInsertInfoInit(&cstate->miinfo, resultRelInfo,
+								cstate->transition_capture,
 								estate, mycid, ti_options);
 	}
 
@@ -836,7 +477,7 @@ CopyFrom(CopyFromState cstate)
 			Assert(resultRelInfo == target_resultRelInfo);
 			Assert(insertMethod == CIM_MULTI);
 
-			myslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+			myslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 													 resultRelInfo);
 		}
 
@@ -905,18 +546,18 @@ CopyFrom(CopyFromState cstate)
 				/* Set the multi-insert buffer to use for this partition. */
 				if (leafpart_use_multi_insert)
 				{
-					if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
-						CopyMultiInsertInfoSetupBuffer(&multiInsertInfo,
+					if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+						MultiInsertInfoSetupBuffer(&cstate->miinfo,
 													   resultRelInfo);
 				}
 				else if (insertMethod == CIM_MULTI_CONDITIONAL &&
-						 !CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
+						 !MultiInsertInfoIsEmpty(&cstate->miinfo))
 				{
 					/*
 					 * Flush pending inserts if this partition can't use
 					 * batching, so rows are visible to triggers etc.
 					 */
-					CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 
 				if (bistate != NULL)
@@ -962,7 +603,7 @@ CopyFrom(CopyFromState cstate)
 				/* no other path available for partitioned table */
 				Assert(insertMethod == CIM_MULTI_CONDITIONAL);
 
-				batchslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+				batchslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 															resultRelInfo);
 
 				if (map != NULL)
@@ -1042,17 +683,17 @@ CopyFrom(CopyFromState cstate)
 					ExecMaterializeSlot(myslot);
 
 					/* Add this tuple to the tuple buffer */
-					CopyMultiInsertInfoStore(&multiInsertInfo,
+					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
 											 cstate->line_buf.len,
-											 cstate->cur_lineno);
+											 cstate->miinfo.cur_lineno);
 
 					/*
 					 * If enough inserts have queued up, then flush all
 					 * buffers out to their tables.
 					 */
-					if (CopyMultiInsertInfoIsFull(&multiInsertInfo))
-						CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					if (MultiInsertInfoIsFull(&cstate->miinfo))
+						MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 				else
 				{
@@ -1113,8 +754,8 @@ CopyFrom(CopyFromState cstate)
 	/* Flush any remaining buffered tuples */
 	if (insertMethod != CIM_SINGLE)
 	{
-		if (!CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
-			CopyMultiInsertInfoFlush(&multiInsertInfo, NULL);
+		if (!MultiInsertInfoIsEmpty(&cstate->miinfo))
+			MultiInsertInfoFlush(&cstate->miinfo, NULL);
 	}
 
 	/* Done, clean up */
@@ -1148,7 +789,7 @@ CopyFrom(CopyFromState cstate)
 
 	/* Tear down the multi-insert buffer data */
 	if (insertMethod != CIM_SINGLE)
-		CopyMultiInsertInfoCleanup(&multiInsertInfo);
+		MultiInsertInfoCleanup(&cstate->miinfo);
 
 	/* Close all the partitioned tables, leaf partitions, and their indices */
 	if (proute)
@@ -1327,7 +968,6 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
-	cstate->cur_lineno = 0;
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 4c74067f84..02efad8b31 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -460,14 +460,14 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	Assert(!cstate->opts.binary);
 
 	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	if (cstate->miinfo.cur_lineno == 0 && cstate->opts.header_line)
 	{
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 		if (CopyReadLine(cstate))
 			return false;		/* done */
 	}
 
-	cstate->cur_lineno++;
+	cstate->miinfo.cur_lineno++;
 
 	/* Actually read the line into memory here */
 	done = CopyReadLine(cstate);
@@ -608,7 +608,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 		int16		fld_count;
 		ListCell   *cur;
 
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 
 		if (!CopyGetInt16(cstate, &fld_count))
 		{
@@ -916,7 +916,7 @@ CopyReadLineText(CopyFromState cstate)
 			 * at all --- is cur_lineno a physical or logical count?)
 			 */
 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+				cstate->miinfo.cur_lineno++;
 		}
 
 		/* Process \r */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index f4dd47acc7..79dbf0b3e8 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1247,7 +1247,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 													 * ExecInitRoutingInfo */
 	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
 	resultRelInfo->ri_ChildToRootMap = NULL;
-	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	resultRelInfo->ri_MultiInsertBuffer = NULL;
 }
 
 /*
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 941731a0a9..067bfd11ba 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -993,7 +993,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
 
-	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	partRelInfo->ri_MultiInsertBuffer = NULL;
 
 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 921e695419..a53cbeeb2c 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -72,6 +73,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -389,6 +392,8 @@ ExecInsert(ModifyTableState *mtstate,
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	TupleTableSlot *batchslot = NULL;
+	bool	use_multi_insert = false;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -408,6 +413,66 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	// XXX: maybe this should only be done if it's not a partitioned table or
+	// if the partitions don't support miinfo, which uses its own bistates
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->operation == CMD_INSERT &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+	}
+
+	if (!mtstate->miinfo)
+	{
+		/*
+		 * If multi-inserts aren't possible for this statement at all, so don't
+		 * check further
+		 */
+	} else if (proute == NULL)
+	{
+		if (mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+	else
+	{
+		/*
+		 * If a partitioned table itself allows multi-insert, and bistate
+		 * indicates we've inserted the threshold number of tuples, check if
+		 * the partition also supports it.
+		 */
+
+		/* Determine which triggers exist on this partition */
+		// XXX copyfrom.c only checks triggers when the partition changes,
+		// so maybe use_multi_insert should be in mtstate ?
+		bool has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_before_row);
+
+		bool has_instead_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_instead_row);
+
+		/*
+		 * Disable multi-inserts when the partition has BEFORE/INSTEAD
+		 * OF triggers, or if the partition is a foreign partition.
+		 * The number of tuples eligible for multi-insert is tracked separately
+		 * from the total number of tuples in case it's not supported for some
+		 * partitions.
+		 */
+		if (!has_before_insert_row_trig &&
+			!has_instead_insert_row_trig &&
+			resultRelInfo->ri_FdwRoutine == NULL &&
+			mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+
+	if (use_multi_insert && mtstate->miinfo->ntuples - 1 == bulk_insert_ntuples)
+		elog(DEBUG1, "enabling multi insert");
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -594,7 +659,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   mtstate->bistate,
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -629,12 +694,39 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (use_multi_insert)
+		{
+			if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+				MultiInsertInfoSetupBuffer(mtstate->miinfo, resultRelInfo);
+
+			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (MultiInsertInfoIsFull(mtstate->miinfo))
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -647,32 +739,36 @@ ExecInsert(ModifyTableState *mtstate,
 	if (canSetTag)
 		(estate->es_processed)++;
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
+
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2229,6 +2325,45 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+
+	/*
+	 * Set miinfo if it can support multi-insert. This is the equivalent of
+	 * CIM_MULTI_* et al in copyfrom.c
+	 */
+
+	if (operation != CMD_INSERT ||
+			node->onConflictAction != ONCONFLICT_NONE)
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			(mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_before_row ||
+			 // mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_after_row || // XXX or any row level triggers at all?
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_instead_row))
+		/*
+		 * Can't support multi-inserts when there are any BEFORE/INSTEAD OF
+		 * triggers on the table.
+		 */
+		mtstate->miinfo = NULL;
+	else if (node->rootRelation > 0 &&
+			mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		/*
+		 * For partitioned tables we can't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
+			/* || cstate->volatile_defexprs */ )
+		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
+		/* Can't support multi-inserts to foreign tables or if there are any */
+		mtstate->miinfo = NULL;
+	else
+	{
+		mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+		MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+				mtstate->mt_transition_capture,
+				estate, GetCurrentCommandId(true), 0);
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2693,6 +2828,19 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
+	if (node->miinfo)
+	{
+		if (!MultiInsertInfoIsEmpty(node->miinfo))
+			 MultiInsertInfoFlush(node->miinfo, node->resultRelInfo); // root ?
+		MultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 17579eeaca..b6f493e0c8 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -3445,6 +3446,15 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index e37942df39..372414b1c0 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -15,6 +15,7 @@
 #define COPYFROM_INTERNAL_H
 
 #include "commands/copy.h"
+#include "executor/nodeModifyTable.h"
 #include "commands/trigger.h"
 
 /*
@@ -92,10 +93,12 @@ typedef struct CopyFromStateData
 
 	/* these are just for error messages, see CopyFromErrorCallback */
 	const char *cur_relname;	/* table name for error messages */
-	uint64		cur_lineno;		/* line number for error messages */
 	const char *cur_attname;	/* current att for error messages */
 	const char *cur_attval;		/* current att value for error messages */
 
+	/* For bulk inserts and for error callback */
+	MultiInsertInfo miinfo;
+
 	/*
 	 * Working state
 	 */
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 83e2965531..30542a542a 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -13,8 +13,12 @@
 #ifndef NODEMODIFYTABLE_H
 #define NODEMODIFYTABLE_H
 
+#include "commands/trigger.h"
+#include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT int bulk_insert_ntuples;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
@@ -23,4 +27,371 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate,
 extern void ExecEndModifyTable(ModifyTableState *node);
 extern void ExecReScanModifyTable(ModifyTableState *node);
 
+/* Bulk insert stuff which used to live in copy.c */
+
+/*
+ * No more than this many tuples per MultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * MultiInsertBuffer items stored in MultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES		1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+#define MAX_BUFFERED_BYTES		65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS	32
+
+/* Stores multi-insert data related to a single relation in CopyFrom. */
+typedef struct MultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;	/* BulkInsertState for this rel */
+	int			nused;			/* number of 'slots' containing tuples */
+	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+												 * stream */
+} MultiInsertBuffer;
+
+/*
+ * Stores one or many MultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPY/INSERTing into a partitioned table.
+ */
+typedef struct MultiInsertInfo
+{
+	List	   *multiInsertBuffers; /* List of tracked MultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;	/* number of bytes from all buffered tuples */
+	TransitionCaptureState	*transition_capture;
+	EState	   *estate;			/* Executor state */
+	CommandId	mycid;			/* Command Id */
+	int			ti_options;		/* table insert options */
+	size_t		ntuples;		/* Number of rows *eligible* for multi-insert */
+
+	/* Line number for errors in copyfrom.c */
+	uint64		cur_lineno;
+	bool		line_buf_valid;
+} MultiInsertInfo;
+
+
+/*
+ * Allocate memory and initialize a new MultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static MultiInsertBuffer *
+MultiInsertBufferInit(ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = (MultiInsertBuffer *) palloc(sizeof(MultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+MultiInsertInfoSetupBuffer(MultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = MultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_MultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Initialize an already allocated MultiInsertInfo.
+ *
+ * If rri is a non-partitioned table then a MultiInsertBuffer is set up
+ * for that table.
+ */
+static inline void
+MultiInsertInfoInit(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						TransitionCaptureState *transition_capture,
+						EState *estate, CommandId mycid, int ti_options)
+{
+	miinfo->multiInsertBuffers = NIL;
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+	miinfo->transition_capture = transition_capture;
+	miinfo->estate = estate;
+	miinfo->mycid = mycid;
+	miinfo->ti_options = ti_options;
+	miinfo->cur_lineno = 0;
+
+	/*
+	 * Only setup the buffer when not dealing with a partitioned table.
+	 * Buffers for partitioned tables will just be setup when we need to send
+	 * tuples their way for the first time.
+	 */
+	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		MultiInsertInfoSetupBuffer(miinfo, rri);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+MultiInsertInfoIsFull(MultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+MultiInsertInfoIsEmpty(MultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Write the tuples stored in 'buffer' out to the table.
+ */
+static inline void
+MultiInsertBufferFlush(MultiInsertInfo *miinfo,
+						   MultiInsertBuffer *buffer)
+{
+	MemoryContext oldcontext;
+	int			i;
+	uint64		save_cur_lineno;
+	EState	   *estate = miinfo->estate;
+	CommandId	mycid = miinfo->mycid;
+	int			ti_options = miinfo->ti_options;
+	bool		line_buf_valid = miinfo->line_buf_valid;
+	int			nused = buffer->nused;
+	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+	TupleTableSlot **slots = buffer->slots;
+
+	/*
+	 * Print error context information correctly, if one of the operations
+	 * below fail.
+	 */
+	miinfo->line_buf_valid = false;
+	save_cur_lineno = miinfo->cur_lineno;
+
+	/*
+	 * table_multi_insert may leak memory, so switch to short-lived memory
+	 * context before calling it.
+	 */
+	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); // XXX requires executor.h
+	table_multi_insert(resultRelInfo->ri_RelationDesc,
+					   slots,
+					   nused,
+					   mycid,
+					   ti_options,
+					   buffer->bistate);
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < nused; i++)
+	{
+		/*
+		 * If there are any indexes, update them for all the inserted tuples,
+		 * and run AFTER ROW INSERT triggers.
+		 */
+		if (resultRelInfo->ri_NumIndices > 0)
+		{
+			List	   *recheckIndexes;
+
+			miinfo->cur_lineno = buffer->linenos[i];
+			recheckIndexes =
+				ExecInsertIndexTuples(resultRelInfo,
+									  buffer->slots[i], estate, false, false,
+									  NULL, NIL);
+
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], recheckIndexes,
+								 miinfo->transition_capture);
+			list_free(recheckIndexes);
+		}
+
+		/*
+		 * There's no indexes, but see if we need to run AFTER ROW INSERT
+		 * triggers anyway.
+		 */
+		else if (resultRelInfo->ri_TrigDesc != NULL &&
+				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+		{
+			miinfo->cur_lineno = buffer->linenos[i];
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], NIL, miinfo->transition_capture);
+		}
+
+		ExecClearTuple(slots[i]);
+	}
+
+	/* Mark that all slots are free */
+	buffer->nused = 0;
+
+	/* reset cur_lineno and line_buf_valid to what they were */
+	miinfo->line_buf_valid = line_buf_valid;
+	miinfo->cur_lineno = save_cur_lineno;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+MultiInsertBufferCleanup(MultiInsertInfo *miinfo,
+							 MultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_MultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Write out all stored tuples in all buffers out to the tables.
+ *
+ * Once flushed we also trim the tracked buffers list down to size by removing
+ * the buffers created earliest first.
+ *
+ * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
+ * used.  When cleaning up old buffers we'll never remove the one for
+ * 'curr_rri'.
+ */
+static inline void
+MultiInsertInfoFlush(MultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		MultiInsertBuffer *buffer = (MultiInsertBuffer *) lfirst(lc);
+
+		MultiInsertBufferFlush(miinfo, buffer);
+	}
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+
+	/*
+	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
+	 * remove buffers starting with the ones we created first.  It seems less
+	 * likely that these older ones will be needed than the ones that were
+	 * just created.
+	 */
+	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
+	{
+		MultiInsertBuffer *buffer;
+
+		buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+
+		/*
+		 * We never want to remove the buffer that's currently being used, so
+		 * if we happen to find that then move it to the end of the list.
+		 */
+		if (buffer->resultRelInfo == curr_rri)
+		{
+			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+			buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+		}
+
+		MultiInsertBufferCleanup(miinfo, buffer);
+		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+	}
+}
+
+/*
+ * Cleanup allocated buffers and free memory
+ */
+static inline void
+MultiInsertInfoCleanup(MultiInsertInfo *miinfo)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+		MultiInsertBufferCleanup(miinfo, lfirst(lc));
+
+	list_free(miinfo->multiInsertBuffers);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+MultiInsertInfoNextFreeSlot(MultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * MultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+MultiInsertInfoStore(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* NODEMODIFYTABLE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 48c3f570fa..8c7ca6627d 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -32,6 +33,9 @@
 #include "utils/tuplesort.h"
 #include "utils/tuplestore.h"
 
+/* This would be a circular inclusion */
+// #include "executor/nodeModifyTable.h"
+
 struct PlanState;				/* forward references in this file */
 struct ParallelHashJoinState;
 struct ExecRowMark;
@@ -39,8 +43,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct MultiInsertBuffer;
+// struct MultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -498,8 +502,8 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copyfrom.c when performing multi-inserts */
-	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
+	/* for use by copyfrom.c/modifyTable when performing multi-inserts */
+	struct MultiInsertBuffer *ri_MultiInsertBuffer;
 } ResultRelInfo;
 
 /* ----------------
@@ -1165,6 +1169,10 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct MultiInsertInfo	*miinfo;
+	size_t		ntuples;	/* Number of tuples inserted; */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..bc4c1a4fc2 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,49 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+DEBUG:  enabling multi insert
+RESET client_min_messages;
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..a74eb3826a 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,26 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
+select count(1) from hash_parted;
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 943142ced8..2274f78843 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -423,8 +423,6 @@ ConvertRowtypeExpr
 CookedConstraint
 CopyDest
 CopyInsertMethod
-CopyMultiInsertBuffer
-CopyMultiInsertInfo
 CopyState
 CopyStateData
 CopyStmt
@@ -1401,6 +1399,8 @@ ModifyTableState
 MorphOpaque
 MsgType
 MultiAssignRef
+MultiInsertBuffer
+MultiInsertInfo
 MultiSortSupport
 MultiSortSupportData
 MultiXactId
-- 
2.17.0

v9-0002-WIP-Check-for-volatile-defaults.patchtext/x-diff; charset=us-asciiDownload

From 9443f342039fb35db6df7940183ef324777db1c5 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 1 Dec 2020 23:11:31 -0600
Subject: [PATCH v9 2/4] WIP: Check for volatile defaults

We want to check if any column *uses* a volatile default value, but after
parsing and rewriting, that information appears to be lost about which column
values are defaults and which were specified.  insertedcols doesn't appear to
be useful for this.  So add a field to track if a TargetEntry is planned with
column default.
---
 src/backend/executor/nodeModifyTable.c | 64 ++++++++++++++++++++++++--
 src/backend/nodes/copyfuncs.c          |  1 +
 src/backend/nodes/equalfuncs.c         |  1 +
 src/backend/nodes/makefuncs.c          |  1 +
 src/backend/nodes/outfuncs.c           |  1 +
 src/backend/nodes/readfuncs.c          |  1 +
 src/backend/optimizer/util/tlist.c     |  1 +
 src/backend/rewrite/rewriteHandler.c   |  3 ++
 src/include/nodes/primnodes.h          |  2 +
 9 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a53cbeeb2c..2059428e2a 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -50,6 +50,7 @@
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -2258,6 +2259,61 @@ ExecModifyTable(PlanState *pstate)
 	return NULL;
 }
 
+/*
+ * Determine if a table has volatile column defaults which are used by a given
+ * planned statement (if the column is not specified or specified as DEFAULT).
+ * This works only for INSERT.
+ */
+static bool
+has_volatile_defaults(ResultRelInfo *resultRelInfo, ModifyTable *node)
+{
+	TupleDesc	tupDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+	Plan		*plan;
+
+	Assert(list_length(node->plans) == 1);
+	plan = linitial(node->plans);
+
+	for (int attnum = 1; attnum <= tupDesc->natts; attnum++)
+	{
+		Form_pg_attribute att = TupleDescAttr(tupDesc, attnum - 1);
+		Expr		*defexpr;
+		TargetEntry	*tle;
+
+		/* We don't need to check dropped/generated attributes */
+		if (att->attisdropped || att->attgenerated)
+			continue;
+
+		tle = list_nth(plan->targetlist, attnum - 1);
+		Assert(tle != NULL);
+		Assert(tle->resno == attnum);
+
+		/*
+		 * If the column was specified with a non-default value, then don't
+		 * check the volatility of its default
+		 */
+		if (!tle->isdefault)
+			continue;
+
+		/* Check the column's default value if one exists */
+		defexpr = (Expr *) build_column_default(resultRelInfo->ri_RelationDesc, attnum);
+		if (defexpr == NULL)
+			continue;
+
+		/* Run the expression through planner */
+		// defexpr = expression_planner(defexpr);
+		// (void) ExecInitExpr(defexpr, NULL);
+		expression_planner(defexpr);
+
+		if (contain_volatile_functions_not_nextval((Node *) defexpr))
+		{
+			elog(DEBUG1, "found volatile att %d", attnum);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /* ----------------------------------------------------------------
  *		ExecInitModifyTable
  * ----------------------------------------------------------------
@@ -2352,10 +2408,10 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		 * are any statement level insert triggers.
 		 */
 		mtstate->miinfo = NULL;
-	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
-			/* || cstate->volatile_defexprs */ )
-		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
-		/* Can't support multi-inserts to foreign tables or if there are any */
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL ||
+			has_volatile_defaults(mtstate->rootResultRelInfo, node))
+		/* Can't support multi-inserts to foreign tables or if there are any
+		 * volatile default expressions in the table. */
 		mtstate->miinfo = NULL;
 	else
 	{
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index ba3ccc712c..2008e8e5d6 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2178,6 +2178,7 @@ _copyTargetEntry(const TargetEntry *from)
 	COPY_SCALAR_FIELD(resorigtbl);
 	COPY_SCALAR_FIELD(resorigcol);
 	COPY_SCALAR_FIELD(resjunk);
+	COPY_SCALAR_FIELD(isdefault);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index a2ef853dc2..aa3cdf3729 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -770,6 +770,7 @@ _equalTargetEntry(const TargetEntry *a, const TargetEntry *b)
 	COMPARE_SCALAR_FIELD(resorigtbl);
 	COMPARE_SCALAR_FIELD(resorigcol);
 	COMPARE_SCALAR_FIELD(resjunk);
+	COMPARE_SCALAR_FIELD(isdefault);
 
 	return true;
 }
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 01c110cd2f..aeeba7032f 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -254,6 +254,7 @@ makeTargetEntry(Expr *expr,
 	tle->ressortgroupref = 0;
 	tle->resorigtbl = InvalidOid;
 	tle->resorigcol = 0;
+	tle->isdefault = false;
 
 	tle->resjunk = resjunk;
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 8392be6d44..924ffda1c6 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1661,6 +1661,7 @@ _outTargetEntry(StringInfo str, const TargetEntry *node)
 	WRITE_OID_FIELD(resorigtbl);
 	WRITE_INT_FIELD(resorigcol);
 	WRITE_BOOL_FIELD(resjunk);
+	WRITE_BOOL_FIELD(isdefault);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index d2c8d58070..642b6c63c5 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1275,6 +1275,7 @@ _readTargetEntry(void)
 	READ_OID_FIELD(resorigtbl);
 	READ_INT_FIELD(resorigcol);
 	READ_BOOL_FIELD(resjunk);
+	READ_BOOL_FIELD(isdefault);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 89853a0630..7ef1517027 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -349,6 +349,7 @@ apply_tlist_labeling(List *dest_tlist, List *src_tlist)
 		dest_tle->resorigtbl = src_tle->resorigtbl;
 		dest_tle->resorigcol = src_tle->resorigcol;
 		dest_tle->resjunk = src_tle->resjunk;
+		dest_tle->isdefault = src_tle->isdefault;
 	}
 }
 
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 0c7508a0d8..7ab13e51e5 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -986,10 +986,13 @@ rewriteTargetListIU(List *targetList,
 			}
 
 			if (new_expr)
+			{
 				new_tle = makeTargetEntry((Expr *) new_expr,
 										  attrno,
 										  pstrdup(NameStr(att_tup->attname)),
 										  false);
+				new_tle->isdefault = true;
+			}
 		}
 
 		/*
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index d4ce037088..888bd36a07 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -1437,6 +1437,8 @@ typedef struct TargetEntry
 	AttrNumber	resorigcol;		/* column's number in source table */
 	bool		resjunk;		/* set to true to eliminate the attribute from
 								 * final target list */
+	bool		isdefault;		/* true if using the column default, either
+								 * by "DEFAULT" or omission of the column */
 } TargetEntry;
 
 
-- 
2.17.0

v9-0003-COPY-flush-multi-insert-buffer-based-on-accumulat.patchtext/x-diff; charset=us-asciiDownload

From 1705f4a471d59032e716db6618882e2cf7decebd Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 5 Dec 2020 08:52:14 -0600
Subject: [PATCH v9 3/4] COPY: flush multi-insert buffer based on accumulated
 size of tuples..

..rather than line length
---
 src/backend/commands/copyfrom.c        | 2 +-
 src/include/executor/nodeModifyTable.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 8221a2c5d3..e5ea909dfe 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -685,7 +685,7 @@ CopyFrom(CopyFromState cstate)
 					/* Add this tuple to the tuple buffer */
 					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
-											 cstate->line_buf.len,
+											 MemoryContextMemAllocated(myslot->tts_mcxt, true),
 											 cstate->miinfo.cur_lineno);
 
 					/*
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 30542a542a..87b689099c 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -41,10 +41,10 @@ extern void ExecReScanModifyTable(ModifyTableState *node);
 #define MAX_BUFFERED_TUPLES		1000
 
 /*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
+ * Flush buffers if there are >= this many bytes of tuples stored, as counted
+ * by the slot's memory contexts.
  */
-#define MAX_BUFFERED_BYTES		65535
+#define MAX_BUFFERED_BYTES		(1024*1024*8)
 
 /* Trim the list of buffers back down to this number after flushing */
 #define MAX_PARTITION_BUFFERS	32
-- 
2.17.0

v9-0004-WIP-check-tuple-size.patchtext/x-diff; charset=us-asciiDownload

From 9443ca50187f46eaceaa2b53383d9a668afecca9 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 1 Dec 2020 17:20:25 -0600
Subject: [PATCH v9 4/4] WIP: check tuple size

Or maybe INSERT should flush buffer based only on the *number* of tuples, and
not their size ?
---
 src/backend/executor/nodeModifyTable.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2059428e2a..04b0598183 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -703,7 +703,13 @@ ExecInsert(ModifyTableState *mtstate,
 			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
 			ExecCopySlot(batchslot, slot);
 
-			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot,
+					// sizeof(void*) * batchslot->tts_nvalid, /* tuple size - underestimate */
+					MemoryContextMemAllocated(batchslot->tts_mcxt, true), /* tuple size */
+					mtstate->ntuples); /* lineno */
+
+			elog(DEBUG2, "bufferedBytes %d; tuples %ld",
+					mtstate->miinfo->bufferedBytes, mtstate->ntuples);
 
 			if (MultiInsertInfoIsFull(mtstate->miinfo))
 				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
-- 
2.17.0

pryzby@telsasoft.com

almost 5 years ago

In reply to: Justin Pryzby (#11)

4 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Mon, Feb 22, 2021 at 02:25:22AM +0000, houzj.fnst@fujitsu.com wrote:

Yes, you can see that I've copied the checks from copy.
Like copy, some checks are done once, in ExecInitModifyTable, outside
of the ExecModifyTable "loop".

This squishes some commits together.
And uses bistate for ON CONFLICT.
And attempts to use memory context for tuple size.

Rebased on 9dc718bdf2b1a574481a45624d42b674332e2903

I guess my patch should/may be subsumed by this other one - I'm fine with
that.
https://commitfest.postgresql.org/31/2871/

Note that my interest here is just in bistate, to avoid leaving behind many dirty
buffers, not improved performance of COPY.

I am very interested in this patch, and I plan to do some experiments with the patch.
Can you please rebase the patch because it seems can not applied to the master now.

Thanks for your interest.

I was sitting on a rebased version since the bulk FDW patch will cause
conflicts, and since this should maybe be built on top of the table-am patch
(2871). Have fun :)

--
Justin

Attachments:

v10-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_i.patchtext/x-diff; charset=us-asciiDownload

From e2b93b3b3aaa32f680193b42d91a80bab40768a4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v10 1/4] INSERT SELECT to use BulkInsertState and multi_insert

Renames structures;
Move MultipleInsert functions from copyfrom.c to (tentatively) nodeModifyTable.h;
Move into MultiInsertInfo: transition_capture and cur_lineno (via cstate->miinfo);

Dynamically switch to multi-insert mode based on the number of insertions.
This is intended to accomodate 1) the original use case of INSERT using a small
ring buffer to avoid leaving behind dirty buffers; and, 2) Automatically using
multi-inserts for batch operations; 3) allow the old behavior of leaving behind
dirty buffers, which might allow INSERT to run more quickly, at the cost of
leaving behind many dirty buffers which other backends may have to write out.

XXX: for (1), the bulk-insert state is used even if not multi-insert, including
for a VALUES.

TODO: use cstate->miinfo.cur_lineno++ instead of mtstate->miinfo->ntuples
---
 src/backend/commands/copyfrom.c          | 394 +----------------------
 src/backend/commands/copyfromparse.c     |  10 +-
 src/backend/executor/execMain.c          |   2 +-
 src/backend/executor/execPartition.c     |   2 +-
 src/backend/executor/nodeModifyTable.c   | 196 +++++++++--
 src/backend/utils/misc/guc.c             |  10 +
 src/include/commands/copyfrom_internal.h |   5 +-
 src/include/executor/nodeModifyTable.h   | 371 +++++++++++++++++++++
 src/include/nodes/execnodes.h            |  16 +-
 src/test/regress/expected/insert.out     |  43 +++
 src/test/regress/sql/insert.sql          |  20 ++
 src/tools/pgindent/typedefs.list         |   4 +-
 12 files changed, 658 insertions(+), 415 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 796ca7b3f7..5b8a1e4b61 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -46,54 +46,6 @@
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /* non-export function prototypes */
 static char *limit_printout_length(const char *str);
 
@@ -111,7 +63,7 @@ CopyFromErrorCallback(void *arg)
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
-			 cstate->cur_lineno);
+			 cstate->miinfo.cur_lineno);
 
 	if (cstate->opts.binary)
 	{
@@ -206,317 +158,6 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
-/*
- * Initialize an already allocated CopyMultiInsertInfo.
- *
- * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
- * for that table.
- */
-static void
-CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						CopyFromState cstate, EState *estate, CommandId mycid,
-						int ti_options)
-{
-	miinfo->multiInsertBuffers = NIL;
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-	miinfo->cstate = cstate;
-	miinfo->estate = estate;
-	miinfo->mycid = mycid;
-	miinfo->ti_options = ti_options;
-
-	/*
-	 * Only setup the buffer when not dealing with a partitioned table.
-	 * Buffers for partitioned tables will just be setup when we need to send
-	 * tuples their way for the first time.
-	 */
-	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
-}
-
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
-/*
- * Write the tuples stored in 'buffer' out to the table.
- */
-static inline void
-CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
-						   CopyMultiInsertBuffer *buffer)
-{
-	MemoryContext oldcontext;
-	int			i;
-	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
-	EState	   *estate = miinfo->estate;
-	CommandId	mycid = miinfo->mycid;
-	int			ti_options = miinfo->ti_options;
-	bool		line_buf_valid = cstate->line_buf_valid;
-	int			nused = buffer->nused;
-	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
-	TupleTableSlot **slots = buffer->slots;
-
-	/*
-	 * Print error context information correctly, if one of the operations
-	 * below fail.
-	 */
-	cstate->line_buf_valid = false;
-	save_cur_lineno = cstate->cur_lineno;
-
-	/*
-	 * table_multi_insert may leak memory, so switch to short-lived memory
-	 * context before calling it.
-	 */
-	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	table_multi_insert(resultRelInfo->ri_RelationDesc,
-					   slots,
-					   nused,
-					   mycid,
-					   ti_options,
-					   buffer->bistate);
-	MemoryContextSwitchTo(oldcontext);
-
-	for (i = 0; i < nused; i++)
-	{
-		/*
-		 * If there are any indexes, update them for all the inserted tuples,
-		 * and run AFTER ROW INSERT triggers.
-		 */
-		if (resultRelInfo->ri_NumIndices > 0)
-		{
-			List	   *recheckIndexes;
-
-			cstate->cur_lineno = buffer->linenos[i];
-			recheckIndexes =
-				ExecInsertIndexTuples(resultRelInfo,
-									  buffer->slots[i], estate, false, false,
-									  NULL, NIL);
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], recheckIndexes,
-								 cstate->transition_capture);
-			list_free(recheckIndexes);
-		}
-
-		/*
-		 * There's no indexes, but see if we need to run AFTER ROW INSERT
-		 * triggers anyway.
-		 */
-		else if (resultRelInfo->ri_TrigDesc != NULL &&
-				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
-				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
-		{
-			cstate->cur_lineno = buffer->linenos[i];
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], NIL, cstate->transition_capture);
-		}
-
-		ExecClearTuple(slots[i]);
-	}
-
-	/* Mark that all slots are free */
-	buffer->nused = 0;
-
-	/* reset cur_lineno and line_buf_valid to what they were */
-	cstate->line_buf_valid = line_buf_valid;
-	cstate->cur_lineno = save_cur_lineno;
-}
-
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
-/*
- * Write out all stored tuples in all buffers out to the tables.
- *
- * Once flushed we also trim the tracked buffers list down to size by removing
- * the buffers created earliest first.
- *
- * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
- * used.  When cleaning up old buffers we'll never remove the one for
- * 'curr_rri'.
- */
-static inline void
-CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-	{
-		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
-
-		CopyMultiInsertBufferFlush(miinfo, buffer);
-	}
-
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-
-	/*
-	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
-	 * remove buffers starting with the ones we created first.  It seems less
-	 * likely that these older ones will be needed than the ones that were
-	 * just created.
-	 */
-	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
-	{
-		CopyMultiInsertBuffer *buffer;
-
-		buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-
-		/*
-		 * We never want to remove the buffer that's currently being used, so
-		 * if we happen to find that then move it to the end of the list.
-		 */
-		if (buffer->resultRelInfo == curr_rri)
-		{
-			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-			buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-		}
-
-		CopyMultiInsertBufferCleanup(miinfo, buffer);
-		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-	}
-}
-
-/*
- * Cleanup allocated buffers and free memory
- */
-static inline void
-CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-		CopyMultiInsertBufferCleanup(miinfo, lfirst(lc));
-
-	list_free(miinfo->multiInsertBuffers);
-}
-
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
@@ -538,7 +179,6 @@ CopyFrom(CopyFromState cstate)
 	int			ti_options = 0; /* start with default options for insert */
 	BulkInsertState bistate = NULL;
 	CopyInsertMethod insertMethod;
-	CopyMultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
 	uint64		processed = 0;
 	bool		has_before_insert_row_trig;
 	bool		has_instead_insert_row_trig;
@@ -726,7 +366,7 @@ CopyFrom(CopyFromState cstate)
 		 * For partitioned tables we can't support multi-inserts when there
 		 * are any statement level insert triggers. It might be possible to
 		 * allow partitioned tables with such triggers in the future, but for
-		 * now, CopyMultiInsertInfoFlush expects that any before row insert
+		 * now, MultiInsertInfoFlush expects that any before row insert
 		 * and statement level insert triggers are on the same relation.
 		 */
 		insertMethod = CIM_SINGLE;
@@ -774,7 +414,8 @@ CopyFrom(CopyFromState cstate)
 		else
 			insertMethod = CIM_MULTI;
 
-		CopyMultiInsertInfoInit(&multiInsertInfo, resultRelInfo, cstate,
+		MultiInsertInfoInit(&cstate->miinfo, resultRelInfo,
+								cstate->transition_capture,
 								estate, mycid, ti_options);
 	}
 
@@ -837,7 +478,7 @@ CopyFrom(CopyFromState cstate)
 			Assert(resultRelInfo == target_resultRelInfo);
 			Assert(insertMethod == CIM_MULTI);
 
-			myslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+			myslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 													 resultRelInfo);
 		}
 
@@ -906,18 +547,18 @@ CopyFrom(CopyFromState cstate)
 				/* Set the multi-insert buffer to use for this partition. */
 				if (leafpart_use_multi_insert)
 				{
-					if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
-						CopyMultiInsertInfoSetupBuffer(&multiInsertInfo,
+					if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+						MultiInsertInfoSetupBuffer(&cstate->miinfo,
 													   resultRelInfo);
 				}
 				else if (insertMethod == CIM_MULTI_CONDITIONAL &&
-						 !CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
+						 !MultiInsertInfoIsEmpty(&cstate->miinfo))
 				{
 					/*
 					 * Flush pending inserts if this partition can't use
 					 * batching, so rows are visible to triggers etc.
 					 */
-					CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 
 				if (bistate != NULL)
@@ -963,7 +604,7 @@ CopyFrom(CopyFromState cstate)
 				/* no other path available for partitioned table */
 				Assert(insertMethod == CIM_MULTI_CONDITIONAL);
 
-				batchslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+				batchslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 															resultRelInfo);
 
 				if (map != NULL)
@@ -1043,17 +684,17 @@ CopyFrom(CopyFromState cstate)
 					ExecMaterializeSlot(myslot);
 
 					/* Add this tuple to the tuple buffer */
-					CopyMultiInsertInfoStore(&multiInsertInfo,
+					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
 											 cstate->line_buf.len,
-											 cstate->cur_lineno);
+											 cstate->miinfo.cur_lineno);
 
 					/*
 					 * If enough inserts have queued up, then flush all
 					 * buffers out to their tables.
 					 */
-					if (CopyMultiInsertInfoIsFull(&multiInsertInfo))
-						CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					if (MultiInsertInfoIsFull(&cstate->miinfo))
+						MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 				else
 				{
@@ -1114,8 +755,8 @@ CopyFrom(CopyFromState cstate)
 	/* Flush any remaining buffered tuples */
 	if (insertMethod != CIM_SINGLE)
 	{
-		if (!CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
-			CopyMultiInsertInfoFlush(&multiInsertInfo, NULL);
+		if (!MultiInsertInfoIsEmpty(&cstate->miinfo))
+			MultiInsertInfoFlush(&cstate->miinfo, NULL);
 	}
 
 	/* Done, clean up */
@@ -1149,7 +790,7 @@ CopyFrom(CopyFromState cstate)
 
 	/* Tear down the multi-insert buffer data */
 	if (insertMethod != CIM_SINGLE)
-		CopyMultiInsertInfoCleanup(&multiInsertInfo);
+		MultiInsertInfoCleanup(&cstate->miinfo);
 
 	/* Close all the partitioned tables, leaf partitions, and their indices */
 	if (proute)
@@ -1328,7 +969,6 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
-	cstate->cur_lineno = 0;
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 315b16fd7a..3fbf8cb431 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -460,14 +460,14 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	Assert(!cstate->opts.binary);
 
 	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	if (cstate->miinfo.cur_lineno == 0 && cstate->opts.header_line)
 	{
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 		if (CopyReadLine(cstate))
 			return false;		/* done */
 	}
 
-	cstate->cur_lineno++;
+	cstate->miinfo.cur_lineno++;
 
 	/* Actually read the line into memory here */
 	done = CopyReadLine(cstate);
@@ -608,7 +608,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 		int16		fld_count;
 		ListCell   *cur;
 
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 
 		if (!CopyGetInt16(cstate, &fld_count))
 		{
@@ -916,7 +916,7 @@ CopyReadLineText(CopyFromState cstate)
 			 * at all --- is cur_lineno a physical or logical count?)
 			 */
 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+				cstate->miinfo.cur_lineno++;
 		}
 
 		/* Process \r */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index c74ce36ffb..245e173021 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1233,7 +1233,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 													 * ExecInitRoutingInfo */
 	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
 	resultRelInfo->ri_ChildToRootMap = NULL;
-	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	resultRelInfo->ri_MultiInsertBuffer = NULL;
 }
 
 /*
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index b8da4c5967..b0f4b68b6e 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1011,7 +1011,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 
 	Assert(partRelInfo->ri_BatchSize >= 1);
 
-	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	partRelInfo->ri_MultiInsertBuffer = NULL;
 
 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2993ba43e3..0907b3ebd5 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -79,6 +80,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -397,6 +400,8 @@ ExecInsert(ModifyTableState *mtstate,
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
 	MemoryContext oldContext;
+	TupleTableSlot *batchslot = NULL;
+	bool	use_multi_insert = false;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -416,6 +421,66 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	// XXX: maybe this should only be done if it's not a partitioned table or
+	// if the partitions don't support miinfo, which uses its own bistates
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->operation == CMD_INSERT &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+	}
+
+	if (!mtstate->miinfo)
+	{
+		/*
+		 * If multi-inserts aren't possible for this statement at all, so don't
+		 * check further
+		 */
+	} else if (proute == NULL)
+	{
+		if (mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+	else
+	{
+		/*
+		 * If a partitioned table itself allows multi-insert, and bistate
+		 * indicates we've inserted the threshold number of tuples, check if
+		 * the partition also supports it.
+		 */
+
+		/* Determine which triggers exist on this partition */
+		// XXX copyfrom.c only checks triggers when the partition changes,
+		// so maybe use_multi_insert should be in mtstate ?
+		bool has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_before_row);
+
+		bool has_instead_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_instead_row);
+
+		/*
+		 * Disable multi-inserts when the partition has BEFORE/INSTEAD
+		 * OF triggers, or if the partition is a foreign partition.
+		 * The number of tuples eligible for multi-insert is tracked separately
+		 * from the total number of tuples in case it's not supported for some
+		 * partitions.
+		 */
+		if (!has_before_insert_row_trig &&
+			!has_instead_insert_row_trig &&
+			resultRelInfo->ri_FdwRoutine == NULL &&
+			mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+
+	if (use_multi_insert && mtstate->miinfo->ntuples - 1 == bulk_insert_ntuples)
+		elog(DEBUG1, "enabling multi insert");
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -651,7 +716,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   mtstate->bistate,
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -686,12 +751,39 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (use_multi_insert)
+		{
+			if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+				MultiInsertInfoSetupBuffer(mtstate->miinfo, resultRelInfo);
+
+			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (MultiInsertInfoIsFull(mtstate->miinfo))
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -704,32 +796,36 @@ ExecInsert(ModifyTableState *mtstate,
 	if (canSetTag)
 		(estate->es_processed)++;
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
+
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2372,6 +2468,45 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+
+	/*
+	 * Set miinfo if it can support multi-insert. This is the equivalent of
+	 * CIM_MULTI_* et al in copyfrom.c
+	 */
+
+	if (operation != CMD_INSERT ||
+			node->onConflictAction != ONCONFLICT_NONE)
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			(mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_before_row ||
+			 // mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_after_row || // XXX or any row level triggers at all?
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_instead_row))
+		/*
+		 * Can't support multi-inserts when there are any BEFORE/INSTEAD OF
+		 * triggers on the table.
+		 */
+		mtstate->miinfo = NULL;
+	else if (node->rootRelation > 0 &&
+			mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		/*
+		 * For partitioned tables we can't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
+			/* || cstate->volatile_defexprs */ )
+		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
+		/* Can't support multi-inserts to foreign tables or if there are any */
+		mtstate->miinfo = NULL;
+	else
+	{
+		mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+		MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+				mtstate->mt_transition_capture,
+				estate, GetCurrentCommandId(true), 0);
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2864,6 +2999,19 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
+	if (node->miinfo)
+	{
+		if (!MultiInsertInfoIsEmpty(node->miinfo))
+			 MultiInsertInfoFlush(node->miinfo, node->resultRelInfo); // root ?
+		MultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 00018abb7d..e08577851f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -3445,6 +3446,15 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index e37942df39..372414b1c0 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -15,6 +15,7 @@
 #define COPYFROM_INTERNAL_H
 
 #include "commands/copy.h"
+#include "executor/nodeModifyTable.h"
 #include "commands/trigger.h"
 
 /*
@@ -92,10 +93,12 @@ typedef struct CopyFromStateData
 
 	/* these are just for error messages, see CopyFromErrorCallback */
 	const char *cur_relname;	/* table name for error messages */
-	uint64		cur_lineno;		/* line number for error messages */
 	const char *cur_attname;	/* current att for error messages */
 	const char *cur_attval;		/* current att value for error messages */
 
+	/* For bulk inserts and for error callback */
+	MultiInsertInfo miinfo;
+
 	/*
 	 * Working state
 	 */
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 83e2965531..30542a542a 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -13,8 +13,12 @@
 #ifndef NODEMODIFYTABLE_H
 #define NODEMODIFYTABLE_H
 
+#include "commands/trigger.h"
+#include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT int bulk_insert_ntuples;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
@@ -23,4 +27,371 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate,
 extern void ExecEndModifyTable(ModifyTableState *node);
 extern void ExecReScanModifyTable(ModifyTableState *node);
 
+/* Bulk insert stuff which used to live in copy.c */
+
+/*
+ * No more than this many tuples per MultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * MultiInsertBuffer items stored in MultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES		1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+#define MAX_BUFFERED_BYTES		65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS	32
+
+/* Stores multi-insert data related to a single relation in CopyFrom. */
+typedef struct MultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;	/* BulkInsertState for this rel */
+	int			nused;			/* number of 'slots' containing tuples */
+	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+												 * stream */
+} MultiInsertBuffer;
+
+/*
+ * Stores one or many MultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPY/INSERTing into a partitioned table.
+ */
+typedef struct MultiInsertInfo
+{
+	List	   *multiInsertBuffers; /* List of tracked MultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;	/* number of bytes from all buffered tuples */
+	TransitionCaptureState	*transition_capture;
+	EState	   *estate;			/* Executor state */
+	CommandId	mycid;			/* Command Id */
+	int			ti_options;		/* table insert options */
+	size_t		ntuples;		/* Number of rows *eligible* for multi-insert */
+
+	/* Line number for errors in copyfrom.c */
+	uint64		cur_lineno;
+	bool		line_buf_valid;
+} MultiInsertInfo;
+
+
+/*
+ * Allocate memory and initialize a new MultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static MultiInsertBuffer *
+MultiInsertBufferInit(ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = (MultiInsertBuffer *) palloc(sizeof(MultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+MultiInsertInfoSetupBuffer(MultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = MultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_MultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Initialize an already allocated MultiInsertInfo.
+ *
+ * If rri is a non-partitioned table then a MultiInsertBuffer is set up
+ * for that table.
+ */
+static inline void
+MultiInsertInfoInit(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						TransitionCaptureState *transition_capture,
+						EState *estate, CommandId mycid, int ti_options)
+{
+	miinfo->multiInsertBuffers = NIL;
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+	miinfo->transition_capture = transition_capture;
+	miinfo->estate = estate;
+	miinfo->mycid = mycid;
+	miinfo->ti_options = ti_options;
+	miinfo->cur_lineno = 0;
+
+	/*
+	 * Only setup the buffer when not dealing with a partitioned table.
+	 * Buffers for partitioned tables will just be setup when we need to send
+	 * tuples their way for the first time.
+	 */
+	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		MultiInsertInfoSetupBuffer(miinfo, rri);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+MultiInsertInfoIsFull(MultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+MultiInsertInfoIsEmpty(MultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Write the tuples stored in 'buffer' out to the table.
+ */
+static inline void
+MultiInsertBufferFlush(MultiInsertInfo *miinfo,
+						   MultiInsertBuffer *buffer)
+{
+	MemoryContext oldcontext;
+	int			i;
+	uint64		save_cur_lineno;
+	EState	   *estate = miinfo->estate;
+	CommandId	mycid = miinfo->mycid;
+	int			ti_options = miinfo->ti_options;
+	bool		line_buf_valid = miinfo->line_buf_valid;
+	int			nused = buffer->nused;
+	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+	TupleTableSlot **slots = buffer->slots;
+
+	/*
+	 * Print error context information correctly, if one of the operations
+	 * below fail.
+	 */
+	miinfo->line_buf_valid = false;
+	save_cur_lineno = miinfo->cur_lineno;
+
+	/*
+	 * table_multi_insert may leak memory, so switch to short-lived memory
+	 * context before calling it.
+	 */
+	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); // XXX requires executor.h
+	table_multi_insert(resultRelInfo->ri_RelationDesc,
+					   slots,
+					   nused,
+					   mycid,
+					   ti_options,
+					   buffer->bistate);
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < nused; i++)
+	{
+		/*
+		 * If there are any indexes, update them for all the inserted tuples,
+		 * and run AFTER ROW INSERT triggers.
+		 */
+		if (resultRelInfo->ri_NumIndices > 0)
+		{
+			List	   *recheckIndexes;
+
+			miinfo->cur_lineno = buffer->linenos[i];
+			recheckIndexes =
+				ExecInsertIndexTuples(resultRelInfo,
+									  buffer->slots[i], estate, false, false,
+									  NULL, NIL);
+
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], recheckIndexes,
+								 miinfo->transition_capture);
+			list_free(recheckIndexes);
+		}
+
+		/*
+		 * There's no indexes, but see if we need to run AFTER ROW INSERT
+		 * triggers anyway.
+		 */
+		else if (resultRelInfo->ri_TrigDesc != NULL &&
+				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+		{
+			miinfo->cur_lineno = buffer->linenos[i];
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], NIL, miinfo->transition_capture);
+		}
+
+		ExecClearTuple(slots[i]);
+	}
+
+	/* Mark that all slots are free */
+	buffer->nused = 0;
+
+	/* reset cur_lineno and line_buf_valid to what they were */
+	miinfo->line_buf_valid = line_buf_valid;
+	miinfo->cur_lineno = save_cur_lineno;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+MultiInsertBufferCleanup(MultiInsertInfo *miinfo,
+							 MultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_MultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Write out all stored tuples in all buffers out to the tables.
+ *
+ * Once flushed we also trim the tracked buffers list down to size by removing
+ * the buffers created earliest first.
+ *
+ * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
+ * used.  When cleaning up old buffers we'll never remove the one for
+ * 'curr_rri'.
+ */
+static inline void
+MultiInsertInfoFlush(MultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		MultiInsertBuffer *buffer = (MultiInsertBuffer *) lfirst(lc);
+
+		MultiInsertBufferFlush(miinfo, buffer);
+	}
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+
+	/*
+	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
+	 * remove buffers starting with the ones we created first.  It seems less
+	 * likely that these older ones will be needed than the ones that were
+	 * just created.
+	 */
+	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
+	{
+		MultiInsertBuffer *buffer;
+
+		buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+
+		/*
+		 * We never want to remove the buffer that's currently being used, so
+		 * if we happen to find that then move it to the end of the list.
+		 */
+		if (buffer->resultRelInfo == curr_rri)
+		{
+			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+			buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+		}
+
+		MultiInsertBufferCleanup(miinfo, buffer);
+		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+	}
+}
+
+/*
+ * Cleanup allocated buffers and free memory
+ */
+static inline void
+MultiInsertInfoCleanup(MultiInsertInfo *miinfo)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+		MultiInsertBufferCleanup(miinfo, lfirst(lc));
+
+	list_free(miinfo->multiInsertBuffers);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+MultiInsertInfoNextFreeSlot(MultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * MultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+MultiInsertInfoStore(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* NODEMODIFYTABLE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 943931f65d..19668bbf66 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -32,6 +33,9 @@
 #include "utils/tuplesort.h"
 #include "utils/tuplestore.h"
 
+/* This would be a circular inclusion */
+// #include "executor/nodeModifyTable.h"
+
 struct PlanState;				/* forward references in this file */
 struct ParallelHashJoinState;
 struct ExecRowMark;
@@ -39,8 +43,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct MultiInsertBuffer;
+// struct MultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -511,8 +515,8 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copyfrom.c when performing multi-inserts */
-	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
+	/* for use by copyfrom.c/modifyTable when performing multi-inserts */
+	struct MultiInsertBuffer *ri_MultiInsertBuffer;
 } ResultRelInfo;
 
 /* ----------------
@@ -1178,6 +1182,10 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct MultiInsertInfo	*miinfo;
+	size_t		ntuples;	/* Number of tuples inserted; */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..bc4c1a4fc2 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,49 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+DEBUG:  enabling multi insert
+RESET client_min_messages;
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..a74eb3826a 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,26 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
+select count(1) from hash_parted;
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index bab4f3adb3..e07588fb6c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -423,8 +423,6 @@ ConvertRowtypeExpr
 CookedConstraint
 CopyDest
 CopyInsertMethod
-CopyMultiInsertBuffer
-CopyMultiInsertInfo
 CopyState
 CopyStateData
 CopyStmt
@@ -1402,6 +1400,8 @@ ModifyTableState
 MorphOpaque
 MsgType
 MultiAssignRef
+MultiInsertBuffer
+MultiInsertInfo
 MultiSortSupport
 MultiSortSupportData
 MultiXactId
-- 
2.17.0

v10-0002-WIP-Check-for-volatile-defaults.patchtext/x-diff; charset=us-asciiDownload

From 2799ea649a720305d74bc77baeed5387cdcbc49d Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 1 Dec 2020 23:11:31 -0600
Subject: [PATCH v10 2/4] WIP: Check for volatile defaults

We want to check if any column *uses* a volatile default value, but after
parsing and rewriting, that information appears to be lost about which column
values are defaults and which were specified.  insertedcols doesn't appear to
be useful for this.  So add a field to track if a TargetEntry is planned with
column default.
---
 src/backend/executor/nodeModifyTable.c | 64 ++++++++++++++++++++++++--
 src/backend/nodes/copyfuncs.c          |  1 +
 src/backend/nodes/equalfuncs.c         |  1 +
 src/backend/nodes/makefuncs.c          |  1 +
 src/backend/nodes/outfuncs.c           |  1 +
 src/backend/nodes/readfuncs.c          |  1 +
 src/backend/optimizer/util/tlist.c     |  1 +
 src/backend/rewrite/rewriteHandler.c   |  3 ++
 src/include/nodes/primnodes.h          |  2 +
 9 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 0907b3ebd5..e018946c0e 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -50,6 +50,7 @@
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -2401,6 +2402,61 @@ ExecModifyTable(PlanState *pstate)
 	return NULL;
 }
 
+/*
+ * Determine if a table has volatile column defaults which are used by a given
+ * planned statement (if the column is not specified or specified as DEFAULT).
+ * This works only for INSERT.
+ */
+static bool
+has_volatile_defaults(ResultRelInfo *resultRelInfo, ModifyTable *node)
+{
+	TupleDesc	tupDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+	Plan		*plan;
+
+	Assert(list_length(node->plans) == 1);
+	plan = linitial(node->plans);
+
+	for (int attnum = 1; attnum <= tupDesc->natts; attnum++)
+	{
+		Form_pg_attribute att = TupleDescAttr(tupDesc, attnum - 1);
+		Expr		*defexpr;
+		TargetEntry	*tle;
+
+		/* We don't need to check dropped/generated attributes */
+		if (att->attisdropped || att->attgenerated)
+			continue;
+
+		tle = list_nth(plan->targetlist, attnum - 1);
+		Assert(tle != NULL);
+		Assert(tle->resno == attnum);
+
+		/*
+		 * If the column was specified with a non-default value, then don't
+		 * check the volatility of its default
+		 */
+		if (!tle->isdefault)
+			continue;
+
+		/* Check the column's default value if one exists */
+		defexpr = (Expr *) build_column_default(resultRelInfo->ri_RelationDesc, attnum);
+		if (defexpr == NULL)
+			continue;
+
+		/* Run the expression through planner */
+		// defexpr = expression_planner(defexpr);
+		// (void) ExecInitExpr(defexpr, NULL);
+		expression_planner(defexpr);
+
+		if (contain_volatile_functions_not_nextval((Node *) defexpr))
+		{
+			elog(DEBUG1, "found volatile att %d", attnum);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /* ----------------------------------------------------------------
  *		ExecInitModifyTable
  * ----------------------------------------------------------------
@@ -2495,10 +2551,10 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		 * are any statement level insert triggers.
 		 */
 		mtstate->miinfo = NULL;
-	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
-			/* || cstate->volatile_defexprs */ )
-		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
-		/* Can't support multi-inserts to foreign tables or if there are any */
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL ||
+			has_volatile_defaults(mtstate->rootResultRelInfo, node))
+		/* Can't support multi-inserts to foreign tables or if there are any
+		 * volatile default expressions in the table. */
 		mtstate->miinfo = NULL;
 	else
 	{
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 65bbc18ecb..0088539df7 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2178,6 +2178,7 @@ _copyTargetEntry(const TargetEntry *from)
 	COPY_SCALAR_FIELD(resorigtbl);
 	COPY_SCALAR_FIELD(resorigcol);
 	COPY_SCALAR_FIELD(resjunk);
+	COPY_SCALAR_FIELD(isdefault);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index c2d73626fc..12260f803f 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -770,6 +770,7 @@ _equalTargetEntry(const TargetEntry *a, const TargetEntry *b)
 	COMPARE_SCALAR_FIELD(resorigtbl);
 	COMPARE_SCALAR_FIELD(resorigcol);
 	COMPARE_SCALAR_FIELD(resjunk);
+	COMPARE_SCALAR_FIELD(isdefault);
 
 	return true;
 }
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 01c110cd2f..aeeba7032f 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -254,6 +254,7 @@ makeTargetEntry(Expr *expr,
 	tle->ressortgroupref = 0;
 	tle->resorigtbl = InvalidOid;
 	tle->resorigcol = 0;
+	tle->isdefault = false;
 
 	tle->resjunk = resjunk;
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f5dcedf6e8..50f5bfda8e 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1661,6 +1661,7 @@ _outTargetEntry(StringInfo str, const TargetEntry *node)
 	WRITE_OID_FIELD(resorigtbl);
 	WRITE_INT_FIELD(resorigcol);
 	WRITE_BOOL_FIELD(resjunk);
+	WRITE_BOOL_FIELD(isdefault);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 4388aae71d..973b558eec 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1315,6 +1315,7 @@ _readTargetEntry(void)
 	READ_OID_FIELD(resorigtbl);
 	READ_INT_FIELD(resorigcol);
 	READ_BOOL_FIELD(resjunk);
+	READ_BOOL_FIELD(isdefault);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 89853a0630..7ef1517027 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -349,6 +349,7 @@ apply_tlist_labeling(List *dest_tlist, List *src_tlist)
 		dest_tle->resorigtbl = src_tle->resorigtbl;
 		dest_tle->resorigcol = src_tle->resorigcol;
 		dest_tle->resjunk = src_tle->resjunk;
+		dest_tle->isdefault = src_tle->isdefault;
 	}
 }
 
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 0672f497c6..3d93e24004 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -987,10 +987,13 @@ rewriteTargetListIU(List *targetList,
 			}
 
 			if (new_expr)
+			{
 				new_tle = makeTargetEntry((Expr *) new_expr,
 										  attrno,
 										  pstrdup(NameStr(att_tup->attname)),
 										  false);
+				new_tle->isdefault = true;
+			}
 		}
 
 		/*
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index d4ce037088..888bd36a07 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -1437,6 +1437,8 @@ typedef struct TargetEntry
 	AttrNumber	resorigcol;		/* column's number in source table */
 	bool		resjunk;		/* set to true to eliminate the attribute from
 								 * final target list */
+	bool		isdefault;		/* true if using the column default, either
+								 * by "DEFAULT" or omission of the column */
 } TargetEntry;
 
 
-- 
2.17.0

v10-0003-COPY-flush-multi-insert-buffer-based-on-accumula.patchtext/x-diff; charset=us-asciiDownload

From 1f1647182bb9c2620bde6af054c5fb27a2534f1c Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 5 Dec 2020 08:52:14 -0600
Subject: [PATCH v10 3/4] COPY: flush multi-insert buffer based on accumulated
 size of tuples..

..rather than line length
---
 src/backend/commands/copyfrom.c        | 2 +-
 src/include/executor/nodeModifyTable.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 5b8a1e4b61..f36c29ce7f 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -686,7 +686,7 @@ CopyFrom(CopyFromState cstate)
 					/* Add this tuple to the tuple buffer */
 					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
-											 cstate->line_buf.len,
+											 MemoryContextMemAllocated(myslot->tts_mcxt, true),
 											 cstate->miinfo.cur_lineno);
 
 					/*
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 30542a542a..87b689099c 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -41,10 +41,10 @@ extern void ExecReScanModifyTable(ModifyTableState *node);
 #define MAX_BUFFERED_TUPLES		1000
 
 /*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
+ * Flush buffers if there are >= this many bytes of tuples stored, as counted
+ * by the slot's memory contexts.
  */
-#define MAX_BUFFERED_BYTES		65535
+#define MAX_BUFFERED_BYTES		(1024*1024*8)
 
 /* Trim the list of buffers back down to this number after flushing */
 #define MAX_PARTITION_BUFFERS	32
-- 
2.17.0

v10-0004-WIP-check-tuple-size.patchtext/x-diff; charset=us-asciiDownload

From 85cf321bfab8efeb51affa97bb4226646264340e Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 1 Dec 2020 17:20:25 -0600
Subject: [PATCH v10 4/4] WIP: check tuple size

Or maybe INSERT should flush buffer based only on the *number* of tuples, and
not their size ?
---
 src/backend/executor/nodeModifyTable.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index e018946c0e..ec9e43ac55 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -760,7 +760,13 @@ ExecInsert(ModifyTableState *mtstate,
 			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
 			ExecCopySlot(batchslot, slot);
 
-			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot,
+					// sizeof(void*) * batchslot->tts_nvalid, /* tuple size - underestimate */
+					MemoryContextMemAllocated(batchslot->tts_mcxt, true), /* tuple size */
+					mtstate->ntuples); /* lineno */
+
+			elog(DEBUG2, "bufferedBytes %d; tuples %ld",
+					mtstate->miinfo->bufferedBytes, mtstate->ntuples);
 
 			if (MultiInsertInfoIsFull(mtstate->miinfo))
 				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
-- 
2.17.0

Import Notes

Reply to msg id not found: OS0PR01MB57168194265E54E96632B41B94819@OS0PR01MB5716.jpnprd01.prod.outlook.com

houzj.fnst@fujitsu.com

houzj.fnst@fujitsu.com

almost 5 years ago

In reply to: Justin Pryzby (#20)

RE: should INSERT SELECT use a BulkInsertState?

I am very interested in this patch, and I plan to do some experiments with the

patch.

Can you please rebase the patch because it seems can not applied to the

master now.

Thanks for your interest.

I was sitting on a rebased version since the bulk FDW patch will cause conflicts,
and since this should maybe be built on top of the table-am patch (2871).
Have fun :)

Hi,

When I testing with the patch, I found I can not use "\d tablename".
It reports the following error, it this related to the patch?

--------------------------------------------------------------------------
ERROR: did not find '}' at end of input node at character 141
STATEMENT: SELECT pol.polname, pol.polpermissive,
CASE WHEN pol.polroles = '{0}' THEN NULL ELSE pg_catalog.array_to_string(array(select rolname from pg_catalog.pg_roles where oid = any (pol.polroles) order by 1),',') END,
pg_catalog.pg_get_expr(pol.polqual, pol.polrelid),
pg_catalog.pg_get_expr(pol.polwithcheck, pol.polrelid),
CASE pol.polcmd
WHEN 'r' THEN 'SELECT'
WHEN 'a' THEN 'INSERT'
WHEN 'w' THEN 'UPDATE'
WHEN 'd' THEN 'DELETE'
END AS cmd
FROM pg_catalog.pg_policy pol
WHERE pol.polrelid = '58112' ORDER BY 1;
ERROR: did not find '}' at end of input node
LINE 2: ...catalog.array_to_string(array(select rolname from pg_catalog...
--------------------------------------------------------------------------

Best regards,
houzj

houzj.fnst@fujitsu.com

houzj.fnst@fujitsu.com

almost 5 years ago

In reply to: houzj.fnst@fujitsu.com (#21)

RE: should INSERT SELECT use a BulkInsertState?

I am very interested in this patch, and I plan to do some
experiments with the

patch.

Can you please rebase the patch because it seems can not applied to
the

master now.

Thanks for your interest.

I was sitting on a rebased version since the bulk FDW patch will cause
conflicts, and since this should maybe be built on top of the table-am patch

(2871).

Have fun :)

Hi,

When I testing with the patch, I found I can not use "\d tablename".
It reports the following error, it this related to the patch?

Sorry, solved by re-initdb.

Best regards,
houzj

ibrar.ahmad@gmail.com

almost 5 years ago

In reply to: houzj.fnst@fujitsu.com (#22)

1 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Mon, Mar 8, 2021 at 2:18 PM houzj.fnst@fujitsu.com <
houzj.fnst@fujitsu.com> wrote:

I am very interested in this patch, and I plan to do some
experiments with the

patch.

Can you please rebase the patch because it seems can not applied to
the

master now.

Thanks for your interest.

I was sitting on a rebased version since the bulk FDW patch will cause
conflicts, and since this should maybe be built on top of the table-am

patch

(2871).

Have fun :)

Hi,

When I testing with the patch, I found I can not use "\d tablename".
It reports the following error, it this related to the patch?

Sorry, solved by re-initdb.

Best regards,
houzj

One of the patch

(v10-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_i.patch) from the
patchset does not apply.

http://cfbot.cputube.org/patch_32_2553.log

1 out of 13 hunks FAILED -- saving rejects to file
src/backend/commands/copyfrom.c.rej

It is a minor change, therefore I fixed that to make cfbot happy. Please
take a look if that works for you.

--
Ibrar Ahmed

Attachments:

v11-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_i.patchapplication/octet-stream; name=v11-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_i.patchDownload

From e2b93b3b3aaa32f680193b42d91a80bab40768a4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 8 May 2020 02:17:32 -0500
Subject: [PATCH v10 1/4] INSERT SELECT to use BulkInsertState and multi_insert

Renames structures;
Move MultipleInsert functions from copyfrom.c to (tentatively) nodeModifyTable.h;
Move into MultiInsertInfo: transition_capture and cur_lineno (via cstate->miinfo);

Dynamically switch to multi-insert mode based on the number of insertions.
This is intended to accomodate 1) the original use case of INSERT using a small
ring buffer to avoid leaving behind dirty buffers; and, 2) Automatically using
multi-inserts for batch operations; 3) allow the old behavior of leaving behind
dirty buffers, which might allow INSERT to run more quickly, at the cost of
leaving behind many dirty buffers which other backends may have to write out. 

XXX: for (1), the bulk-insert state is used even if not multi-insert, including
for a VALUES.

TODO: use cstate->miinfo.cur_lineno++ instead of mtstate->miinfo->ntuples
---
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 2ed696d429..0e1c483183 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -46,54 +46,6 @@
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
-/*
- * No more than this many tuples per CopyMultiInsertBuffer
- *
- * Caution: Don't make this too big, as we could end up with this many
- * CopyMultiInsertBuffer items stored in CopyMultiInsertInfo's
- * multiInsertBuffers list.  Increasing this can cause quadratic growth in
- * memory requirements during copies into partitioned tables with a large
- * number of partitions.
- */
-#define MAX_BUFFERED_TUPLES		1000
-
-/*
- * Flush buffers if there are >= this many bytes, as counted by the input
- * size, of tuples stored.
- */
-#define MAX_BUFFERED_BYTES		65535
-
-/* Trim the list of buffers back down to this number after flushing */
-#define MAX_PARTITION_BUFFERS	32
-
-/* Stores multi-insert data related to a single relation in CopyFrom. */
-typedef struct CopyMultiInsertBuffer
-{
-	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
-	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
-	BulkInsertState bistate;	/* BulkInsertState for this rel */
-	int			nused;			/* number of 'slots' containing tuples */
-	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
-												 * stream */
-} CopyMultiInsertBuffer;
-
-/*
- * Stores one or many CopyMultiInsertBuffers and details about the size and
- * number of tuples which are stored in them.  This allows multiple buffers to
- * exist at once when COPYing into a partitioned table.
- */
-typedef struct CopyMultiInsertInfo
-{
-	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
-	int			bufferedTuples; /* number of tuples buffered over all buffers */
-	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
-	EState	   *estate;			/* Executor state used for COPY */
-	CommandId	mycid;			/* Command Id used for COPY */
-	int			ti_options;		/* table insert options */
-} CopyMultiInsertInfo;
-
-
 /* non-export function prototypes */
 static char *limit_printout_length(const char *str);
 
@@ -111,7 +63,7 @@ CopyFromErrorCallback(void *arg)
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
-			 cstate->cur_lineno);
+			 cstate->miinfo.cur_lineno);
 
 	if (cstate->opts.binary)
 	{
@@ -206,317 +158,6 @@ limit_printout_length(const char *str)
 	return res;
 }
 
-/*
- * Allocate memory and initialize a new CopyMultiInsertBuffer for this
- * ResultRelInfo.
- */
-static CopyMultiInsertBuffer *
-CopyMultiInsertBufferInit(ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = (CopyMultiInsertBuffer *) palloc(sizeof(CopyMultiInsertBuffer));
-	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
-	buffer->resultRelInfo = rri;
-	buffer->bistate = GetBulkInsertState();
-	buffer->nused = 0;
-
-	return buffer;
-}
-
-/*
- * Make a new buffer for this ResultRelInfo.
- */
-static inline void
-CopyMultiInsertInfoSetupBuffer(CopyMultiInsertInfo *miinfo,
-							   ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer;
-
-	buffer = CopyMultiInsertBufferInit(rri);
-
-	/* Setup back-link so we can easily find this buffer again */
-	rri->ri_CopyMultiInsertBuffer = buffer;
-	/* Record that we're tracking this buffer */
-	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-}
-
-/*
- * Initialize an already allocated CopyMultiInsertInfo.
- *
- * If rri is a non-partitioned table then a CopyMultiInsertBuffer is set up
- * for that table.
- */
-static void
-CopyMultiInsertInfoInit(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						CopyFromState cstate, EState *estate, CommandId mycid,
-						int ti_options)
-{
-	miinfo->multiInsertBuffers = NIL;
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-	miinfo->cstate = cstate;
-	miinfo->estate = estate;
-	miinfo->mycid = mycid;
-	miinfo->ti_options = ti_options;
-
-	/*
-	 * Only setup the buffer when not dealing with a partitioned table.
-	 * Buffers for partitioned tables will just be setup when we need to send
-	 * tuples their way for the first time.
-	 */
-	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-		CopyMultiInsertInfoSetupBuffer(miinfo, rri);
-}
-
-/*
- * Returns true if the buffers are full
- */
-static inline bool
-CopyMultiInsertInfoIsFull(CopyMultiInsertInfo *miinfo)
-{
-	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
-		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
-		return true;
-	return false;
-}
-
-/*
- * Returns true if we have no buffered tuples
- */
-static inline bool
-CopyMultiInsertInfoIsEmpty(CopyMultiInsertInfo *miinfo)
-{
-	return miinfo->bufferedTuples == 0;
-}
-
-/*
- * Write the tuples stored in 'buffer' out to the table.
- */
-static inline void
-CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
-						   CopyMultiInsertBuffer *buffer)
-{
-	MemoryContext oldcontext;
-	int			i;
-	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
-	EState	   *estate = miinfo->estate;
-	CommandId	mycid = miinfo->mycid;
-	int			ti_options = miinfo->ti_options;
-	bool		line_buf_valid = cstate->line_buf_valid;
-	int			nused = buffer->nused;
-	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
-	TupleTableSlot **slots = buffer->slots;
-
-	/*
-	 * Print error context information correctly, if one of the operations
-	 * below fail.
-	 */
-	cstate->line_buf_valid = false;
-	save_cur_lineno = cstate->cur_lineno;
-
-	/*
-	 * table_multi_insert may leak memory, so switch to short-lived memory
-	 * context before calling it.
-	 */
-	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	table_multi_insert(resultRelInfo->ri_RelationDesc,
-					   slots,
-					   nused,
-					   mycid,
-					   ti_options,
-					   buffer->bistate);
-	MemoryContextSwitchTo(oldcontext);
-
-	for (i = 0; i < nused; i++)
-	{
-		/*
-		 * If there are any indexes, update them for all the inserted tuples,
-		 * and run AFTER ROW INSERT triggers.
-		 */
-		if (resultRelInfo->ri_NumIndices > 0)
-		{
-			List	   *recheckIndexes;
-
-			cstate->cur_lineno = buffer->linenos[i];
-			recheckIndexes =
-				ExecInsertIndexTuples(resultRelInfo,
-									  buffer->slots[i], estate, false, false,
-									  NULL, NIL);
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], recheckIndexes,
-								 cstate->transition_capture);
-			list_free(recheckIndexes);
-		}
-
-		/*
-		 * There's no indexes, but see if we need to run AFTER ROW INSERT
-		 * triggers anyway.
-		 */
-		else if (resultRelInfo->ri_TrigDesc != NULL &&
-				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
-				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
-		{
-			cstate->cur_lineno = buffer->linenos[i];
-			ExecARInsertTriggers(estate, resultRelInfo,
-								 slots[i], NIL, cstate->transition_capture);
-		}
-
-		ExecClearTuple(slots[i]);
-	}
-
-	/* Mark that all slots are free */
-	buffer->nused = 0;
-
-	/* reset cur_lineno and line_buf_valid to what they were */
-	cstate->line_buf_valid = line_buf_valid;
-	cstate->cur_lineno = save_cur_lineno;
-}
-
-/*
- * Drop used slots and free member for this buffer.
- *
- * The buffer must be flushed before cleanup.
- */
-static inline void
-CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
-							 CopyMultiInsertBuffer *buffer)
-{
-	int			i;
-
-	/* Ensure buffer was flushed */
-	Assert(buffer->nused == 0);
-
-	/* Remove back-link to ourself */
-	buffer->resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
-
-	FreeBulkInsertState(buffer->bistate);
-
-	/* Since we only create slots on demand, just drop the non-null ones. */
-	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
-		ExecDropSingleTupleTableSlot(buffer->slots[i]);
-
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
-	pfree(buffer);
-}
-
-/*
- * Write out all stored tuples in all buffers out to the tables.
- *
- * Once flushed we also trim the tracked buffers list down to size by removing
- * the buffers created earliest first.
- *
- * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
- * used.  When cleaning up old buffers we'll never remove the one for
- * 'curr_rri'.
- */
-static inline void
-CopyMultiInsertInfoFlush(CopyMultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-	{
-		CopyMultiInsertBuffer *buffer = (CopyMultiInsertBuffer *) lfirst(lc);
-
-		CopyMultiInsertBufferFlush(miinfo, buffer);
-	}
-
-	miinfo->bufferedTuples = 0;
-	miinfo->bufferedBytes = 0;
-
-	/*
-	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
-	 * remove buffers starting with the ones we created first.  It seems less
-	 * likely that these older ones will be needed than the ones that were
-	 * just created.
-	 */
-	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
-	{
-		CopyMultiInsertBuffer *buffer;
-
-		buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-
-		/*
-		 * We never want to remove the buffer that's currently being used, so
-		 * if we happen to find that then move it to the end of the list.
-		 */
-		if (buffer->resultRelInfo == curr_rri)
-		{
-			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
-			buffer = (CopyMultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
-		}
-
-		CopyMultiInsertBufferCleanup(miinfo, buffer);
-		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
-	}
-}
-
-/*
- * Cleanup allocated buffers and free memory
- */
-static inline void
-CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
-{
-	ListCell   *lc;
-
-	foreach(lc, miinfo->multiInsertBuffers)
-		CopyMultiInsertBufferCleanup(miinfo, lfirst(lc));
-
-	list_free(miinfo->multiInsertBuffers);
-}
-
-/*
- * Get the next TupleTableSlot that the next tuple should be stored in.
- *
- * Callers must ensure that the buffer is not full.
- *
- * Note: 'miinfo' is unused but has been included for consistency with the
- * other functions in this area.
- */
-static inline TupleTableSlot *
-CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,
-								ResultRelInfo *rri)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-	int			nused = buffer->nused;
-
-	Assert(buffer != NULL);
-	Assert(nused < MAX_BUFFERED_TUPLES);
-
-	if (buffer->slots[nused] == NULL)
-		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
-	return buffer->slots[nused];
-}
-
-/*
- * Record the previously reserved TupleTableSlot that was reserved by
- * CopyMultiInsertInfoNextFreeSlot as being consumed.
- */
-static inline void
-CopyMultiInsertInfoStore(CopyMultiInsertInfo *miinfo, ResultRelInfo *rri,
-						 TupleTableSlot *slot, int tuplen, uint64 lineno)
-{
-	CopyMultiInsertBuffer *buffer = rri->ri_CopyMultiInsertBuffer;
-
-	Assert(buffer != NULL);
-	Assert(slot == buffer->slots[buffer->nused]);
-
-	/* Store the line number so we can properly report any errors later */
-	buffer->linenos[buffer->nused] = lineno;
-
-	/* Record this slot as being used */
-	buffer->nused++;
-
-	/* Update how many tuples are stored and their size */
-	miinfo->bufferedTuples++;
-	miinfo->bufferedBytes += tuplen;
-}
-
 /*
  * Copy FROM file to relation.
  */
@@ -538,7 +179,6 @@ CopyFrom(CopyFromState cstate)
 	int			ti_options = 0; /* start with default options for insert */
 	BulkInsertState bistate = NULL;
 	CopyInsertMethod insertMethod;
-	CopyMultiInsertInfo multiInsertInfo = {0};	/* pacify compiler */
 	int64		processed = 0;
 	int64		excluded = 0;
 	bool		has_before_insert_row_trig;
@@ -727,7 +367,7 @@ CopyFrom(CopyFromState cstate)
 		 * For partitioned tables we can't support multi-inserts when there
 		 * are any statement level insert triggers. It might be possible to
 		 * allow partitioned tables with such triggers in the future, but for
-		 * now, CopyMultiInsertInfoFlush expects that any before row insert
+		 * now, MultiInsertInfoFlush expects that any before row insert
 		 * and statement level insert triggers are on the same relation.
 		 */
 		insertMethod = CIM_SINGLE;
@@ -775,7 +415,8 @@ CopyFrom(CopyFromState cstate)
 		else
 			insertMethod = CIM_MULTI;
 
-		CopyMultiInsertInfoInit(&multiInsertInfo, resultRelInfo, cstate,
+		MultiInsertInfoInit(&cstate->miinfo, resultRelInfo,
+								cstate->transition_capture,
 								estate, mycid, ti_options);
 	}
 
@@ -838,7 +479,7 @@ CopyFrom(CopyFromState cstate)
 			Assert(resultRelInfo == target_resultRelInfo);
 			Assert(insertMethod == CIM_MULTI);
 
-			myslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+			myslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 													 resultRelInfo);
 		}
 
@@ -915,18 +556,18 @@ CopyFrom(CopyFromState cstate)
 				/* Set the multi-insert buffer to use for this partition. */
 				if (leafpart_use_multi_insert)
 				{
-					if (resultRelInfo->ri_CopyMultiInsertBuffer == NULL)
-						CopyMultiInsertInfoSetupBuffer(&multiInsertInfo,
+					if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+						MultiInsertInfoSetupBuffer(&cstate->miinfo,
 													   resultRelInfo);
 				}
 				else if (insertMethod == CIM_MULTI_CONDITIONAL &&
-						 !CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
+						 !MultiInsertInfoIsEmpty(&cstate->miinfo))
 				{
 					/*
 					 * Flush pending inserts if this partition can't use
 					 * batching, so rows are visible to triggers etc.
 					 */
-					CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 
 				if (bistate != NULL)
@@ -972,7 +613,7 @@ CopyFrom(CopyFromState cstate)
 				/* no other path available for partitioned table */
 				Assert(insertMethod == CIM_MULTI_CONDITIONAL);
 
-				batchslot = CopyMultiInsertInfoNextFreeSlot(&multiInsertInfo,
+				batchslot = MultiInsertInfoNextFreeSlot(&cstate->miinfo,
 															resultRelInfo);
 
 				if (map != NULL)
@@ -1052,17 +693,17 @@ CopyFrom(CopyFromState cstate)
 					ExecMaterializeSlot(myslot);
 
 					/* Add this tuple to the tuple buffer */
-					CopyMultiInsertInfoStore(&multiInsertInfo,
+					MultiInsertInfoStore(&cstate->miinfo,
 											 resultRelInfo, myslot,
 											 cstate->line_buf.len,
-											 cstate->cur_lineno);
+											 cstate->miinfo.cur_lineno);
 
 					/*
 					 * If enough inserts have queued up, then flush all
 					 * buffers out to their tables.
 					 */
-					if (CopyMultiInsertInfoIsFull(&multiInsertInfo))
-						CopyMultiInsertInfoFlush(&multiInsertInfo, resultRelInfo);
+					if (MultiInsertInfoIsFull(&cstate->miinfo))
+						MultiInsertInfoFlush(&cstate->miinfo, resultRelInfo);
 				}
 				else
 				{
@@ -1124,8 +765,8 @@ CopyFrom(CopyFromState cstate)
 	/* Flush any remaining buffered tuples */
 	if (insertMethod != CIM_SINGLE)
 	{
-		if (!CopyMultiInsertInfoIsEmpty(&multiInsertInfo))
-			CopyMultiInsertInfoFlush(&multiInsertInfo, NULL);
+		if (!MultiInsertInfoIsEmpty(&cstate->miinfo))
+			MultiInsertInfoFlush(&cstate->miinfo, NULL);
 	}
 
 	/* Done, clean up */
@@ -1152,7 +793,7 @@ CopyFrom(CopyFromState cstate)
 
 	/* Tear down the multi-insert buffer data */
 	if (insertMethod != CIM_SINGLE)
-		CopyMultiInsertInfoCleanup(&multiInsertInfo);
+		MultiInsertInfoCleanup(&cstate->miinfo);
 
 	/* Close all the partitioned tables, leaf partitions, and their indices */
 	if (proute)
@@ -1341,7 +982,6 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
-	cstate->cur_lineno = 0;
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index ce24a1528b..2cae98e8f7 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -426,14 +426,14 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	Assert(!cstate->opts.binary);
 
 	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	if (cstate->miinfo.cur_lineno == 0 && cstate->opts.header_line)
 	{
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 		if (CopyReadLine(cstate))
 			return false;		/* done */
 	}
 
-	cstate->cur_lineno++;
+	cstate->miinfo.cur_lineno++;
 
 	/* Actually read the line into memory here */
 	done = CopyReadLine(cstate);
@@ -574,7 +574,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 		int16		fld_count;
 		ListCell   *cur;
 
-		cstate->cur_lineno++;
+		cstate->miinfo.cur_lineno++;
 
 		if (!CopyGetInt16(cstate, &fld_count))
 		{
@@ -877,7 +877,7 @@ CopyReadLineText(CopyFromState cstate)
 			 * at all --- is cur_lineno a physical or logical count?)
 			 */
 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+				cstate->miinfo.cur_lineno++;
 		}
 
 		/* Process \r */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 0648dd82ba..19031a483b 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1233,7 +1233,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 													 * ExecInitRoutingInfo */
 	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
 	resultRelInfo->ri_ChildToRootMap = NULL;
-	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	resultRelInfo->ri_MultiInsertBuffer = NULL;
 }
 
 /*
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index b8da4c5967..b0f4b68b6e 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1011,7 +1011,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 
 	Assert(partRelInfo->ri_BatchSize >= 1);
 
-	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+	partRelInfo->ri_MultiInsertBuffer = NULL;
 
 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2993ba43e3..0907b3ebd5 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -43,6 +43,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "commands/trigger.h"
+#include "commands/copy.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
@@ -79,6 +80,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -397,6 +400,8 @@ ExecInsert(ModifyTableState *mtstate,
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
 	MemoryContext oldContext;
+	TupleTableSlot *batchslot = NULL;
+	bool	use_multi_insert = false;
 
 	/*
 	 * If the input result relation is a partitioned table, find the leaf
@@ -416,6 +421,66 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	// XXX: maybe this should only be done if it's not a partitioned table or
+	// if the partitions don't support miinfo, which uses its own bistates
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->operation == CMD_INSERT &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+	}
+
+	if (!mtstate->miinfo)
+	{
+		/*
+		 * If multi-inserts aren't possible for this statement at all, so don't
+		 * check further
+		 */
+	} else if (proute == NULL)
+	{
+		if (mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+	else
+	{
+		/*
+		 * If a partitioned table itself allows multi-insert, and bistate
+		 * indicates we've inserted the threshold number of tuples, check if
+		 * the partition also supports it.
+		 */
+
+		/* Determine which triggers exist on this partition */
+		// XXX copyfrom.c only checks triggers when the partition changes,
+		// so maybe use_multi_insert should be in mtstate ?
+		bool has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_before_row);
+
+		bool has_instead_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
+				resultRelInfo->ri_TrigDesc->trig_insert_instead_row);
+
+		/*
+		 * Disable multi-inserts when the partition has BEFORE/INSTEAD
+		 * OF triggers, or if the partition is a foreign partition.
+		 * The number of tuples eligible for multi-insert is tracked separately
+		 * from the total number of tuples in case it's not supported for some
+		 * partitions.
+		 */
+		if (!has_before_insert_row_trig &&
+			!has_instead_insert_row_trig &&
+			resultRelInfo->ri_FdwRoutine == NULL &&
+			mtstate->miinfo->ntuples++ >= bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+			use_multi_insert = true;
+	}
+
+	if (use_multi_insert && mtstate->miinfo->ntuples - 1 == bulk_insert_ntuples)
+		elog(DEBUG1, "enabling multi insert");
+
 	/*
 	 * BEFORE ROW INSERT Triggers.
 	 *
@@ -651,7 +716,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   mtstate->bistate,
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -686,12 +751,39 @@ ExecInsert(ModifyTableState *mtstate,
 
 			/* Since there was no insertion conflict, we're done */
 		}
+		else if (use_multi_insert)
+		{
+			if (resultRelInfo->ri_MultiInsertBuffer == NULL)
+				MultiInsertInfoSetupBuffer(mtstate->miinfo, resultRelInfo);
+
+			batchslot = MultiInsertInfoNextFreeSlot(mtstate->miinfo, resultRelInfo);
+			ExecCopySlot(batchslot, slot);
+
+			MultiInsertInfoStore(mtstate->miinfo, resultRelInfo, batchslot, 0, 0); // XXX: tuplen/lineno
+
+			if (MultiInsertInfoIsFull(mtstate->miinfo))
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
+			/*
+			 * Flush pending inserts if this partition can't use
+			 * batching, so rows are visible to triggers etc.
+			 */
+			if (mtstate->miinfo)
+				MultiInsertInfoFlush(mtstate->miinfo, resultRelInfo);
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -704,32 +796,36 @@ ExecInsert(ModifyTableState *mtstate,
 	if (canSetTag)
 		(estate->es_processed)++;
 
-	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
-	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	/* Triggers were already run in the batch insert case */
+	if (batchslot == NULL)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
-
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * If this insert is the result of a partition key update that moved the
+		 * tuple to a new partition, put this row into the transition NEW TABLE,
+		 * if there is one. We need to do this separately for DELETE and INSERT
+		 * because they happen on different tables.
 		 */
-		ar_insert_trig_tcs = NULL;
-	}
+		ar_insert_trig_tcs = mtstate->mt_transition_capture;
+		if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+			&& mtstate->mt_transition_capture->tcs_update_new_table)
+		{
+			ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+								 NULL,
+								 slot,
+								 NULL,
+								 mtstate->mt_transition_capture);
+
+			/*
+			 * We've already captured the NEW TABLE row, so make sure any AR
+			 * INSERT trigger fired below doesn't capture it again.
+			 */
+			ar_insert_trig_tcs = NULL;
+		}
 
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 ar_insert_trig_tcs);
+	}
 
 	list_free(recheckIndexes);
 
@@ -2372,6 +2468,45 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
 	mtstate->mt_nplans = nplans;
+	mtstate->bistate = NULL;
+
+	/*
+	 * Set miinfo if it can support multi-insert. This is the equivalent of
+	 * CIM_MULTI_* et al in copyfrom.c
+	 */
+
+	if (operation != CMD_INSERT ||
+			node->onConflictAction != ONCONFLICT_NONE)
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			(mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_before_row ||
+			 // mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_after_row || // XXX or any row level triggers at all?
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_instead_row))
+		/*
+		 * Can't support multi-inserts when there are any BEFORE/INSTEAD OF
+		 * triggers on the table.
+		 */
+		mtstate->miinfo = NULL;
+	else if (node->rootRelation > 0 &&
+			mtstate->rootResultRelInfo->ri_TrigDesc != NULL &&
+			 mtstate->rootResultRelInfo->ri_TrigDesc->trig_insert_new_table)
+		/*
+		 * For partitioned tables we can't support multi-inserts when there
+		 * are any statement level insert triggers.
+		 */
+		mtstate->miinfo = NULL;
+	else if (mtstate->rootResultRelInfo->ri_FdwRoutine != NULL
+			/* || cstate->volatile_defexprs */ )
+		// XXX contain_volatile_functions_not_nextval((Node *) defexpr);
+		/* Can't support multi-inserts to foreign tables or if there are any */
+		mtstate->miinfo = NULL;
+	else
+	{
+		mtstate->miinfo = calloc(sizeof(*mtstate->miinfo), 1);
+		MultiInsertInfoInit(mtstate->miinfo, mtstate->rootResultRelInfo,
+				mtstate->mt_transition_capture,
+				estate, GetCurrentCommandId(true), 0);
+	}
 
 	/* set up epqstate with dummy subplan data for the moment */
 	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
@@ -2864,6 +2999,19 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
+	if (node->miinfo)
+	{
+		if (!MultiInsertInfoIsEmpty(node->miinfo))
+			 MultiInsertInfoFlush(node->miinfo, node->resultRelInfo); // root ?
+		MultiInsertInfoCleanup(node->miinfo);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 855076b1fd..250bbf3caa 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -54,6 +54,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "executor/nodeModifyTable.h"
 #include "optimizer/geqo.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/paths.h"
@@ -3455,6 +3456,15 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index 705f5b615b..aae0da3200 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -15,6 +15,7 @@
 #define COPYFROM_INTERNAL_H
 
 #include "commands/copy.h"
+#include "executor/nodeModifyTable.h"
 #include "commands/trigger.h"
 
 /*
@@ -91,10 +92,12 @@ typedef struct CopyFromStateData
 
 	/* these are just for error messages, see CopyFromErrorCallback */
 	const char *cur_relname;	/* table name for error messages */
-	uint64		cur_lineno;		/* line number for error messages */
 	const char *cur_attname;	/* current att for error messages */
 	const char *cur_attval;		/* current att value for error messages */
 
+	/* For bulk inserts and for error callback */
+	MultiInsertInfo miinfo;
+
 	/*
 	 * Working state
 	 */
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 83e2965531..30542a542a 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -13,8 +13,12 @@
 #ifndef NODEMODIFYTABLE_H
 #define NODEMODIFYTABLE_H
 
+#include "commands/trigger.h"
+#include "executor/executor.h" // XXX
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT int bulk_insert_ntuples;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
@@ -23,4 +27,371 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate,
 extern void ExecEndModifyTable(ModifyTableState *node);
 extern void ExecReScanModifyTable(ModifyTableState *node);
 
+/* Bulk insert stuff which used to live in copy.c */
+
+/*
+ * No more than this many tuples per MultiInsertBuffer
+ *
+ * Caution: Don't make this too big, as we could end up with this many
+ * MultiInsertBuffer items stored in MultiInsertInfo's
+ * multiInsertBuffers list.  Increasing this can cause quadratic growth in
+ * memory requirements during copies into partitioned tables with a large
+ * number of partitions.
+ */
+#define MAX_BUFFERED_TUPLES		1000
+
+/*
+ * Flush buffers if there are >= this many bytes, as counted by the input
+ * size, of tuples stored.
+ */
+#define MAX_BUFFERED_BYTES		65535
+
+/* Trim the list of buffers back down to this number after flushing */
+#define MAX_PARTITION_BUFFERS	32
+
+/* Stores multi-insert data related to a single relation in CopyFrom. */
+typedef struct MultiInsertBuffer
+{
+	TupleTableSlot *slots[MAX_BUFFERED_TUPLES]; /* Array to store tuples */
+	ResultRelInfo *resultRelInfo;	/* ResultRelInfo for 'relid' */
+	BulkInsertState bistate;	/* BulkInsertState for this rel */
+	int			nused;			/* number of 'slots' containing tuples */
+	uint64		linenos[MAX_BUFFERED_TUPLES];	/* Line # of tuple in copy
+												 * stream */
+} MultiInsertBuffer;
+
+/*
+ * Stores one or many MultiInsertBuffers and details about the size and
+ * number of tuples which are stored in them.  This allows multiple buffers to
+ * exist at once when COPY/INSERTing into a partitioned table.
+ */
+typedef struct MultiInsertInfo
+{
+	List	   *multiInsertBuffers; /* List of tracked MultiInsertBuffers */
+	int			bufferedTuples; /* number of tuples buffered over all buffers */
+	int			bufferedBytes;	/* number of bytes from all buffered tuples */
+	TransitionCaptureState	*transition_capture;
+	EState	   *estate;			/* Executor state */
+	CommandId	mycid;			/* Command Id */
+	int			ti_options;		/* table insert options */
+	size_t		ntuples;		/* Number of rows *eligible* for multi-insert */
+
+	/* Line number for errors in copyfrom.c */
+	uint64		cur_lineno;
+	bool		line_buf_valid;
+} MultiInsertInfo;
+
+
+/*
+ * Allocate memory and initialize a new MultiInsertBuffer for this
+ * ResultRelInfo.
+ */
+static MultiInsertBuffer *
+MultiInsertBufferInit(ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = (MultiInsertBuffer *) palloc(sizeof(MultiInsertBuffer));
+	memset(buffer->slots, 0, sizeof(TupleTableSlot *) * MAX_BUFFERED_TUPLES);
+	buffer->resultRelInfo = rri;
+	buffer->bistate = GetBulkInsertState();
+	buffer->nused = 0;
+
+	return buffer;
+}
+
+/*
+ * Make a new buffer for this ResultRelInfo.
+ */
+static inline void
+MultiInsertInfoSetupBuffer(MultiInsertInfo *miinfo,
+							   ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer;
+
+	buffer = MultiInsertBufferInit(rri);
+
+	/* Setup back-link so we can easily find this buffer again */
+	rri->ri_MultiInsertBuffer = buffer;
+	/* Record that we're tracking this buffer */
+	miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+}
+
+/*
+ * Initialize an already allocated MultiInsertInfo.
+ *
+ * If rri is a non-partitioned table then a MultiInsertBuffer is set up
+ * for that table.
+ */
+static inline void
+MultiInsertInfoInit(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						TransitionCaptureState *transition_capture,
+						EState *estate, CommandId mycid, int ti_options)
+{
+	miinfo->multiInsertBuffers = NIL;
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+	miinfo->transition_capture = transition_capture;
+	miinfo->estate = estate;
+	miinfo->mycid = mycid;
+	miinfo->ti_options = ti_options;
+	miinfo->cur_lineno = 0;
+
+	/*
+	 * Only setup the buffer when not dealing with a partitioned table.
+	 * Buffers for partitioned tables will just be setup when we need to send
+	 * tuples their way for the first time.
+	 */
+	if (rri->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		MultiInsertInfoSetupBuffer(miinfo, rri);
+}
+
+/*
+ * Returns true if the buffers are full
+ */
+static inline bool
+MultiInsertInfoIsFull(MultiInsertInfo *miinfo)
+{
+	if (miinfo->bufferedTuples >= MAX_BUFFERED_TUPLES ||
+		miinfo->bufferedBytes >= MAX_BUFFERED_BYTES)
+		return true;
+	return false;
+}
+
+/*
+ * Returns true if we have no buffered tuples
+ */
+static inline bool
+MultiInsertInfoIsEmpty(MultiInsertInfo *miinfo)
+{
+	return miinfo->bufferedTuples == 0;
+}
+
+/*
+ * Write the tuples stored in 'buffer' out to the table.
+ */
+static inline void
+MultiInsertBufferFlush(MultiInsertInfo *miinfo,
+						   MultiInsertBuffer *buffer)
+{
+	MemoryContext oldcontext;
+	int			i;
+	uint64		save_cur_lineno;
+	EState	   *estate = miinfo->estate;
+	CommandId	mycid = miinfo->mycid;
+	int			ti_options = miinfo->ti_options;
+	bool		line_buf_valid = miinfo->line_buf_valid;
+	int			nused = buffer->nused;
+	ResultRelInfo *resultRelInfo = buffer->resultRelInfo;
+	TupleTableSlot **slots = buffer->slots;
+
+	/*
+	 * Print error context information correctly, if one of the operations
+	 * below fail.
+	 */
+	miinfo->line_buf_valid = false;
+	save_cur_lineno = miinfo->cur_lineno;
+
+	/*
+	 * table_multi_insert may leak memory, so switch to short-lived memory
+	 * context before calling it.
+	 */
+	oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); // XXX requires executor.h
+	table_multi_insert(resultRelInfo->ri_RelationDesc,
+					   slots,
+					   nused,
+					   mycid,
+					   ti_options,
+					   buffer->bistate);
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < nused; i++)
+	{
+		/*
+		 * If there are any indexes, update them for all the inserted tuples,
+		 * and run AFTER ROW INSERT triggers.
+		 */
+		if (resultRelInfo->ri_NumIndices > 0)
+		{
+			List	   *recheckIndexes;
+
+			miinfo->cur_lineno = buffer->linenos[i];
+			recheckIndexes =
+				ExecInsertIndexTuples(resultRelInfo,
+									  buffer->slots[i], estate, false, false,
+									  NULL, NIL);
+
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], recheckIndexes,
+								 miinfo->transition_capture);
+			list_free(recheckIndexes);
+		}
+
+		/*
+		 * There's no indexes, but see if we need to run AFTER ROW INSERT
+		 * triggers anyway.
+		 */
+		else if (resultRelInfo->ri_TrigDesc != NULL &&
+				 (resultRelInfo->ri_TrigDesc->trig_insert_after_row ||
+				  resultRelInfo->ri_TrigDesc->trig_insert_new_table))
+		{
+			miinfo->cur_lineno = buffer->linenos[i];
+			ExecARInsertTriggers(estate, resultRelInfo,
+								 slots[i], NIL, miinfo->transition_capture);
+		}
+
+		ExecClearTuple(slots[i]);
+	}
+
+	/* Mark that all slots are free */
+	buffer->nused = 0;
+
+	/* reset cur_lineno and line_buf_valid to what they were */
+	miinfo->line_buf_valid = line_buf_valid;
+	miinfo->cur_lineno = save_cur_lineno;
+}
+
+/*
+ * Drop used slots and free member for this buffer.
+ *
+ * The buffer must be flushed before cleanup.
+ */
+static inline void
+MultiInsertBufferCleanup(MultiInsertInfo *miinfo,
+							 MultiInsertBuffer *buffer)
+{
+	int			i;
+
+	/* Ensure buffer was flushed */
+	Assert(buffer->nused == 0);
+
+	/* Remove back-link to ourself */
+	buffer->resultRelInfo->ri_MultiInsertBuffer = NULL;
+
+	FreeBulkInsertState(buffer->bistate);
+
+	/* Since we only create slots on demand, just drop the non-null ones. */
+	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
+		ExecDropSingleTupleTableSlot(buffer->slots[i]);
+
+	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
+							 miinfo->ti_options);
+
+	pfree(buffer);
+}
+
+/*
+ * Write out all stored tuples in all buffers out to the tables.
+ *
+ * Once flushed we also trim the tracked buffers list down to size by removing
+ * the buffers created earliest first.
+ *
+ * Callers should pass 'curr_rri' is the ResultRelInfo that's currently being
+ * used.  When cleaning up old buffers we'll never remove the one for
+ * 'curr_rri'.
+ */
+static inline void
+MultiInsertInfoFlush(MultiInsertInfo *miinfo, ResultRelInfo *curr_rri)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+	{
+		MultiInsertBuffer *buffer = (MultiInsertBuffer *) lfirst(lc);
+
+		MultiInsertBufferFlush(miinfo, buffer);
+	}
+
+	miinfo->bufferedTuples = 0;
+	miinfo->bufferedBytes = 0;
+
+	/*
+	 * Trim the list of tracked buffers down if it exceeds the limit.  Here we
+	 * remove buffers starting with the ones we created first.  It seems less
+	 * likely that these older ones will be needed than the ones that were
+	 * just created.
+	 */
+	while (list_length(miinfo->multiInsertBuffers) > MAX_PARTITION_BUFFERS)
+	{
+		MultiInsertBuffer *buffer;
+
+		buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+
+		/*
+		 * We never want to remove the buffer that's currently being used, so
+		 * if we happen to find that then move it to the end of the list.
+		 */
+		if (buffer->resultRelInfo == curr_rri)
+		{
+			miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+			miinfo->multiInsertBuffers = lappend(miinfo->multiInsertBuffers, buffer);
+			buffer = (MultiInsertBuffer *) linitial(miinfo->multiInsertBuffers);
+		}
+
+		MultiInsertBufferCleanup(miinfo, buffer);
+		miinfo->multiInsertBuffers = list_delete_first(miinfo->multiInsertBuffers);
+	}
+}
+
+/*
+ * Cleanup allocated buffers and free memory
+ */
+static inline void
+MultiInsertInfoCleanup(MultiInsertInfo *miinfo)
+{
+	ListCell   *lc;
+
+	foreach(lc, miinfo->multiInsertBuffers)
+		MultiInsertBufferCleanup(miinfo, lfirst(lc));
+
+	list_free(miinfo->multiInsertBuffers);
+}
+
+/*
+ * Get the next TupleTableSlot that the next tuple should be stored in.
+ *
+ * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
+ */
+static inline TupleTableSlot *
+MultiInsertInfoNextFreeSlot(MultiInsertInfo *miinfo,
+								ResultRelInfo *rri)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+	int			nused = buffer->nused;
+
+	Assert(buffer != NULL);
+	Assert(nused < MAX_BUFFERED_TUPLES);
+
+	if (buffer->slots[nused] == NULL)
+		buffer->slots[nused] = table_slot_create(rri->ri_RelationDesc, NULL);
+	return buffer->slots[nused];
+}
+
+/*
+ * Record the previously reserved TupleTableSlot that was reserved by
+ * MultiInsertInfoNextFreeSlot as being consumed.
+ */
+static inline void
+MultiInsertInfoStore(MultiInsertInfo *miinfo, ResultRelInfo *rri,
+						 TupleTableSlot *slot, int tuplen, uint64 lineno)
+{
+	MultiInsertBuffer *buffer = rri->ri_MultiInsertBuffer;
+
+	Assert(buffer != NULL);
+	Assert(slot == buffer->slots[buffer->nused]);
+
+	/* Store the line number so we can properly report any errors later */
+	buffer->linenos[buffer->nused] = lineno;
+
+	/* Record this slot as being used */
+	buffer->nused++;
+
+	/* Update how many tuples are stored and their size */
+	miinfo->bufferedTuples++;
+	miinfo->bufferedBytes += tuplen;
+}
+
 #endif							/* NODEMODIFYTABLE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e31ad6204e..2aee650dd3 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -32,6 +33,9 @@
 #include "utils/tuplesort.h"
 #include "utils/tuplestore.h"
 
+/* This would be a circular inclusion */
+// #include "executor/nodeModifyTable.h"
+
 struct PlanState;				/* forward references in this file */
 struct ParallelHashJoinState;
 struct ExecRowMark;
@@ -39,8 +43,8 @@ struct ExprState;
 struct ExprContext;
 struct RangeTblEntry;			/* avoid including parsenodes.h here */
 struct ExprEvalStep;			/* avoid including execExpr.h everywhere */
-struct CopyMultiInsertBuffer;
-
+// struct MultiInsertBuffer;
+// struct MultiInsertInfo;
 
 /* ----------------
  *		ExprState node
@@ -511,8 +515,8 @@ typedef struct ResultRelInfo
 	 */
 	TupleConversionMap *ri_ChildToRootMap;
 
-	/* for use by copyfrom.c when performing multi-inserts */
-	struct CopyMultiInsertBuffer *ri_CopyMultiInsertBuffer;
+	/* for use by copyfrom.c/modifyTable when performing multi-inserts */
+	struct MultiInsertBuffer *ri_MultiInsertBuffer;
 } ResultRelInfo;
 
 /* ----------------
@@ -1178,6 +1182,10 @@ typedef struct ModifyTableState
 	List	  **mt_arowmarks;	/* per-subplan ExecAuxRowMark lists */
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	struct MultiInsertInfo	*miinfo;
+	size_t		ntuples;	/* Number of tuples inserted; */
 
 	/*
 	 * Slot for storing tuples in the root partitioned table's rowtype during
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index da50ee3b67..bc4c1a4fc2 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -462,6 +462,49 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+ a  
+----
+ 12
+ 11
+(2 rows)
+
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+DEBUG:  enabling multi insert
+RESET client_min_messages;
+select count(1) from hash_parted;
+ count 
+-------
+ 10001
+(1 row)
+
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Index Only Scan using hpart1_a_idx on hpart1 hash_parted
+   Index Cond: (a = 13)
+(2 rows)
+
+select * from hash_parted where a=13;
+ a  
+----
+ 13
+(1 row)
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 963faa1614..a74eb3826a 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -280,6 +280,26 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+create index on hash_parted(a);
+-- make sure small inserts are flushed
+insert into hash_parted values(11);
+insert into hpart0 values(12);
+select * from hash_parted;
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
+select count(1) from hash_parted;
+commit;
+-- test that index was updated
+vacuum analyze hash_parted;
+explain(costs off)
+select * from hash_parted where a=13;
+select * from hash_parted where a=13;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e017557e3e..55e69bcd86 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -425,8 +425,6 @@ ConvertRowtypeExpr
 CookedConstraint
 CopyDest
 CopyInsertMethod
-CopyMultiInsertBuffer
-CopyMultiInsertInfo
 CopyState
 CopyStateData
 CopyStmt
@@ -1404,6 +1402,8 @@ ModifyTableState
 MorphOpaque
 MsgType
 MultiAssignRef
+MultiInsertBuffer
+MultiInsertInfo
 MultiSortSupport
 MultiSortSupportData
 MultiXactId

zyu@yugabyte.com

almost 5 years ago

In reply to: Ibrar Ahmed (#23)

Re: should INSERT SELECT use a BulkInsertState?

Hi,
+           mtstate->ntuples > bulk_insert_ntuples &&
+           bulk_insert_ntuples >= 0)

I wonder why bulk_insert_ntuples == 0 is included in the above. It
seems bulk_insert_ntuples having value of 0 should mean not enabling bulk
insertions.

+   }
+   else
+   {

nit: the else should be on the same line as the right brace.

Cheers

On Thu, Mar 18, 2021 at 6:38 AM Ibrar Ahmed <ibrar.ahmad@gmail.com> wrote:

Show quoted text

On Mon, Mar 8, 2021 at 2:18 PM houzj.fnst@fujitsu.com <
houzj.fnst@fujitsu.com> wrote:

I am very interested in this patch, and I plan to do some
experiments with the

patch.

Can you please rebase the patch because it seems can not applied to
the

master now.

Thanks for your interest.

I was sitting on a rebased version since the bulk FDW patch will cause
conflicts, and since this should maybe be built on top of the

table-am patch

(2871).

Have fun :)

Hi,

When I testing with the patch, I found I can not use "\d tablename".
It reports the following error, it this related to the patch?

Sorry, solved by re-initdb.

Best regards,
houzj

One of the patch

(v10-0001-INSERT-SELECT-to-use-BulkInsertState-and-multi_i.patch) from the
patchset does not apply.

http://cfbot.cputube.org/patch_32_2553.log

1 out of 13 hunks FAILED -- saving rejects to file
src/backend/commands/copyfrom.c.rej

It is a minor change, therefore I fixed that to make cfbot happy. Please
take a look if that works for you.

--
Ibrar Ahmed

houzj.fnst@fujitsu.com

houzj.fnst@fujitsu.com

almost 5 years ago

In reply to: Ibrar Ahmed (#23)

RE: should INSERT SELECT use a BulkInsertState?

Hi,

About the 0002-patch [Check for volatile defaults].

I wonder if we can check the volatile default value by traversing "query->targetList" in planner.

IMO, the column default expression was written into the targetList, and the current parallel-safety check
travere the "query->targetList" to determine whether it contains unsafe column default expression.
Like: standard_planner-> query_tree_walker
if (walker((Node *) query->targetList, context))
return true;
May be we can do the similar thing to check the volatile defaults, if so, we do not need to add a field to TargetEntry.

Best regards,
houzj

pryzby@telsasoft.com

over 4 years ago

In reply to: Zhihong Yu (#24)

1 attachment(s)

Re: should INSERT SELECT use a BulkInsertState?

On Mon, May 11, 2020 at 03:19:34PM +0900, Michael Paquier wrote:

On Fri, May 08, 2020 at 02:25:45AM -0500, Justin Pryzby wrote:

Seems to me it should, at least conditionally. At least if there's a function
scan or a relation or ..

I mentioned a bit about our use-case here:
/messages/by-id/20200219173742.GA30939@telsasoft.com
=> I'd prefer our loaders to write their own data rather than dirtying large
fractions of buffer cache and leaving it around for other backends to clean up.

Does it matter in terms of performance and for which cases does it
actually matter?

Every 15min we're inserting 10s of thousands of rows, which dirties a large
number of buffers:

postgres=# CREATE EXTENSION pg_buffercache; DROP TABLE tt; CREATE TABLE tt(i int); INSERT INTO tt SELECT generate_series(1,999999); SELECT usagecount,COUNT(1) FROM pg_buffercache WHERE isdirty GROUP BY 1 ORDER BY 1;
usagecount | count
------------+-------
1 | 1
2 | 1
3 | 2
4 | 2
5 | 4436

With this patch it dirties fewer pages and with lower usage count:

1 | 2052
2 | 1
3 | 3
4 | 2
5 | 10

The goal is to avoid cache churn by using a small ring buffer.
Note that indexes on the target table will themselves use up buffers, and
BulkInsert won't help so much.

On Thu, Mar 18, 2021 at 08:29:50AM -0700, Zhihong Yu wrote:

+           mtstate->ntuples > bulk_insert_ntuples &&
+           bulk_insert_ntuples >= 0)
I wonder why bulk_insert_ntuples == 0 is included in the above. It
seems bulk_insert_ntuples having value of 0 should mean not enabling bulk
insertions.

I think it ought to be possible to enable bulk insertions immediately, which is
what 0 does. -1 is the value defined to mean "do not use bulk insert".
I realize there's no documentation yet.

This patch started out with the goal of using a BulkInsertState for INSERT,
same as for COPY. We use INSERT ON CONFLICT VALUES(),()..., and it'd be nice
if our data loaders could avoid leaving behind dirty buffers.

Simon suggested to also use MultInsertInfo. However that patch is complicated:
it cannot work with volatile default expressions, and probably many other
things that could go in SELECT but not supported by miistate. That may be
better handled by another patch (or in any case by someone else) like this one:
| New Table Access Methods for Multi and Single Inserts
https://commitfest.postgresql.org/31/2871/

I split off the simple part again. If there's no interest in the 0001 patch
alone, then I guess it should be closed in the CF.

However, the "new table AM" patch doesn't address our case, since neither
VALUES nor INSERT SELECT are considered a bulk insert..

--
Justin

Attachments:

v11-0001-ExecInsert-to-use-BulkInsertState.patchtext/x-diff; charset=us-asciiDownload

From 19eea95d1eca89a90f3f1b2992e248e3e3459caa Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 27 Apr 2021 09:09:13 -0500
Subject: [PATCH v11] ExecInsert to use BulkInsertState

---
 src/backend/executor/nodeModifyTable.c | 34 ++++++++++++++++++++++++--
 src/backend/utils/misc/guc.c           |  9 +++++++
 src/include/executor/nodeModifyTable.h |  2 ++
 src/include/nodes/execnodes.h          |  5 ++++
 src/test/regress/expected/insert.out   | 15 ++++++++++++
 src/test/regress/sql/insert.sql        | 10 ++++++++
 6 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index d328856ae5..9ce81dc922 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -81,6 +81,8 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 											   ResultRelInfo *targetRelInfo,
 											   TupleTableSlot *slot,
 											   ResultRelInfo **partRelInfo);
+/* guc */
+int bulk_insert_ntuples = 1000;
 
 /*
  * Verify that the tuples to be produced by INSERT match the
@@ -625,6 +627,19 @@ ExecInsert(ModifyTableState *mtstate,
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
+	/* Use bulk insert after a threshold number of tuples */
+	// XXX: maybe this should only be done if it's not a partitioned table or
+	// if the partitions don't support miinfo, which uses its own bistates
+	mtstate->ntuples++;
+	if (mtstate->bistate == NULL &&
+			mtstate->ntuples > bulk_insert_ntuples &&
+			bulk_insert_ntuples >= 0)
+	{
+		elog(DEBUG1, "enabling bulk insert");
+		mtstate->bistate = GetBulkInsertState();
+	}
+
+
 	/*
 	 * Open the table's indexes, if we have not done so already, so that we
 	 * can add new index entries for the inserted tuple.
@@ -893,7 +908,7 @@ ExecInsert(ModifyTableState *mtstate,
 			table_tuple_insert_speculative(resultRelationDesc, slot,
 										   estate->es_output_cid,
 										   0,
-										   NULL,
+										   mtstate->bistate,
 										   specToken);
 
 			/* insert index entries for tuple */
@@ -930,10 +945,17 @@ ExecInsert(ModifyTableState *mtstate,
 		}
 		else
 		{
+			if (proute && mtstate->prevResultRelInfo != resultRelInfo)
+			{
+				if (mtstate->bistate)
+					ReleaseBulkInsertStatePin(mtstate->bistate);
+				mtstate->prevResultRelInfo = resultRelInfo;
+			}
+
 			/* insert the tuple normally */
 			table_tuple_insert(resultRelationDesc, slot,
 							   estate->es_output_cid,
-							   0, NULL);
+							   0, mtstate->bistate);
 
 			/* insert index entries for tuple */
 			if (resultRelInfo->ri_NumIndices > 0)
@@ -2736,6 +2758,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	mtstate->resultRelInfo = (ResultRelInfo *)
 		palloc(nrels * sizeof(ResultRelInfo));
 
+	mtstate->bistate = NULL;
+
 	/*----------
 	 * Resolve the target relation. This is the same as:
 	 *
@@ -3196,6 +3220,12 @@ ExecEndModifyTable(ModifyTableState *node)
 		}
 	}
 
+	if (node->bistate)
+	{
+		FreeBulkInsertState(node->bistate);
+		table_finish_bulk_insert(node->rootResultRelInfo->ri_RelationDesc, 0);
+	}
+
 	/*
 	 * Close all the partitioned tables, leaf partitions, and their indices
 	 * and release the slot used for tuple routing, if set.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index d2ce4a8450..984e636c5d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -51,6 +51,7 @@
 #include "commands/vacuum.h"
 #include "commands/variable.h"
 #include "common/string.h"
+#include "executor/nodeModifyTable.h"
 #include "funcapi.h"
 #include "jit/jit.h"
 #include "libpq/auth.h"
@@ -3569,6 +3570,14 @@ static struct config_int ConfigureNamesInt[] =
 		0, 0, INT_MAX,
 		check_client_connection_check_interval, NULL, NULL
 	},
+	{
+		{"bulk_insert_ntuples", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enable bulk insertions after this number of tuples."),
+			gettext_noop("A ring buffer of limited size will be used and updates done in batch"),
+		},
+		&bulk_insert_ntuples,
+		1000, -1, INT_MAX,
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h
index 83e2965531..479124899a 100644
--- a/src/include/executor/nodeModifyTable.h
+++ b/src/include/executor/nodeModifyTable.h
@@ -15,6 +15,8 @@
 
 #include "nodes/execnodes.h"
 
+extern PGDLLIMPORT int bulk_insert_ntuples;
+
 extern void ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
 									   EState *estate, TupleTableSlot *slot,
 									   CmdType cmdtype);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 37cb4f3d59..f2bd880d28 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -14,6 +14,7 @@
 #ifndef EXECNODES_H
 #define EXECNODES_H
 
+#include "access/heapam.h"
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
@@ -1202,6 +1203,10 @@ typedef struct ModifyTableState
 	EPQState	mt_epqstate;	/* for evaluating EvalPlanQual rechecks */
 	bool		fireBSTriggers; /* do we need to fire stmt triggers? */
 
+	BulkInsertState	bistate;	/* state for bulk insert like INSERT SELECT, when miinfo cannot be used */
+	ResultRelInfo	*prevResultRelInfo; /* last child inserted with bistate */
+	size_t			ntuples;	/* Number of tuples inserted */
+
 	/*
 	 * These fields are used for inherited UPDATE and DELETE, to track which
 	 * target relation a given tuple is from.  If there are a lot of target
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 5063a3dc22..ea1b15ff83 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -483,6 +483,21 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+DEBUG:  enabling bulk insert
+RESET client_min_messages;
+select count(1) from hash_parted;
+ count 
+-------
+  9999
+(1 row)
+
+commit;
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index bfaa8a3b27..1a792084c1 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -302,6 +302,16 @@ from hash_parted order by part;
 -- partitions
 \d+ list_parted
 
+-- bulk inserts
+truncate hash_parted;
+begin;
+-- exercise bulk insert to partitions
+SET client_min_messages=debug1;
+insert into hash_parted select generate_series(1,9999);
+RESET client_min_messages;
+select count(1) from hash_parted;
+commit;
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
-- 
2.17.0

Daniel Gustafsson

daniel@yesql.se

about 4 years ago

In reply to: Justin Pryzby (#26)

Re: should INSERT SELECT use a BulkInsertState?

On 27 Sep 2021, at 02:08, Justin Pryzby <pryzby@telsasoft.com> wrote:

I split off the simple part again. If there's no interest in the 0001 patch
alone, then I guess it should be closed in the CF.

Since the thread has stalled, maybe that's the best course of action here. Any
objections from anyone on the thread?

--
Daniel Gustafsson https://vmware.com/