From 6b646d96a9c2a310836693452deb2128636d1beb Mon Sep 17 00:00:00 2001
From: Atsushi Torikoshi <torikoshia@oss.nttdata.com>
Date: Mon, 27 Mar 2023 22:15:49 +0900
Subject: [PATCH v5] Add new COPY option IGNORE_DATATYPE_ERRORS

Currently entire COPY fails when there exists unexpected
data regarding data type or range.
In some cases, it would be useful to skip copying such data
and continue copying and IGNORE_DATATYPE_ERRORS does this.

This patch uses the soft error handling infrastructure, which
is introduced by d9f7f5d32f20.

Author: Damir Belyalov, Atsushi Torikoshi
---
 doc/src/sgml/ref/copy.sgml               | 13 ++++++++++
 src/backend/commands/copy.c              | 15 ++++++++++-
 src/backend/commands/copyfrom.c          | 32 ++++++++++++++++++++++++
 src/backend/commands/copyfromparse.c     | 17 ++++++++++---
 src/bin/psql/tab-complete.c              |  3 ++-
 src/include/commands/copy.h              |  1 +
 src/include/commands/copyfrom_internal.h |  3 +++
 src/test/regress/expected/copy2.out      | 22 ++++++++++++++++
 src/test/regress/sql/copy2.sql           | 19 ++++++++++++++
 9 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 5e591ed2e6..cea56d65eb 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -34,6 +34,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
 
     FORMAT <replaceable class="parameter">format_name</replaceable>
     FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
+    IGNORE_DATATYPE_ERRORS [ <replaceable class="parameter">boolean</replaceable> ]
     DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
     NULL '<replaceable class="parameter">null_string</replaceable>'
     HEADER [ <replaceable class="parameter">boolean</replaceable> | MATCH ]
@@ -234,6 +235,18 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>IGNORE_DATATYPE_ERRORS</literal></term>
+    <listitem>
+     <para>
+      Drops rows that contain malformed data while copying. These are rows
+      with columns where the data type's input-function raises an error.
+      This option is not allowed when using binary format.  Note that this
+      is only supported in current <command>COPY</command> syntax.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><literal>DELIMITER</literal></term>
     <listitem>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index f14fae3308..aa50cc1ee1 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -419,6 +419,7 @@ ProcessCopyOptions(ParseState *pstate,
 	bool		format_specified = false;
 	bool		freeze_specified = false;
 	bool		header_specified = false;
+	bool		ignore_datatype_errors_specified = false;
 	ListCell   *option;
 
 	/* Support external use for option sanity checking */
@@ -458,6 +459,13 @@ ProcessCopyOptions(ParseState *pstate,
 			freeze_specified = true;
 			opts_out->freeze = defGetBoolean(defel);
 		}
+		else if (strcmp(defel->defname, "ignore_datatype_errors") == 0)
+		{
+			if (ignore_datatype_errors_specified)
+				errorConflictingDefElem(defel, pstate);
+			ignore_datatype_errors_specified = true;
+			opts_out->ignore_datatype_errors = defGetBoolean(defel);
+		}
 		else if (strcmp(defel->defname, "delimiter") == 0)
 		{
 			if (opts_out->delim)
@@ -576,7 +584,7 @@ ProcessCopyOptions(ParseState *pstate,
 	}
 
 	/*
-	 * Check for incompatible options (must do these two before inserting
+	 * Check for incompatible options (must do these before inserting
 	 * defaults)
 	 */
 	if (opts_out->binary && opts_out->delim)
@@ -594,6 +602,11 @@ ProcessCopyOptions(ParseState *pstate,
 				(errcode(ERRCODE_SYNTAX_ERROR),
 				 errmsg("cannot specify DEFAULT in BINARY mode")));
 
+	if (opts_out->binary && opts_out->ignore_datatype_errors)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("cannot specify IGNORE_DATATYPE_ERRORS in BINARY mode")));
+
 	/* Set defaults for omitted options */
 	if (!opts_out->delim)
 		opts_out->delim = opts_out->csv_mode ? "," : "\t";
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 80bca79cd0..9e23cd45d5 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -652,6 +652,7 @@ CopyFrom(CopyFromState cstate)
 	bool		has_before_insert_row_trig;
 	bool		has_instead_insert_row_trig;
 	bool		leafpart_use_multi_insert = false;
+	ErrorSaveContext escontext = {T_ErrorSaveContext};
 
 	Assert(cstate->rel);
 	Assert(list_length(cstate->range_table) == 1);
@@ -752,6 +753,13 @@ CopyFrom(CopyFromState cstate)
 		ti_options |= TABLE_INSERT_FROZEN;
 	}
 
+	/* Set up soft error handler for IGNORE_DATATYPE_ERRORS */
+	if (cstate->opts.ignore_datatype_errors)
+	{
+		escontext.details_wanted = true;
+		cstate->escontext = escontext;
+	}
+
 	/*
 	 * We need a ResultRelInfo so we can use the regular executor's
 	 * index-entry-making machinery.  (There used to be a huge amount of code
@@ -987,7 +995,31 @@ CopyFrom(CopyFromState cstate)
 
 		/* Directly store the values/nulls array in the slot */
 		if (!NextCopyFrom(cstate, econtext, myslot->tts_values, myslot->tts_isnull))
+		{
+			if (cstate->opts.ignore_datatype_errors && cstate->ignored_errors > 0)
+				ereport(WARNING,
+						errmsg("%zd rows were skipped due to data type incompatibility",
+							cstate->ignored_errors));
 			break;
+		}
+
+		/* Soft error occured, skip this tuple and cleanup the escontext */
+		if (cstate->escontext.error_occurred)
+		{
+			ErrorSaveContext new_escontext = {T_ErrorSaveContext};
+
+			/* adjust elevel so we don't jump out */
+			cstate->escontext.error_data->elevel = WARNING;
+			/* despite the name, this won't raise an error since elevel is WARNING now */
+			ThrowErrorData(cstate->escontext.error_data);
+
+			ExecClearTuple(myslot);
+
+			new_escontext.details_wanted = true;
+			cstate->escontext = new_escontext;
+
+			continue;
+		}
 
 		ExecStoreVirtualTuple(myslot);
 
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 3853902a16..6b0447782d 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -70,6 +70,7 @@
 #include "libpq/pqformat.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "nodes/miscnodes.h"
 #include "pgstat.h"
 #include "port/pg_bswap.h"
 #include "utils/builtins.h"
@@ -956,10 +957,18 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 				values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
 			}
 			else
-				values[m] = InputFunctionCall(&in_functions[m],
-											  string,
-											  typioparams[m],
-											  att->atttypmod);
+				/* If IGNORE_DATATYPE_ERRORS is enabled skip rows with datatype errors */
+				if (!InputFunctionCallSafe(&in_functions[m],
+										   string,
+										   typioparams[m],
+										   att->atttypmod,
+										   (Node *) &cstate->escontext,
+										   &values[m]))
+				{
+					cstate->ignored_errors++;
+
+					return true;
+				}
 
 			cstate->cur_attname = NULL;
 			cstate->cur_attval = NULL;
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 42e87b9e49..4b443d4ea7 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2857,7 +2857,8 @@ psql_completion(const char *text, int start, int end)
 	else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "("))
 		COMPLETE_WITH("FORMAT", "FREEZE", "DELIMITER", "NULL",
 					  "HEADER", "QUOTE", "ESCAPE", "FORCE_QUOTE",
-					  "FORCE_NOT_NULL", "FORCE_NULL", "ENCODING", "DEFAULT");
+					  "FORCE_NOT_NULL", "FORCE_NULL", "ENCODING", "DEFAULT",
+					  "IGNORE_DATATYPE_ERRORS");
 
 	/* Complete COPY <sth> FROM|TO filename WITH (FORMAT */
 	else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "(", "FORMAT"))
diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h
index 33175868f6..c2e55ac21f 100644
--- a/src/include/commands/copy.h
+++ b/src/include/commands/copy.h
@@ -42,6 +42,7 @@ typedef struct CopyFormatOptions
 								 * -1 if not specified */
 	bool		binary;			/* binary format? */
 	bool		freeze;			/* freeze rows on loading? */
+	bool		ignore_datatype_errors;  /* ignore rows with datatype errors */
 	bool		csv_mode;		/* Comma Separated Value format? */
 	CopyHeaderChoice header_line;	/* header line? */
 	char	   *null_print;		/* NULL marker string (server encoding!) */
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index ac2c16f8b8..1164c71631 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -16,6 +16,7 @@
 
 #include "commands/copy.h"
 #include "commands/trigger.h"
+#include "nodes/miscnodes.h"
 
 /*
  * Represents the different source cases we need to worry about at
@@ -94,6 +95,8 @@ typedef struct CopyFromStateData
 								 * default value */
 	FmgrInfo   *in_functions;	/* array of input functions for each attrs */
 	Oid		   *typioparams;	/* array of element types for in_functions */
+	ErrorSaveContext escontext; /* soft error trapper during in_functions execution */
+	int64		ignored_errors;	/* total number of ignored errors */
 	int		   *defmap;			/* array of default att numbers related to
 								 * missing att */
 	ExprState **defexprs;		/* array of default att expressions for all
diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
index 8e33eee719..99f70f44ed 100644
--- a/src/test/regress/expected/copy2.out
+++ b/src/test/regress/expected/copy2.out
@@ -82,6 +82,8 @@ COPY x to stdin (format BINARY, delimiter ',');
 ERROR:  cannot specify DELIMITER in BINARY mode
 COPY x to stdin (format BINARY, null 'x');
 ERROR:  cannot specify NULL in BINARY mode
+COPY x to stdin (format BINARY, ignore_datatype_errors);
+ERROR:  cannot specify IGNORE_DATATYPE_ERRORS in BINARY mode
 COPY x to stdin (format TEXT, force_quote(a));
 ERROR:  COPY force quote available only in CSV mode
 COPY x from stdin (format CSV, force_quote(a));
@@ -666,6 +668,25 @@ SELECT * FROM instead_of_insert_tbl;
 (2 rows)
 
 COMMIT;
+-- tests for IGNORE_DATATYPE_ERRORS option
+CREATE TABLE check_ign_err (n int, m int[], k int);
+COPY check_ign_err FROM STDIN WITH (IGNORE_DATATYPE_ERRORS);
+WARNING:  invalid input syntax for type integer: "a"
+WARNING:  value "3333333333" is out of range for type integer
+WARNING:  invalid input syntax for type integer: "a"
+WARNING:  invalid input syntax for type integer: ""
+WARNING:  4 rows were skipped due to data type incompatibility
+SELECT * FROM check_ign_err;
+ n |  m  | k 
+---+-----+---
+ 1 | {1} | 1
+ 5 | {5} | 5
+(2 rows)
+
+-- test missing data: should fail
+COPY check_ign_err FROM STDIN WITH (IGNORE_DATATYPE_ERRORS);
+ERROR:  missing data for column "k"
+CONTEXT:  COPY check_ign_err, line 1: "1	{1}"
 -- clean up
 DROP TABLE forcetest;
 DROP TABLE vistest;
@@ -680,6 +701,7 @@ DROP TABLE instead_of_insert_tbl;
 DROP VIEW instead_of_insert_tbl_view;
 DROP VIEW instead_of_insert_tbl_view_2;
 DROP FUNCTION fun_instead_of_insert_tbl();
+DROP TABLE check_ign_err;
 --
 -- COPY FROM ... DEFAULT
 --
diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
index d759635068..e88f27bbfd 100644
--- a/src/test/regress/sql/copy2.sql
+++ b/src/test/regress/sql/copy2.sql
@@ -70,6 +70,7 @@ COPY x from stdin (encoding 'sql_ascii', encoding 'sql_ascii');
 -- incorrect options
 COPY x to stdin (format BINARY, delimiter ',');
 COPY x to stdin (format BINARY, null 'x');
+COPY x to stdin (format BINARY, ignore_datatype_errors);
 COPY x to stdin (format TEXT, force_quote(a));
 COPY x from stdin (format CSV, force_quote(a));
 COPY x to stdout (format TEXT, force_not_null(a));
@@ -464,6 +465,23 @@ test1
 SELECT * FROM instead_of_insert_tbl;
 COMMIT;
 
+-- tests for IGNORE_DATATYPE_ERRORS option
+CREATE TABLE check_ign_err (n int, m int[], k int);
+COPY check_ign_err FROM STDIN WITH (IGNORE_DATATYPE_ERRORS);
+1	{1}	1
+a	{2}	2
+3	{3}	3333333333
+4	{a, 4}	4
+
+5	{5}	5
+\.
+SELECT * FROM check_ign_err;
+
+-- test missing data: should fail
+COPY check_ign_err FROM STDIN WITH (IGNORE_DATATYPE_ERRORS);
+1	{1}
+\.
+
 -- clean up
 DROP TABLE forcetest;
 DROP TABLE vistest;
@@ -478,6 +496,7 @@ DROP TABLE instead_of_insert_tbl;
 DROP VIEW instead_of_insert_tbl_view;
 DROP VIEW instead_of_insert_tbl_view_2;
 DROP FUNCTION fun_instead_of_insert_tbl();
+DROP TABLE check_ign_err;
 
 --
 -- COPY FROM ... DEFAULT

base-commit: 36f40ce2dc66f1a36d6a12f7a0352e1c5bf1063e
-- 
2.25.1

