START/END line number for COPY FROM

Started by Surafel Temesgenabout 7 years ago7 messages
#1Surafel Temesgen
surafel3000@gmail.com
1 attachment(s)

Hi,

Currently we can skip header line on COPY FROM but having the ability to
skip and stop copying at any line can use to divide long copy operation and
enable to copy a subset of the file and skipping footer. Attach is a patch
for it

Regards

Surafel

Attachments:

copy_from_start_stop_line_v1.patchtext/x-patch; charset=US-ASCII; name=copy_from_start_stop_line_v1.patchDownload
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 411941ed31..86f9a6a905 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -43,6 +43,8 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     FORCE_NOT_NULL ( <replaceable class="parameter">column_name</replaceable> [, ...] )
     FORCE_NULL ( <replaceable class="parameter">column_name</replaceable> [, ...] )
     ENCODING '<replaceable class="parameter">encoding_name</replaceable>'
+    START <replaceable class="parameter">starting_line_number</replaceable>
+    END <replaceable class="parameter">ending_line_number</replaceable>
 </synopsis>
  </refsynopsisdiv>
 
@@ -353,6 +355,24 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>START</literal></term>
+    <listitem>
+     <para>
+      Specifies the line number to begin copying.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><literal>END</literal></term>
+    <listitem>
+     <para>
+      Specifies the line number to end copying.
+     </para>
+    </listitem>
+   </varlistentry>
+
   </variablelist>
  </refsect1>
 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 4311e16007..4d07716a9f 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -121,6 +121,8 @@ typedef struct CopyStateData
 	int			file_encoding;	/* file or remote side's character encoding */
 	bool		need_transcoding;	/* file encoding diff from server? */
 	bool		encoding_embeds_ascii;	/* ASCII can be non-first byte? */
+	int		start_postion;	/* copying star line */
+	int		end_postion;	/* copying end line */
 
 	/* parameters from the COPY command */
 	Relation	rel;			/* relation to copy to or from */
@@ -347,6 +349,7 @@ static void CopySendInt32(CopyState cstate, int32 val);
 static bool CopyGetInt32(CopyState cstate, int32 *val);
 static void CopySendInt16(CopyState cstate, int16 val);
 static bool CopyGetInt16(CopyState cstate, int16 *val);
+static void skipLines(CopyState cstate);
 
 
 /*
@@ -1223,6 +1226,34 @@ ProcessCopyOptions(ParseState *pstate,
 								defel->defname),
 						 parser_errposition(pstate, defel->location)));
 		}
+		else if (strcmp(defel->defname, "start") == 0)
+		{
+			if (cstate->start_postion)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("conflicting or redundant options"),
+						 parser_errposition(pstate, defel->location)));
+			cstate->start_postion = defGetInt64(defel);
+			if (cstate->start_postion < 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid START line number"),
+						 parser_errposition(pstate, defel->location)));
+		}
+		else if (strcmp(defel->defname, "end") == 0)
+		{
+			if (cstate->end_postion)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("conflicting or redundant options"),
+						 parser_errposition(pstate, defel->location)));
+			cstate->end_postion = defGetInt64(defel);
+			if (cstate->end_postion < 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid END line number"),
+						 parser_errposition(pstate, defel->location)));
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -1373,6 +1404,13 @@ ProcessCopyOptions(ParseState *pstate,
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("CSV quote character must not appear in the NULL specification")));
+
+	if (cstate->end_postion != 0 &&
+		cstate->start_postion > cstate->end_postion)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("START line number can not be greater then END line number")));
+
 }
 
 /*
@@ -2317,6 +2355,7 @@ CopyFrom(CopyState cstate)
 	uint64		lastPartitionSampleLineNo = 0;
 	uint64		nPartitionChanges = 0;
 	double		avgTuplesPerPartChange = 0;
+	uint64		end_line = 1;
 
 	Assert(cstate->rel);
 
@@ -2619,6 +2658,20 @@ CopyFrom(CopyState cstate)
 	has_instead_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
 								   resultRelInfo->ri_TrigDesc->trig_insert_instead_row);
 
+	if (cstate->start_postion)
+	{
+		end_line = cstate->start_postion;
+		skipLines(cstate);
+	}
+
+	/* throw the header line away means start copying at second line */
+	if (cstate->start_postion == 0 && cstate->header_line)
+	{
+		cstate->start_postion = 2;
+		end_line = cstate->start_postion;
+		skipLines(cstate);
+	}
+
 	/*
 	 * Check BEFORE STATEMENT insertion triggers. It's debatable whether we
 	 * should do this for COPY, since it's not really an "INSERT" statement as
@@ -2644,6 +2697,9 @@ CopyFrom(CopyState cstate)
 		TupleTableSlot *slot;
 		bool		skip_tuple;
 
+		if (cstate->end_postion !=0 && cstate->end_postion < end_line++)
+			break;
+
 		CHECK_FOR_INTERRUPTS();
 
 		if (nBufferedTuples == 0)
@@ -3394,14 +3450,6 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields)
 	/* only available for text or csv input */
 	Assert(!cstate->binary);
 
-	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->header_line)
-	{
-		cstate->cur_lineno++;
-		if (CopyReadLine(cstate))
-			return false;		/* done */
-	}
-
 	cstate->cur_lineno++;
 
 	/* Actually read the line into memory here */
@@ -4086,6 +4134,22 @@ not_end_of_copy:
 	return result;
 }
 
+/*
+ * go to starting postion.
+ */
+static void
+skipLines(CopyState cstate)
+{
+	for (int i=1; i < cstate->start_postion; i++)
+	{
+		if (CopyReadLineText(cstate))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("START line can not be greater than the number of record in the file")));
+
+	}
+}
+
 /*
  *	Return decimal value for a hexadecimal digit
  */
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 2c2208ffb7..a16b91cf11 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -3093,6 +3093,14 @@ copy_opt_item:
 				{
 					$$ = makeDefElem("encoding", (Node *)makeString($2), @1);
 				}
+			| START Iconst
+				{
+					$$ = makeDefElem("start", (Node *)makeInteger($2), @1);
+				}
+			| END_P Iconst
+				{
+					$$ = makeDefElem("end", (Node *)makeInteger($2), @1);
+				}
 		;
 
 /* The following exist for backward compatibility with very old versions */
diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h
index b5cadce3ef..ae2d11226f 100644
--- a/src/include/commands/copy.h
+++ b/src/include/commands/copy.h
@@ -41,4 +41,5 @@ extern uint64 CopyFrom(CopyState cstate);
 
 extern DestReceiver *CreateCopyDestReceiver(void);
 
+
 #endif							/* COPY_H */
diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
index 19bb538411..d01e3f5cf0 100644
--- a/src/test/regress/expected/copy2.out
+++ b/src/test/regress/expected/copy2.out
@@ -49,6 +49,9 @@ CONTEXT:  COPY x, line 1: "2002	232	40	50	60	70	80"
 COPY x (b, c, d, e) from stdin delimiter ',' null 'x';
 COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
 COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
+COPY x (a, b, c, d, e) from stdin (START 2);
+COPY x (a, b, c, d, e) from stdin (END 3);
+COPY x (a, b, c, d, e) from stdin (START 2 , END 3);
 -- check results of copy in
 SELECT * FROM x;
    a   | b  |     c      |   d    |          e           
@@ -73,12 +76,21 @@ SELECT * FROM x;
   4006 |  6 | BackslashN | \N     | before trigger fired
   4007 |  7 | XX         | XX     | before trigger fired
   4008 |  8 | Delimiter  | :      | before trigger fired
+ 50002 | 23 | 33         | 43     | before trigger fired
+ 50003 | 24 | 34         | 44     | before trigger fired
+ 50004 | 25 | 35         | 45     | before trigger fired
+ 50005 | 26 | 36         | 46     | before trigger fired
+ 60001 | 22 | 32         | 42     | before trigger fired
+ 60002 | 23 | 33         | 43     | before trigger fired
+ 60003 | 24 | 34         | 44     | before trigger fired
+ 70002 | 23 | 33         | 43     | before trigger fired
+ 70003 | 24 | 34         | 44     | before trigger fired
      1 |  1 | stuff      | test_1 | after trigger fired
      2 |  2 | stuff      | test_2 | after trigger fired
      3 |  3 | stuff      | test_3 | after trigger fired
      4 |  4 | stuff      | test_4 | after trigger fired
      5 |  5 | stuff      | test_5 | after trigger fired
-(25 rows)
+(34 rows)
 
 -- check copy out
 COPY x TO stdout;
@@ -102,6 +114,15 @@ COPY x TO stdout;
 4006	6	BackslashN	\\N	before trigger fired
 4007	7	XX	XX	before trigger fired
 4008	8	Delimiter	:	before trigger fired
+50002	23	33	43	before trigger fired
+50003	24	34	44	before trigger fired
+50004	25	35	45	before trigger fired
+50005	26	36	46	before trigger fired
+60001	22	32	42	before trigger fired
+60002	23	33	43	before trigger fired
+60003	24	34	44	before trigger fired
+70002	23	33	43	before trigger fired
+70003	24	34	44	before trigger fired
 1	1	stuff	test_1	after trigger fired
 2	2	stuff	test_2	after trigger fired
 3	3	stuff	test_3	after trigger fired
@@ -128,6 +149,15 @@ N	before trigger fired
 BackslashN	before trigger fired
 XX	before trigger fired
 Delimiter	before trigger fired
+33	before trigger fired
+34	before trigger fired
+35	before trigger fired
+36	before trigger fired
+32	before trigger fired
+33	before trigger fired
+34	before trigger fired
+33	before trigger fired
+34	before trigger fired
 stuff	after trigger fired
 stuff	after trigger fired
 stuff	after trigger fired
@@ -154,6 +184,15 @@ I'm null	before trigger fired
 6	before trigger fired
 7	before trigger fired
 8	before trigger fired
+23	before trigger fired
+24	before trigger fired
+25	before trigger fired
+26	before trigger fired
+22	before trigger fired
+23	before trigger fired
+24	before trigger fired
+23	before trigger fired
+24	before trigger fired
 1	after trigger fired
 2	after trigger fired
 3	after trigger fired
diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
index e36df8858e..92e576e0b2 100644
--- a/src/test/regress/sql/copy2.sql
+++ b/src/test/regress/sql/copy2.sql
@@ -95,6 +95,30 @@ COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
 4008:8:Delimiter:\::\:
 \.
 
+COPY x (a, b, c, d, e) from stdin (START 2);
+50001	22	32	42	52
+50002	23	33	43	53
+50003	24	34	44	54
+50004	25	35	45	55
+50005	26	36	46	56
+\.
+
+COPY x (a, b, c, d, e) from stdin (END 3);
+60001	22	32	42	52
+60002	23	33	43	53
+60003	24	34	44	54
+60004	25	35	45	55
+60005	26	36	46	56
+\.
+
+COPY x (a, b, c, d, e) from stdin (START 2 , END 3);
+70001	22	32	42	52
+70002	23	33	43	53
+70003	24	34	44	54
+70004	25	35	45	55
+70005	26	36	46	56
+\.
+
 -- check results of copy in
 SELECT * FROM x;
 
#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Surafel Temesgen (#1)
Re: START/END line number for COPY FROM

Surafel Temesgen <surafel3000@gmail.com> writes:

Currently we can skip header line on COPY FROM but having the ability to
skip and stop copying at any line can use to divide long copy operation and
enable to copy a subset of the file and skipping footer. Attach is a patch
for it

I do not think this is a good idea. We have resisted attempts to add
ETL-like features to COPY on the grounds that it would add complexity
and cost performance, and that that's not what COPY is for. This
seems to fall squarely in the domain of something you should be doing
with another tool.

regards, tom lane

#3Peter Eisentraut
peter.eisentraut@2ndquadrant.com
In reply to: Surafel Temesgen (#1)
Re: START/END line number for COPY FROM

On 20/12/2018 14:02, Surafel Temesgen wrote:

Currently we can skip header line on COPY FROM but having the ability to
skip and stop copying at any line can use to divide long copy operation
and enable to copy a subset of the file and skipping footer.

It seems a bit fragile to me if I want to skip a footer and need to
figure out the total line count, subtract one, and then oh, was it zero-
or one-based.

--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#4Surafel Temesgen
surafel3000@gmail.com
In reply to: Peter Eisentraut (#3)
Re: START/END line number for COPY FROM

Hi,
On Fri, Jan 4, 2019 at 5:37 PM Peter Eisentraut <
peter.eisentraut@2ndquadrant.com> wrote:

It seems a bit fragile to me if I want to skip a footer and need to
figure out the total line count, subtract one, and then oh, was it zero-
or one-based.

But normally we don't say start copying from line number 0
regards
Surafel

#5David Rowley
david.rowley@2ndquadrant.com
In reply to: Surafel Temesgen (#1)
Re: START/END line number for COPY FROM

On Fri, 21 Dec 2018 at 02:02, Surafel Temesgen <surafel3000@gmail.com> wrote:

Currently we can skip header line on COPY FROM but having the ability to skip and stop copying at any line can use to divide long copy operation and enable to copy a subset of the file and skipping footer. Attach is a patch for it

I'm struggling a bit to see the sense in this. If you really want to
improve the performance of a long copy, then I think it makes more
sense to have performed the backup in multiple pieces in the first
place. Having the database read the input stream and ignore the first
N lines sounds like a bit of a waste of effort, and effort that
wouldn't be required if the COPY TO had been done in multiple pieces.

--
David Rowley http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#6Surafel Temesgen
surafel3000@gmail.com
In reply to: David Rowley (#5)
Re: START/END line number for COPY FROM

Hi,
On Sat, Jan 5, 2019 at 1:10 PM David Rowley <david.rowley@2ndquadrant.com>
wrote:

On Fri, 21 Dec 2018 at 02:02, Surafel Temesgen <surafel3000@gmail.com>
wrote:

Currently we can skip header line on COPY FROM but having the ability to

skip and stop copying at any line can use to divide long copy operation and
enable to copy a subset of the file and skipping footer. Attach is a patch
for it

I'm struggling a bit to see the sense in this. If you really want to
improve the performance of a long copy, then I think it makes more
sense to have performed the backup in multiple pieces in the first
place.

it is not always the case to have in control of the data importing it may
came from
external system

regards
Surafel

#7Peter Eisentraut
peter.eisentraut@2ndquadrant.com
In reply to: Surafel Temesgen (#6)
Re: START/END line number for COPY FROM

On 06/01/2019 12:59, Surafel Temesgen wrote:

it is not always the case to have in control of the data importing it
may came from
external system

But the problem that David described remains: If your data loading
requirement is so complicated that you need to load the file in chunks,
then doing it by line numbers will require you to skip over the leading
lines at every subsequent chunk. That's not going to be good for larger
files.

I think your problem needs a different solution.

--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services