Force lookahead in COPY FROM parsing

Started by Heikki Linnakangasalmost 5 years ago6 messages

hlinnaka@iki.fi

almost 5 years ago

1 attachment(s)

I posted this earlier at
/messages/by-id/9ec25819-0a8a-d51a-17dc-4150bb3cca3b@iki.fi,
and that led to removing FE/BE protocol version 2 support. That's been
committed now, so here's COPY FROM patch again, rebased. To recap:

Previously COPY FROM could not look ahead in the COPY stream, because in
the v2 protocol, it had to detect the end-of-copy marker correctly. With
v2 protocol gone, that's no longer an issue, and we can simplify the
parsing slightly. Simpler should also mean faster, but I haven't tried
that measuring that.

- Heikki

Attachments:

v2-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patchtext/x-patch; charset=UTF-8; name=v2-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patchDownload

From 82680ff836fd5b43d1f83ed01e490d831ffa2b09 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 4 Mar 2021 11:10:39 +0200
Subject: [PATCH v2 1/1] Simplify COPY FROM parsing by forcing lookahead.

Now that we don't support the old-style COPY protocol anymore, we can
force four bytes of lookahead like was suggested in an existing comment,
and simplify the loop in CopyReadLineText().

Discussion: https://www.postgresql.org/message-id/9ec25819-0a8a-d51a-17dc-4150bb3cca3b%40iki.fi
---
 src/backend/commands/copyfromparse.c     | 119 ++++++++---------------
 src/include/commands/copyfrom_internal.h |   3 +-
 2 files changed, 40 insertions(+), 82 deletions(-)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index ce24a1528bd..c2efe7e0782 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -46,21 +46,6 @@
  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
  */
 
-/*
- * This keeps the character read at the top of the loop in the buffer
- * even if there is more than one read-ahead.
- */
-#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
-if (1) \
-{ \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
-	{ \
-		raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
-		need_data = true; \
-		continue; \
-	} \
-} else ((void) 0)
-
 /* This consumes the remainder of the buffer and breaks */
 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 if (1) \
@@ -118,7 +103,7 @@ static int	CopyGetData(CopyFromState cstate, void *databuf,
 						int minread, int maxread);
 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
-static bool CopyLoadRawBuf(CopyFromState cstate);
+static bool CopyLoadRawBuf(CopyFromState cstate, int minread);
 static int	CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
 
 void
@@ -209,7 +194,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not read from COPY file: %m")));
-			if (bytesread == 0)
+			if (bytesread < maxread)
 				cstate->reached_eof = true;
 			break;
 		case COPY_FRONTEND:
@@ -278,6 +263,8 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 			break;
 		case COPY_CALLBACK:
 			bytesread = cstate->data_source_cb(databuf, minread, maxread);
+			if (bytesread < minread)
+				cstate->reached_eof = true;
 			break;
 	}
 
@@ -329,14 +316,13 @@ CopyGetInt16(CopyFromState cstate, int16 *val)
 /*
  * CopyLoadRawBuf loads some more data into raw_buf
  *
- * Returns true if able to obtain at least one more byte, else false.
+ * Returns true if able to obtain at least 'minread' bytes, else false.
  *
  * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
- * of the buffer and then we load more data after that.  This case occurs only
- * when a multibyte character crosses a bufferload boundary.
+ * of the buffer and then we load more data after that.
  */
 static bool
-CopyLoadRawBuf(CopyFromState cstate)
+CopyLoadRawBuf(CopyFromState cstate, int minread)
 {
 	int			nbytes = RAW_BUF_BYTES(cstate);
 	int			inbytes;
@@ -347,14 +333,15 @@ CopyLoadRawBuf(CopyFromState cstate)
 				nbytes);
 
 	inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
-						  1, RAW_BUF_SIZE - nbytes);
+						  minread, RAW_BUF_SIZE - nbytes);
 	nbytes += inbytes;
 	cstate->raw_buf[nbytes] = '\0';
 	cstate->raw_buf_index = 0;
 	cstate->raw_buf_len = nbytes;
 	cstate->bytes_processed += inbytes;
 	pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
-	return (inbytes > 0);
+
+	return (inbytes >= minread);
 }
 
 /*
@@ -389,7 +376,7 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
 			/* Load more data if buffer is empty. */
 			if (RAW_BUF_BYTES(cstate) == 0)
 			{
-				if (!CopyLoadRawBuf(cstate))
+				if (!CopyLoadRawBuf(cstate, 1))
 					break;		/* EOF */
 			}
 
@@ -678,7 +665,7 @@ CopyReadLine(CopyFromState cstate)
 			do
 			{
 				cstate->raw_buf_index = cstate->raw_buf_len;
-			} while (CopyLoadRawBuf(cstate));
+			} while (CopyLoadRawBuf(cstate, 1));
 		}
 	}
 	else
@@ -747,7 +734,6 @@ CopyReadLineText(CopyFromState cstate)
 	char	   *copy_raw_buf;
 	int			raw_buf_ptr;
 	int			copy_buf_len;
-	bool		need_data = false;
 	bool		hit_eof = false;
 	bool		result = false;
 	char		mblen_str[2];
@@ -794,6 +780,7 @@ CopyReadLineText(CopyFromState cstate)
 	copy_raw_buf = cstate->raw_buf;
 	raw_buf_ptr = cstate->raw_buf_index;
 	copy_buf_len = cstate->raw_buf_len;
+	hit_eof = cstate->reached_eof;
 
 	for (;;)
 	{
@@ -801,38 +788,40 @@ CopyReadLineText(CopyFromState cstate)
 		char		c;
 
 		/*
-		 * Load more data if needed.  Ideally we would just force four bytes
-		 * of read-ahead and avoid the many calls to
-		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
-		 * does not allow us to read too far ahead or we might read into the
-		 * next data, so we read-ahead only as far we know we can.  One
-		 * optimization would be to read-ahead four byte here if
-		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
-		 * considering the size of the buffer.
+		 * Load more data if needed.
+		 *
+		 * We need to look ahead max three bytes in one iteration of the loop
+		 * (for the sequence \.<CR><NL>), so make sure we have at least four
+		 * bytes in the buffer.  Note that we always guarantee that there is
+		 * one \0 in the buffer, after last valid byte; the lookaheads below
+		 * rely on that.
 		 */
-		if (raw_buf_ptr >= copy_buf_len || need_data)
+#define COPY_READ_LINE_LOOKAHEAD	4
+		if (raw_buf_ptr + COPY_READ_LINE_LOOKAHEAD >= copy_buf_len)
 		{
-			REFILL_LINEBUF;
+			if (!hit_eof)
+			{
+				REFILL_LINEBUF;
 
-			/*
-			 * Try to read some more data.  This will certainly reset
-			 * raw_buf_index to zero, and raw_buf_ptr must go with it.
-			 */
-			if (!CopyLoadRawBuf(cstate))
-				hit_eof = true;
-			raw_buf_ptr = 0;
-			copy_buf_len = cstate->raw_buf_len;
+				/*
+				 * Try to read some more data.  This will certainly reset
+				 * raw_buf_index to zero, and raw_buf_ptr must go with it.
+				 */
+				if (!CopyLoadRawBuf(cstate, COPY_READ_LINE_LOOKAHEAD))
+					hit_eof = true;
+				raw_buf_ptr = 0;
+				copy_buf_len = cstate->raw_buf_len;
+			}
 
 			/*
 			 * If we are completely out of data, break out of the loop,
 			 * reporting EOF.
 			 */
-			if (copy_buf_len <= 0)
+			if (copy_buf_len - raw_buf_ptr <= 0)
 			{
 				result = true;
 				break;
 			}
-			need_data = false;
 		}
 
 		/* OK to fetch a character */
@@ -841,20 +830,6 @@ CopyReadLineText(CopyFromState cstate)
 
 		if (cstate->opts.csv_mode)
 		{
-			/*
-			 * If character is '\\' or '\r', we may need to look ahead below.
-			 * Force fetch of the next character if we don't already have it.
-			 * We need to do this before changing CSV state, in case one of
-			 * these characters is also the quote or escape character.
-			 *
-			 * Note: old-protocol does not like forced prefetch, but it's OK
-			 * here since we cannot validly be at EOF.
-			 */
-			if (c == '\\' || c == '\r')
-			{
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-			}
-
 			/*
 			 * Dealing with quotes and escapes here is mildly tricky. If the
 			 * quote char is also the escape char, there's no problem - we
@@ -888,14 +863,9 @@ CopyReadLineText(CopyFromState cstate)
 				cstate->eol_type == EOL_CRNL)
 			{
 				/*
-				 * If need more data, go back to loop top to load it.
-				 *
-				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of raw_buf.
+				 * Look at the next character.  If we're at EOF, c2 will wind
+				 * up as '\0' because of the guaranteed pad of raw_buf.
 				 */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-
-				/* get next char */
 				c = copy_raw_buf[raw_buf_ptr];
 
 				if (c == '\n')
@@ -961,7 +931,6 @@ CopyReadLineText(CopyFromState cstate)
 		{
 			char		c2;
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
 			/* -----
@@ -976,16 +945,9 @@ CopyReadLineText(CopyFromState cstate)
 			{
 				raw_buf_ptr++;	/* consume the '.' */
 
-				/*
-				 * Note: if we loop back for more data here, it does not
-				 * matter that the CSV state change checks are re-executed; we
-				 * will come back here with no important state changed.
-				 */
 				if (cstate->eol_type == EOL_CRNL)
 				{
-					/* Get the next character */
-					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-					/* if hit_eof, c2 will become '\0' */
+					/* Get next character.  If hit_eof, c2 will become '\0' */
 					c2 = copy_raw_buf[raw_buf_ptr++];
 
 					if (c2 == '\n')
@@ -1008,9 +970,7 @@ CopyReadLineText(CopyFromState cstate)
 					}
 				}
 
-				/* Get the next character */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-				/* if hit_eof, c2 will become '\0' */
+				/* Get next character.  If hit_eof, c2 will become '\0' */
 				c2 = copy_raw_buf[raw_buf_ptr++];
 
 				if (c2 != '\r' && c2 != '\n')
@@ -1095,7 +1055,6 @@ not_end_of_copy:
 			mblen_str[0] = c;
 			mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
 			IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
 			raw_buf_ptr += mblen - 1;
 		}
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index 705f5b615be..c088c4facdb 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -70,8 +70,7 @@ typedef struct CopyFromStateData
 	CopySource	copy_src;		/* type of copy source */
 	FILE	   *copy_file;		/* used if copy_src == COPY_FILE */
 	StringInfo	fe_msgbuf;		/* used if copy_src == COPY_NEW_FE */
-	bool		reached_eof;	/* true if we read to end of copy data (not
-								 * all copy_src types maintain this) */
+	bool		reached_eof;	/* true if we read to end of copy data */
 
 	EolType		eol_type;		/* EOL type of input */
 	int			file_encoding;	/* file or remote side's character encoding */
-- 
2.30.1

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#1)

Re: Force lookahead in COPY FROM parsing

On Thu, Mar 4, 2021 at 5:13 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

I posted this earlier at

/messages/by-id/9ec25819-0a8a-d51a-17dc-4150bb3cca3b@iki.fi
,

and that led to removing FE/BE protocol version 2 support. That's been
committed now, so here's COPY FROM patch again, rebased.

Looks good to me. Just a couple minor things:

+ * Look at the next character.  If we're at EOF, c2 will wind
+ * up as '\0' because of the guaranteed pad of raw_buf.
  */
- IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-
- /* get next char */
  c = copy_raw_buf[raw_buf_ptr];

The new comment seems copy-pasted from the c2 statements further down.

- if (raw_buf_ptr >= copy_buf_len || need_data)
+#define COPY_READ_LINE_LOOKAHEAD 4
+ if (raw_buf_ptr + COPY_READ_LINE_LOOKAHEAD >= copy_buf_len)

Is this #define deliberately put down here rather than at the top of the
file?

- * of the buffer and then we load more data after that.  This case occurs
only
- * when a multibyte character crosses a bufferload boundary.
+ * of the buffer and then we load more data after that.

Is the removed comment really invalidated by this patch? I figured it was
something not affected until the patch to do the encoding conversion in
larger chunks.

--
John Naylor
EDB: http://www.enterprisedb.com

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: John Naylor (#2)

Re: Force lookahead in COPY FROM parsing

The cfbot thinks this patch no longer applies, but it works for me, so
still set to RFC. It looks like it's because the thread to remove the v2
FE/BE protocol was still attached to the commitfest entry. I've deleted
that, so let's see if that helps.

To answer the side question of whether it makes any difference in
performance, I used the blackhole AM [1]https://github.com/michaelpq/pg_plugins/tree/master/blackhole_am -- John Naylor EDB: http://www.enterprisedb.com to isolate the copy code path as
much as possible. Forcing lookahead seems to not make a noticeable
difference (min of 5 runs):

master:
306ms

force lookahead:
304ms

[1]: https://github.com/michaelpq/pg_plugins/tree/master/blackhole_am -- John Naylor EDB: http://www.enterprisedb.com
--
John Naylor
EDB: http://www.enterprisedb.com

Heikki Linnakangas

hlinnaka@iki.fi

almost 5 years ago

In reply to: John Naylor (#3)

1 attachment(s)

Re: Force lookahead in COPY FROM parsing

On 18/03/2021 17:09, John Naylor wrote:

The cfbot thinks this patch no longer applies, but it works for me, so
still set to RFC. It looks like it's because the thread to remove the v2
FE/BE protocol was still attached to the commitfest entry. I've deleted
that, so let's see if that helps.

It was now truly bitrotted by commit f82de5c46b (the encoding conversion
changes). Rebase attached.

To answer the side question of whether it makes any difference in
performance, I used the blackhole AM [1] to isolate the copy code path
as much as possible. Forcing lookahead seems to not make a noticeable
difference (min of 5 runs):

master:
306ms

force lookahead:
304ms

I forgot to mention the small detail of what the test was:

create extension blackhole_am;
create table blackhole_tab (a text) using blackhole_am ;
copy blackhole_tab from program 'for i in {1..100}; do cat /path/to/UTF-8\ Sampler.htm ; done;' ;

...where the .htm file is at http://kermitproject.org/utf8.html

Ok, I wouldn't expect to see much difference in that test, it gets
drowned in all the other parsing overhead. I tested this now with this:

copy (select repeat('x', 10000) from generate_series(1, 100000)) to
'/tmp/copydata-x.txt'
create table blackhole_tab (a text);

copy blackhole_tab from '/tmp/copydata-x.txt' where false;

I took the min of 5 runs of that COPY FROM statement:

master:
4107 ms

v3-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patch:
3172 ms

I was actually surprised it was so effective on that test, I expected a
small but noticeable gain. But I'll take it.

Replying to your earlier comments (sorry for the delay):

Looks good to me. Just a couple minor things:
+ * Look at the next character.  If we're at EOF, c2 will wind
+ * up as '\0' because of the guaranteed pad of raw_buf.
*/
- IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-
- /* get next char */
c = copy_raw_buf[raw_buf_ptr];
The new comment seems copy-pasted from the c2 statements further down.

Fixed.

- if (raw_buf_ptr >= copy_buf_len || need_data)
+#define COPY_READ_LINE_LOOKAHEAD 4
+ if (raw_buf_ptr + COPY_READ_LINE_LOOKAHEAD >= copy_buf_len)
Is this #define deliberately put down here rather than at the top of the file?

Yeah, it's only used right here locally. Matter of taste, but I'd prefer
to keep it here.

- * of the buffer and then we load more data after that.  This case occurs only
- * when a multibyte character crosses a bufferload boundary.
+ * of the buffer and then we load more data after that.
Is the removed comment really invalidated by this patch? I figured it was something not affected until the patch to do the encoding conversion in larger chunks.

Not sure anymore, but this is moot now, since the other patch was committed.

Thanks for the review and the testing!

- Heikki

Attachments:

v3-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patchtext/x-patch; charset=UTF-8; name=v3-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patchDownload

From 463ac83e337340a78179502a23c2e775a44afcbd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 1 Apr 2021 23:22:05 +0300
Subject: [PATCH v3 1/1] Simplify COPY FROM parsing by forcing lookahead.

Now that we don't support the old-style COPY protocol anymore, we can
force four bytes of lookahead like was suggested in an existing comment,
and simplify the loop in CopyReadLineText().

Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/89627a2a-c123-a8aa-267e-5d168feb73dd%40iki.fi
---
 src/backend/commands/copyfromparse.c | 111 +++++++++------------------
 1 file changed, 35 insertions(+), 76 deletions(-)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 0813424768..b52abc1520 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -90,21 +90,6 @@
  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
  */
 
-/*
- * This keeps the character read at the top of the loop in the buffer
- * even if there is more than one read-ahead.
- */
-#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
-if (1) \
-{ \
-	if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
-	{ \
-		input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
-		need_data = true; \
-		continue; \
-	} \
-} else ((void) 0)
-
 /* This consumes the remainder of the buffer and breaks */
 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 if (1) \
@@ -159,7 +144,7 @@ static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
 
 /* Low-level communications functions */
 static int	CopyGetData(CopyFromState cstate, void *databuf,
-						int minread, int maxread);
+						int maxread);
 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
 static void CopyLoadInputBuf(CopyFromState cstate);
@@ -230,9 +215,8 @@ ReceiveCopyBinaryHeader(CopyFromState cstate)
 /*
  * CopyGetData reads data from the source (file or frontend)
  *
- * We attempt to read at least minread, and at most maxread, bytes from
- * the source.  The actual number of bytes read is returned; if this is
- * less than minread, EOF was detected.
+ * We attempt to read at maxread bytes from the source.  The actual
+ * number of bytes read is returned; if this is 0, EOF was detected.
  *
  * Note: when copying from the frontend, we expect a proper EOF mark per
  * protocol; if the frontend simply drops the connection, we raise error.
@@ -241,7 +225,7 @@ ReceiveCopyBinaryHeader(CopyFromState cstate)
  * NB: no data conversion is applied here.
  */
 static int
-CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
+CopyGetData(CopyFromState cstate, void *databuf, int maxread)
 {
 	int			bytesread = 0;
 
@@ -257,7 +241,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 				cstate->raw_reached_eof = true;
 			break;
 		case COPY_FRONTEND:
-			while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
+			while (maxread > 0 && bytesread == 0 && !cstate->raw_reached_eof)
 			{
 				int			avail;
 
@@ -321,7 +305,9 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 			}
 			break;
 		case COPY_CALLBACK:
-			bytesread = cstate->data_source_cb(databuf, minread, maxread);
+			bytesread = cstate->data_source_cb(databuf, 1, maxread);
+			if (bytesread == 0)
+				cstate->raw_reached_eof = true;
 			break;
 	}
 
@@ -605,7 +591,7 @@ CopyLoadRawBuf(CopyFromState cstate)
 
 	/* Load more data */
 	inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
-						  1, RAW_BUF_SIZE - cstate->raw_buf_len);
+						  RAW_BUF_SIZE - cstate->raw_buf_len);
 	nbytes += inbytes;
 	cstate->raw_buf[nbytes] = '\0';
 	cstate->raw_buf_len = nbytes;
@@ -990,7 +976,7 @@ CopyReadLine(CopyFromState cstate)
 			do
 			{
 				inbytes = CopyGetData(cstate, cstate->input_buf,
-									  1, INPUT_BUF_SIZE);
+									  INPUT_BUF_SIZE);
 			} while (inbytes > 0);
 			cstate->input_buf_index = 0;
 			cstate->input_buf_len = 0;
@@ -1047,8 +1033,7 @@ CopyReadLineText(CopyFromState cstate)
 	char	   *copy_input_buf;
 	int			input_buf_ptr;
 	int			copy_buf_len;
-	bool		need_data = false;
-	bool		hit_eof = false;
+	bool		hit_eof;
 	bool		result = false;
 
 	/* CSV variables */
@@ -1098,6 +1083,7 @@ CopyReadLineText(CopyFromState cstate)
 	copy_input_buf = cstate->input_buf;
 	input_buf_ptr = cstate->input_buf_index;
 	copy_buf_len = cstate->input_buf_len;
+	hit_eof = cstate->input_reached_eof;
 
 	for (;;)
 	{
@@ -1105,35 +1091,37 @@ CopyReadLineText(CopyFromState cstate)
 		char		c;
 
 		/*
-		 * Load more data if needed.  Ideally we would just force four bytes
-		 * of read-ahead and avoid the many calls to
-		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
-		 * does not allow us to read too far ahead or we might read into the
-		 * next data, so we read-ahead only as far we know we can.  One
-		 * optimization would be to read-ahead four byte here if
-		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
-		 * considering the size of the buffer.
+		 * Load more data if needed.
+		 *
+		 * We need to look ahead max three bytes in one iteration of the loop
+		 * (for the sequence \.<CR><NL>), so make sure we have at least four
+		 * bytes in the buffer.  Note that we always guarantee that there is
+		 * one \0 in the buffer, after last valid byte; the lookaheads below
+		 * rely on that.
 		 */
-		if (input_buf_ptr >= copy_buf_len || need_data)
+#define COPY_READ_LINE_LOOKAHEAD	4
+		if (input_buf_ptr + COPY_READ_LINE_LOOKAHEAD >= copy_buf_len)
 		{
-			REFILL_LINEBUF;
+			if (!hit_eof)
+			{
+				REFILL_LINEBUF;
 
-			CopyLoadInputBuf(cstate);
-			/* update our local variables */
-			hit_eof = cstate->input_reached_eof;
-			input_buf_ptr = cstate->input_buf_index;
-			copy_buf_len = cstate->input_buf_len;
+				CopyLoadInputBuf(cstate);
+				/* update our local variables */
+				hit_eof = cstate->input_reached_eof;
+				input_buf_ptr = cstate->input_buf_index;
+				copy_buf_len = cstate->input_buf_len;
+			}
 
 			/*
 			 * If we are completely out of data, break out of the loop,
 			 * reporting EOF.
 			 */
-			if (INPUT_BUF_BYTES(cstate) <= 0)
+			if (copy_buf_len - input_buf_ptr <= 0)
 			{
 				result = true;
 				break;
 			}
-			need_data = false;
 		}
 
 		/* OK to fetch a character */
@@ -1142,20 +1130,6 @@ CopyReadLineText(CopyFromState cstate)
 
 		if (cstate->opts.csv_mode)
 		{
-			/*
-			 * If character is '\\' or '\r', we may need to look ahead below.
-			 * Force fetch of the next character if we don't already have it.
-			 * We need to do this before changing CSV state, in case one of
-			 * these characters is also the quote or escape character.
-			 *
-			 * Note: old-protocol does not like forced prefetch, but it's OK
-			 * here since we cannot validly be at EOF.
-			 */
-			if (c == '\\' || c == '\r')
-			{
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-			}
-
 			/*
 			 * Dealing with quotes and escapes here is mildly tricky. If the
 			 * quote char is also the escape char, there's no problem - we
@@ -1189,14 +1163,9 @@ CopyReadLineText(CopyFromState cstate)
 				cstate->eol_type == EOL_CRNL)
 			{
 				/*
-				 * If need more data, go back to loop top to load it.
-				 *
-				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of input_buf.
+				 * Look at the next character.  If we're at EOF, c will wind
+				 * up as '\0' because of the guaranteed pad of input_buf.
 				 */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-
-				/* get next char */
 				c = copy_input_buf[input_buf_ptr];
 
 				if (c == '\n')
@@ -1262,7 +1231,6 @@ CopyReadLineText(CopyFromState cstate)
 		{
 			char		c2;
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
 			/* -----
@@ -1277,16 +1245,9 @@ CopyReadLineText(CopyFromState cstate)
 			{
 				input_buf_ptr++;	/* consume the '.' */
 
-				/*
-				 * Note: if we loop back for more data here, it does not
-				 * matter that the CSV state change checks are re-executed; we
-				 * will come back here with no important state changed.
-				 */
 				if (cstate->eol_type == EOL_CRNL)
 				{
-					/* Get the next character */
-					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-					/* if hit_eof, c2 will become '\0' */
+					/* Get next character.  If hit_eof, c2 will become '\0' */
 					c2 = copy_input_buf[input_buf_ptr++];
 
 					if (c2 == '\n')
@@ -1309,9 +1270,7 @@ CopyReadLineText(CopyFromState cstate)
 					}
 				}
 
-				/* Get the next character */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-				/* if hit_eof, c2 will become '\0' */
+				/* Get next character.  If hit_eof, c2 will become '\0' */
 				c2 = copy_input_buf[input_buf_ptr++];
 
 				if (c2 != '\r' && c2 != '\n')
-- 
2.30.2

John Naylor

john.naylor@enterprisedb.com

almost 5 years ago

In reply to: Heikki Linnakangas (#4)

Re: Force lookahead in COPY FROM parsing

On Thu, Apr 1, 2021 at 4:47 PM Heikki Linnakangas <hlinnaka@iki.fi> wrote:

Ok, I wouldn't expect to see much difference in that test, it gets
drowned in all the other parsing overhead. I tested this now with this:

copy (select repeat('x', 10000) from generate_series(1, 100000)) to
'/tmp/copydata-x.txt'
create table blackhole_tab (a text);

copy blackhole_tab from '/tmp/copydata-x.txt' where false;

I took the min of 5 runs of that COPY FROM statement:

master:
4107 ms

v3-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patch:
3172 ms

I was actually surprised it was so effective on that test, I expected a
small but noticeable gain. But I'll take it.

Nice! With this test on my laptop I only get 7-8% increase, but that's much
better than what I saw before.

I have nothing further so it's RFC. The patch is pretty simple compared to
the earlier ones, but is worth running the fuzzer again as added insurance?

As an aside, I noticed the URL near the top of copyfromparse.c that
explains a detail of macros has moved from

http://www.cit.gu.edu.au/~anthony/info/C/C.macros

https://antofthy.gitlab.io/info/C/C_macros.txt

--
John Naylor
EDB: http://www.enterprisedb.com

Heikki Linnakangas

hlinnaka@iki.fi

almost 5 years ago

In reply to: John Naylor (#5)

1 attachment(s)

Re: Force lookahead in COPY FROM parsing

On 02/04/2021 20:21, John Naylor wrote:

I have nothing further so it's RFC. The patch is pretty simple compared
to the earlier ones, but is worth running the fuzzer again as added
insurance?

Good idea. I did that, and indeed it revealed bugs. If the client sent
just a single byte in one CopyData message, we only loaded that one byte
into the buffer, instead of the full 4 bytes needed for lookahead.
Attached is a new version that fixes that.

Unfortunately, that's not the end of it. Consider the byte sequence
"\.<NL><some invalid bytes>" appearing at the end of the input. We
should detect the end-of-copy marker \. and stop reading without
complaining about the garbage after the end-of-copy marker. That doesn't
work if we force 4 bytes of lookahead; the invalid byte sequence fits in
the lookahead window, so we will try to convert it.

I'm sure that can be fixed, for example by adding special handling for
the last few bytes of the input. But it needs some more thinking, this
patch isn't quite ready to be committed yet :-(.

- Heikki

Attachments:

v4-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patchtext/x-patch; charset=UTF-8; name=v4-0001-Simplify-COPY-FROM-parsing-by-forcing-lookahead.patchDownload

From e011702cdd2b6ce86687870db06b88fb8daf5d62 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 6 Apr 2021 20:48:19 +0300
Subject: [PATCH v4 1/1] Simplify COPY FROM parsing by forcing lookahead.

Now that we don't support the old-style COPY protocol anymore, we can
force four bytes of lookahead like was suggested in an existing comment,
and simplify the loop in CopyReadLineText().

FIXME: This fails with sequence "\.<NL><some invalid bytes>". We
should detect the end-of-copy marker \. and stop reading without
complaining about the garbage after the end-of-copy marker. That
doesn't work if we force 4 bytes of lookahead; the invalid byte
sequence fits in the lookahead window, so we will try to convert it.

Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/89627a2a-c123-a8aa-267e-5d168feb73dd%40iki.fi
---
 src/backend/commands/copyfromparse.c | 112 +++++++++------------------
 1 file changed, 36 insertions(+), 76 deletions(-)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 0813424768..6b140531ed 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -90,21 +90,6 @@
  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
  */
 
-/*
- * This keeps the character read at the top of the loop in the buffer
- * even if there is more than one read-ahead.
- */
-#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
-if (1) \
-{ \
-	if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
-	{ \
-		input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
-		need_data = true; \
-		continue; \
-	} \
-} else ((void) 0)
-
 /* This consumes the remainder of the buffer and breaks */
 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 if (1) \
@@ -159,7 +144,7 @@ static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
 
 /* Low-level communications functions */
 static int	CopyGetData(CopyFromState cstate, void *databuf,
-						int minread, int maxread);
+						int maxread);
 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
 static void CopyLoadInputBuf(CopyFromState cstate);
@@ -230,9 +215,8 @@ ReceiveCopyBinaryHeader(CopyFromState cstate)
 /*
  * CopyGetData reads data from the source (file or frontend)
  *
- * We attempt to read at least minread, and at most maxread, bytes from
- * the source.  The actual number of bytes read is returned; if this is
- * less than minread, EOF was detected.
+ * We attempt to read at most maxread bytes from the source.  The actual
+ * number of bytes read is returned; if this is 0, EOF was detected.
  *
  * Note: when copying from the frontend, we expect a proper EOF mark per
  * protocol; if the frontend simply drops the connection, we raise error.
@@ -241,7 +225,7 @@ ReceiveCopyBinaryHeader(CopyFromState cstate)
  * NB: no data conversion is applied here.
  */
 static int
-CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
+CopyGetData(CopyFromState cstate, void *databuf, int maxread)
 {
 	int			bytesread = 0;
 
@@ -257,7 +241,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 				cstate->raw_reached_eof = true;
 			break;
 		case COPY_FRONTEND:
-			while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
+			while (maxread > 0 && bytesread == 0 && !cstate->raw_reached_eof)
 			{
 				int			avail;
 
@@ -321,7 +305,9 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 			}
 			break;
 		case COPY_CALLBACK:
-			bytesread = cstate->data_source_cb(databuf, minread, maxread);
+			bytesread = cstate->data_source_cb(databuf, 1, maxread);
+			if (bytesread == 0)
+				cstate->raw_reached_eof = true;
 			break;
 	}
 
@@ -605,7 +591,7 @@ CopyLoadRawBuf(CopyFromState cstate)
 
 	/* Load more data */
 	inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
-						  1, RAW_BUF_SIZE - cstate->raw_buf_len);
+						  RAW_BUF_SIZE - cstate->raw_buf_len);
 	nbytes += inbytes;
 	cstate->raw_buf[nbytes] = '\0';
 	cstate->raw_buf_len = nbytes;
@@ -990,7 +976,7 @@ CopyReadLine(CopyFromState cstate)
 			do
 			{
 				inbytes = CopyGetData(cstate, cstate->input_buf,
-									  1, INPUT_BUF_SIZE);
+									  INPUT_BUF_SIZE);
 			} while (inbytes > 0);
 			cstate->input_buf_index = 0;
 			cstate->input_buf_len = 0;
@@ -1047,8 +1033,7 @@ CopyReadLineText(CopyFromState cstate)
 	char	   *copy_input_buf;
 	int			input_buf_ptr;
 	int			copy_buf_len;
-	bool		need_data = false;
-	bool		hit_eof = false;
+	bool		hit_eof;
 	bool		result = false;
 
 	/* CSV variables */
@@ -1098,6 +1083,7 @@ CopyReadLineText(CopyFromState cstate)
 	copy_input_buf = cstate->input_buf;
 	input_buf_ptr = cstate->input_buf_index;
 	copy_buf_len = cstate->input_buf_len;
+	hit_eof = cstate->input_reached_eof;
 
 	for (;;)
 	{
@@ -1105,35 +1091,38 @@ CopyReadLineText(CopyFromState cstate)
 		char		c;
 
 		/*
-		 * Load more data if needed.  Ideally we would just force four bytes
-		 * of read-ahead and avoid the many calls to
-		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
-		 * does not allow us to read too far ahead or we might read into the
-		 * next data, so we read-ahead only as far we know we can.  One
-		 * optimization would be to read-ahead four byte here if
-		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
-		 * considering the size of the buffer.
+		 * Load more data if needed.
+		 *
+		 * We need to look ahead at most three bytes in one iteration of the
+		 * loop (for the sequence \.<CR><NL>), so make sure we have at least
+		 * four bytes in the buffer.  Note that we always guarantee that there
+		 * is one \0 in the buffer, after last valid byte; the lookaheads
+		 * below rely on that.
 		 */
-		if (input_buf_ptr >= copy_buf_len || need_data)
+#define COPY_READ_LINE_LOOKAHEAD	4
+		if (input_buf_ptr + COPY_READ_LINE_LOOKAHEAD >= copy_buf_len)
 		{
-			REFILL_LINEBUF;
+			if (!hit_eof)
+			{
+				REFILL_LINEBUF;
 
-			CopyLoadInputBuf(cstate);
-			/* update our local variables */
-			hit_eof = cstate->input_reached_eof;
-			input_buf_ptr = cstate->input_buf_index;
-			copy_buf_len = cstate->input_buf_len;
+				CopyLoadInputBuf(cstate);
+				/* update our local variables */
+				input_buf_ptr = cstate->input_buf_index;
+				copy_buf_len = cstate->input_buf_len;
+				hit_eof = cstate->input_reached_eof;
+				continue;
+			}
 
 			/*
 			 * If we are completely out of data, break out of the loop,
 			 * reporting EOF.
 			 */
-			if (INPUT_BUF_BYTES(cstate) <= 0)
+			if (copy_buf_len - input_buf_ptr <= 0)
 			{
 				result = true;
 				break;
 			}
-			need_data = false;
 		}
 
 		/* OK to fetch a character */
@@ -1142,20 +1131,6 @@ CopyReadLineText(CopyFromState cstate)
 
 		if (cstate->opts.csv_mode)
 		{
-			/*
-			 * If character is '\\' or '\r', we may need to look ahead below.
-			 * Force fetch of the next character if we don't already have it.
-			 * We need to do this before changing CSV state, in case one of
-			 * these characters is also the quote or escape character.
-			 *
-			 * Note: old-protocol does not like forced prefetch, but it's OK
-			 * here since we cannot validly be at EOF.
-			 */
-			if (c == '\\' || c == '\r')
-			{
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-			}
-
 			/*
 			 * Dealing with quotes and escapes here is mildly tricky. If the
 			 * quote char is also the escape char, there's no problem - we
@@ -1189,14 +1164,9 @@ CopyReadLineText(CopyFromState cstate)
 				cstate->eol_type == EOL_CRNL)
 			{
 				/*
-				 * If need more data, go back to loop top to load it.
-				 *
-				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of input_buf.
+				 * Look at the next character.  If we're at EOF, c will wind
+				 * up as '\0' because of the guaranteed pad of input_buf.
 				 */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-
-				/* get next char */
 				c = copy_input_buf[input_buf_ptr];
 
 				if (c == '\n')
@@ -1262,7 +1232,6 @@ CopyReadLineText(CopyFromState cstate)
 		{
 			char		c2;
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
 			/* -----
@@ -1277,16 +1246,9 @@ CopyReadLineText(CopyFromState cstate)
 			{
 				input_buf_ptr++;	/* consume the '.' */
 
-				/*
-				 * Note: if we loop back for more data here, it does not
-				 * matter that the CSV state change checks are re-executed; we
-				 * will come back here with no important state changed.
-				 */
 				if (cstate->eol_type == EOL_CRNL)
 				{
-					/* Get the next character */
-					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-					/* if hit_eof, c2 will become '\0' */
+					/* Get next character.  If hit_eof, c2 will become '\0' */
 					c2 = copy_input_buf[input_buf_ptr++];
 
 					if (c2 == '\n')
@@ -1309,9 +1271,7 @@ CopyReadLineText(CopyFromState cstate)
 					}
 				}
 
-				/* Get the next character */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-				/* if hit_eof, c2 will become '\0' */
+				/* Get next character.  If hit_eof, c2 will become '\0' */
 				c2 = copy_input_buf[input_buf_ptr++];
 
 				if (c2 != '\r' && c2 != '\n')
-- 
2.30.2