From 2e8e49bc9cd3bd346358ad97bef5bb8cd5bb4a26 Mon Sep 17 00:00:00 2001 From: Joel Jacobson Date: Tue, 15 Oct 2024 03:03:09 +0200 Subject: [PATCH 16/16] Add "raw" COPY format support for unstructured text data. This commit introduces a new format option to the COPY command, enabling the import and export of unstructured text data where each line is treated as a single field without any delimiters. --- doc/src/sgml/ref/copy.sgml | 98 ++++++++- src/backend/commands/copy.c | 45 ++-- src/backend/commands/copyfrom.c | 7 + src/backend/commands/copyfromparse.c | 204 +++++++++++++++++- src/backend/commands/copyto.c | 70 +++++- src/backend/parser/gram.y | 8 +- src/include/commands/copy.h | 1 + src/include/parser/kwlist.h | 1 + src/test/regress/data/newlines_cr.data | 1 + src/test/regress/data/newlines_cr_lr.data | 2 + .../regress/data/newlines_cr_lr_nolast.data | 2 + src/test/regress/data/newlines_cr_nolast.data | 1 + src/test/regress/data/newlines_lr.data | 2 + src/test/regress/data/newlines_lr_nolast.data | 2 + src/test/regress/data/newlines_mixed_1.data | 1 + src/test/regress/data/newlines_mixed_2.data | 2 + src/test/regress/data/newlines_mixed_3.data | 2 + src/test/regress/data/newlines_mixed_4.data | 2 + src/test/regress/data/newlines_mixed_5.data | 2 + src/test/regress/expected/copy.out | 92 ++++++++ src/test/regress/expected/copy2.out | 57 ++++- src/test/regress/sql/copy.sql | 43 ++++ src/test/regress/sql/copy2.sql | 43 +++- 23 files changed, 656 insertions(+), 32 deletions(-) create mode 100644 src/test/regress/data/newlines_cr.data create mode 100644 src/test/regress/data/newlines_cr_lr.data create mode 100644 src/test/regress/data/newlines_cr_lr_nolast.data create mode 100644 src/test/regress/data/newlines_cr_nolast.data create mode 100644 src/test/regress/data/newlines_lr.data create mode 100644 src/test/regress/data/newlines_lr_nolast.data create mode 100644 src/test/regress/data/newlines_mixed_1.data create mode 100644 src/test/regress/data/newlines_mixed_2.data create mode 100644 src/test/regress/data/newlines_mixed_3.data create mode 100644 src/test/regress/data/newlines_mixed_4.data create mode 100644 src/test/regress/data/newlines_mixed_5.data diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 8394402f09..06ca632ee3 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -218,8 +218,9 @@ COPY { table_name [ ( Selects the data format to be read or written: text, - csv (Comma Separated Values), - or binary. + CSV (Comma Separated Values), + binary, + or raw The default is text. See below for details. @@ -257,7 +258,8 @@ COPY { table_name [ ( CSV format. This must be a single one-byte character. - This option is not allowed when using binary format. + This option is allowed only when using text or + CSV format. @@ -271,7 +273,8 @@ COPY { table_name [ ( CSV format. You might prefer an empty string even in text format for cases where you don't want to distinguish nulls from empty strings. - This option is not allowed when using binary format. + This option is allowed only when using text or + CSV format. @@ -294,7 +297,7 @@ COPY { table_name [ ( COPY FROM, and only when - not using binary format. + using text or CSV format. @@ -400,7 +403,8 @@ COPY { table_name [ ( The ignore option is applicable only for COPY FROM - when the FORMAT is text or csv. + when the FORMAT is text, + CSV or raw. A NOTICE message containing the ignored row count is @@ -893,6 +897,88 @@ COPY count + + Raw Format + + + This format option is used for importing and exporting files containing + unstructured text, where each line is treated as a single field. It is + ideal for data that does not conform to a structured, tabular format and + lacks delimiters. + + + + In the raw format, each line of the input or output is + considered a complete value without any field separation. There are no + field delimiters, and all characters are taken literally. There is no + special handling for quotes, backslashes, or escape sequences. All + characters, including whitespace and special characters, are preserved + exactly as they appear in the file. However, it's important to note that + the text is still interpreted according to the specified ENCODING + option or the current client encoding for input, and encoded using the + specified ENCODING or the current client encoding for output. + + + + When using this format, the COPY command must specify + exactly one column. Specifying multiple columns will result in an error. + If the table has multiple columns and no column list is provided, an error + will occur. + + + + The raw format does not distinguish a NULL + value from an empty string. Empty lines are imported as empty strings, not + as NULL values. + + + + Encoding works the same as in the text and CSV formats. + + + + + + Raw Format + + + This format option is used for importing and exporting files containing + unstructured text, where each line is treated as a single field. It is + ideal for data that does not conform to a structured, tabular format and + lacks delimiters. + + + + In the raw format, each line of the input or output is + considered a complete value without any field separation. There are no + field delimiters, and all characters are taken literally. There is no + special handling for quotes, backslashes, or escape sequences. All + characters, including whitespace and special characters, are preserved + exactly as they appear in the file. However, it's important to note that + the text is still interpreted according to the specified ENCODING + option or the current client encoding for input, and encoded using the + specified ENCODING or the current client encoding for output. + + + + When using this format, the COPY command must specify + exactly one column. Specifying multiple columns will result in an error. + If the table has multiple columns and no column list is provided, an error + will occur. + + + + The raw format does not distinguish a NULL + value from an empty string. Empty lines are imported as empty strings, not + as NULL values. + + + + Encoding works the same as in the text and CSV formats. + + + + Binary Format diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index cde46bbe2b..74d6ebb78d 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -516,6 +516,8 @@ ProcessCopyOptions(ParseState *pstate, opts_out->format = COPY_FORMAT_CSV; else if (strcmp(fmt, "binary") == 0) opts_out->format = COPY_FORMAT_BINARY; + else if (strcmp(fmt, "raw") == 0) + opts_out->format = COPY_FORMAT_RAW; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -688,6 +690,12 @@ ProcessCopyOptions(ParseState *pstate, /*- translator: %s is the name of a COPY option, e.g. ON_ERROR */ errmsg("cannot specify %s in BINARY mode", "DELIMITER"))); + if (opts_out->format == COPY_FORMAT_RAW) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + /*- translator: %s is the name of a COPY option, e.g. ON_ERROR */ + errmsg("cannot specify %s in RAW mode", "DELIMITER"))); + /* Only single-byte delimiter strings are supported. */ if (strlen(opts_out->delim) != 1) ereport(ERROR, @@ -718,11 +726,11 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("COPY delimiter cannot be \"%s\"", opts_out->delim))); } - else if (opts_out->format != COPY_FORMAT_BINARY) - { - /* Set default delimiter */ - opts_out->delim = opts_out->format == COPY_FORMAT_CSV ? "," : "\t"; - } + /* Set default delimiter */ + else if (opts_out->format == COPY_FORMAT_CSV) + opts_out->delim = ","; + else if (opts_out->format == COPY_FORMAT_TEXT) + opts_out->delim = "\t"; /* --- NULL option --- */ if (opts_out->null_print) @@ -732,6 +740,11 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot specify %s in BINARY mode", "NULL"))); + if (opts_out->format == COPY_FORMAT_RAW) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("cannot specify %s in RAW mode", "NULL"))); + /* Disallow end-of-line characters */ if (strchr(opts_out->null_print, '\r') != NULL || strchr(opts_out->null_print, '\n') != NULL) @@ -739,11 +752,12 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("COPY null representation cannot use newline or carriage return"))); } - else if (opts_out->format != COPY_FORMAT_BINARY) - { - /* Set default null_print */ - opts_out->null_print = opts_out->format == COPY_FORMAT_CSV ? "" : "\\N"; - } + /* Set default null_print */ + else if (opts_out->format == COPY_FORMAT_CSV) + opts_out->null_print = ""; + else if (opts_out->format == COPY_FORMAT_TEXT) + opts_out->null_print = "\\N"; + if (opts_out->null_print) opts_out->null_print_len = strlen(opts_out->null_print); @@ -795,6 +809,11 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot specify %s in BINARY mode", "DEFAULT"))); + if (opts_out->format == COPY_FORMAT_RAW) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("cannot specify %s in RAW mode", "DEFAULT"))); + /* Assert options have been set (defaults applied if not specified) */ Assert(opts_out->delim); Assert(opts_out->null_print); @@ -941,8 +960,8 @@ ProcessCopyOptions(ParseState *pstate, ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), /*- translator: first and second %s are the names of COPY option, e.g. - * ON_ERROR, third is the value of the COPY option, e.g. IGNORE */ - errmsg("COPY %s requires %s to be set to %s", + * ON_ERROR, third is the value of the COPY option, e.g. IGNORE */ + errmsg("COPY %s requires %s to be set to %s", "REJECT_LIMIT", "ON_ERROR", "IGNORE"))); } @@ -985,7 +1004,7 @@ ProcessCopyOptions(ParseState *pstate, ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), /*- translator: %s is the name of a COPY option, e.g. NULL */ - errmsg("CSV quote character must not appear in the %s specification", + errmsg("CSV quote character must not appear in the %s specification", "NULL"))); } } diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index f350a4ff97..99dcb00f8a 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -1438,6 +1438,13 @@ BeginCopyFrom(ParseState *pstate, /* Generate or convert list of attributes to process */ cstate->attnumlist = CopyGetAttnums(tupDesc, cstate->rel, attnamelist); + /* Enforce single column requirement for RAW format */ + if (cstate->opts.format == COPY_FORMAT_RAW && + list_length(cstate->attnumlist) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("COPY with format 'raw' must specify exactly one column"))); + num_phys_attrs = tupDesc->natts; /* Convert FORCE_NOT_NULL name list to per-column flags, check validity */ diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 50bb4b7750..2528c6f111 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -7,7 +7,7 @@ * formats. The main entry point is NextCopyFrom(), which parses the * next input line and returns it as Datums. * - * In text/CSV mode, the parsing happens in multiple stages: + * In text/CSV/raw mode, the parsing happens in multiple stages: * * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf * 1. 2. 3. 4. @@ -25,7 +25,7 @@ * is copied into 'line_buf', with quotes and escape characters still * intact. * - * 4. CopyReadAttributesText/CSV() function takes the input line from + * 4. CopyReadAttributesText/CSV/Raw() function takes the input line from * 'line_buf', and splits it into fields, unescaping the data as required. * The fields are stored in 'attribute_buf', and 'raw_fields' array holds * pointers to each field. @@ -143,8 +143,10 @@ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; /* non-export function prototypes */ static bool CopyReadLine(CopyFromState cstate); static bool CopyReadLineText(CopyFromState cstate); +static bool CopyReadLineRawText(CopyFromState cstate); static int CopyReadAttributesText(CopyFromState cstate); static int CopyReadAttributesCSV(CopyFromState cstate); +static int CopyReadAttributesRaw(CopyFromState cstate); static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, Oid typioparam, int32 typmod, bool *isnull); @@ -732,7 +734,7 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes) } /* - * Read raw fields in the next line for COPY FROM in text or csv mode. + * Read raw fields in the next line for COPY FROM in text, csv, or raw mode. * Return false if no more lines. * * An internal temporary buffer is returned via 'fields'. It is valid until @@ -748,7 +750,7 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields) int fldct; bool done; - /* only available for text or csv input */ + /* only available for text, csv, or raw input */ Assert(cstate->opts.format != COPY_FORMAT_BINARY); /* on input check that the header line is correct if needed */ @@ -768,8 +770,15 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields) if (cstate->opts.format == COPY_FORMAT_CSV) fldct = CopyReadAttributesCSV(cstate); - else + else if (cstate->opts.format == COPY_FORMAT_TEXT) fldct = CopyReadAttributesText(cstate); + else if (cstate->opts.format == COPY_FORMAT_RAW) + fldct = CopyReadAttributesRaw(cstate); + else + { + elog(ERROR, "unexpected COPY format: %d", cstate->opts.format); + pg_unreachable(); + } if (fldct != list_length(cstate->attnumlist)) ereport(ERROR, @@ -823,8 +832,15 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields) /* Parse the line into de-escaped field values */ if (cstate->opts.format == COPY_FORMAT_CSV) fldct = CopyReadAttributesCSV(cstate); - else + else if (cstate->opts.format == COPY_FORMAT_TEXT) fldct = CopyReadAttributesText(cstate); + else if (cstate->opts.format == COPY_FORMAT_RAW) + fldct = CopyReadAttributesRaw(cstate); + else + { + elog(ERROR, "unexpected COPY format: %d", cstate->opts.format); + pg_unreachable(); + } *fields = cstate->raw_fields; *nfields = fldct; @@ -1096,7 +1112,10 @@ CopyReadLine(CopyFromState cstate) cstate->line_buf_valid = false; /* Parse data and transfer into line_buf */ - result = CopyReadLineText(cstate); + if (cstate->opts.format == COPY_FORMAT_RAW) + result = CopyReadLineRawText(cstate); + else + result = CopyReadLineText(cstate); if (result) { @@ -1462,6 +1481,138 @@ CopyReadLineText(CopyFromState cstate) return result; } +/* + * CopyReadLineRawText - inner loop of CopyReadLine for raw text mode + */ +static bool +CopyReadLineRawText(CopyFromState cstate) +{ + char *copy_input_buf; + int input_buf_ptr; + int copy_buf_len; + bool need_data = false; + bool hit_eof = false; + bool result = false; + + /* + * The objective of this loop is to transfer the entire next input line + * into line_buf. We only care for detecting newlines (\r and/or \n). + * All other characters are treated as regular data. + * + * For speed, we try to move data from input_buf to line_buf in chunks + * rather than one character at a time. input_buf_ptr points to the next + * character to examine; any characters from input_buf_index to + * input_buf_ptr have been determined to be part of the line, but not yet + * transferred to line_buf. + * + * For a little extra speed within the loop, we copy input_buf and + * input_buf_len into local variables. + */ + copy_input_buf = cstate->input_buf; + input_buf_ptr = cstate->input_buf_index; + copy_buf_len = cstate->input_buf_len; + + for (;;) + { + int prev_raw_ptr; + char c; + + /* + * Load more data if needed. + */ + if (input_buf_ptr >= copy_buf_len || need_data) + { + REFILL_LINEBUF; + + CopyLoadInputBuf(cstate); + /* update our local variables */ + hit_eof = cstate->input_reached_eof; + input_buf_ptr = cstate->input_buf_index; + copy_buf_len = cstate->input_buf_len; + + /* + * If we are completely out of data, break out of the loop, + * reporting EOF. + */ + if (INPUT_BUF_BYTES(cstate) <= 0) + { + result = true; + break; + } + need_data = false; + } + + /* OK to fetch a character */ + prev_raw_ptr = input_buf_ptr; + c = copy_input_buf[input_buf_ptr++]; + + /* Process \r */ + if (c == '\r') + { + /* Check for \r\n on first line, _and_ handle \r\n. */ + if (cstate->eol_type == EOL_UNKNOWN || + cstate->eol_type == EOL_CRNL) + { + /* + * If need more data, go back to loop top to load it. + * + * Note that if we are at EOF, c will wind up as '\0' because + * of the guaranteed pad of input_buf. + */ + IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); + + /* get next char */ + c = copy_input_buf[input_buf_ptr]; + + if (c == '\n') + { + input_buf_ptr++; /* eat newline */ + cstate->eol_type = EOL_CRNL; /* in case not set yet */ + } + else + { + if (cstate->eol_type == EOL_CRNL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("inconsistent newline style"))); + /* + * if we got here, it is the first line and we didn't find + * \n, so don't consume the peeked character + */ + cstate->eol_type = EOL_CR; + } + } + else if (cstate->eol_type == EOL_NL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("inconsistent newline style"))); + /* If reach here, we have found the line terminator */ + break; + } + + /* Process \n */ + if (c == '\n') + { + if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("inconsistent newline style"))); + cstate->eol_type = EOL_NL; /* in case not set yet */ + /* If reach here, we have found the line terminator */ + break; + } + + /* All other characters are treated as regular data */ + } /* end of outer loop */ + + /* + * Transfer any still-uncopied data to line_buf. + */ + REFILL_LINEBUF; + + return result; +} + /* * Return decimal value for a hexadecimal digit */ @@ -1938,6 +2089,45 @@ endfield: return fieldno; } +/* + * Parse the current line as a single attribute for the "raw" COPY format. + * No parsing, quoting, or escaping is performed. + * Empty lines are treated as empty strings, not NULL. + */ +static int +CopyReadAttributesRaw(CopyFromState cstate) +{ + /* Enforce single column requirement */ + if (cstate->max_fields != 1) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("COPY with format 'raw' requires exactly one column"))); + } + + resetStringInfo(&cstate->attribute_buf); + + /* + * The attribute will certainly not be longer than the input + * data line, so we can just force attribute_buf to be large enough and + * then transfer data without any checks for enough space. We need to do + * it this way because enlarging attribute_buf mid-stream would invalidate + * pointers already stored into cstate->raw_fields[]. + */ + if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) + enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); + + /* Copy the entire line into attribute_buf */ + memcpy(cstate->attribute_buf.data, cstate->line_buf.data, + cstate->line_buf.len); + cstate->attribute_buf.data[cstate->line_buf.len] = '\0'; + cstate->attribute_buf.len = cstate->line_buf.len; + + /* Assign the single field to raw_fields[0] */ + cstate->raw_fields[0] = cstate->attribute_buf.data; + + return 1; +} /* * Read a binary attribute diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index 78531ae846..99fd68a483 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -113,6 +113,7 @@ static void CopyOneRowTo(CopyToState cstate, TupleTableSlot *slot); static void CopyAttributeOutText(CopyToState cstate, const char *string); static void CopyAttributeOutCSV(CopyToState cstate, const char *string, bool use_quote); +static void CopyAttributeOutRaw(CopyToState cstate, const char *string); /* Low-level communications functions */ static void SendCopyBegin(CopyToState cstate); @@ -570,6 +571,13 @@ BeginCopyTo(ParseState *pstate, /* Generate or convert list of attributes to process */ cstate->attnumlist = CopyGetAttnums(tupDesc, cstate->rel, attnamelist); + /* Enforce single column requirement for RAW format */ + if (cstate->opts.format == COPY_FORMAT_RAW && + list_length(cstate->attnumlist) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("COPY with format 'raw' must specify exactly one column"))); + num_phys_attrs = tupDesc->natts; /* Convert FORCE_QUOTE name list to per-column flags, check validity */ @@ -835,8 +843,10 @@ DoCopyTo(CopyToState cstate) if (cstate->opts.format == COPY_FORMAT_CSV) CopyAttributeOutCSV(cstate, colname, false); - else + else if (cstate->opts.format == COPY_FORMAT_TEXT) CopyAttributeOutText(cstate, colname); + else if (cstate->opts.format == COPY_FORMAT_RAW) + CopyAttributeOutRaw(cstate, colname); } CopySendEndOfRow(cstate); @@ -917,7 +927,8 @@ CopyOneRowTo(CopyToState cstate, TupleTableSlot *slot) /* Make sure the tuple is fully deconstructed */ slot_getallattrs(slot); - if (cstate->opts.format != COPY_FORMAT_BINARY) + if (cstate->opts.format == COPY_FORMAT_TEXT || + cstate->opts.format == COPY_FORMAT_CSV) { bool need_delim = false; @@ -945,7 +956,7 @@ CopyOneRowTo(CopyToState cstate, TupleTableSlot *slot) } } } - else + else if (cstate->opts.format == COPY_FORMAT_BINARY) { foreach_int(attnum, cstate->attnumlist) { @@ -965,6 +976,37 @@ CopyOneRowTo(CopyToState cstate, TupleTableSlot *slot) } } } + else if (cstate->opts.format == COPY_FORMAT_RAW) + { + int attnum; + Datum value; + bool isnull; + + /* Ensure only one column is being copied */ + if (list_length(cstate->attnumlist) != 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("COPY with format 'raw' must specify exactly one column"))); + + attnum = linitial_int(cstate->attnumlist); + value = slot->tts_values[attnum - 1]; + isnull = slot->tts_isnull[attnum - 1]; + + if (!isnull) + { + char *string = OutputFunctionCall(&out_functions[attnum - 1], + value); + CopyAttributeOutRaw(cstate, string); + } + /* For RAW format, we don't send anything for NULL values */ + } + else + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Unsupported COPY format"))); + } + CopySendEndOfRow(cstate); @@ -1219,6 +1261,28 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string, } } +/* + * Send text representation of one attribute for RAW format. + */ +static void +CopyAttributeOutRaw(CopyToState cstate, const char *string) +{ + const char *ptr; + + /* Ensure the format is RAW */ + Assert(cstate->opts.format == COPY_FORMAT_RAW); + + /* Ensure exactly one column is being processed */ + Assert(list_length(cstate->attnumlist) == 1); + + if (cstate->need_transcoding) + ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding); + else + ptr = string; + + CopySendString(cstate, ptr); +} + /* * copy_dest_startup --- executor startup */ diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 4aa8646af7..0d0a3ad7ff 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -768,7 +768,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); QUOTE QUOTES - RANGE READ REAL REASSIGN RECURSIVE REF_P REFERENCES REFERENCING + RANGE RAW READ REAL REASSIGN RECURSIVE REF_P REFERENCES REFERENCING REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA RESET RESTART RESTRICT RETURN RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP ROUTINE ROUTINES ROW ROWS RULE @@ -3513,6 +3513,10 @@ copy_opt_item: { $$ = makeDefElem("encoding", (Node *) makeString($2), @1); } + | RAW + { + $$ = makeDefElem("format", (Node *) makeString("raw"), @1); + } ; /* The following exist for backward compatibility with very old versions */ @@ -17771,6 +17775,7 @@ unreserved_keyword: | QUOTE | QUOTES | RANGE + | RAW | READ | REASSIGN | RECURSIVE @@ -18398,6 +18403,7 @@ bare_label_keyword: | QUOTE | QUOTES | RANGE + | RAW | READ | REAL | REASSIGN diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index e700fd01b5..04f7548ef4 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -59,6 +59,7 @@ typedef enum CopyFormat COPY_FORMAT_TEXT, COPY_FORMAT_BINARY, COPY_FORMAT_CSV, + COPY_FORMAT_RAW, } CopyFormat; /* diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 899d64ad55..02cd28c750 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -360,6 +360,7 @@ PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("quotes", QUOTES, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("raw", RAW, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("read", READ, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("real", REAL, COL_NAME_KEYWORD, BARE_LABEL) PG_KEYWORD("reassign", REASSIGN, UNRESERVED_KEYWORD, BARE_LABEL) diff --git a/src/test/regress/data/newlines_cr.data b/src/test/regress/data/newlines_cr.data new file mode 100644 index 0000000000..5397a14fca --- /dev/null +++ b/src/test/regress/data/newlines_cr.data @@ -0,0 +1 @@ +line1 line2 \ No newline at end of file diff --git a/src/test/regress/data/newlines_cr_lr.data b/src/test/regress/data/newlines_cr_lr.data new file mode 100644 index 0000000000..8561d5d6dc --- /dev/null +++ b/src/test/regress/data/newlines_cr_lr.data @@ -0,0 +1,2 @@ +line1 +line2 diff --git a/src/test/regress/data/newlines_cr_lr_nolast.data b/src/test/regress/data/newlines_cr_lr_nolast.data new file mode 100644 index 0000000000..3a1bd7a527 --- /dev/null +++ b/src/test/regress/data/newlines_cr_lr_nolast.data @@ -0,0 +1,2 @@ +line1 +line2 \ No newline at end of file diff --git a/src/test/regress/data/newlines_cr_nolast.data b/src/test/regress/data/newlines_cr_nolast.data new file mode 100644 index 0000000000..d9dce6c5ea --- /dev/null +++ b/src/test/regress/data/newlines_cr_nolast.data @@ -0,0 +1 @@ +line1 line2 \ No newline at end of file diff --git a/src/test/regress/data/newlines_lr.data b/src/test/regress/data/newlines_lr.data new file mode 100644 index 0000000000..c0d0fb45c3 --- /dev/null +++ b/src/test/regress/data/newlines_lr.data @@ -0,0 +1,2 @@ +line1 +line2 diff --git a/src/test/regress/data/newlines_lr_nolast.data b/src/test/regress/data/newlines_lr_nolast.data new file mode 100644 index 0000000000..f8be7bb828 --- /dev/null +++ b/src/test/regress/data/newlines_lr_nolast.data @@ -0,0 +1,2 @@ +line1 +line2 \ No newline at end of file diff --git a/src/test/regress/data/newlines_mixed_1.data b/src/test/regress/data/newlines_mixed_1.data new file mode 100644 index 0000000000..d20e511549 --- /dev/null +++ b/src/test/regress/data/newlines_mixed_1.data @@ -0,0 +1 @@ +line1 line2 diff --git a/src/test/regress/data/newlines_mixed_2.data b/src/test/regress/data/newlines_mixed_2.data new file mode 100644 index 0000000000..fe03b64cc3 --- /dev/null +++ b/src/test/regress/data/newlines_mixed_2.data @@ -0,0 +1,2 @@ +line1 +line2 diff --git a/src/test/regress/data/newlines_mixed_3.data b/src/test/regress/data/newlines_mixed_3.data new file mode 100644 index 0000000000..d2772944d6 --- /dev/null +++ b/src/test/regress/data/newlines_mixed_3.data @@ -0,0 +1,2 @@ +line1 +line2 \ No newline at end of file diff --git a/src/test/regress/data/newlines_mixed_4.data b/src/test/regress/data/newlines_mixed_4.data new file mode 100644 index 0000000000..7afb2406f0 --- /dev/null +++ b/src/test/regress/data/newlines_mixed_4.data @@ -0,0 +1,2 @@ +line1 +line2 line3 \ No newline at end of file diff --git a/src/test/regress/data/newlines_mixed_5.data b/src/test/regress/data/newlines_mixed_5.data new file mode 100644 index 0000000000..658b3593ea --- /dev/null +++ b/src/test/regress/data/newlines_mixed_5.data @@ -0,0 +1,2 @@ +line1 +line2 diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index f554d42c84..d7ec9dd736 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -325,3 +325,95 @@ SELECT tableoid::regclass, id % 2 = 0 is_even, count(*) from parted_si GROUP BY (2 rows) DROP TABLE parted_si; +-- Test COPY FORMAT raw +\set filename :abs_builddir '/results/copy_raw_test.data' +CREATE TABLE copy_raw_test (id SERIAL PRIMARY KEY, col text); +INSERT INTO copy_raw_test (col) VALUES +(E'",\\'), (E'\\.'), (NULL), (''), (' '), ('test'); +COPY copy_raw_test (col) TO :'filename' (FORMAT raw); +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +------+---------- + ",\ | f + \. | f + | f + | f + | f + test | f +(6 rows) + +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' RAW; +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +------+---------- + ",\ | f + \. | f + | f + | f + | f + test | f +(6 rows) + +\set filename :abs_srcdir '/data/newlines_lr.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +-------+---------- + line1 | f + line2 | f +(2 rows) + +\set filename :abs_srcdir '/data/newlines_lr_nolast.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +-------+---------- + line1 | f + line2 | f +(2 rows) + +\set filename :abs_srcdir '/data/newlines_cr_lr.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +-------+---------- + line1 | f + line2 | f +(2 rows) + +\set filename :abs_srcdir '/data/newlines_cr_lr_nolast.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +-------+---------- + line1 | f + line2 | f +(2 rows) + +\set filename :abs_srcdir '/data/newlines_cr.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +-------+---------- + line1 | f + line2 | f +(2 rows) + +\set filename :abs_srcdir '/data/newlines_cr_nolast.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + col | ?column? +-------+---------- + line1 | f + line2 | f +(2 rows) + diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 626a437d40..34bf06390b 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -88,8 +88,12 @@ LINE 1: COPY x from stdin (log_verbosity default, log_verbosity verb... -- incorrect options COPY x to stdout (format BINARY, delimiter ','); ERROR: cannot specify DELIMITER in BINARY mode +COPY x to stdout (format RAW, delimiter ','); +ERROR: cannot specify DELIMITER in RAW mode COPY x to stdout (format BINARY, null 'x'); ERROR: cannot specify NULL in BINARY mode +COPY x to stdout (format RAW, null 'x'); +ERROR: cannot specify NULL in RAW mode COPY x from stdin (format BINARY, on_error ignore); ERROR: only ON_ERROR STOP is allowed in BINARY mode COPY x from stdin (on_error unsupported); @@ -100,6 +104,10 @@ COPY x to stdout (format TEXT, force_quote(a)); ERROR: COPY FORCE_QUOTE requires CSV mode COPY x to stdout (format TEXT, force_quote *); ERROR: COPY FORCE_QUOTE requires CSV mode +COPY x to stdout (format RAW, force_quote(a)); +ERROR: COPY FORCE_QUOTE requires CSV mode +COPY x to stdout (format RAW, force_quote *); +ERROR: COPY FORCE_QUOTE requires CSV mode COPY x from stdin (format CSV, force_quote(a)); ERROR: COPY FORCE_QUOTE cannot be used with COPY FROM COPY x from stdin (format CSV, force_quote *); @@ -108,6 +116,10 @@ COPY x from stdin (format TEXT, force_not_null(a)); ERROR: COPY FORCE_NOT_NULL requires CSV mode COPY x from stdin (format TEXT, force_not_null *); ERROR: COPY FORCE_NOT_NULL requires CSV mode +COPY x from stdin (format RAW, force_not_null(a)); +ERROR: COPY FORCE_NOT_NULL requires CSV mode +COPY x from stdin (format RAW, force_not_null *); +ERROR: COPY FORCE_NOT_NULL requires CSV mode COPY x to stdout (format CSV, force_not_null(a)); ERROR: COPY FORCE_NOT_NULL cannot be used with COPY TO COPY x to stdout (format CSV, force_not_null *); @@ -116,6 +128,10 @@ COPY x from stdin (format TEXT, force_null(a)); ERROR: COPY FORCE_NULL requires CSV mode COPY x from stdin (format TEXT, force_null *); ERROR: COPY FORCE_NULL requires CSV mode +COPY x from stdin (format RAW, force_null(a)); +ERROR: COPY FORCE_NULL requires CSV mode +COPY x from stdin (format RAW, force_null *); +ERROR: COPY FORCE_NULL requires CSV mode COPY x to stdout (format CSV, force_null(a)); ERROR: COPY FORCE_NULL cannot be used with COPY TO COPY x to stdout (format CSV, force_null *); @@ -858,9 +874,11 @@ select id, text_value, ts_value from copy_default; (2 rows) truncate copy_default; --- DEFAULT cannot be used in binary mode +-- DEFAULT cannot be used in binary or raw mode copy copy_default from stdin with (format binary, default '\D'); ERROR: cannot specify DEFAULT in BINARY mode +copy copy_default from stdin with (format raw, default '\D'); +ERROR: cannot specify DEFAULT in RAW mode -- DEFAULT cannot be new line nor carriage return copy copy_default from stdin with (default E'\n'); ERROR: COPY default representation cannot use newline or carriage return @@ -929,3 +947,40 @@ truncate copy_default; -- DEFAULT cannot be used in COPY TO copy (select 1 as test) TO stdout with (default '\D'); ERROR: COPY DEFAULT cannot be used with COPY TO +-- +-- Test COPY FORMAT errors +-- +\getenv abs_srcdir PG_ABS_SRCDIR +\getenv abs_builddir PG_ABS_BUILDDIR +\set filename :abs_builddir '/results/copy_raw_test_errors.data' +-- Test single column requirement +CREATE TABLE copy_raw_test_errors (col1 text, col2 text); +COPY copy_raw_test_errors TO :'filename' (FORMAT raw); +ERROR: COPY with format 'raw' must specify exactly one column +COPY copy_raw_test_errors (col1, col2) TO :'filename' (FORMAT raw); +ERROR: COPY with format 'raw' must specify exactly one column +COPY copy_raw_test_errors FROM :'filename' (FORMAT raw); +ERROR: COPY with format 'raw' must specify exactly one column +COPY copy_raw_test_errors (col1, col2) FROM :'filename' (FORMAT raw); +ERROR: COPY with format 'raw' must specify exactly one column +-- Test inconsistent newline style +\set filename :abs_srcdir '/data/newlines_mixed_1.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); +ERROR: inconsistent newline style +CONTEXT: COPY copy_raw_test_errors, line 2 +\set filename :abs_srcdir '/data/newlines_mixed_2.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); +ERROR: inconsistent newline style +CONTEXT: COPY copy_raw_test_errors, line 2 +\set filename :abs_srcdir '/data/newlines_mixed_3.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); +ERROR: inconsistent newline style +CONTEXT: COPY copy_raw_test_errors, line 2 +\set filename :abs_srcdir '/data/newlines_mixed_4.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); +ERROR: inconsistent newline style +CONTEXT: COPY copy_raw_test_errors, line 2 +\set filename :abs_srcdir '/data/newlines_mixed_5.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); +ERROR: inconsistent newline style +CONTEXT: COPY copy_raw_test_errors, line 2 diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index f1699b66b0..c106bd74ec 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -348,3 +348,46 @@ COPY parted_si(id, data) FROM :'filename'; SELECT tableoid::regclass, id % 2 = 0 is_even, count(*) from parted_si GROUP BY 1, 2 ORDER BY 1; DROP TABLE parted_si; + +-- Test COPY FORMAT raw +\set filename :abs_builddir '/results/copy_raw_test.data' +CREATE TABLE copy_raw_test (id SERIAL PRIMARY KEY, col text); +INSERT INTO copy_raw_test (col) VALUES +(E'",\\'), (E'\\.'), (NULL), (''), (' '), ('test'); +COPY copy_raw_test (col) TO :'filename' (FORMAT raw); +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' RAW; +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + +\set filename :abs_srcdir '/data/newlines_lr.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + +\set filename :abs_srcdir '/data/newlines_lr_nolast.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + +\set filename :abs_srcdir '/data/newlines_cr_lr.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + +\set filename :abs_srcdir '/data/newlines_cr_lr_nolast.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + +\set filename :abs_srcdir '/data/newlines_cr.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; + +\set filename :abs_srcdir '/data/newlines_cr_nolast.data' +TRUNCATE copy_raw_test; +COPY copy_raw_test (col) FROM :'filename' (FORMAT raw); +SELECT col, col IS NULL FROM copy_raw_test ORDER BY id; diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index 3458d287f2..56367234bf 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -71,19 +71,27 @@ COPY x from stdin (log_verbosity default, log_verbosity verbose); -- incorrect options COPY x to stdout (format BINARY, delimiter ','); +COPY x to stdout (format RAW, delimiter ','); COPY x to stdout (format BINARY, null 'x'); +COPY x to stdout (format RAW, null 'x'); COPY x from stdin (format BINARY, on_error ignore); COPY x from stdin (on_error unsupported); COPY x to stdout (format TEXT, force_quote(a)); COPY x to stdout (format TEXT, force_quote *); +COPY x to stdout (format RAW, force_quote(a)); +COPY x to stdout (format RAW, force_quote *); COPY x from stdin (format CSV, force_quote(a)); COPY x from stdin (format CSV, force_quote *); COPY x from stdin (format TEXT, force_not_null(a)); COPY x from stdin (format TEXT, force_not_null *); +COPY x from stdin (format RAW, force_not_null(a)); +COPY x from stdin (format RAW, force_not_null *); COPY x to stdout (format CSV, force_not_null(a)); COPY x to stdout (format CSV, force_not_null *); COPY x from stdin (format TEXT, force_null(a)); COPY x from stdin (format TEXT, force_null *); +COPY x from stdin (format RAW, force_null(a)); +COPY x from stdin (format RAW, force_null *); COPY x to stdout (format CSV, force_null(a)); COPY x to stdout (format CSV, force_null *); COPY x to stdout (format BINARY, on_error unsupported); @@ -636,8 +644,9 @@ select id, text_value, ts_value from copy_default; truncate copy_default; --- DEFAULT cannot be used in binary mode +-- DEFAULT cannot be used in binary or raw mode copy copy_default from stdin with (format binary, default '\D'); +copy copy_default from stdin with (format raw, default '\D'); -- DEFAULT cannot be new line nor carriage return copy copy_default from stdin with (default E'\n'); @@ -707,3 +716,35 @@ truncate copy_default; -- DEFAULT cannot be used in COPY TO copy (select 1 as test) TO stdout with (default '\D'); + +-- +-- Test COPY FORMAT errors +-- + +\getenv abs_srcdir PG_ABS_SRCDIR +\getenv abs_builddir PG_ABS_BUILDDIR + +\set filename :abs_builddir '/results/copy_raw_test_errors.data' + +-- Test single column requirement +CREATE TABLE copy_raw_test_errors (col1 text, col2 text); +COPY copy_raw_test_errors TO :'filename' (FORMAT raw); +COPY copy_raw_test_errors (col1, col2) TO :'filename' (FORMAT raw); +COPY copy_raw_test_errors FROM :'filename' (FORMAT raw); +COPY copy_raw_test_errors (col1, col2) FROM :'filename' (FORMAT raw); + +-- Test inconsistent newline style +\set filename :abs_srcdir '/data/newlines_mixed_1.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); + +\set filename :abs_srcdir '/data/newlines_mixed_2.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); + +\set filename :abs_srcdir '/data/newlines_mixed_3.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); + +\set filename :abs_srcdir '/data/newlines_mixed_4.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); + +\set filename :abs_srcdir '/data/newlines_mixed_5.data' +COPY copy_raw_test_errors (col1) FROM :'filename' (FORMAT raw); -- 2.45.1