diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 38424ad..1f9e4cc 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -40,7 +40,8 @@ COPY { table_name [ ( quote_character' ESCAPE 'escape_character' FORCE_QUOTE { ( column [, ...] ) | * } - FORCE_NOT_NULL ( column [, ...] ) + FORCE_NOT_NULL ( column [, ...] ) | + ENCODING encoding_name @@ -282,6 +283,18 @@ COPY { table_name [ ( + + ENCODING + + + Specifies that the file is encoded in the encoding_name. If this option is + omitted, the current client encoding is used. See the Notes below + for more details. + + + + @@ -377,8 +390,9 @@ COPY count - Input data is interpreted according to the current client encoding, - and output data is encoded in the current client encoding, even + Input data is interpreted according to ENCODING + option or the current client encoding, and output data is encoded + in ENCODING or the current client encoding, even if the data does not pass through the client but is read from or written to a file directly by the server. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 841bf22..6ea9372 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -96,9 +96,11 @@ typedef struct CopyStateData bool fe_copy; /* true for all FE copy dests */ bool fe_eof; /* true if detected end of copy data */ EolType eol_type; /* EOL type of input */ - int client_encoding; /* remote side's character encoding */ - bool need_transcoding; /* client encoding diff from server? */ + int target_encoding; /* remote side's character encoding */ + bool encoding_option; /* has encoding option? */ + bool need_transcoding; /* encoding diff from server? */ bool encoding_embeds_ascii; /* ASCII can be non-first byte? */ + FmgrInfo *conv_proc; /* encoding conversion proc */ uint64 processed; /* # of tuples processed */ /* parameters from the COPY command */ @@ -811,6 +813,22 @@ DoCopy(const CopyStmt *stmt, const char *queryString) errmsg("conflicting or redundant options"))); cstate->escape = defGetString(defel); } + else if (strcmp(defel->defname, "encoding") == 0) + { + if (cstate->encoding_option) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + if (PG_VALID_ENCODING(pg_char_to_encoding(defGetString(defel)))) + cstate->target_encoding = + pg_char_to_encoding(defGetString(defel)); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("argument to option \"%s\" must be a valid encoding name", + defel->defname))); + cstate->encoding_option = true; + } else if (strcmp(defel->defname, "force_quote") == 0) { if (force_quote || force_quote_all) @@ -1169,16 +1187,22 @@ DoCopy(const CopyStmt *stmt, const char *queryString) cstate->processed = 0; /* - * Set up encoding conversion info. Even if the client and server - * encodings are the same, we must apply pg_client_to_server() to validate - * data in multibyte encodings. + * Set up encoding conversion info. If encoding option is specified, + * use it instead of client_encoding of GUC. + */ + if (!cstate->encoding_option) + cstate->target_encoding = pg_get_client_encoding(); + + /* + * Even if the client and server encodings are the same, + * we must apply pg_client_to_server() to validate data in + * multibyte encodings. */ - cstate->client_encoding = pg_get_client_encoding(); cstate->need_transcoding = - (cstate->client_encoding != GetDatabaseEncoding() || + (cstate->target_encoding != GetDatabaseEncoding() || pg_database_encoding_max_length() > 1); /* See Multibyte encoding comment above */ - cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding); + cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->target_encoding); cstate->copy_dest = COPY_FILE; /* default */ cstate->filename = stmt->filename; @@ -1346,6 +1370,24 @@ CopyTo(CopyState cstate) /* We use fe_msgbuf as a per-row buffer regardless of copy_dest */ cstate->fe_msgbuf = makeStringInfo(); + /* + * We don't need conversion procedure when the encodings match + * between target and server, but need_transcoding indicates + * we need verify encoding still. + */ + if (cstate->need_transcoding && + cstate->target_encoding != GetDatabaseEncoding()) + { + Oid funcoid; + + funcoid = FindDefaultConversionProc( + GetDatabaseEncoding(), cstate->target_encoding); + cstate->conv_proc = (FmgrInfo *) palloc(sizeof(FmgrInfo)); + fmgr_info(funcoid, cstate->conv_proc); + } + else + cstate->conv_proc = NULL; + /* Get info about the columns we need to process. */ cstate->out_functions = (FmgrInfo *) palloc(num_phys_attrs * sizeof(FmgrInfo)); foreach(cur, cstate->attnumlist) @@ -1400,8 +1442,13 @@ CopyTo(CopyState cstate) * encoding, because it will be sent directly with CopySendString. */ if (cstate->need_transcoding) - cstate->null_print_client = pg_server_to_client(cstate->null_print, - cstate->null_print_len); + cstate->null_print_client = + pg_cached_encoding_conversion(cstate->null_print, + cstate->null_print_len, + cstate->conv_proc, + GetDatabaseEncoding(), + cstate->target_encoding, + false); /* if a header has been requested send the line */ if (cstate->header_line) @@ -1949,6 +1996,24 @@ CopyFrom(CopyState cstate) cstate->cur_attname = NULL; cstate->cur_attval = NULL; + /* + * We don't need conversion procedure when the encodings match + * between target and server, but need_transcoding indicates + * we need verify encoding still. + */ + if (cstate->need_transcoding && + cstate->target_encoding != GetDatabaseEncoding()) + { + Oid funcoid; + + funcoid = FindDefaultConversionProc( + cstate->target_encoding, GetDatabaseEncoding()); + cstate->conv_proc = (FmgrInfo *) palloc(sizeof(FmgrInfo)); + fmgr_info(funcoid, cstate->conv_proc); + } + else + cstate->conv_proc = NULL; + bistate = GetBulkInsertState(); /* Set up callback to identify error line number */ @@ -2351,8 +2416,12 @@ CopyReadLine(CopyState cstate) { char *cvt; - cvt = pg_client_to_server(cstate->line_buf.data, - cstate->line_buf.len); + cvt = pg_cached_encoding_conversion(cstate->line_buf.data, + cstate->line_buf.len, + cstate->conv_proc, + cstate->target_encoding, + GetDatabaseEncoding(), + true); if (cvt != cstate->line_buf.data) { /* transfer converted data back to line_buf */ @@ -2711,7 +2780,7 @@ not_end_of_copy: mblen_str[0] = c; /* All our encodings only read the first byte to get the length */ - mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str); + mblen = pg_encoding_mblen(cstate->target_encoding, mblen_str); IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1); IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1); raw_buf_ptr += mblen - 1; @@ -3210,7 +3279,11 @@ CopyAttributeOutText(CopyState cstate, char *string) char delimc = cstate->delim[0]; if (cstate->need_transcoding) - ptr = pg_server_to_client(string, strlen(string)); + ptr = pg_cached_encoding_conversion(string, strlen(string), + cstate->conv_proc, + GetDatabaseEncoding(), + cstate->target_encoding, + false); else ptr = string; @@ -3283,7 +3356,7 @@ CopyAttributeOutText(CopyState cstate, char *string) start = ptr++; /* we include char in next run */ } else if (IS_HIGHBIT_SET(c)) - ptr += pg_encoding_mblen(cstate->client_encoding, ptr); + ptr += pg_encoding_mblen(cstate->target_encoding, ptr); else ptr++; } @@ -3370,7 +3443,11 @@ CopyAttributeOutCSV(CopyState cstate, char *string, use_quote = true; if (cstate->need_transcoding) - ptr = pg_server_to_client(string, strlen(string)); + ptr = pg_cached_encoding_conversion(string, strlen(string), + cstate->conv_proc, + GetDatabaseEncoding(), + cstate->target_encoding, + false); else ptr = string; @@ -3397,7 +3474,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string, break; } if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii) - tptr += pg_encoding_mblen(cstate->client_encoding, tptr); + tptr += pg_encoding_mblen(cstate->target_encoding, tptr); else tptr++; } @@ -3421,7 +3498,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string, start = ptr; /* we include char in next run */ } if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii) - ptr += pg_encoding_mblen(cstate->client_encoding, ptr); + ptr += pg_encoding_mblen(cstate->target_encoding, ptr); else ptr++; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 456db5c..1f698d1 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -2217,6 +2217,10 @@ copy_opt_item: { $$ = makeDefElem("force_not_null", (Node *)$4); } + | ENCODING ColId_or_Sconst + { + $$ = makeDefElem("encoding", (Node *)makeString($2)); + } ; /* The following exist for backward compatibility with very old versions */ diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index a041812..b38b7a0 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -71,8 +71,11 @@ static int pending_client_encoding = PG_SQL_ASCII; /* Internal functions */ -static char *perform_default_encoding_conversion(const char *src, - int len, bool is_client_to_server); +static bool need_conversion(const char *src, int len, + int src_encoding, int dest_encoding, bool check_sanity); +static char *perform_cached_encoding_conversion(const char *src, + int len, FmgrInfo *flinfo, + int src_encoding, int dest_encoding); static int cliplen(const char *str, int len, int limit); @@ -500,50 +503,12 @@ pg_client_to_server(const char *s, int len) Assert(DatabaseEncoding); Assert(ClientEncoding); - if (len <= 0) + if (!need_conversion(s, len, ClientEncoding->encoding, + DatabaseEncoding->encoding, true)) return (char *) s; - if (ClientEncoding->encoding == DatabaseEncoding->encoding || - ClientEncoding->encoding == PG_SQL_ASCII) - { - /* - * No conversion is needed, but we must still validate the data. - */ - (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false); - return (char *) s; - } - - if (DatabaseEncoding->encoding == PG_SQL_ASCII) - { - /* - * No conversion is possible, but we must still validate the data, - * because the client-side code might have done string escaping using - * the selected client_encoding. If the client encoding is ASCII-safe - * then we just do a straight validation under that encoding. For an - * ASCII-unsafe encoding we have a problem: we dare not pass such data - * to the parser but we have no way to convert it. We compromise by - * rejecting the data if it contains any non-ASCII characters. - */ - if (PG_VALID_BE_ENCODING(ClientEncoding->encoding)) - (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false); - else - { - int i; - - for (i = 0; i < len; i++) - { - if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte value for encoding \"%s\": 0x%02x", - pg_enc2name_tbl[PG_SQL_ASCII].name, - (unsigned char) s[i]))); - } - } - return (char *) s; - } - - return perform_default_encoding_conversion(s, len, true); + return perform_cached_encoding_conversion(s, len, ToServerConvProc, + ClientEncoding->encoding, DatabaseEncoding->encoding); } /* @@ -555,43 +520,101 @@ pg_server_to_client(const char *s, int len) Assert(DatabaseEncoding); Assert(ClientEncoding); + if (!need_conversion(s, len, DatabaseEncoding->encoding, + ClientEncoding->encoding, false)) + return (char *) s; + return perform_cached_encoding_conversion(s, len, ToClientConvProc, + DatabaseEncoding->encoding, ClientEncoding->encoding); +} + +/* + * given conversion Fmgr, do encoding conversion with sanity check if needed. + */ +char * +pg_cached_encoding_conversion(const char *s, int len, FmgrInfo *convproc, + int src_encoding, int dest_encoding, bool check_sanity) +{ if (len <= 0) return (char *) s; - if (ClientEncoding->encoding == DatabaseEncoding->encoding || - ClientEncoding->encoding == PG_SQL_ASCII || - DatabaseEncoding->encoding == PG_SQL_ASCII) - return (char *) s; /* assume data is valid */ + if (!need_conversion(s, len, src_encoding, dest_encoding, check_sanity)) + return (char *) s; - return perform_default_encoding_conversion(s, len, false); + return perform_cached_encoding_conversion(s, len, convproc, + src_encoding, dest_encoding); } /* + * returns true if conversion is needed. check_sanity = true means + * the source string should be verified even though the conversion happened. + */ +static bool +need_conversion(const char *s, int len, int src_encoding, int dest_encoding, + bool check_sanity) +{ + if (check_sanity) + { + if (src_encoding == dest_encoding || src_encoding == PG_SQL_ASCII) + { + /* + * No conversion is needed, but we must still validate the data. + */ + (void) pg_verify_mbstr(src_encoding, s, len, false); + return false; + } + + if (dest_encoding == PG_SQL_ASCII) + { + /* + * No conversion is possible, but we must still validate the + * data, because the client-side code might have done string + * escaping using the selected client_encoding. If the client + * encoding is ASCII-safe then we just do a straight validation + * under that encoding. For an ASCII-unsafe encoding we have a + * problem: we dare not pass such data to the parser but we have + * no way to convert it. We compromise by rejecting the data + * if it contains any non-ASCII characters. + */ + if (PG_VALID_BE_ENCODING(src_encoding)) + (void) pg_verify_mbstr(src_encoding, s, len, false); + else + { + int i; + + for (i = 0; i < len; i++) + { + if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte value for encoding \"%s\": 0x%02x", + pg_enc2name_tbl[PG_SQL_ASCII].name, + (unsigned char) s[i]))); + } + } + return false; + } + } + else + { + if (src_encoding == dest_encoding || + src_encoding == PG_SQL_ASCII || + dest_encoding == PG_SQL_ASCII) + return false; + } + + return true; +} +/* * Perform default encoding conversion using cached FmgrInfo. Since * this function does not access database at all, it is safe to call * outside transactions. If the conversion has not been set up by * SetClientEncoding(), no conversion is performed. */ static char * -perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server) +perform_cached_encoding_conversion(const char *src, int len, + FmgrInfo *flinfo, int src_encoding, int dest_encoding) { char *result; - int src_encoding, - dest_encoding; - FmgrInfo *flinfo; - - if (is_client_to_server) - { - src_encoding = ClientEncoding->encoding; - dest_encoding = DatabaseEncoding->encoding; - flinfo = ToServerConvProc; - } - else - { - src_encoding = DatabaseEncoding->encoding; - dest_encoding = ClientEncoding->encoding; - flinfo = ToClientConvProc; - } if (flinfo == NULL) return (char *) src; diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index f110723..576773e 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -21,6 +21,10 @@ #include +#ifndef FRONTEND +#include +#endif + /* * The pg_wchar type */ @@ -421,6 +425,12 @@ extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, extern char *pg_client_to_server(const char *s, int len); extern char *pg_server_to_client(const char *s, int len); +#ifndef FRONTEND +extern char *pg_cached_encoding_conversion(const char *s, int len, + FmgrInfo *convproc, int src_encoding, + int dest_encoding, bool check_sanity); +#endif + extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 15cbe02..88d7d16 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -46,10 +46,10 @@ CONTEXT: COPY x, line 1: "2001 231 \N \N" COPY x from stdin; ERROR: extra data after last expected column CONTEXT: COPY x, line 1: "2002 232 40 50 60 70 80" --- various COPY options: delimiters, oids, NULL string +-- various COPY options: delimiters, oids, NULL string, encoding COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x'; COPY x from stdin WITH DELIMITER AS ';' NULL AS ''; -COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X'; +COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING sql_ascii; -- check results of copy in SELECT * FROM x; a | b | c | d | e @@ -187,7 +187,7 @@ COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|'; Jackson, Sam|\h It is "perfect".| ''| -COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\'; +COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING sql_ascii; "Jackson, Sam","\\h" "It is \"perfect\"."," " "", diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index c2e8b03..d2683d1 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -72,7 +72,7 @@ COPY x from stdin; 2002 232 40 50 60 70 80 \. --- various COPY options: delimiters, oids, NULL string +-- various COPY options: delimiters, oids, NULL string, encoding COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x'; 500000,x,45,80,90 500001,x,\x,\\x,\\\x @@ -83,7 +83,7 @@ COPY x from stdin WITH DELIMITER AS ';' NULL AS ''; 3000;;c;; \. -COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X'; +COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING sql_ascii; 4000:\X:C:\X:\X 4001:1:empty:: 4002:2:null:\X:\X @@ -127,7 +127,7 @@ INSERT INTO y VALUES ('', NULL); COPY y TO stdout WITH CSV; COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|'; -COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\'; +COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING sql_ascii; COPY y TO stdout WITH CSV FORCE QUOTE *; -- Repeat above tests with new 9.0 option syntax