diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 38424ad..1f9e4cc 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -40,7 +40,8 @@ COPY { table_name [ ( quote_character'
ESCAPE 'escape_character'
FORCE_QUOTE { ( column [, ...] ) | * }
- FORCE_NOT_NULL ( column [, ...] )
+ FORCE_NOT_NULL ( column [, ...] ) |
+ ENCODING encoding_name
@@ -282,6 +283,18 @@ COPY { table_name [ (
+
+ ENCODING>
+
+
+ Specifies that the file is encoded in the encoding_name. If this option is
+ omitted, the current client encoding is used. See the Notes below
+ for more details.
+
+
+
+
@@ -377,8 +390,9 @@ COPY count
- Input data is interpreted according to the current client encoding,
- and output data is encoded in the current client encoding, even
+ Input data is interpreted according to ENCODING
+ option or the current client encoding, and output data is encoded
+ in ENCODING or the current client encoding, even
if the data does not pass through the client but is read from or
written to a file directly by the server.
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 841bf22..6ea9372 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -96,9 +96,11 @@ typedef struct CopyStateData
bool fe_copy; /* true for all FE copy dests */
bool fe_eof; /* true if detected end of copy data */
EolType eol_type; /* EOL type of input */
- int client_encoding; /* remote side's character encoding */
- bool need_transcoding; /* client encoding diff from server? */
+ int target_encoding; /* remote side's character encoding */
+ bool encoding_option; /* has encoding option? */
+ bool need_transcoding; /* encoding diff from server? */
bool encoding_embeds_ascii; /* ASCII can be non-first byte? */
+ FmgrInfo *conv_proc; /* encoding conversion proc */
uint64 processed; /* # of tuples processed */
/* parameters from the COPY command */
@@ -811,6 +813,22 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
errmsg("conflicting or redundant options")));
cstate->escape = defGetString(defel);
}
+ else if (strcmp(defel->defname, "encoding") == 0)
+ {
+ if (cstate->encoding_option)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("conflicting or redundant options")));
+ if (PG_VALID_ENCODING(pg_char_to_encoding(defGetString(defel))))
+ cstate->target_encoding =
+ pg_char_to_encoding(defGetString(defel));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("argument to option \"%s\" must be a valid encoding name",
+ defel->defname)));
+ cstate->encoding_option = true;
+ }
else if (strcmp(defel->defname, "force_quote") == 0)
{
if (force_quote || force_quote_all)
@@ -1169,16 +1187,22 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
cstate->processed = 0;
/*
- * Set up encoding conversion info. Even if the client and server
- * encodings are the same, we must apply pg_client_to_server() to validate
- * data in multibyte encodings.
+ * Set up encoding conversion info. If encoding option is specified,
+ * use it instead of client_encoding of GUC.
+ */
+ if (!cstate->encoding_option)
+ cstate->target_encoding = pg_get_client_encoding();
+
+ /*
+ * Even if the client and server encodings are the same,
+ * we must apply pg_client_to_server() to validate data in
+ * multibyte encodings.
*/
- cstate->client_encoding = pg_get_client_encoding();
cstate->need_transcoding =
- (cstate->client_encoding != GetDatabaseEncoding() ||
+ (cstate->target_encoding != GetDatabaseEncoding() ||
pg_database_encoding_max_length() > 1);
/* See Multibyte encoding comment above */
- cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
+ cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->target_encoding);
cstate->copy_dest = COPY_FILE; /* default */
cstate->filename = stmt->filename;
@@ -1346,6 +1370,24 @@ CopyTo(CopyState cstate)
/* We use fe_msgbuf as a per-row buffer regardless of copy_dest */
cstate->fe_msgbuf = makeStringInfo();
+ /*
+ * We don't need conversion procedure when the encodings match
+ * between target and server, but need_transcoding indicates
+ * we need verify encoding still.
+ */
+ if (cstate->need_transcoding &&
+ cstate->target_encoding != GetDatabaseEncoding())
+ {
+ Oid funcoid;
+
+ funcoid = FindDefaultConversionProc(
+ GetDatabaseEncoding(), cstate->target_encoding);
+ cstate->conv_proc = (FmgrInfo *) palloc(sizeof(FmgrInfo));
+ fmgr_info(funcoid, cstate->conv_proc);
+ }
+ else
+ cstate->conv_proc = NULL;
+
/* Get info about the columns we need to process. */
cstate->out_functions = (FmgrInfo *) palloc(num_phys_attrs * sizeof(FmgrInfo));
foreach(cur, cstate->attnumlist)
@@ -1400,8 +1442,13 @@ CopyTo(CopyState cstate)
* encoding, because it will be sent directly with CopySendString.
*/
if (cstate->need_transcoding)
- cstate->null_print_client = pg_server_to_client(cstate->null_print,
- cstate->null_print_len);
+ cstate->null_print_client =
+ pg_cached_encoding_conversion(cstate->null_print,
+ cstate->null_print_len,
+ cstate->conv_proc,
+ GetDatabaseEncoding(),
+ cstate->target_encoding,
+ false);
/* if a header has been requested send the line */
if (cstate->header_line)
@@ -1949,6 +1996,24 @@ CopyFrom(CopyState cstate)
cstate->cur_attname = NULL;
cstate->cur_attval = NULL;
+ /*
+ * We don't need conversion procedure when the encodings match
+ * between target and server, but need_transcoding indicates
+ * we need verify encoding still.
+ */
+ if (cstate->need_transcoding &&
+ cstate->target_encoding != GetDatabaseEncoding())
+ {
+ Oid funcoid;
+
+ funcoid = FindDefaultConversionProc(
+ cstate->target_encoding, GetDatabaseEncoding());
+ cstate->conv_proc = (FmgrInfo *) palloc(sizeof(FmgrInfo));
+ fmgr_info(funcoid, cstate->conv_proc);
+ }
+ else
+ cstate->conv_proc = NULL;
+
bistate = GetBulkInsertState();
/* Set up callback to identify error line number */
@@ -2351,8 +2416,12 @@ CopyReadLine(CopyState cstate)
{
char *cvt;
- cvt = pg_client_to_server(cstate->line_buf.data,
- cstate->line_buf.len);
+ cvt = pg_cached_encoding_conversion(cstate->line_buf.data,
+ cstate->line_buf.len,
+ cstate->conv_proc,
+ cstate->target_encoding,
+ GetDatabaseEncoding(),
+ true);
if (cvt != cstate->line_buf.data)
{
/* transfer converted data back to line_buf */
@@ -2711,7 +2780,7 @@ not_end_of_copy:
mblen_str[0] = c;
/* All our encodings only read the first byte to get the length */
- mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str);
+ mblen = pg_encoding_mblen(cstate->target_encoding, mblen_str);
IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
raw_buf_ptr += mblen - 1;
@@ -3210,7 +3279,11 @@ CopyAttributeOutText(CopyState cstate, char *string)
char delimc = cstate->delim[0];
if (cstate->need_transcoding)
- ptr = pg_server_to_client(string, strlen(string));
+ ptr = pg_cached_encoding_conversion(string, strlen(string),
+ cstate->conv_proc,
+ GetDatabaseEncoding(),
+ cstate->target_encoding,
+ false);
else
ptr = string;
@@ -3283,7 +3356,7 @@ CopyAttributeOutText(CopyState cstate, char *string)
start = ptr++; /* we include char in next run */
}
else if (IS_HIGHBIT_SET(c))
- ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+ ptr += pg_encoding_mblen(cstate->target_encoding, ptr);
else
ptr++;
}
@@ -3370,7 +3443,11 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
use_quote = true;
if (cstate->need_transcoding)
- ptr = pg_server_to_client(string, strlen(string));
+ ptr = pg_cached_encoding_conversion(string, strlen(string),
+ cstate->conv_proc,
+ GetDatabaseEncoding(),
+ cstate->target_encoding,
+ false);
else
ptr = string;
@@ -3397,7 +3474,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
break;
}
if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
- tptr += pg_encoding_mblen(cstate->client_encoding, tptr);
+ tptr += pg_encoding_mblen(cstate->target_encoding, tptr);
else
tptr++;
}
@@ -3421,7 +3498,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
start = ptr; /* we include char in next run */
}
if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
- ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+ ptr += pg_encoding_mblen(cstate->target_encoding, ptr);
else
ptr++;
}
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 456db5c..1f698d1 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -2217,6 +2217,10 @@ copy_opt_item:
{
$$ = makeDefElem("force_not_null", (Node *)$4);
}
+ | ENCODING ColId_or_Sconst
+ {
+ $$ = makeDefElem("encoding", (Node *)makeString($2));
+ }
;
/* The following exist for backward compatibility with very old versions */
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index a041812..b38b7a0 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -71,8 +71,11 @@ static int pending_client_encoding = PG_SQL_ASCII;
/* Internal functions */
-static char *perform_default_encoding_conversion(const char *src,
- int len, bool is_client_to_server);
+static bool need_conversion(const char *src, int len,
+ int src_encoding, int dest_encoding, bool check_sanity);
+static char *perform_cached_encoding_conversion(const char *src,
+ int len, FmgrInfo *flinfo,
+ int src_encoding, int dest_encoding);
static int cliplen(const char *str, int len, int limit);
@@ -500,50 +503,12 @@ pg_client_to_server(const char *s, int len)
Assert(DatabaseEncoding);
Assert(ClientEncoding);
- if (len <= 0)
+ if (!need_conversion(s, len, ClientEncoding->encoding,
+ DatabaseEncoding->encoding, true))
return (char *) s;
- if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
- ClientEncoding->encoding == PG_SQL_ASCII)
- {
- /*
- * No conversion is needed, but we must still validate the data.
- */
- (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
- return (char *) s;
- }
-
- if (DatabaseEncoding->encoding == PG_SQL_ASCII)
- {
- /*
- * No conversion is possible, but we must still validate the data,
- * because the client-side code might have done string escaping using
- * the selected client_encoding. If the client encoding is ASCII-safe
- * then we just do a straight validation under that encoding. For an
- * ASCII-unsafe encoding we have a problem: we dare not pass such data
- * to the parser but we have no way to convert it. We compromise by
- * rejecting the data if it contains any non-ASCII characters.
- */
- if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
- (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
- else
- {
- int i;
-
- for (i = 0; i < len; i++)
- {
- if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
- ereport(ERROR,
- (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("invalid byte value for encoding \"%s\": 0x%02x",
- pg_enc2name_tbl[PG_SQL_ASCII].name,
- (unsigned char) s[i])));
- }
- }
- return (char *) s;
- }
-
- return perform_default_encoding_conversion(s, len, true);
+ return perform_cached_encoding_conversion(s, len, ToServerConvProc,
+ ClientEncoding->encoding, DatabaseEncoding->encoding);
}
/*
@@ -555,43 +520,101 @@ pg_server_to_client(const char *s, int len)
Assert(DatabaseEncoding);
Assert(ClientEncoding);
+ if (!need_conversion(s, len, DatabaseEncoding->encoding,
+ ClientEncoding->encoding, false))
+ return (char *) s;
+ return perform_cached_encoding_conversion(s, len, ToClientConvProc,
+ DatabaseEncoding->encoding, ClientEncoding->encoding);
+}
+
+/*
+ * given conversion Fmgr, do encoding conversion with sanity check if needed.
+ */
+char *
+pg_cached_encoding_conversion(const char *s, int len, FmgrInfo *convproc,
+ int src_encoding, int dest_encoding, bool check_sanity)
+{
if (len <= 0)
return (char *) s;
- if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
- ClientEncoding->encoding == PG_SQL_ASCII ||
- DatabaseEncoding->encoding == PG_SQL_ASCII)
- return (char *) s; /* assume data is valid */
+ if (!need_conversion(s, len, src_encoding, dest_encoding, check_sanity))
+ return (char *) s;
- return perform_default_encoding_conversion(s, len, false);
+ return perform_cached_encoding_conversion(s, len, convproc,
+ src_encoding, dest_encoding);
}
/*
+ * returns true if conversion is needed. check_sanity = true means
+ * the source string should be verified even though the conversion happened.
+ */
+static bool
+need_conversion(const char *s, int len, int src_encoding, int dest_encoding,
+ bool check_sanity)
+{
+ if (check_sanity)
+ {
+ if (src_encoding == dest_encoding || src_encoding == PG_SQL_ASCII)
+ {
+ /*
+ * No conversion is needed, but we must still validate the data.
+ */
+ (void) pg_verify_mbstr(src_encoding, s, len, false);
+ return false;
+ }
+
+ if (dest_encoding == PG_SQL_ASCII)
+ {
+ /*
+ * No conversion is possible, but we must still validate the
+ * data, because the client-side code might have done string
+ * escaping using the selected client_encoding. If the client
+ * encoding is ASCII-safe then we just do a straight validation
+ * under that encoding. For an ASCII-unsafe encoding we have a
+ * problem: we dare not pass such data to the parser but we have
+ * no way to convert it. We compromise by rejecting the data
+ * if it contains any non-ASCII characters.
+ */
+ if (PG_VALID_BE_ENCODING(src_encoding))
+ (void) pg_verify_mbstr(src_encoding, s, len, false);
+ else
+ {
+ int i;
+
+ for (i = 0; i < len; i++)
+ {
+ if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid byte value for encoding \"%s\": 0x%02x",
+ pg_enc2name_tbl[PG_SQL_ASCII].name,
+ (unsigned char) s[i])));
+ }
+ }
+ return false;
+ }
+ }
+ else
+ {
+ if (src_encoding == dest_encoding ||
+ src_encoding == PG_SQL_ASCII ||
+ dest_encoding == PG_SQL_ASCII)
+ return false;
+ }
+
+ return true;
+}
+/*
* Perform default encoding conversion using cached FmgrInfo. Since
* this function does not access database at all, it is safe to call
* outside transactions. If the conversion has not been set up by
* SetClientEncoding(), no conversion is performed.
*/
static char *
-perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
+perform_cached_encoding_conversion(const char *src, int len,
+ FmgrInfo *flinfo, int src_encoding, int dest_encoding)
{
char *result;
- int src_encoding,
- dest_encoding;
- FmgrInfo *flinfo;
-
- if (is_client_to_server)
- {
- src_encoding = ClientEncoding->encoding;
- dest_encoding = DatabaseEncoding->encoding;
- flinfo = ToServerConvProc;
- }
- else
- {
- src_encoding = DatabaseEncoding->encoding;
- dest_encoding = ClientEncoding->encoding;
- flinfo = ToClientConvProc;
- }
if (flinfo == NULL)
return (char *) src;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index f110723..576773e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -21,6 +21,10 @@
#include
+#ifndef FRONTEND
+#include
+#endif
+
/*
* The pg_wchar type
*/
@@ -421,6 +425,12 @@ extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
extern char *pg_client_to_server(const char *s, int len);
extern char *pg_server_to_client(const char *s, int len);
+#ifndef FRONTEND
+extern char *pg_cached_encoding_conversion(const char *s, int len,
+ FmgrInfo *convproc, int src_encoding,
+ int dest_encoding, bool check_sanity);
+#endif
+
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
index 15cbe02..88d7d16 100644
--- a/src/test/regress/expected/copy2.out
+++ b/src/test/regress/expected/copy2.out
@@ -46,10 +46,10 @@ CONTEXT: COPY x, line 1: "2001 231 \N \N"
COPY x from stdin;
ERROR: extra data after last expected column
CONTEXT: COPY x, line 1: "2002 232 40 50 60 70 80"
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING sql_ascii;
-- check results of copy in
SELECT * FROM x;
a | b | c | d | e
@@ -187,7 +187,7 @@ COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
Jackson, Sam|\h
It is "perfect".|
''|
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING sql_ascii;
"Jackson, Sam","\\h"
"It is \"perfect\"."," "
"",
diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
index c2e8b03..d2683d1 100644
--- a/src/test/regress/sql/copy2.sql
+++ b/src/test/regress/sql/copy2.sql
@@ -72,7 +72,7 @@ COPY x from stdin;
2002 232 40 50 60 70 80
\.
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
500000,x,45,80,90
500001,x,\x,\\x,\\\x
@@ -83,7 +83,7 @@ COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
3000;;c;;
\.
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING sql_ascii;
4000:\X:C:\X:\X
4001:1:empty::
4002:2:null:\X:\X
@@ -127,7 +127,7 @@ INSERT INTO y VALUES ('', NULL);
COPY y TO stdout WITH CSV;
COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING sql_ascii;
COPY y TO stdout WITH CSV FORCE QUOTE *;
-- Repeat above tests with new 9.0 option syntax