From f2d8bfdaab6c1ac815764c658e8144a807e578d3 Mon Sep 17 00:00:00 2001 From: Ayush Tiwari Date: Mon, 20 Apr 2026 11:16:36 +0530 Subject: [PATCH v3] Apply encoding conversion in COPY TO FORMAT JSON CopyToJsonOneRow() sent the output of composite_to_json() directly via CopySendData() without encoding conversion. The text and CSV paths convert per-attribute via pg_server_to_any() when need_transcoding is true, but the JSON path skipped this entirely. This meant COPY ... TO ... WITH (FORMAT json, ENCODING 'LATIN1') on a UTF-8 server silently produced UTF-8 output, and COPY TO STDOUT with a non-UTF-8 client_encoding would send unconverted bytes. Apply pg_server_to_any() to the whole JSON buffer after composite_to_json() returns, converting to the requested file encoding when it differs from the server encoding. Introduced by 7dadd38cda9 (json format for COPY TO). --- src/backend/commands/copyto.c | 20 +++++++++++++++++++- src/test/regress/expected/copyencoding.out | 21 +++++++++++++++++++++ src/test/regress/sql/copyencoding.sql | 12 ++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index 85d15353647..bb28c15339d 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -427,7 +427,25 @@ CopyToJsonOneRow(CopyToState cstate, TupleTableSlot *slot) } } - CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len); + /* + * Convert the JSON output to the target encoding if needed. Unlike the + * text and CSV paths which convert per-attribute via CopyAttributeOut*, + * composite_to_json() emits the whole row as one buffer, so we transcode + * it here in a single call before sending. + */ + if (cstate->need_transcoding) + { + char *converted; + + converted = pg_server_to_any(cstate->json_buf->data, + cstate->json_buf->len, + cstate->file_encoding); + CopySendData(cstate, converted, strlen(converted)); + if (converted != cstate->json_buf->data) + pfree(converted); + } + else + CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len); CopySendTextLikeEndOfRow(cstate); } diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out index 76ea0e7cf04..fc6ea54b232 100644 --- a/src/test/regress/expected/copyencoding.out +++ b/src/test/regress/expected/copyencoding.out @@ -50,4 +50,25 @@ COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81 CONTEXT: COPY copy_encoding_tab, line 1 RESET client_encoding; +-- JSON format encoding conversion +\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json' +COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1'); +-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9). +-- Done as separate position checks to stay independent of the platform's +-- end-of-line convention. +SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9, + position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9; + has_latin1_e9 | has_utf8_e9 +---------------+------------- + t | f +(1 row) + +-- Round-trip back through COPY FROM with the same encoding. +COPY copy_encoding_tab FROM :'json_latin1' WITH (ENCODING 'LATIN1'); +SELECT * FROM copy_encoding_tab WHERE t LIKE '%é%'; + t +----------- + {"c":"é"} +(1 row) + DROP TABLE copy_encoding_tab; diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql index 64718245b94..51b16abf42b 100644 --- a/src/test/regress/sql/copyencoding.sql +++ b/src/test/regress/sql/copyencoding.sql @@ -57,4 +57,16 @@ SET client_encoding TO EUC_JP; COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); RESET client_encoding; +-- JSON format encoding conversion +\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json' +COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1'); +-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9). +-- Done as separate position checks to stay independent of the platform's +-- end-of-line convention. +SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9, + position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9; +-- Round-trip back through COPY FROM with the same encoding. +COPY copy_encoding_tab FROM :'json_latin1' WITH (ENCODING 'LATIN1'); +SELECT * FROM copy_encoding_tab WHERE t LIKE '%é%'; + DROP TABLE copy_encoding_tab; -- 2.34.1