zstd compression for pg_dump
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.
I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/
--
Justin
Attachments:
0001-WIP-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From 2486417b7c3586e150e806a1fbc3b873c2a4a0f9 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 1/3] WIP: pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
note-to-self: see also: private commit 36ab001fb
---
src/bin/pg_dump/Makefile | 1 +
src/bin/pg_dump/compress_io.c | 54 +--
src/bin/pg_dump/compress_zstd.c | 520 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 9 +
src/bin/pg_dump/meson.build | 1 +
src/bin/pg_dump/pg_backup_archiver.c | 9 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 13 -
src/bin/pg_dump/t/002_pg_dump.pl | 71 ++++
9 files changed, 640 insertions(+), 40 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..76574298faf 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -29,6 +29,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index ce06f1eac9c..061e3d9ce1c 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -98,6 +99,10 @@ supports_compression(const pg_compress_specification compression_spec)
if (algorithm == PG_COMPRESSION_LZ4)
supported = true;
#endif
+#ifdef USE_ZSTD
+ if (algorithm == PG_COMPRESSION_ZSTD)
+ supported = true;
+#endif
if (!supported)
return psprintf("this build does not support compression with %s",
@@ -130,6 +135,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,25 +203,36 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
CompressFileHandle *
InitDiscoverCompressFileHandle(const char *path, const char *mode)
+// pg_compress_algorithm alg
{
CompressFileHandle *CFH = NULL;
struct stat st;
@@ -237,28 +255,12 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
/* avoid unused warning if it is not built with compression */
if (exists)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz")) // alg == PG_COMPRESSION_GZIP &&
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4")) // alg == PG_COMPRESSION_LZ4 &&
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst")) // alg == PG_COMPRESSION_ZSTD &&
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..659d08533ae
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,520 @@
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+ /* XXX: use one separate ZSTD_CStream per thread: disable on windows ? */
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+} ZstdCompressorState;
+
+static ZSTD_CStream *ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+ZSTD_CCtx_setParam_or_die(ZSTD_CStream * cstream,
+ ZSTD_cParameter param, int value)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: %s",
+ ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level);
+
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers);
+
+#if 0
+ if (compress.options & PG_COMPRESSION_OPTION_CHECKSUM)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_checksumFlag,
+ compress.checksum);
+
+ /* Still marked as experimental */
+ if (compress.options & PG_COMPRESSION_OPTION_RSYNCABLE)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_rsyncable, 1);
+#endif
+
+ return cstream;
+}
+
+void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ if (cs->writeF == NULL)
+ return;
+
+ for (;;)
+ {
+ size_t res;
+
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ &zstdcs->input, ZSTD_e_end);
+
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+ /* TODO check that we wrote "pos" bytes */
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not close compression stream: %s",
+ ZSTD_getErrorName(res));
+
+ if (res == 0)
+ break;
+ }
+
+ /* XXX: retval */
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ input->src = data;
+ input->size = dLen;
+ input->pos = 0;
+
+#if 0
+ ZSTD_CCtx_reset(zstdcs->cstream, ZSTD_reset_session_only); // XXX */
+ res = ZSTD_CCtx_setPledgedSrcSize(cs->zstd.cstream, dLen);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+#endif
+
+ while (input->pos != input->size)
+ {
+ size_t res;
+
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, ZSTD_e_continue);
+
+ if (output->pos == output->size ||
+ input->pos != input->size)
+ {
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length
+ * chunk is the EOF marker in the custom format. This should never
+ * happen but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ output->pos = 0;
+ }
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+ }
+}
+
+/* Read data from a compressed zstd archive */
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+ size_t res;
+ size_t input_size;
+
+ dstream = ZSTD_createDStream();
+ if (dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ input_size = ZSTD_DStreamInSize();
+ input.src = pg_malloc(input_size);
+
+ output.size = ZSTD_DStreamOutSize();
+ output.dst = pg_malloc(output.size);
+
+ /* read compressed data */
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * XXX: the buffer can grow, we shouldn't keep resetting it to the
+ * original value..
+ */
+ input.size = input_size;
+
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input.src), &input.size);
+ input.pos = 0;
+ input.size = cnt;
+
+ if (cnt == 0)
+ break;
+
+ while (input.pos < input.size)
+ {
+ /* decompress */
+ output.pos = 0;
+ res = ZSTD_decompressStream(dstream, &output, &input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /* write to output handle */
+ ((char *) output.dst)[output.pos] = '\0';
+ ahwrite(output.dst, 1, output.pos, AH);
+ /* if (res == 0) break; */
+ }
+ }
+
+ pg_free(unconstify(void *, input.src));
+ pg_free(output.dst);
+}
+
+/* Public routines that support Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ ZstdCompressorState *zstdcs;
+
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ cs->private_data = pg_malloc0(sizeof(ZstdCompressorState));
+ zstdcs = cs->private_data;
+ /* XXX: initialize safely like the corresponding zlib "paranoia" */
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ zstdcs->cstream = ZstdCStreamParams(cs->compression_spec);
+}
+
+/*----------------------
+ * Compress File API
+ *----------------------
+ */
+
+static size_t
+Zstd_read(void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t input_size = ZSTD_DStreamInSize();
+
+ /* input_size is the allocated size */
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_size);
+
+ /* If the input is completely consumed, start back at the beginning */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_size, zstdcs->fp);
+ input->size = cnt;
+
+ /* If we have no input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ Assert(cnt >= 0);
+ Assert(input->size <= input_size);
+
+ /* Now consume as much as possible */
+ for (; input->pos < input->size;)
+ {
+ /* decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+ if (output->pos == output->size)
+ break; /* No more room for output */
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ return output->pos;
+}
+
+static size_t
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+#if 0
+ ZSTD_CCtx_reset(fp->zstd.cstream, ZSTD_reset_session_only);
+ res = ZSTD_CCtx_setPledgedSrcSize(fp->zstd.cstream, size);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+#endif
+
+ /* Consume all input, and flush later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ pg_fatal("could not write data: %m");
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of blobs, and the I/O is buffered anyway.
+ */
+ int i,
+ res;
+
+ for (i = 0; i < len - 1; ++i)
+ {
+ res = CFH->read_func(&buf[i], 1, CFH);
+ if (res != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : 0;
+}
+
+static int
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int result;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ pg_fatal("could not write data: %m");
+
+ if (res == 0)
+ break;
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ result = fclose(zstdcs->fp);
+ pg_free(zstdcs);
+ return result;
+}
+
+static int
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static int
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ {
+ /* XXX zstdcs->errcode = errno; */
+ return 1;
+ }
+
+ CFH->private_data = pg_malloc0(sizeof(ZstdCompressorState));
+ zstdcs = (ZstdCompressorState *) CFH->private_data;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = ZstdCStreamParams(CFH->compression_spec);
+ }
+ else if (strchr(mode, 'r'))
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ /* XXX else: bad mode */
+
+ return 0;
+}
+
+static int
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+#if 0
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ if (ZSTD_isError(res))
+ return ZSTD_getErrorName(res)
+ else
+#endif
+
+ return strerror(errno);
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..f36698b4c26
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,9 @@
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index 0da476a4c34..334d449091d 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 61ebb8fe85d..6e97d1f5894 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2075,7 +2075,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2086,10 +2086,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index 41c2b733e3e..29845340859 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 24ba936332d..72f6126a1fb 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -710,19 +710,6 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
-
/*
* Custom and directory formats are compressed by default with gzip when
* available, not the others.
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 72b19ee6cde..160cd1124a2 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -213,6 +213,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
--
2.34.1
0002-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From 6121c2bf52253c016c1222e56dbff86a2d2bddc5 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 2/3] TMP: pg_dump: use Zstd by default, for CI only
---
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 15 ++++++++-------
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 72f6126a1fb..0c61194f960 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -717,8 +717,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#else
/* Nothing to do in the default case */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 160cd1124a2..a69c87fca25 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -383,10 +384,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -408,8 +409,8 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
@@ -417,7 +418,7 @@ my %pgdump_runs = (
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
$supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
0003-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From 854db16b74f527b096cff646b5311c9a84029265 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 3/3] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
ci-os-only: freebsd
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 8 ++-
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 131 insertions(+), 8 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 93fc7167d4a..ee8bb7c2445 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2747,7 +2747,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2764,6 +2765,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 49d218905fb..46eb219812f 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -676,8 +676,12 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>, or
+ <application>lz4</application>; or <application>zstd</application>,
+ but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may allow dumps
+ to be significantly smaller, but it might not reduce the size of
+ custom or directory format dumps, whose fields are separately compressed.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index 659d08533ae..e32dd73da64 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -67,6 +67,11 @@ ZstdCStreamParams(pg_compress_specification compress)
ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers);
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ ZSTD_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance);
+
#if 0
if (compress.options & PG_COMPRESSION_OPTION_CHECKSUM)
ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_checksumFlag,
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index a69c87fca25..a09dc205188 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..713a77c292d 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as an boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..6cf8cf396a8 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ int long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
On Fri, Feb 24, 2023 at 01:18:40PM -0600, Justin Pryzby wrote:
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/
Patch 0003 adds support for the --long option of zstd, meaning that it
"enables long distance matching with #windowLog". What's the benefit
of that when it is applied to dumps and base backup contents?
--
Michael
On Sat, Feb 25, 2023 at 01:44:36PM +0900, Michael Paquier wrote:
On Fri, Feb 24, 2023 at 01:18:40PM -0600, Justin Pryzby wrote:
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/Patch 0003 adds support for the --long option of zstd, meaning that it
"enables long distance matching with #windowLog". What's the benefit
of that when it is applied to dumps and base backup contents?
It (can) makes it smaller.
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ With zstd compression, <literal>long</literal> mode may allow dumps
+ to be significantly smaller, but it might not reduce the size of
+ custom or directory format dumps, whose fields are separately compressed.
Note that I included that here as 003, but I also have an pre-existing
patch for adding that just to basebackup.
--
Justin
On 2/24/23 20:18, Justin Pryzby wrote:
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/
Thanks. Sadly cfbot is unhappy - the windows and cplusplus builds failed
because of some issue in pg_backup_archiver.h. But it's a bit bizarre
because the patch does not modify that file at all ...
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Sat, Feb 25, 2023, at 7:31 AM, Tomas Vondra wrote:
On 2/24/23 20:18, Justin Pryzby wrote:
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/Thanks. Sadly cfbot is unhappy - the windows and cplusplus builds failed
because of some issue in pg_backup_archiver.h. But it's a bit bizarre
because the patch does not modify that file at all ...
cpluspluscheck says
# pg_dump is not C++-clean because it uses "public" and "namespace"
# as field names, which is unfortunate but we won't change it now.
Hence, the patch should exclude the new header file from it.
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -153,6 +153,7 @@ do
test "$f" = src/bin/pg_dump/compress_gzip.h && continue
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
--
Euler Taveira
EDB https://www.enterprisedb.com/
On Fri, Feb 24, 2023 at 01:18:40PM -0600, Justin Pryzby wrote:
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/
This resolves cfbot warnings: windows and cppcheck.
And refactors zstd routines.
And updates docs.
And includes some fixes for earlier patches that these patches conflicts
with/depends on.
Attachments:
0001-f-fixes-for-LZ4.patchtext/x-diff; charset=us-asciiDownload
From ea9b67d09fe1b51e7946cf34fca5795e57dd3858 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Fri, 24 Feb 2023 14:07:09 -0600
Subject: [PATCH 1/4] f!fixes for LZ4
---
doc/src/sgml/ref/pg_dump.sgml | 4 ++--
src/bin/pg_dump/compress_gzip.c | 8 --------
src/bin/pg_dump/compress_io.c | 2 +-
src/bin/pg_dump/compress_io.h | 4 ++--
src/bin/pg_dump/compress_lz4.c | 4 ++--
src/bin/pg_dump/pg_backup_archiver.c | 4 ++--
src/bin/pg_dump/pg_dump.c | 2 --
src/bin/pg_dump/t/002_pg_dump.pl | 6 +++---
8 files changed, 12 insertions(+), 22 deletions(-)
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 49d218905fb..6fbe49f7ede 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -331,7 +331,7 @@ PostgreSQL documentation
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
can be compressed with the <application>gzip</application> or
- <application>lz4</application>tool.
+ <application>lz4</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -655,7 +655,7 @@ PostgreSQL documentation
<para>
Specify the compression method and/or the compression level to use.
The compression method can be set to <literal>gzip</literal> or
- <literal>lz4</literal> or <literal>none</literal> for no compression.
+ <literal>lz4</literal>, or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
diff --git a/src/bin/pg_dump/compress_gzip.c b/src/bin/pg_dump/compress_gzip.c
index 0af65afeb4e..52f41c2e58c 100644
--- a/src/bin/pg_dump/compress_gzip.c
+++ b/src/bin/pg_dump/compress_gzip.c
@@ -123,17 +123,9 @@ WriteDataToArchiveGzip(ArchiveHandle *AH, CompressorState *cs,
gzipcs->outbuf = pg_malloc(ZLIB_OUT_SIZE + 1);
gzipcs->outsize = ZLIB_OUT_SIZE;
- /*
- * A level of zero simply copies the input one block at the time. This
- * is probably not what the user wanted when calling this interface.
- */
- if (cs->compression_spec.level == 0)
- pg_fatal("requested to compress the archive yet no level was specified");
-
if (deflateInit(zp, cs->compression_spec.level) != Z_OK)
pg_fatal("could not initialize compression library: %s", zp->msg);
- /* Just be paranoid - maybe End is called after Start, with no Write */
zp->next_out = gzipcs->outbuf;
zp->avail_out = gzipcs->outsize;
}
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index ce06f1eac9c..9239dbb2755 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -83,7 +83,7 @@
* used by the caller in an error message.
*/
char *
-supports_compression(const pg_compress_specification compression_spec)
+pgdump_supports_compression(const pg_compress_specification compression_spec)
{
const pg_compress_algorithm algorithm = compression_spec.algorithm;
bool supported = false;
diff --git a/src/bin/pg_dump/compress_io.h b/src/bin/pg_dump/compress_io.h
index bbde2693915..46815fa2ebe 100644
--- a/src/bin/pg_dump/compress_io.h
+++ b/src/bin/pg_dump/compress_io.h
@@ -21,7 +21,7 @@
#define ZLIB_OUT_SIZE 4096
#define ZLIB_IN_SIZE 4096
-extern char *supports_compression(const pg_compress_specification compression_spec);
+extern char *pgdump_supports_compression(const pg_compress_specification compression_spec);
/*
* Prototype for callback function used in writeData()
@@ -172,7 +172,7 @@ struct CompressFileHandle
extern CompressFileHandle *InitCompressFileHandle(const pg_compress_specification compression_spec);
/*
- * Initialize a compress file stream. Deffer the compression algorithm
+ * Initialize a compress file stream. Infer the compression algorithm
* from 'path', either by examining its suffix or by appending the supported
* suffixes in 'path'.
*/
diff --git a/src/bin/pg_dump/compress_lz4.c b/src/bin/pg_dump/compress_lz4.c
index fe1014e6e77..63e794cdc68 100644
--- a/src/bin/pg_dump/compress_lz4.c
+++ b/src/bin/pg_dump/compress_lz4.c
@@ -161,8 +161,8 @@ typedef struct LZ4File
} LZ4File;
/*
- * LZ4 equivalent to feof() or gzeof(). The end of file is reached if there
- * is no decompressed output in the overflow buffer and the end of the file
+ * LZ4 equivalent to feof() or gzeof(). Return true iff there is no
+ * decompressed output in the overflow buffer and the end of the backing file
* is reached.
*/
static int
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 61ebb8fe85d..2063d6f239d 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -388,7 +388,7 @@ RestoreArchive(Archive *AHX)
{
if (te->hadDumper && (te->reqs & REQ_DATA) != 0)
{
- char *errmsg = supports_compression(AH->compression_spec);
+ char *errmsg = pgdump_supports_compression(AH->compression_spec);
if (errmsg)
pg_fatal("cannot restore from compressed archive (%s)",
errmsg);
@@ -3745,7 +3745,7 @@ ReadHead(ArchiveHandle *AH)
else
AH->compression_spec.algorithm = PG_COMPRESSION_GZIP;
- errmsg = supports_compression(AH->compression_spec);
+ errmsg = pgdump_supports_compression(AH->compression_spec);
if (errmsg)
{
pg_log_warning("archive is compressed, but this installation does not support compression (%s) -- no data will be available",
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 24ba936332d..ce2242195f3 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -733,8 +733,6 @@ main(int argc, char **argv)
#ifdef HAVE_LIBZ
parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
&compression_spec);
-#else
- /* Nothing to do in the default case */
#endif
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 72b19ee6cde..ad7bc5c194b 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -4248,10 +4248,10 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip if there is no support for it.
+ # Skip command-level tests for gzip/lz4 if they're not supported.
if ($pgdump_runs{$run}->{compile_option} &&
- ($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4))
+ (($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
--
2.34.1
0002-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From d9eca9692f11650be1bfb5635ae833e9ba94a43a Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 2/4] pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
---
doc/src/sgml/ref/pg_dump.sgml | 10 +-
src/bin/pg_dump/Makefile | 2 +
src/bin/pg_dump/compress_io.c | 53 +--
src/bin/pg_dump/compress_zstd.c | 505 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 9 +
src/bin/pg_dump/meson.build | 1 +
src/bin/pg_dump/pg_backup_archiver.c | 9 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 13 -
src/bin/pg_dump/t/002_pg_dump.pl | 79 +++-
src/tools/pginclude/cpluspluscheck | 1 +
11 files changed, 637 insertions(+), 47 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 6fbe49f7ede..0df86a245f0 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -330,8 +330,9 @@ PostgreSQL documentation
machine-readable format that <application>pg_restore</application>
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
- can be compressed with the <application>gzip</application> or
- <application>lz4</application> tools.
+ can be compressed with the <application>gzip</application>,
+ <application>lz4</application>, or
+ <application>zstd</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -654,8 +655,9 @@ PostgreSQL documentation
<listitem>
<para>
Specify the compression method and/or the compression level to use.
- The compression method can be set to <literal>gzip</literal> or
- <literal>lz4</literal>, or <literal>none</literal> for no compression.
+ The compression method can be set to <literal>gzip</literal>,
+ <literal>lz4</literal> or <literal>zstd</literal>,
+ or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..bf540fee2ba 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -18,6 +18,7 @@ include $(top_builddir)/src/Makefile.global
export GZIP_PROGRAM=$(GZIP)
export LZ4
+#XXX export ZSTD
export with_icu
override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
@@ -29,6 +30,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index 9239dbb2755..121c69ed48f 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -98,6 +99,10 @@ pgdump_supports_compression(const pg_compress_specification compression_spec)
if (algorithm == PG_COMPRESSION_LZ4)
supported = true;
#endif
+#ifdef USE_ZSTD
+ if (algorithm == PG_COMPRESSION_ZSTD)
+ supported = true;
+#endif
if (!supported)
return psprintf("this build does not support compression with %s",
@@ -130,6 +135,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,20 +203,30 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
@@ -237,28 +254,12 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
/* avoid unused warning if it is not built with compression */
if (exists)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz"))
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..5d2f2bf2a2d
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,505 @@
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+ /* XXX: use one separate ZSTD_CStream per thread: disable on windows ? */
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+} ZstdCompressorState;
+
+static ZSTD_CStream *ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+ZSTD_CCtx_setParam_or_die(ZSTD_CStream * cstream,
+ ZSTD_cParameter param, int value)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: %s",
+ ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level);
+
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers);
+
+#if 0
+ if (compress.options & PG_COMPRESSION_OPTION_CHECKSUM)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_checksumFlag,
+ compress.checksum);
+
+ /* Still marked as experimental */
+ if (compress.options & PG_COMPRESSION_OPTION_RSYNCABLE)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_rsyncable, 1);
+#endif
+
+ return cstream;
+}
+
+/* Helper function for EndCompressorZstd and WriteDataToArchiveZstd */
+static void
+ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop while there's any input or until flushed */
+ while (input->pos != input->size || flush)
+ {
+ size_t res;
+
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, flush ? ZSTD_e_end : ZSTD_e_continue);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length
+ * chunk is the EOF marker in the custom format. This should never
+ * happen but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ output->pos = 0;
+
+ if (res == 0)
+ break;
+ }
+}
+
+void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ if (zstdcs->cstream)
+ {
+ ZstdWriteCommon(AH, cs, true);
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ zstdcs->cstream = NULL;
+ }
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ if (zstdcs->cstream == NULL)
+ {
+ zstdcs->cstream = ZstdCStreamParams(cs->compression_spec);
+
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ }
+
+ zstdcs->input.src = data;
+ zstdcs->input.size = dLen;
+ zstdcs->input.pos = 0;
+
+ ZstdWriteCommon(AH, cs, false);
+}
+
+/* Read data from a compressed zstd archive */
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ size_t res;
+
+ if (zstdcs->dstream == NULL)
+ {
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ input->size = ZSTD_DStreamInSize();
+ input->src = pg_malloc(input->size);
+
+ output->size = ZSTD_DStreamOutSize();
+ output->dst = pg_malloc(output->size);
+ }
+
+ /* read compressed data */
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * XXX: the buffer can grow, we shouldn't keep resetting it to the
+ * original value..
+ */
+ input->size = ZSTD_DStreamInSize();
+
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size);
+ input->pos = 0;
+ input->size = cnt;
+
+ if (cnt == 0)
+ break;
+
+ /* Now consume as much as possible */
+ while (input->pos < input->size)
+ {
+ /* decompress */
+ output->pos = 0;
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /* write to output handle */
+ ((char *) output->dst)[output->pos] = '\0';
+ ahwrite(output->dst, 1, output->pos, AH);
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+ }
+
+ pg_free(unconstify(void *, input->src));
+ pg_free(output->dst);
+}
+
+/* Public routines that support Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ cs->private_data = pg_malloc0(sizeof(ZstdCompressorState));
+}
+
+/*----------------------
+ * Compressed stream API
+ *----------------------
+ */
+
+static size_t
+Zstd_read(void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ /* input_size is the allocated size */
+ size_t input_size = ZSTD_DStreamInSize();
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_size);
+
+ /* If the input is completely consumed, start back at the beginning */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_size, zstdcs->fp);
+ input->size = cnt;
+
+ /* If we have no input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ Assert(cnt >= 0);
+ Assert(input->size <= input_size);
+
+ while (input->pos < input->size)
+ {
+ /* decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ if (output->pos == output->size)
+ break; /* No more room for output */
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ return output->pos;
+}
+
+static size_t
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+#if 0
+ ZSTD_CCtx_reset(fp->zstd.cstream, ZSTD_reset_session_only);
+ res = ZSTD_CCtx_setPledgedSrcSize(fp->zstd.cstream, size);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+#endif
+
+ /* Consume all input, and flush later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ pg_fatal("could not write data: %m");
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ int i,
+ res;
+
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of LOs, and the I/O is buffered anyway.
+ */
+ for (i = 0; i < len - 1; ++i)
+ {
+ res = CFH->read_func(&buf[i], 1, CFH);
+ if (res != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : NULL;
+}
+
+static int
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int result;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ pg_fatal("could not write data: %m");
+
+ if (res == 0)
+ break;
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ result = fclose(zstdcs->fp);
+ pg_free(zstdcs);
+ return result;
+}
+
+static int
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static int
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ return 1;
+
+ CFH->private_data = pg_malloc0(sizeof(ZstdCompressorState));
+ zstdcs = (ZstdCompressorState *) CFH->private_data;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = ZstdCStreamParams(CFH->compression_spec);
+ }
+ else if (strchr(mode, 'r'))
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ /* XXX else: bad mode */
+
+ return 0;
+}
+
+static int
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+#if 0
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ if (ZSTD_isError(res))
+ return ZSTD_getErrorName(res)
+ else
+#endif
+
+ return strerror(errno);
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..f36698b4c26
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,9 @@
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index 0da476a4c34..334d449091d 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 2063d6f239d..fc739db4355 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2075,7 +2075,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2086,10 +2086,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index 41c2b733e3e..29845340859 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index ce2242195f3..6495a88ff99 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -710,19 +710,6 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
-
/*
* Custom and directory formats are compressed by default with gzip when
* available, not the others.
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index ad7bc5c194b..eb5e09f55c8 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -213,6 +214,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
@@ -4248,10 +4320,11 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip/lz4 if they're not supported.
+ # Skip command-level tests for gzip/lz4/zstd if they're not supported.
if ($pgdump_runs{$run}->{compile_option} &&
(($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'zstd' && !$supports_zstd)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck
index 135633fb19b..71d19683817 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -154,6 +154,7 @@ do
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
test "$f" = src/bin/pg_dump/pg_dump.h && continue
--
2.34.1
0003-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From 7b9bff6e458afad750e026a20a7ebf96c7767dde Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 3/4] TMP: pg_dump: use Zstd by default, for CI only
ci-os-only: warnings
---
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 14 +++++++-------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 6495a88ff99..27eb9f4e63d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -717,8 +717,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#endif
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index eb5e09f55c8..8b58507703d 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -384,10 +384,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -409,16 +409,16 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
glob_patterns => [
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
- $supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ $supports_zstd ?
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
0004-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From 2fe26e632f68d7ffdaa977b6110739a85ee33a60 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 4/4] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
//-os-only: freebsd
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 8 ++-
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 131 insertions(+), 8 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 93fc7167d4a..ee8bb7c2445 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2747,7 +2747,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2764,6 +2765,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 0df86a245f0..253518b7a41 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -678,8 +678,12 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>, or
+ <application>lz4</application>; or <application>zstd</application>,
+ but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may allow dumps
+ to be significantly smaller, but it might not reduce the size of
+ custom or directory format dumps, whose fields are separately compressed.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index 5d2f2bf2a2d..65c14a7cb67 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -67,6 +67,11 @@ ZstdCStreamParams(pg_compress_specification compress)
ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers);
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ ZSTD_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance);
+
#if 0
if (compress.options & PG_COMPRESSION_OPTION_CHECKSUM)
ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_checksumFlag,
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 8b58507703d..15e8f606a86 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..713a77c292d 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as an boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..6cf8cf396a8 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ int long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
On Sat, Feb 25, 2023 at 5:22 PM Justin Pryzby <pryzby@telsasoft.com> wrote:
This resolves cfbot warnings: windows and cppcheck.
And refactors zstd routines.
And updates docs.
And includes some fixes for earlier patches that these patches conflicts
with/depends on.
This'll need a rebase (cfbot took a while to catch up). The patchset
includes basebackup modifications, which are part of a different CF
entry; was that intended?
I tried this on a local, 3.5GB, mostly-text table (from the UK Price
Paid dataset [1]https://landregistry.data.gov.uk/) and the comparison against the other methods was
impressive. (I'm no good at constructing compression benchmarks, so
this is a super naive setup. Client's on the same laptop as the
server.)
$ time ./src/bin/pg_dump/pg_dump -d postgres -t pp_complete -Z
zstd > /tmp/zstd.dump
real 1m17.632s
user 0m35.521s
sys 0m2.683s
$ time ./\src/bin/pg_dump/pg_dump -d postgres -t pp_complete -Z
lz4 > /tmp/lz4.dump
real 1m13.125s
user 0m19.795s
sys 0m3.370s
$ time ./\src/bin/pg_dump/pg_dump -d postgres -t pp_complete -Z
gzip > /tmp/gzip.dump
real 2m24.523s
user 2m22.114s
sys 0m1.848s
$ ls -l /tmp/*.dump
-rw-rw-r-- 1 jacob jacob 1331493925 Mar 3 09:45 /tmp/gzip.dump
-rw-rw-r-- 1 jacob jacob 2125998939 Mar 3 09:42 /tmp/lz4.dump
-rw-rw-r-- 1 jacob jacob 1215834718 Mar 3 09:40 /tmp/zstd.dump
Default gzip was the only method that bottlenecked on pg_dump rather
than the server, and default zstd outcompressed it at a fraction of
the CPU time. So, naively, this looks really good.
With this particular dataset, I don't see much improvement with
zstd:long. (At nearly double the CPU time, I get a <1% improvement in
compression size.) I assume it's heavily data dependent, but from the
notes on --long [2]https://github.com/facebook/zstd/releases/tag/v1.3.2 it seems like they expect you to play around with
the window size to further tailor it to your data. Does it make sense
to provide the long option without the windowLog parameter?
Thanks,
--Jacob
[1]: https://landregistry.data.gov.uk/
[2]: https://github.com/facebook/zstd/releases/tag/v1.3.2
On Fri, Mar 03, 2023 at 10:32:53AM -0800, Jacob Champion wrote:
On Sat, Feb 25, 2023 at 5:22 PM Justin Pryzby <pryzby@telsasoft.com> wrote:
This resolves cfbot warnings: windows and cppcheck.
And refactors zstd routines.
And updates docs.
And includes some fixes for earlier patches that these patches conflicts
with/depends on.This'll need a rebase (cfbot took a while to catch up).
Soon.
The patchset includes basebackup modifications, which are part of a
different CF entry; was that intended?
Yes, it's intentional - if zstd:long mode were to be merged first, then
this patch should include long mode from the start.
Or, if pgdump+zstd were merged first, then long mode could be added to
both places.
I tried this on a local, 3.5GB, mostly-text table (from the UK Price
Thanks for looking. If your zstd library is compiled with thread
support, could you also try with :workers=N ? I believe this is working
correctly, but I'm going to ask for help verifying that...
It'd be especially useful to test under windows, where pgdump/restore
use threads instead of forking... If you have a windows environment but
not set up for development, I think it's possible to get cirrusci to
compile a patch for you and then retrieve the binaries provided as an
"artifact" (credit/blame for this idea should be directed to Thomas
Munro).
With this particular dataset, I don't see much improvement with
zstd:long.
Yeah. I this could be because either 1) you already got very good
comprssion without looking at more data; and/or 2) the neighboring data
is already very similar, maybe equally or more similar, than the further
data, from which there's nothing to gain.
(At nearly double the CPU time, I get a <1% improvement in
compression size.) I assume it's heavily data dependent, but from the
notes on --long [2] it seems like they expect you to play around with
the window size to further tailor it to your data. Does it make sense
to provide the long option without the windowLog parameter?
I don't want to start exposing lots of fine-granined parameters at this
point. In the immediate case, it looks like it may require more than
just adding another parameter:
Note: If windowLog is set to larger than 27,
--long=windowLog or --memory=windowSize needs to be passed to the
decompressor.
--
Justin
On Fri, Mar 3, 2023 at 10:55 AM Justin Pryzby <pryzby@telsasoft.com> wrote:
Thanks for looking. If your zstd library is compiled with thread
support, could you also try with :workers=N ? I believe this is working
correctly, but I'm going to ask for help verifying that...
Unfortunately not (Ubuntu 20.04):
pg_dump: error: could not set compression parameter: Unsupported parameter
But that lets me review the error! I think these error messages should
say which options caused them.
It'd be especially useful to test under windows, where pgdump/restore
use threads instead of forking... If you have a windows environment but
not set up for development, I think it's possible to get cirrusci to
compile a patch for you and then retrieve the binaries provided as an
"artifact" (credit/blame for this idea should be directed to Thomas
Munro).
I should be able to do that next week.
With this particular dataset, I don't see much improvement with
zstd:long.Yeah. I this could be because either 1) you already got very good
comprssion without looking at more data; and/or 2) the neighboring data
is already very similar, maybe equally or more similar, than the further
data, from which there's nothing to gain.
What kinds of improvements do you see with your setup? I'm wondering
when we would suggest that people use it.
I don't want to start exposing lots of fine-granined parameters at this
point. In the immediate case, it looks like it may require more than
just adding another parameter:Note: If windowLog is set to larger than 27,
--long=windowLog or --memory=windowSize needs to be passed to the
decompressor.
Hm. That would complicate things.
Thanks,
--Jacob
On Fri, Mar 03, 2023 at 01:38:05PM -0800, Jacob Champion wrote:
With this particular dataset, I don't see much improvement with
zstd:long.Yeah. I this could be because either 1) you already got very good
comprssion without looking at more data; and/or 2) the neighboring data
is already very similar, maybe equally or more similar, than the further
data, from which there's nothing to gain.What kinds of improvements do you see with your setup? I'm wondering
when we would suggest that people use it.
On customer data, I see small improvements - below 10%.
But on my first two tries, I made synthetic data sets where it's a lot:
$ ./src/bin/pg_dump/pg_dump -d pryzbyj -Fp -Z zstd:long |wc -c
286107
$ ./src/bin/pg_dump/pg_dump -d pryzbyj -Fp -Z zstd:long=0 |wc -c
1709695
That's just 6 identical tables like:
pryzbyj=# CREATE TABLE t1 AS SELECT generate_series(1,999999);
In this case, "custom" format doesn't see that benefit, because the
greatest similarity is across tables, which don't share compressor
state. But I think the note that I wrote in the docs about that should
be removed - custom format could see a big benefit, as long as the table
is big enough, and there's more similarity/repetition at longer
distances.
Here's one where custom format *does* benefit, due to long-distance
repetition within a single table. The data is contrived, but the schema
of ID => data is not. What's notable isn't how compressible the data
is, but how much *more* compressible it is with long-distance matching.
pryzbyj=# CREATE TABLE t1 AS SELECT i,array_agg(j) FROM generate_series(1,444)i,generate_series(1,99999)j GROUP BY 1;
$ ./src/bin/pg_dump/pg_dump -d pryzbyj -Fc -Z zstd:long=1 |wc -c
82023
$ ./src/bin/pg_dump/pg_dump -d pryzbyj -Fc -Z zstd:long=0 |wc -c
1048267
--
Justin
On Sat, Feb 25, 2023 at 07:22:27PM -0600, Justin Pryzby wrote:
On Fri, Feb 24, 2023 at 01:18:40PM -0600, Justin Pryzby wrote:
This is a draft patch - review is welcome and would help to get this
ready to be considererd for v16, if desired.I'm going to add this thread to the old CF entry.
https://commitfest.postgresql.org/31/2888/This resolves cfbot warnings: windows and cppcheck.
And refactors zstd routines.
And updates docs.
And includes some fixes for earlier patches that these patches conflicts
with/depends on.
This rebases over the TAP and doc fixes to LZ4.
And adds necessary ENV to makefile and meson.
And adds an annoying boilerplate header.
And removes supports_compression(), which is what I think Tomas meant
when referring to "annoying unsupported cases".
And updates zstd.c: fix an off-by-one, allocate in init depending on
readF/writeF, do not reset the input buffer on each iteration, and show
parameter name in errors.
I'd appreciate help checking that this is doing the right things and
works correctly with zstd threaded workers. The zstd API says: "use one
different context per thread for parallel execution" and "For parallel
execution, use one separate ZSTD_CStream per thread".
https://github.com/facebook/zstd/blob/dev/lib/zstd.h
I understand that to mean that, if pg_dump *itself* were using threads,
then each thread would need to call ZSTD_createCStream(). pg_dump isn't
threaded, so there's nothing special needed, right?
Except that, under windows, pg_dump -Fd -j actually uses threads instead
of forking. I *think* that's still safe, since the pgdump threads are
created *before* calling zstd functions (see _PrintTocData and
_StartData of the custom and directory formats), so it happens naturally
that there's a separate zstd stream for each thread of pgdump.
--
Justin
Attachments:
0002-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From 002cf327f7309c7d2b94919626b6a972ca875ed4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 2/3] TMP: pg_dump: use Zstd by default, for CI only
//-os-only: warnings
---
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 14 +++++++-------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 08edeef2e3d..2f35b059a75 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -717,8 +717,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#else
/* Nothing to do in the default case */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 85dcc144f83..11a28e77f54 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -384,10 +384,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -409,16 +409,16 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
glob_patterns => [
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
- $supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ $supports_zstd ?
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
0003-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From b60bb3ce2cc7e6babc6f8b88a7a5c1f481b79345 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 3/3] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
//-os-only: freebsd
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 7 ++-
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 130 insertions(+), 8 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 73b7f4432f3..05a887cd092 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2747,7 +2747,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2764,6 +2765,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 1fb66be1818..261c18e14cc 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -678,8 +678,11 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>, or
+ <application>lz4</application>; or <application>zstd</application>,
+ but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may improve the
+ compression ratio, at the cost of increased memory use.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index d4c54d6a1dd..09378f62939 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -84,6 +84,11 @@ ZstdCStreamParams(pg_compress_specification compress)
ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers, "workers");
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ ZSTD_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance, "long");
+
return cstream;
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 11a28e77f54..634acf39840 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..713a77c292d 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as an boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..6cf8cf396a8 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ int long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
0001-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From 6a8b88a3dd37d24ebfdaa6a96505476b8a1efe92 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 1/3] pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
---
doc/src/sgml/ref/pg_dump.sgml | 8 +-
src/bin/pg_dump/Makefile | 2 +
src/bin/pg_dump/compress_io.c | 79 ++--
src/bin/pg_dump/compress_zstd.c | 515 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 9 +
src/bin/pg_dump/meson.build | 2 +
src/bin/pg_dump/pg_backup_archiver.c | 28 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 13 -
src/bin/pg_dump/t/002_pg_dump.pl | 79 +++-
src/tools/pginclude/cpluspluscheck | 1 +
11 files changed, 654 insertions(+), 84 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 334e4b7fd14..1fb66be1818 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -330,8 +330,9 @@ PostgreSQL documentation
machine-readable format that <application>pg_restore</application>
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
- can be compressed with the <application>gzip</application> or
- <application>lz4</application> tools.
+ can be compressed with the <application>gzip</application>,
+ <application>lz4</application>, or
+ <application>zstd</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -655,7 +656,8 @@ PostgreSQL documentation
<para>
Specify the compression method and/or the compression level to use.
The compression method can be set to <literal>gzip</literal>,
- <literal>lz4</literal>, or <literal>none</literal> for no compression.
+ <literal>lz4</literal>, <literal>zstd</literal>,
+ or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..24de7593a6a 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -18,6 +18,7 @@ include $(top_builddir)/src/Makefile.global
export GZIP_PROGRAM=$(GZIP)
export LZ4
+export ZSTD
export with_icu
override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
@@ -29,6 +30,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index ce06f1eac9c..a3c2f36bb67 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -76,36 +77,6 @@
*----------------------
*/
-/*
- * Checks whether a compression algorithm is supported.
- *
- * On success returns NULL, otherwise returns a malloc'ed string which can be
- * used by the caller in an error message.
- */
-char *
-supports_compression(const pg_compress_specification compression_spec)
-{
- const pg_compress_algorithm algorithm = compression_spec.algorithm;
- bool supported = false;
-
- if (algorithm == PG_COMPRESSION_NONE)
- supported = true;
-#ifdef HAVE_LIBZ
- if (algorithm == PG_COMPRESSION_GZIP)
- supported = true;
-#endif
-#ifdef USE_LZ4
- if (algorithm == PG_COMPRESSION_LZ4)
- supported = true;
-#endif
-
- if (!supported)
- return psprintf("this build does not support compression with %s",
- get_compress_algorithm_name(algorithm));
-
- return NULL;
-}
-
/*----------------------
* Compressor API
*----------------------
@@ -130,6 +101,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,20 +169,30 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
@@ -237,28 +220,12 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
/* avoid unused warning if it is not built with compression */
if (exists)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz"))
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..d4c54d6a1dd
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,515 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.c
+ * Routines for archivers to write a Zstd compressed data stream.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+
+ /* pointer to a static string like from strerror(), for Zstd_write() */
+ const char *zstderror;
+} ZstdCompressorState;
+
+static ZSTD_CStream *ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+ZSTD_CCtx_setParam_or_die(ZSTD_CStream *cstream,
+ ZSTD_cParameter param, int value, char *paramname)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: \"%s\": %s",
+ paramname, ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level, "level");
+
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ ZSTD_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers, "workers");
+
+ return cstream;
+}
+
+/* Helper function for EndCompressorZstd and WriteDataToArchiveZstd */
+static void
+ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop while there's any input or until flushed */
+ while (input->pos != input->size || flush)
+ {
+ size_t res;
+
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, flush ? ZSTD_e_end : ZSTD_e_continue);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length chunk
+ * is the EOF marker in the custom format. This should never happen
+ * but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ output->pos = 0;
+
+ if (res == 0)
+ break;
+ }
+}
+
+void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ if (zstdcs->cstream != NULL)
+ {
+ ZstdWriteCommon(AH, cs, true);
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream != NULL)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ pg_free(zstdcs->output.dst);
+ }
+
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ zstdcs->input.src = data;
+ zstdcs->input.size = dLen;
+ zstdcs->input.pos = 0;
+
+ ZstdWriteCommon(AH, cs, false);
+}
+
+/* Read data from a compressed zstd archive */
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res;
+
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * Read compressed data. Note that readF can resize the buffer; the
+ * new size is tracked and used for future loops.
+ */
+ input->size = input_allocated_size;
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size);
+ input_allocated_size = input->size;
+ input->size = cnt;
+ input->pos = 0;
+
+ if (cnt == 0)
+ break;
+
+ /* Now decompress */
+ while (input->pos < input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /* write to output handle */
+ ((char *) output->dst)[output->pos] = '\0';
+ ahwrite(output->dst, 1, output->pos, AH);
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+ }
+}
+
+/* Public routines that support Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ ZstdCompressorState *zstdcs;
+
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ cs->private_data = pg_malloc0(sizeof(ZstdCompressorState));
+ zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ if (cs->readF != NULL)
+ {
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ zstdcs->input.size = ZSTD_DStreamInSize();
+ zstdcs->input.src = pg_malloc(zstdcs->input.size);
+
+ zstdcs->output.size = ZSTD_DStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size + 1);
+ }
+
+ if (cs->writeF != NULL)
+ {
+ zstdcs->cstream = ZstdCStreamParams(cs->compression_spec);
+
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ }
+}
+
+/*
+ * Compressed stream API
+ */
+
+static size_t
+Zstd_read(void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_allocated_size);
+
+ /* If the input is completely consumed, start back at the beginning */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_allocated_size, zstdcs->fp);
+ input->size = cnt;
+
+ Assert(cnt >= 0);
+ Assert(input->size <= input_allocated_size);
+
+ /* If we have no input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ while (input->pos < input->size)
+ {
+ /* decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ if (output->pos == output->size)
+ break; /* No more room for output */
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ return output->pos;
+}
+
+static size_t
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+ /* Consume all input, to be flushed later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return -1;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return -1;
+ }
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ int i,
+ res;
+
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of LOs, and the I/O is buffered anyway.
+ */
+ for (i = 0; i < len - 1; ++i)
+ {
+ res = CFH->read_func(&buf[i], 1, CFH);
+ if (res != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : NULL;
+}
+
+static int
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int result;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop until the compression buffers are fully consumed */
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return -1;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return -1;
+ }
+
+ if (res == 0)
+ break;
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ result = fclose(zstdcs->fp);
+ pg_free(zstdcs);
+ return result;
+}
+
+static int
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static int
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ return 1;
+
+ CFH->private_data = pg_malloc0(sizeof(ZstdCompressorState));
+ zstdcs = (ZstdCompressorState *) CFH->private_data;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'r')
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = ZstdCStreamParams(CFH->compression_spec);
+ }
+ else
+ pg_fatal("unhandled mode");
+
+ return 0;
+}
+
+static int
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+ return strerror(errno);
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..f36698b4c26
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,9 @@
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index 0da476a4c34..c332ef87787 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
@@ -90,6 +91,7 @@ tests += {
'env': {
'GZIP_PROGRAM': gzip.path(),
'LZ4': program_lz4.found() ? program_lz4.path() : '',
+ 'ZSTD': program_zstd.found() ? program_zstd.path() : '',
},
'tests': [
't/001_basic.pl',
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 61ebb8fe85d..6fda84d8774 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -388,10 +388,12 @@ RestoreArchive(Archive *AHX)
{
if (te->hadDumper && (te->reqs & REQ_DATA) != 0)
{
- char *errmsg = supports_compression(AH->compression_spec);
- if (errmsg)
+ pg_compress_specification compress_spec;
+ parse_compress_specification(AH->compression_spec.algorithm,
+ NULL, &compress_spec);
+ if (compress_spec.parse_error != NULL)
pg_fatal("cannot restore from compressed archive (%s)",
- errmsg);
+ compress_spec.parse_error);
else
break;
}
@@ -2075,7 +2077,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2086,10 +2088,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
@@ -3674,7 +3683,7 @@ WriteHead(ArchiveHandle *AH)
void
ReadHead(ArchiveHandle *AH)
{
- char *errmsg;
+ pg_compress_specification compress_spec;
char vmaj,
vmin,
vrev;
@@ -3745,12 +3754,13 @@ ReadHead(ArchiveHandle *AH)
else
AH->compression_spec.algorithm = PG_COMPRESSION_GZIP;
- errmsg = supports_compression(AH->compression_spec);
- if (errmsg)
+ parse_compress_specification(AH->compression_spec.algorithm,
+ NULL, &compress_spec);
+ if (compress_spec.parse_error != NULL)
{
pg_log_warning("archive is compressed, but this installation does not support compression (%s) -- no data will be available",
- errmsg);
- pg_free(errmsg);
+ compress_spec.parse_error);
+ pg_free(compress_spec.parse_error);
}
if (AH->version >= K_VERS_1_4)
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index 41c2b733e3e..29845340859 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 74d806c77ba..08edeef2e3d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -710,19 +710,6 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
-
/*
* Custom and directory formats are compressed by default with gzip when
* available, not the others.
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 14cd0d2d503..85dcc144f83 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -213,6 +214,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "-o", "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "-o", "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
@@ -4271,10 +4343,11 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip/lz4 if there is no support for it.
+ # Skip command-level tests for gzip/lz4/zstd if the tool is not supported
if ($pgdump_runs{$run}->{compile_option} &&
(($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'zstd' && !$supports_zstd)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck
index 58039934756..10fb51585c9 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -154,6 +154,7 @@ do
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
test "$f" = src/bin/pg_dump/pg_dump.h && continue
--
2.34.1
On Sat, Mar 4, 2023 at 8:57 AM Justin Pryzby <pryzby@telsasoft.com> wrote:
pryzbyj=# CREATE TABLE t1 AS SELECT i,array_agg(j) FROM generate_series(1,444)i,generate_series(1,99999)j GROUP BY 1;
$ ./src/bin/pg_dump/pg_dump -d pryzbyj -Fc -Z zstd:long=1 |wc -c
82023
$ ./src/bin/pg_dump/pg_dump -d pryzbyj -Fc -Z zstd:long=0 |wc -c
1048267
Nice!
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?
Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned. And I still wasn't able to test :workers, since
it looks like the official libzstd for Windows isn't built for
multithreading. That'll be another day's project.
--Jacob
Hi,
This'll need another rebase over the meson ICU changes.
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com>
wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?
A meson wrap made this much easier! It looks like pg_dump's meson.build
is missing dependencies on zstd (meson couldn't find the headers in the
subproject without them).
Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.
Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.
And I still wasn't able to test :workers, since
it looks like the official libzstd for Windows isn't built for
multithreading. That'll be another day's project.
The wrapped installation enabled threading too, so I was able to try
with :workers=8. Everything seems to work, but I didn't have a dataset
that showed speed improvements at the time. It did seem to affect the
overall compressibility negatively -- which makes sense, I think,
assuming each thread is looking at a separate and/or smaller window.
On to code (not a complete review):
if (hasSuffix(fname, ".gz"))
compression_spec.algorithm = PG_COMPRESSION_GZIP;
else
{
bool exists;exists = (stat(path, &st) == 0); /* avoid unused warning if it is not built with compression */ if (exists) compression_spec.algorithm = PG_COMPRESSION_NONE; -#ifdef HAVE_LIBZ - if (!exists) - { - free_keep_errno(fname); - fname = psprintf("%s.gz", path); - exists = (stat(fname, &st) == 0); - - if (exists) - compression_spec.algorithm = PG_COMPRESSION_GZIP; - } -#endif -#ifdef USE_LZ4 - if (!exists) - { - free_keep_errno(fname); - fname = psprintf("%s.lz4", path); - exists = (stat(fname, &st) == 0); - - if (exists) - compression_spec.algorithm = PG_COMPRESSION_LZ4; - } -#endif + else if (check_compressed_file(path, &fname, "gz")) + compression_spec.algorithm = PG_COMPRESSION_GZIP; + else if (check_compressed_file(path, &fname, "lz4")) + compression_spec.algorithm = PG_COMPRESSION_LZ4; + else if (check_compressed_file(path, &fname, "zst")) + compression_spec.algorithm = PG_COMPRESSION_ZSTD; }
This function lost some coherence, I think. Should there be a hasSuffix
check at the top for ".zstd" (and, for that matter, ".lz4")? And the
comment references an unused warning, which is only possible with the
#ifdef blocks that were removed.
I'm a little suspicious of the replacement of supports_compression()
with parse_compress_specification(). For example:
- errmsg = supports_compression(AH->compression_spec); - if (errmsg) + parse_compress_specification(AH->compression_spec.algorithm, + NULL, &compress_spec); + if (compress_spec.parse_error != NULL) { pg_log_warning("archive is compressed, but this installation does not support compression (%s - errmsg); - pg_free(errmsg); + compress_spec.parse_error); + pg_free(compress_spec.parse_error); }
The top-level error here is "does not support compression", but wouldn't
a bad specification option with a supported compression method trip this
path too?
+static void +ZSTD_CCtx_setParam_or_die(ZSTD_CStream *cstream, + ZSTD_cParameter param, int value, char *paramname)
IMO we should avoid stepping on the ZSTD_ namespace with our own
internal function names.
+ if (cs->readF != NULL) + { + zstdcs->dstream = ZSTD_createDStream(); + if (zstdcs->dstream == NULL) + pg_fatal("could not initialize compression library"); + + zstdcs->input.size = ZSTD_DStreamInSize(); + zstdcs->input.src = pg_malloc(zstdcs->input.size); + + zstdcs->output.size = ZSTD_DStreamOutSize(); + zstdcs->output.dst = pg_malloc(zstdcs->output.size + 1); + } + + if (cs->writeF != NULL) + { + zstdcs->cstream = ZstdCStreamParams(cs->compression_spec); + + zstdcs->output.size = ZSTD_CStreamOutSize(); + zstdcs->output.dst = pg_malloc(zstdcs->output.size); + zstdcs->output.pos = 0; + }
This seems to suggest that both cs->readF and cs->writeF could be set,
but in that case, the output buffer gets reallocated.
I was curious about the extra byte allocated in the decompression case.
I see that ReadDataFromArchiveZstd() is null-terminating the buffer
before handing it to ahwrite(), but why does it need to do that?
+static const char * +Zstd_get_error(CompressFileHandle *CFH) +{ + return strerror(errno); +}
Seems like this should be using the zstderror stored in the handle?
In ReadDataFromArchiveZstd():
+ size_t input_allocated_size = ZSTD_DStreamInSize(); + size_t res; + + for (;;) + { + size_t cnt; + + /* + * Read compressed data. Note that readF can resize the buffer; the + * new size is tracked and used for future loops. + */ + input->size = input_allocated_size; + cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size); + input_allocated_size = input->size; + input->size = cnt;
This is pretty complex for what it's doing. I'm a little worried that we
let the reallocated buffer escape to the caller while losing track of
how big it is. I think that works today, since there's only ever one
call per handle, but any future refactoring that allowed cs->readData()
to be called more than once would subtly break this code.
In ZstdWriteCommon():
+ /* + * Extra paranoia: avoid zero-length chunks, since a zero length chunk + * is the EOF marker in the custom format. This should never happen + * but... + */ + if (output->pos > 0) + cs->writeF(AH, output->dst, output->pos); + + output->pos = 0;
Elsewhere, output->pos is set to zero before compressing, but here we do
it after, which I think leads to subtle differences in the function
preconditions. If that's an intentional difference, can the reason be
called out in a comment?
--Jacob
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).
I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?
Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.
My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).
On to code (not a complete review):
if (hasSuffix(fname, ".gz"))
compression_spec.algorithm = PG_COMPRESSION_GZIP;
else
{
bool exists;exists = (stat(path, &st) == 0); /* avoid unused warning if it is not built with compression */ if (exists) compression_spec.algorithm = PG_COMPRESSION_NONE; -#ifdef HAVE_LIBZ - if (!exists) - { - free_keep_errno(fname); - fname = psprintf("%s.gz", path); - exists = (stat(fname, &st) == 0); - - if (exists) - compression_spec.algorithm = PG_COMPRESSION_GZIP; - } -#endif -#ifdef USE_LZ4 - if (!exists) - { - free_keep_errno(fname); - fname = psprintf("%s.lz4", path); - exists = (stat(fname, &st) == 0); - - if (exists) - compression_spec.algorithm = PG_COMPRESSION_LZ4; - } -#endif + else if (check_compressed_file(path, &fname, "gz")) + compression_spec.algorithm = PG_COMPRESSION_GZIP; + else if (check_compressed_file(path, &fname, "lz4")) + compression_spec.algorithm = PG_COMPRESSION_LZ4; + else if (check_compressed_file(path, &fname, "zst")) + compression_spec.algorithm = PG_COMPRESSION_ZSTD; }This function lost some coherence, I think. Should there be a hasSuffix
check at the top for ".zstd" (and, for that matter, ".lz4")?
The function is first checking if it was passed a filename which already
has a suffix. And if not, it searches through a list of suffixes,
testing for an existing file with each suffix. The search with stat()
doesn't happen if it has a suffix. I'm having trouble seeing how the
hasSuffix() branch isn't dead code. Another opened question.
I'm a little suspicious of the replacement of supports_compression()
with parse_compress_specification(). For example:- errmsg = supports_compression(AH->compression_spec); - if (errmsg) + parse_compress_specification(AH->compression_spec.algorithm, + NULL, &compress_spec); + if (compress_spec.parse_error != NULL) { pg_log_warning("archive is compressed, but this installation does not support compression (%s - errmsg); - pg_free(errmsg); + compress_spec.parse_error); + pg_free(compress_spec.parse_error); }The top-level error here is "does not support compression", but wouldn't
a bad specification option with a supported compression method trip this
path too?
No - since the 2nd argument is passed as NULL, it just checks whether
the compression is supported. Maybe there ought to be a more
direct/clean way to do it. But up to now evidently nobody needed to do
that.
+static void +ZSTD_CCtx_setParam_or_die(ZSTD_CStream *cstream, + ZSTD_cParameter param, int value, char *paramname)IMO we should avoid stepping on the ZSTD_ namespace with our own
internal function names.
done
+ if (cs->readF != NULL) + + if (cs->writeF != NULL)This seems to suggest that both cs->readF and cs->writeF could be set,
but in that case, the output buffer gets reallocated.
I put back an assertion that exactly one of them was set, since that's
true of how it currently works.
I was curious about the extra byte allocated in the decompression case.
I see that ReadDataFromArchiveZstd() is null-terminating the buffer
before handing it to ahwrite(), but why does it need to do that?
I was trying to figure that out, too. I think the unterminated case
might be for ExecuteSqlCommandBuf(), and that may only (have) been
needed to allow pg_restore to handle ancient/development versions of
pg_dump... It's not currently hit.
https://coverage.postgresql.org/src/bin/pg_dump/pg_backup_db.c.gcov.html#470
I found that the terminator was added for the uncompressed case was
added at e8f69be05 and removed in bf9aa490d.
+Zstd_get_error(CompressFileHandle *CFH)
Seems like this should be using the zstderror stored in the handle?
Yes - I'd already addressed that locally.
In ReadDataFromArchiveZstd():
+ * Read compressed data. Note that readF can resize the buffer; the + * new size is tracked and used for future loops.This is pretty complex for what it's doing. I'm a little worried that we
let the reallocated buffer escape to the caller while losing track of
how big it is. I think that works today, since there's only ever one
call per handle, but any future refactoring that allowed cs->readData()
to be called more than once would subtly break this code.
Note that nothing bad happens if we lose track of how big it is (well,
assuming that readF doesn't *shrink* the buffer).
The previous patch version didn't keep track of its new size, and the only
consequence is that readF() might re-resize it again on a future iteration,
even if it was already sufficiently large.
When I originally wrote it (and up until that patch version), I left
this as an XXX comment about reusing the resized buffer. But it seemed
easy enough to fix so I did.
In ZstdWriteCommon():
Elsewhere, output->pos is set to zero before compressing, but here we do
it after, which I think leads to subtle differences in the function
preconditions. If that's an intentional difference, can the reason be
called out in a comment?
It's not deliberate. I think it had no effect, but changed - thanks.
--
Justin
Attachments:
0001-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From 605bfb6974503bb71bdc09c9539313ef92d19ff3 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 1/3] pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
---
doc/src/sgml/ref/pg_dump.sgml | 8 +-
src/bin/pg_dump/Makefile | 2 +
src/bin/pg_dump/compress_io.c | 85 ++---
src/bin/pg_dump/compress_zstd.c | 523 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 25 ++
src/bin/pg_dump/meson.build | 10 +-
src/bin/pg_dump/pg_backup_archiver.c | 28 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 13 -
src/bin/pg_dump/t/002_pg_dump.pl | 79 +++-
src/tools/pginclude/cpluspluscheck | 1 +
11 files changed, 683 insertions(+), 93 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index e6b003bf104..4652087144f 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -330,8 +330,9 @@ PostgreSQL documentation
machine-readable format that <application>pg_restore</application>
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
- can be compressed with the <application>gzip</application> or
- <application>lz4</application> tools.
+ can be compressed with the <application>gzip</application>,
+ <application>lz4</application>, or
+ <application>zstd</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -655,7 +656,8 @@ PostgreSQL documentation
<para>
Specify the compression method and/or the compression level to use.
The compression method can be set to <literal>gzip</literal>,
- <literal>lz4</literal>, or <literal>none</literal> for no compression.
+ <literal>lz4</literal>, <literal>zstd</literal>,
+ or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..24de7593a6a 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -18,6 +18,7 @@ include $(top_builddir)/src/Makefile.global
export GZIP_PROGRAM=$(GZIP)
export LZ4
+export ZSTD
export with_icu
override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
@@ -29,6 +30,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index ce06f1eac9c..21972933d63 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -76,36 +77,6 @@
*----------------------
*/
-/*
- * Checks whether a compression algorithm is supported.
- *
- * On success returns NULL, otherwise returns a malloc'ed string which can be
- * used by the caller in an error message.
- */
-char *
-supports_compression(const pg_compress_specification compression_spec)
-{
- const pg_compress_algorithm algorithm = compression_spec.algorithm;
- bool supported = false;
-
- if (algorithm == PG_COMPRESSION_NONE)
- supported = true;
-#ifdef HAVE_LIBZ
- if (algorithm == PG_COMPRESSION_GZIP)
- supported = true;
-#endif
-#ifdef USE_LZ4
- if (algorithm == PG_COMPRESSION_LZ4)
- supported = true;
-#endif
-
- if (!supported)
- return psprintf("this build does not support compression with %s",
- get_compress_algorithm_name(algorithm));
-
- return NULL;
-}
-
/*----------------------
* Compressor API
*----------------------
@@ -130,6 +101,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,20 +169,30 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
@@ -231,34 +214,14 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
compression_spec.algorithm = PG_COMPRESSION_GZIP;
else
{
- bool exists;
-
- exists = (stat(path, &st) == 0);
- /* avoid unused warning if it is not built with compression */
- if (exists)
+ if (stat(path, &st) == 0)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz"))
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..dc610a64de5
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.c
+ * Routines for archivers to write a Zstd compressed data stream.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+
+ /* pointer to a static string like from strerror(), for Zstd_write() */
+ const char *zstderror;
+} ZstdCompressorState;
+
+static ZSTD_CStream *ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+Zstd_CCtx_setParam_or_die(ZSTD_CStream *cstream,
+ ZSTD_cParameter param, int value, char *paramname)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: \"%s\": %s",
+ paramname, ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level, "level");
+
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers, "workers");
+
+ return cstream;
+}
+
+/* Helper function for WriteDataToArchiveZstd and EndCompressorZstd */
+static void
+ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop while there's any input or until flushed */
+ while (input->pos != input->size || flush)
+ {
+ size_t res;
+
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, flush ? ZSTD_e_end : ZSTD_e_continue);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length chunk
+ * is the EOF marker in the custom format. This should never happen
+ * but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ if (res == 0)
+ break;
+ }
+}
+
+void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ Assert(zstdcs->cstream == NULL);
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+ else if (cs->writeF != NULL)
+ {
+ Assert(zstdcs->dstream == NULL);
+ ZstdWriteCommon(AH, cs, true);
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ zstdcs->input.src = data;
+ zstdcs->input.size = dLen;
+ zstdcs->input.pos = 0;
+
+ ZstdWriteCommon(AH, cs, false);
+}
+
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res;
+
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * Read compressed data. Note that readF can resize the buffer; the
+ * new size is tracked and used for future loops.
+ */
+ input->size = input_allocated_size;
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size);
+ input_allocated_size = input->size;
+ input->size = cnt;
+ input->pos = 0;
+
+ if (cnt == 0)
+ break;
+
+ /* Now decompress */
+ while (input->pos < input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /* then write the decompressed data to the output handle */
+ ((char *) output->dst)[output->pos] = '\0';
+ ahwrite(output->dst, 1, output->pos, AH);
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+ }
+}
+
+/* Public routine that supports Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec)
+{
+ ZstdCompressorState *zstdcs;
+
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ cs->private_data = zstdcs;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ zstdcs->input.size = ZSTD_DStreamInSize();
+ zstdcs->input.src = pg_malloc(zstdcs->input.size);
+
+ zstdcs->output.size = ZSTD_DStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size + 1);
+ }
+ else if (cs->writeF != NULL)
+ {
+ zstdcs->cstream = ZstdCStreamParams(cs->compression_spec);
+
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ }
+}
+
+/*
+ * Compressed stream API
+ */
+
+static size_t
+Zstd_read(void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_allocated_size);
+
+ /* If the input is completely consumed, start back at the beginning */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_allocated_size, zstdcs->fp);
+ input->size = cnt;
+
+ Assert(cnt >= 0);
+ Assert(cnt <= input_allocated_size);
+
+ /* If we have no more input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ while (input->pos < input->size)
+ {
+ /* now decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ if (output->pos == output->size)
+ break; /* No more room for output */
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ return output->pos;
+}
+
+static size_t
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+ /* Consume all input, to be flushed later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return -1;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return -1;
+ }
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ int i,
+ res;
+
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of LOs, and the I/O is buffered anyway.
+ */
+ for (i = 0; i < len - 1; ++i)
+ {
+ res = CFH->read_func(&buf[i], 1, CFH);
+ if (res != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : NULL;
+}
+
+static int
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int result;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop until the compression buffers are fully consumed */
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return -1;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return -1;
+ }
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ result = fclose(zstdcs->fp);
+ pg_free(zstdcs);
+ return result;
+}
+
+static int
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static int
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ return 1;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ CFH->private_data = zstdcs;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'r')
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = ZstdCStreamParams(CFH->compression_spec);
+ if (zstdcs->cstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else
+ pg_fatal("unhandled mode");
+
+ return 0;
+}
+
+static int
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ return zstdcs->zstderror;
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..2aaa6b100b1
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.h
+ * Zstd interface to compress_io.c routines
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index ab4c25c7811..2c5006c61f7 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
@@ -19,7 +20,7 @@ pg_dump_common_sources = files(
pg_dump_common = static_library('libpgdump_common',
pg_dump_common_sources,
c_pch: pch_postgres_fe_h,
- dependencies: [frontend_code, libpq, lz4, zlib],
+ dependencies: [frontend_code, libpq, lz4, zlib, zstd],
kwargs: internal_lib_args,
)
@@ -39,7 +40,7 @@ endif
pg_dump = executable('pg_dump',
pg_dump_sources,
link_with: [pg_dump_common],
- dependencies: [frontend_code, libpq, zlib],
+ dependencies: [frontend_code, libpq, zlib, zstd],
kwargs: default_bin_args,
)
bin_targets += pg_dump
@@ -58,7 +59,7 @@ endif
pg_dumpall = executable('pg_dumpall',
pg_dumpall_sources,
link_with: [pg_dump_common],
- dependencies: [frontend_code, libpq, zlib],
+ dependencies: [frontend_code, libpq, zlib, zstd],
kwargs: default_bin_args,
)
bin_targets += pg_dumpall
@@ -77,7 +78,7 @@ endif
pg_restore = executable('pg_restore',
pg_restore_sources,
link_with: [pg_dump_common],
- dependencies: [frontend_code, libpq, zlib],
+ dependencies: [frontend_code, libpq, zlib, zstd],
kwargs: default_bin_args,
)
bin_targets += pg_restore
@@ -90,6 +91,7 @@ tests += {
'env': {
'GZIP_PROGRAM': gzip.path(),
'LZ4': program_lz4.found() ? program_lz4.path() : '',
+ 'ZSTD': program_zstd.found() ? program_zstd.path() : '',
'with_icu': icu.found() ? 'yes' : 'no',
},
'tests': [
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 61ebb8fe85d..6fda84d8774 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -388,10 +388,12 @@ RestoreArchive(Archive *AHX)
{
if (te->hadDumper && (te->reqs & REQ_DATA) != 0)
{
- char *errmsg = supports_compression(AH->compression_spec);
- if (errmsg)
+ pg_compress_specification compress_spec;
+ parse_compress_specification(AH->compression_spec.algorithm,
+ NULL, &compress_spec);
+ if (compress_spec.parse_error != NULL)
pg_fatal("cannot restore from compressed archive (%s)",
- errmsg);
+ compress_spec.parse_error);
else
break;
}
@@ -2075,7 +2077,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2086,10 +2088,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
@@ -3674,7 +3683,7 @@ WriteHead(ArchiveHandle *AH)
void
ReadHead(ArchiveHandle *AH)
{
- char *errmsg;
+ pg_compress_specification compress_spec;
char vmaj,
vmin,
vrev;
@@ -3745,12 +3754,13 @@ ReadHead(ArchiveHandle *AH)
else
AH->compression_spec.algorithm = PG_COMPRESSION_GZIP;
- errmsg = supports_compression(AH->compression_spec);
- if (errmsg)
+ parse_compress_specification(AH->compression_spec.algorithm,
+ NULL, &compress_spec);
+ if (compress_spec.parse_error != NULL)
{
pg_log_warning("archive is compressed, but this installation does not support compression (%s) -- no data will be available",
- errmsg);
- pg_free(errmsg);
+ compress_spec.parse_error);
+ pg_free(compress_spec.parse_error);
}
if (AH->version >= K_VERS_1_4)
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index 41c2b733e3e..29845340859 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 2e068c6620e..398a7a7bb3d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -734,19 +734,6 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
-
/*
* Custom and directory formats are compressed by default with gzip when
* available, not the others.
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 42215f82f7a..74f23ae7f74 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -213,6 +214,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "-o", "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "-o", "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
@@ -4648,10 +4720,11 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip/lz4 if there is no support for it.
+ # Skip command-level tests for gzip/lz4/zstd if the tool is not supported
if ($pgdump_runs{$run}->{compile_option} &&
(($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'zstd' && !$supports_zstd)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck
index 58039934756..10fb51585c9 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -154,6 +154,7 @@ do
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
test "$f" = src/bin/pg_dump/pg_dump.h && continue
--
2.34.1
0002-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From 12f839a8b1e601849304bff8b3fc12579d6d4ecc Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 2/3] TMP: pg_dump: use Zstd by default, for CI only
//-os-only: warnings
---
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 14 +++++++-------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 398a7a7bb3d..cca52d2cd0d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -741,8 +741,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#else
/* Nothing to do in the default case */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 74f23ae7f74..87f1d2c692d 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -384,10 +384,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -409,16 +409,16 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
glob_patterns => [
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
- $supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ $supports_zstd ?
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
0003-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From e818c386dd511871a65350e2b2fe3951f78e1cd6 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 3/3] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
//-os-only: freebsd
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 7 ++-
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 130 insertions(+), 8 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 73b7f4432f3..05a887cd092 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2747,7 +2747,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2764,6 +2765,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 4652087144f..9379b98e7f9 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -678,8 +678,11 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>, or
+ <application>lz4</application>; or <application>zstd</application>,
+ but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may improve the
+ compression ratio, at the cost of increased memory use.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index dc610a64de5..ee9513cd555 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -84,6 +84,11 @@ ZstdCStreamParams(pg_compress_specification compress)
Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers, "workers");
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ Zstd_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance, "long");
+
return cstream;
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 87f1d2c692d..0a635ae9fc3 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..713a77c292d 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as an boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..6cf8cf396a8 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ int long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).
I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...
On to code (not a complete review):
if (hasSuffix(fname, ".gz"))
compression_spec.algorithm = PG_COMPRESSION_GZIP;
else
{
bool exists;exists = (stat(path, &st) == 0); /* avoid unused warning if it is not built with compression */ if (exists) compression_spec.algorithm = PG_COMPRESSION_NONE; -#ifdef HAVE_LIBZ - if (!exists) - { - free_keep_errno(fname); - fname = psprintf("%s.gz", path); - exists = (stat(fname, &st) == 0); - - if (exists) - compression_spec.algorithm = PG_COMPRESSION_GZIP; - } -#endif -#ifdef USE_LZ4 - if (!exists) - { - free_keep_errno(fname); - fname = psprintf("%s.lz4", path); - exists = (stat(fname, &st) == 0); - - if (exists) - compression_spec.algorithm = PG_COMPRESSION_LZ4; - } -#endif + else if (check_compressed_file(path, &fname, "gz")) + compression_spec.algorithm = PG_COMPRESSION_GZIP; + else if (check_compressed_file(path, &fname, "lz4")) + compression_spec.algorithm = PG_COMPRESSION_LZ4; + else if (check_compressed_file(path, &fname, "zst")) + compression_spec.algorithm = PG_COMPRESSION_ZSTD; }This function lost some coherence, I think. Should there be a hasSuffix
check at the top for ".zstd" (and, for that matter, ".lz4")?
This was discussed in the lz4 thread a couple days, and I think there
should be hasSuffix() cases for lz4/zstd too, not just for .gz.
The function is first checking if it was passed a filename which already
has a suffix. And if not, it searches through a list of suffixes,
testing for an existing file with each suffix. The search with stat()
doesn't happen if it has a suffix. I'm having trouble seeing how the
hasSuffix() branch isn't dead code. Another opened question.
AFAICS it's done this way because of this comment in pg_backup_directory
* ...
* ".gz" suffix is added to the filenames. The TOC files are never
* compressed by pg_dump, however they are accepted with the .gz suffix
* too, in case the user has manually compressed them with 'gzip'.
I haven't tried, but I believe that if you manually compress the
directory, it may hit this branch. And IMO if we support that for gzip,
the other compression methods should do that too for consistency.
In any case, it's a tiny amount of code and I don't feel like ripping
that out when it might break some currently supported use case.
I'm a little suspicious of the replacement of supports_compression()
with parse_compress_specification(). For example:- errmsg = supports_compression(AH->compression_spec); - if (errmsg) + parse_compress_specification(AH->compression_spec.algorithm, + NULL, &compress_spec); + if (compress_spec.parse_error != NULL) { pg_log_warning("archive is compressed, but this installation does not support compression (%s - errmsg); - pg_free(errmsg); + compress_spec.parse_error); + pg_free(compress_spec.parse_error); }The top-level error here is "does not support compression", but wouldn't
a bad specification option with a supported compression method trip this
path too?No - since the 2nd argument is passed as NULL, it just checks whether
the compression is supported. Maybe there ought to be a more
direct/clean way to do it. But up to now evidently nobody needed to do
that.
I don't think the patch can use parse_compress_specification() instead
of replace supports_compression(). The parsing simply determines if the
build has the library, it doesn't say if a particular tool was modified
to support the algorithm. I might build --with-zstd and yet pg_dump does
not support that algorithm yet.
Even after we add zstd to pg_dump, it's quite likely other compression
algorithms may not be supported by pg_dump from day 1.
I haven't looked at / tested the patch yet, but I wonder if you have any
thoughts regarding the size_t / int tweaks. I don't know what types zstd
library uses, how it reports errors etc.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi,
On 3/17/23 03:43, Tomas Vondra wrote:
...
I'm a little suspicious of the replacement of supports_compression()
with parse_compress_specification(). For example:- errmsg = supports_compression(AH->compression_spec); - if (errmsg) + parse_compress_specification(AH->compression_spec.algorithm, + NULL, &compress_spec); + if (compress_spec.parse_error != NULL) { pg_log_warning("archive is compressed, but this installation does not support compression (%s - errmsg); - pg_free(errmsg); + compress_spec.parse_error); + pg_free(compress_spec.parse_error); }The top-level error here is "does not support compression", but wouldn't
a bad specification option with a supported compression method trip this
path too?No - since the 2nd argument is passed as NULL, it just checks whether
the compression is supported. Maybe there ought to be a more
direct/clean way to do it. But up to now evidently nobody needed to do
that.I don't think the patch can use parse_compress_specification() instead
of replace supports_compression(). The parsing simply determines if the
build has the library, it doesn't say if a particular tool was modified
to support the algorithm. I might build --with-zstd and yet pg_dump does
not support that algorithm yet.Even after we add zstd to pg_dump, it's quite likely other compression
algorithms may not be supported by pg_dump from day 1.I haven't looked at / tested the patch yet, but I wonder if you have any
thoughts regarding the size_t / int tweaks. I don't know what types zstd
library uses, how it reports errors etc.
Any thoughts regarding my comments on removing supports_compression()?
Also, this patch needs a rebase to adopt it to the API changes from last
week. The sooner the better, considering we're getting fairly close to
the end of the CF and code freeze.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...
zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.
But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.
The function is first checking if it was passed a filename which already
has a suffix. And if not, it searches through a list of suffixes,
testing for an existing file with each suffix. The search with stat()
doesn't happen if it has a suffix. I'm having trouble seeing how the
hasSuffix() branch isn't dead code. Another opened question.AFAICS it's done this way because of this comment in pg_backup_directory
* ...
* ".gz" suffix is added to the filenames. The TOC files are never
* compressed by pg_dump, however they are accepted with the .gz suffix
* too, in case the user has manually compressed them with 'gzip'.I haven't tried, but I believe that if you manually compress the
directory, it may hit this branch.
That would make sense, but when I tried, it didn't work like that.
The filenames were all uncompressed names. Maybe it worked differently
in an older release. Or maybe it changed during development of the
parallel-directory-dump patch and it's actually dead code.
This is rebased over the updated compression API.
It seems like I misunderstood something you said before, so now I put
back "supports_compression()".
--
Justin
Attachments:
0001-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From b4cf9df2f8672b012301d58ff83f2b0344de9e96 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 1/3] pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
---
doc/src/sgml/ref/pg_dump.sgml | 13 +-
src/bin/pg_dump/Makefile | 2 +
src/bin/pg_dump/compress_io.c | 62 ++-
src/bin/pg_dump/compress_zstd.c | 535 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 25 ++
src/bin/pg_dump/meson.build | 4 +-
src/bin/pg_dump/pg_backup_archiver.c | 9 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 16 +-
src/bin/pg_dump/t/002_pg_dump.pl | 79 +++-
src/tools/pginclude/cpluspluscheck | 1 +
11 files changed, 694 insertions(+), 54 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index d6b1faa8042..62b3ed2dad6 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -330,8 +330,9 @@ PostgreSQL documentation
machine-readable format that <application>pg_restore</application>
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
- can be compressed with the <application>gzip</application> or
- <application>lz4</application> tools.
+ can be compressed with the <application>gzip</application>,
+ <application>lz4</application>, or
+ <application>zstd</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -655,7 +656,8 @@ PostgreSQL documentation
<para>
Specify the compression method and/or the compression level to use.
The compression method can be set to <literal>gzip</literal>,
- <literal>lz4</literal>, or <literal>none</literal> for no compression.
+ <literal>lz4</literal>, <literal>zstd</literal>,
+ or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
@@ -676,8 +678,9 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>,
+ <application>lz4</application>, or <application>zstd</application>;
+ but the default is not to compress.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..24de7593a6a 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -18,6 +18,7 @@ include $(top_builddir)/src/Makefile.global
export GZIP_PROGRAM=$(GZIP)
export LZ4
+export ZSTD
export with_icu
override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
@@ -29,6 +30,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index 0972a4f934a..b0f59c069c7 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -77,7 +78,8 @@
*/
/*
- * Checks whether a compression algorithm is supported.
+ * Checks whether support for a compression algorithm is implemented in
+ * pg_dump/restore.
*
* On success returns NULL, otherwise returns a malloc'ed string which can be
* used by the caller in an error message.
@@ -98,6 +100,10 @@ supports_compression(const pg_compress_specification compression_spec)
if (algorithm == PG_COMPRESSION_LZ4)
supported = true;
#endif
+#ifdef USE_ZSTD
+ if (algorithm == PG_COMPRESSION_ZSTD)
+ supported = true;
+#endif
if (!supported)
return psprintf("this build does not support compression with %s",
@@ -130,6 +136,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,20 +204,30 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
@@ -231,34 +249,14 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
compression_spec.algorithm = PG_COMPRESSION_GZIP;
else
{
- bool exists;
-
- exists = (stat(path, &st) == 0);
- /* avoid unused warning if it is not built with compression */
- if (exists)
+ if (stat(path, &st) == 0)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz"))
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..944e786320a
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,535 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.c
+ * Routines for archivers to write a Zstd compressed data stream.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+
+ /* pointer to a static string like from strerror(), for Zstd_write() */
+ const char *zstderror;
+} ZstdCompressorState;
+
+static ZSTD_CStream *_ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+_Zstd_CCtx_setParam_or_die(ZSTD_CStream *cstream,
+ ZSTD_cParameter param, int value, char *paramname)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: \"%s\": %s",
+ paramname, ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+_ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level, "level");
+
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers, "workers");
+
+ return cstream;
+}
+
+/* Helper function for WriteDataToArchiveZstd and EndCompressorZstd */
+static void
+_ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop while there's any input or until flushed */
+ while (input->pos != input->size || flush)
+ {
+ size_t res;
+
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, flush ? ZSTD_e_end : ZSTD_e_continue);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length chunk
+ * is the EOF marker in the custom format. This should never happen
+ * but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ if (res == 0)
+ break;
+ }
+}
+
+static void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ Assert(zstdcs->cstream == NULL);
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+ else if (cs->writeF != NULL)
+ {
+ Assert(zstdcs->dstream == NULL);
+ _ZstdWriteCommon(AH, cs, true);
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ zstdcs->input.src = data;
+ zstdcs->input.size = dLen;
+ zstdcs->input.pos = 0;
+
+ _ZstdWriteCommon(AH, cs, false);
+}
+
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res;
+
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * Read compressed data. Note that readF can resize the buffer; the
+ * new size is tracked and used for future loops.
+ */
+ input->size = input_allocated_size;
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size);
+
+ /* ensure that readF didn't *shrink* the buffer */
+ Assert(input->size >= input_allocated_size);
+ input_allocated_size = input->size;
+ input->size = cnt;
+ input->pos = 0;
+
+ if (cnt == 0)
+ break;
+
+ /* Now decompress */
+ while (input->pos < input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /* then write the decompressed data to the output handle */
+ ((char *) output->dst)[output->pos] = '\0';
+ ahwrite(output->dst, 1, output->pos, AH);
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+ }
+}
+
+/* Public routine that supports Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec)
+{
+ ZstdCompressorState *zstdcs;
+
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ cs->private_data = zstdcs;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ zstdcs->input.size = ZSTD_DStreamInSize();
+ zstdcs->input.src = pg_malloc(zstdcs->input.size);
+
+ zstdcs->output.size = ZSTD_DStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size + 1);
+ }
+ else if (cs->writeF != NULL)
+ {
+ zstdcs->cstream = _ZstdCStreamParams(cs->compression_spec);
+
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ }
+}
+
+/*
+ * Compressed stream API
+ */
+
+static bool
+Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_allocated_size);
+
+ /* If the input is completely consumed, start back at the beginning */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_allocated_size, zstdcs->fp);
+ input->size = cnt;
+
+ Assert(cnt >= 0);
+ Assert(cnt <= input_allocated_size);
+
+ /* If we have no more input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ while (input->pos < input->size)
+ {
+ /* now decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ if (output->pos == output->size)
+ break; /* No more room for output */
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ if (rdsize != NULL)
+ *rdsize = output->pos;
+
+ return true;
+}
+
+static bool
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+ /* Consume all input, to be flushed later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return false;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return false;
+ }
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, NULL, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ int i;
+
+ Assert(len > 0);
+
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of LOs, and the I/O is buffered anyway.
+ */
+ for (i = 0; i < len - 1; ++i)
+ {
+ size_t readsz;
+
+ if (!CFH->read_func(&buf[i], 1, &readsz, CFH))
+ break;
+ if (readsz != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : NULL;
+}
+
+static bool
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop until the compression buffers are fully consumed */
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return false;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return false;
+ }
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ if (fclose(zstdcs->fp) != 0)
+ return false;
+
+ pg_free(zstdcs);
+ return true;
+}
+
+static bool
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static bool
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ return false;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ CFH->private_data = zstdcs;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'r')
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = _ZstdCStreamParams(CFH->compression_spec);
+ if (zstdcs->cstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else
+ pg_fatal("unhandled mode");
+
+ return true;
+}
+
+static bool
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return zstdcs->zstderror;
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..2aaa6b100b1
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.h
+ * Zstd interface to compress_io.c routines
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index b2fb7ac77fd..9d59a106f36 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
@@ -19,7 +20,7 @@ pg_dump_common_sources = files(
pg_dump_common = static_library('libpgdump_common',
pg_dump_common_sources,
c_pch: pch_postgres_fe_h,
- dependencies: [frontend_code, libpq, lz4, zlib],
+ dependencies: [frontend_code, libpq, lz4, zlib, zstd],
kwargs: internal_lib_args,
)
@@ -90,6 +91,7 @@ tests += {
'env': {
'GZIP_PROGRAM': gzip.path(),
'LZ4': program_lz4.found() ? program_lz4.path() : '',
+ 'ZSTD': program_zstd.found() ? program_zstd.path() : '',
'with_icu': icu.found() ? 'yes' : 'no',
},
'tests': [
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index ab77e373e91..e8ee6b1ad86 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2120,7 +2120,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2131,10 +2131,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index abaaa3b10e3..2177d5ff425 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index d62780a0880..1844100be8a 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -61,6 +61,7 @@
#include "fe_utils/string_utils.h"
#include "getopt_long.h"
#include "libpq/libpq-fs.h"
+#include "compress_io.h"
#include "parallel.h"
#include "pg_backup_db.h"
#include "pg_backup_utils.h"
@@ -735,18 +736,9 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
+ error_detail = supports_compression(compression_spec);
+ if (error_detail != NULL)
+ pg_fatal("%s", error_detail);
/*
* Custom and directory formats are compressed by default with gzip when
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index a22f27f300f..33435a0a421 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -213,6 +214,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "-o", "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "-o", "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
@@ -4625,10 +4697,11 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip/lz4 if there is no support for it.
+ # Skip command-level tests for gzip/lz4/zstd if the tool is not supported
if ($pgdump_runs{$run}->{compile_option} &&
(($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'zstd' && !$supports_zstd)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck
index 58039934756..10fb51585c9 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -154,6 +154,7 @@ do
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
test "$f" = src/bin/pg_dump/pg_dump.h && continue
--
2.34.1
0002-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From c59f35a4557e3f404732d9c1c04621cae9c92217 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 2/3] TMP: pg_dump: use Zstd by default, for CI only
//-os-only: warnings
---
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 14 +++++++-------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 1844100be8a..fe5c7ec461b 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -747,8 +747,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#else
/* Nothing to do in the default case */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 33435a0a421..7a566db6614 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -384,10 +384,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -409,16 +409,16 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
glob_patterns => [
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
- $supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ $supports_zstd ?
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
0003-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From 7dd22d93ee24584060c061f7a682f371f1a1627a Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 3/3] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 2 +
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 127 insertions(+), 6 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 8b5e7b1ad7f..b11d9a6ba35 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2729,7 +2729,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2746,6 +2747,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 62b3ed2dad6..35147acfca6 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -681,6 +681,8 @@ PostgreSQL documentation
as though it had been fed through <application>gzip</application>,
<application>lz4</application>, or <application>zstd</application>;
but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may improve the
+ compression ratio, at the cost of increased memory use.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index 944e786320a..c2cb45a43a1 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -84,6 +84,11 @@ _ZstdCStreamParams(pg_compress_specification compress)
_Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers, "workers");
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ _Zstd_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance, "long");
+
return cstream;
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 7a566db6614..77c655c6b44 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..713a77c292d 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as an boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..6cf8cf396a8 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ int long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.
OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?
The function is first checking if it was passed a filename which already
has a suffix. And if not, it searches through a list of suffixes,
testing for an existing file with each suffix. The search with stat()
doesn't happen if it has a suffix. I'm having trouble seeing how the
hasSuffix() branch isn't dead code. Another opened question.AFAICS it's done this way because of this comment in pg_backup_directory
* ...
* ".gz" suffix is added to the filenames. The TOC files are never
* compressed by pg_dump, however they are accepted with the .gz suffix
* too, in case the user has manually compressed them with 'gzip'.I haven't tried, but I believe that if you manually compress the
directory, it may hit this branch.That would make sense, but when I tried, it didn't work like that.
The filenames were all uncompressed names. Maybe it worked differently
in an older release. Or maybe it changed during development of the
parallel-directory-dump patch and it's actually dead code.
Interesting. Would be good to find out. I wonder if a little bit of
git-log digging could tell us more. Anyway, until we confirm it's dead
code, we should probably do what .gz does and have the same check for
.lz4 and .zst files.
This is rebased over the updated compression API.
It seems like I misunderstood something you said before, so now I put
back "supports_compression()".
Thanks! I need to do a bit more testing and review, but it seems pretty
much RFC to me, assuming we can figure out what to do about threading.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Tue, Mar 28, 2023 at 12:23 PM Tomas Vondra <tomas.vondra@enterprisedb.com>
wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <
jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
...
OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?Thomas since I appear to be one of the few windows users (I use both), can
I help?
I can test pg_dump... for you, easy to do. I do about 5-10 pg_dumps a day
on windows while developing.
Also, I have an AWS instance I created to build PG/Win with readline back
in November.
I could give you access to that... (you are not the only person who has
made this statement here).
I've made such instances available for other Open Source developers, to
support them.
Obvi I would share connection credentials privately.
Regards, Kirk
On 3/28/23 20:03, Kirk Wolak wrote:
On Tue, Mar 28, 2023 at 12:23 PM Tomas Vondra
<tomas.vondra@enterprisedb.com <mailto:tomas.vondra@enterprisedb.com>>
wrote:On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion
<jchampion@timescale.com <mailto:jchampion@timescale.com>> wrote:
I did some smoke testing against zstd's GitHub release on
Windows. To
...
OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?Thomas since I appear to be one of the few windows users (I use both),
can I help?
I can test pg_dump... for you, easy to do. I do about 5-10 pg_dumps a
day on windows while developing.
Perhaps. But I'll leave the details up to Justin - it's his patch, and
I'm not sure how to verify the threading is OK.
I'd try applying this patch, build with --with-zstd and then run the
pg_dump TAP tests, and perhaps do some manual tests.
And perhaps do the same for --with-lz4 - there's a thread [1] suggesting
we don't detect lz4 stuff on Windows, so the TAP tests do nothing.
/messages/by-id/ZAjL96N9ZW84U59p@msg.df7cb.de
Also, I have an AWS instance I created to build PG/Win with readline
back in November.
I could give you access to that... (you are not the only person who has
made this statement here).
I've made such instances available for other Open Source developers, to
support them.Obvi I would share connection credentials privately.
I'd rather leave the Windows stuff up to someone with more experience
with that platform. I have plenty of other stuff on my plate atm.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Wed, Mar 15, 2023 at 9:50 PM Justin Pryzby <pryzby@telsasoft.com> wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?
I thought I replied to this, sorry -- your newest patchset builds
correctly with subprojects, so the new dependency looks good to me.
Thanks!
Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).
To (maybe?) move this forward a bit, note that pg_backup_custom's
_Clone() function makes sure that there is no active compressor state
at the beginning of the new thread. pg_backup_directory's
implementation has no such provision. And I don't think it can,
because the parent thread might have concurrently set one up -- see
the directory-specific implementation of _CloseArchive(). Perhaps we
should just NULL out those fields after the copy, instead?
To illustrate why I think this is tough to characterize: if I've read
the code correctly, the _Clone() and CloneArchive() implementations
are running concurrently with code that is actively modifying the
ArchiveHandle and the lclContext. So safety is only ensured to the
extent that we keep track of which fields threads are allowed to
touch, and I don't have that mental model.
--Jacob
On Tue, Mar 28, 2023 at 02:03:49PM -0400, Kirk Wolak wrote:
On Tue, Mar 28, 2023 at 12:23 PM Tomas Vondra <tomas.vondra@enterprisedb.com> wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
...
OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?Thomas since I appear to be one of the few windows users (I use both), can I help?
I can test pg_dump... for you, easy to do. I do about 5-10 pg_dumps a day
on windows while developing.
It'd be great if you'd exercise this and other changes to
pg_dump/restore. Tomas just pushed a bugfix, so be sure to "git pull"
before testing, or else you might rediscover the bug.
If you have a zstd library with thread support, you could test with
-Z zstd:workers=3. But I think threads aren't enabled in the common
libzstd packages. Jacob figured out how to compile libzstd easily using
"meson wraps" - but I don't know the details.
--
Justin
On Wed, Mar 29, 2023 at 6:35 AM Justin Pryzby <pryzby@telsasoft.com> wrote:
If you have a zstd library with thread support, you could test with
-Z zstd:workers=3. But I think threads aren't enabled in the common
libzstd packages. Jacob figured out how to compile libzstd easily using
"meson wraps" - but I don't know the details.
From the source root,
$ mkdir subprojects
$ meson wrap install zstd
From then on, Meson was pretty automagical about it during the ninja
build. The subproject's settings are themselves inspectable and
settable via `meson configure`:
$ meson configure -Dzstd:<option>=<value>
--Jacob
On Tue, Mar 28, 2023 at 06:23:26PM +0200, Tomas Vondra wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?
I think that's what's best. I made it issue a warning if "workers" was
specified. It could also be an error, or just ignored.
I considered disabling workers only for windows, but realized that I
haven't tested with threads myself - my local zstd package is compiled
without threading, and I remember having some issue recompiling it with
threading. Jacob's recipe for using meson wraps works well, but it
still seems better to leave it as a future feature. I used that recipe
to enabled zstd with threading on CI (except for linux/autoconf).
The function is first checking if it was passed a filename which already
has a suffix. And if not, it searches through a list of suffixes,
testing for an existing file with each suffix. The search with stat()
doesn't happen if it has a suffix. I'm having trouble seeing how the
hasSuffix() branch isn't dead code. Another opened question.AFAICS it's done this way because of this comment in pg_backup_directory
* ...
* ".gz" suffix is added to the filenames. The TOC files are never
* compressed by pg_dump, however they are accepted with the .gz suffix
* too, in case the user has manually compressed them with 'gzip'.I haven't tried, but I believe that if you manually compress the
directory, it may hit this branch.That would make sense, but when I tried, it didn't work like that.
The filenames were all uncompressed names. Maybe it worked differently
in an older release. Or maybe it changed during development of the
parallel-directory-dump patch and it's actually dead code.Interesting. Would be good to find out. I wonder if a little bit of
git-log digging could tell us more. Anyway, until we confirm it's dead
code, we should probably do what .gz does and have the same check for
.lz4 and .zst files.
I found that hasSuffix() and cfopen() originated in the refactored patch
Heikki's sent here; there's no history beyond that.
/messages/by-id/4D3954C7.9060503@enterprisedb.com
The patch published there appends the .gz within cfopen(), and the
caller writes into the TOC the filename without ".gz". It seems like
maybe a few hours prior, Heikki may have been appending the ".gz" suffix
in the caller, and then writing the TOC with filename.gz.
The only way I've been able to get a "filename.gz" passed to hasSuffix
is to write a directory-format dump, with LOs, and without compression,
and then compress the blobs with "gzip", and *also* edit the blobs.toc
file to say ".gz" (which isn't necessary since, if the original file
isn't found, the restore would search for files with compressed
suffixes).
So .. it's not *technically* unreachable, but I can't see why it'd be
useful to support editing the *content* of the blob TOC (other than
compressing it). I might give some weight to the idea if it were also
possible to edit the non-blob TOC; but, it's a binary file, so no.
For now, I made the change to make zstd and lz4 to behave the same here
as .gz, unless Heikki has a memory or a git reflog going back far enough
to further support the idea that the code path isn't useful.
I'm going to set the patch as RFC as a hint to anyone who would want to
make a final review.
--
Justin
Attachments:
0001-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From a4d2f22d98c16e16c718733f30d71dfb0e3adfe2 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 1/4] pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
---
doc/src/sgml/ref/pg_dump.sgml | 13 +-
src/bin/pg_dump/Makefile | 2 +
src/bin/pg_dump/compress_io.c | 66 ++--
src/bin/pg_dump/compress_zstd.c | 530 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 25 ++
src/bin/pg_dump/meson.build | 4 +-
src/bin/pg_dump/pg_backup_archiver.c | 9 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 20 +-
src/bin/pg_dump/t/002_pg_dump.pl | 79 +++-
src/tools/pginclude/cpluspluscheck | 1 +
11 files changed, 697 insertions(+), 54 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 77299878e02..8de38e0fd0d 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -330,8 +330,9 @@ PostgreSQL documentation
machine-readable format that <application>pg_restore</application>
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
- can be compressed with the <application>gzip</application> or
- <application>lz4</application> tools.
+ can be compressed with the <application>gzip</application>,
+ <application>lz4</application>, or
+ <application>zstd</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -655,7 +656,8 @@ PostgreSQL documentation
<para>
Specify the compression method and/or the compression level to use.
The compression method can be set to <literal>gzip</literal>,
- <literal>lz4</literal>, or <literal>none</literal> for no compression.
+ <literal>lz4</literal>, <literal>zstd</literal>,
+ or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
@@ -676,8 +678,9 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>,
+ <application>lz4</application>, or <application>zstd</application>;
+ but the default is not to compress.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..24de7593a6a 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -18,6 +18,7 @@ include $(top_builddir)/src/Makefile.global
export GZIP_PROGRAM=$(GZIP)
export LZ4
+export ZSTD
export with_icu
override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
@@ -29,6 +30,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index 0972a4f934a..4f06bb024f9 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -77,7 +78,8 @@
*/
/*
- * Checks whether a compression algorithm is supported.
+ * Checks whether support for a compression algorithm is implemented in
+ * pg_dump/restore.
*
* On success returns NULL, otherwise returns a malloc'ed string which can be
* used by the caller in an error message.
@@ -98,6 +100,10 @@ supports_compression(const pg_compress_specification compression_spec)
if (algorithm == PG_COMPRESSION_LZ4)
supported = true;
#endif
+#ifdef USE_ZSTD
+ if (algorithm == PG_COMPRESSION_ZSTD)
+ supported = true;
+#endif
if (!supported)
return psprintf("this build does not support compression with %s",
@@ -130,6 +136,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,20 +204,30 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
@@ -229,36 +247,20 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
if (hasSuffix(fname, ".gz"))
compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (hasSuffix(fname, ".lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (hasSuffix(fname, ".zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
else
{
- bool exists;
-
- exists = (stat(path, &st) == 0);
- /* avoid unused warning if it is not built with compression */
- if (exists)
+ if (stat(path, &st) == 0)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz"))
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..c19d5262943
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,530 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.c
+ * Routines for archivers to write a Zstd compressed data stream.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+
+ /* pointer to a static string like from strerror(), for Zstd_write() */
+ const char *zstderror;
+} ZstdCompressorState;
+
+static ZSTD_CStream *_ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+_Zstd_CCtx_setParam_or_die(ZSTD_CStream *cstream,
+ ZSTD_cParameter param, int value, char *paramname)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: \"%s\": %s",
+ paramname, ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+_ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level, "level");
+
+ return cstream;
+}
+
+/* Helper function for WriteDataToArchiveZstd and EndCompressorZstd */
+static void
+_ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop while there's any input or until flushed */
+ while (input->pos != input->size || flush)
+ {
+ size_t res;
+
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, flush ? ZSTD_e_end : ZSTD_e_continue);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length chunk
+ * is the EOF marker in the custom format. This should never happen
+ * but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ if (res == 0)
+ break;
+ }
+}
+
+static void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ Assert(zstdcs->cstream == NULL);
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+ else if (cs->writeF != NULL)
+ {
+ Assert(zstdcs->dstream == NULL);
+ _ZstdWriteCommon(AH, cs, true);
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ zstdcs->input.src = data;
+ zstdcs->input.size = dLen;
+ zstdcs->input.pos = 0;
+
+ _ZstdWriteCommon(AH, cs, false);
+}
+
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res;
+
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * Read compressed data. Note that readF can resize the buffer; the
+ * new size is tracked and used for future loops.
+ */
+ input->size = input_allocated_size;
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size);
+
+ /* ensure that readF didn't *shrink* the buffer */
+ Assert(input->size >= input_allocated_size);
+ input_allocated_size = input->size;
+ input->size = cnt;
+ input->pos = 0;
+
+ if (cnt == 0)
+ break;
+
+ /* Now decompress */
+ while (input->pos < input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /* then write the decompressed data to the output handle */
+ ((char *) output->dst)[output->pos] = '\0';
+ ahwrite(output->dst, 1, output->pos, AH);
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+ }
+}
+
+/* Public routine that supports Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec)
+{
+ ZstdCompressorState *zstdcs;
+
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ cs->private_data = zstdcs;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ zstdcs->input.size = ZSTD_DStreamInSize();
+ zstdcs->input.src = pg_malloc(zstdcs->input.size);
+
+ zstdcs->output.size = ZSTD_DStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size + 1);
+ }
+ else if (cs->writeF != NULL)
+ {
+ zstdcs->cstream = _ZstdCStreamParams(cs->compression_spec);
+
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ }
+}
+
+/*
+ * Compressed stream API
+ */
+
+static bool
+Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_allocated_size);
+
+ /* If the input is completely consumed, start back at the beginning */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_allocated_size, zstdcs->fp);
+ input->size = cnt;
+
+ Assert(cnt <= input_allocated_size);
+
+ /* If we have no more input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ while (input->pos < input->size)
+ {
+ /* now decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ if (output->pos == output->size)
+ break; /* No more room for output */
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ if (rdsize != NULL)
+ *rdsize = output->pos;
+
+ return true;
+}
+
+static bool
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+ /* Consume all input, to be flushed later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return false;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return false;
+ }
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, NULL, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ int i;
+
+ Assert(len > 0);
+
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of LOs, and the I/O is buffered anyway.
+ */
+ for (i = 0; i < len - 1; ++i)
+ {
+ size_t readsz;
+
+ if (!CFH->read_func(&buf[i], 1, &readsz, CFH))
+ break;
+ if (readsz != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : NULL;
+}
+
+static bool
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop until the compression buffers are fully consumed */
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return false;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return false;
+ }
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ if (fclose(zstdcs->fp) != 0)
+ return false;
+
+ pg_free(zstdcs);
+ return true;
+}
+
+static bool
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static bool
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ return false;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ CFH->private_data = zstdcs;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'r')
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = _ZstdCStreamParams(CFH->compression_spec);
+ if (zstdcs->cstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else
+ pg_fatal("unhandled mode");
+
+ return true;
+}
+
+static bool
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return zstdcs->zstderror;
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..2aaa6b100b1
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.h
+ * Zstd interface to compress_io.c routines
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index b2fb7ac77fd..9d59a106f36 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
@@ -19,7 +20,7 @@ pg_dump_common_sources = files(
pg_dump_common = static_library('libpgdump_common',
pg_dump_common_sources,
c_pch: pch_postgres_fe_h,
- dependencies: [frontend_code, libpq, lz4, zlib],
+ dependencies: [frontend_code, libpq, lz4, zlib, zstd],
kwargs: internal_lib_args,
)
@@ -90,6 +91,7 @@ tests += {
'env': {
'GZIP_PROGRAM': gzip.path(),
'LZ4': program_lz4.found() ? program_lz4.path() : '',
+ 'ZSTD': program_zstd.found() ? program_zstd.path() : '',
'with_icu': icu.found() ? 'yes' : 'no',
},
'tests': [
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index ab77e373e91..e8ee6b1ad86 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2120,7 +2120,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2131,10 +2131,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index abaaa3b10e3..2177d5ff425 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 6abbcff6834..05833a48460 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -61,6 +61,7 @@
#include "fe_utils/string_utils.h"
#include "getopt_long.h"
#include "libpq/libpq-fs.h"
+#include "compress_io.h"
#include "parallel.h"
#include "pg_backup_db.h"
#include "pg_backup_utils.h"
@@ -735,18 +736,13 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
+ error_detail = supports_compression(compression_spec);
+ if (error_detail != NULL)
+ pg_fatal("%s", error_detail);
+
+ if (compression_spec.options & PG_COMPRESSION_OPTION_WORKERS)
+ pg_log_warning("compression option is not currently supported: \"%s\"",
+ "workers");
/*
* Custom and directory formats are compressed by default with gzip when
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 42215f82f7a..74f23ae7f74 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -213,6 +214,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "-o", "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "-o", "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
@@ -4648,10 +4720,11 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip/lz4 if there is no support for it.
+ # Skip command-level tests for gzip/lz4/zstd if the tool is not supported
if ($pgdump_runs{$run}->{compile_option} &&
(($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'zstd' && !$supports_zstd)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck
index 58039934756..10fb51585c9 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -154,6 +154,7 @@ do
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
test "$f" = src/bin/pg_dump/pg_dump.h && continue
--
2.34.1
0002-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From cead77201ffc76d9f6ea9b853467d846f1618a8a Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 2/4] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 2 +
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 127 insertions(+), 6 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 8b5e7b1ad7f..b11d9a6ba35 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2729,7 +2729,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2746,6 +2747,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 8de38e0fd0d..e81e35c13b3 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -681,6 +681,8 @@ PostgreSQL documentation
as though it had been fed through <application>gzip</application>,
<application>lz4</application>, or <application>zstd</application>;
but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may improve the
+ compression ratio, at the cost of increased memory use.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index c19d5262943..c7229ec2922 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -80,6 +80,11 @@ _ZstdCStreamParams(pg_compress_specification compress)
_Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
compress.level, "level");
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ _Zstd_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance, "long");
+
return cstream;
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 74f23ae7f74..bb898b06bb4 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..713a77c292d 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as an boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..6cf8cf396a8 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ int long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
0003-WIP-pg_dump-support-zstd-workers.patchtext/x-diff; charset=us-asciiDownload
From 39b023ef6626dcd303aeee8e9c2418f107244f0f Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Thu, 30 Mar 2023 17:48:57 -0500
Subject: [PATCH 3/4] WIP: pg_dump: support zstd workers
This is a separate commit since it's not essential; the zstd library is
frequently compiled without threading support, so the functionality
isn't very well-tested, and because use of zstd threads might
conceivably play poorly with pg_dump's use of threads under Windows.
Targetting postgres v17.
---
doc/src/sgml/ref/pg_dump.sgml | 8 ++++++--
src/bin/pg_dump/compress_zstd.c | 4 ++++
src/bin/pg_dump/pg_dump.c | 4 ----
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index e81e35c13b3..1d55ce05b21 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -681,8 +681,12 @@ PostgreSQL documentation
as though it had been fed through <application>gzip</application>,
<application>lz4</application>, or <application>zstd</application>;
but the default is not to compress.
- With zstd compression, <literal>long</literal> mode may improve the
- compression ratio, at the cost of increased memory use.
+ With zstd compression, <literal>long</literal> and
+ <literal>workers</literal> options may be specified to enable long-distance
+ matching and threaded workers, respectively.
+ Long distance mode may improve the compression ratio, at the cost of
+ increased memory use.
+ Threaded workers allow leveraging multiple CPUs during compression.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index c7229ec2922..1b821f8ecb1 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -85,6 +85,10 @@ _ZstdCStreamParams(pg_compress_specification compress)
ZSTD_c_enableLongDistanceMatching,
compress.long_distance, "long");
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers, "workers");
+
return cstream;
}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 05833a48460..c0c165c2940 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -740,10 +740,6 @@ main(int argc, char **argv)
if (error_detail != NULL)
pg_fatal("%s", error_detail);
- if (compression_spec.options & PG_COMPRESSION_OPTION_WORKERS)
- pg_log_warning("compression option is not currently supported: \"%s\"",
- "workers");
-
/*
* Custom and directory formats are compressed by default with gzip when
* available, not the others.
--
2.34.1
0004-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From a353ce7ecce1804170bbdf8c56313fab92376561 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 4/4] TMP: pg_dump: use Zstd by default, for CI only
//-os-only: linux-meson
---
.cirrus.yml | 9 ++++++++-
src/bin/pg_dump/compress_zstd.c | 9 +++++++++
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 14 +++++++-------
4 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/.cirrus.yml b/.cirrus.yml
index 5b1747522f9..14402a0ad5c 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -267,6 +267,7 @@ LINUX_CONFIGURE_FEATURES: &LINUX_CONFIGURE_FEATURES >-
LINUX_MESON_FEATURES: &LINUX_MESON_FEATURES >-
-Dllvm=enabled
-Duuid=e2fs
+ -Dzstd=enabled
# Linux, both 32bit and 64bit
@@ -389,6 +390,9 @@ task:
configure_script: |
su postgres <<-EOF
+ mkdir subprojects
+ meson wrap install zstd
+ meson configure -D zstd:multithread=enabled --force-fallback-for=zstd
meson setup \
--buildtype=debug \
-Dcassert=true \
@@ -616,7 +620,10 @@ task:
# Use /DEBUG:FASTLINK to avoid high memory usage during linking
configure_script: |
vcvarsall x64
- meson setup --backend ninja --buildtype debug -Dc_link_args=/DEBUG:FASTLINK -Dcassert=true -Db_pch=true -Dextra_lib_dirs=c:\openssl\1.1\lib -Dextra_include_dirs=c:\openssl\1.1\include -DTAR=%TAR% -DPG_TEST_EXTRA="%PG_TEST_EXTRA%" build
+ mkdir subprojects
+ meson wrap install zstd
+ meson configure -D zstd:multithread=enabled --force-fallback-for=zstd
+ meson setup --backend ninja --buildtype debug -Dc_link_args=/DEBUG:FASTLINK -Dcassert=true -Db_pch=true -Dextra_lib_dirs=c:\openssl\1.1\lib -Dextra_include_dirs=c:\openssl\1.1\include -DTAR=%TAR% -DPG_TEST_EXTRA="%PG_TEST_EXTRA%" -D zstd=enabled build
build_script: |
vcvarsall x64
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index 1b821f8ecb1..94dc16cff49 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -88,6 +88,15 @@ _ZstdCStreamParams(pg_compress_specification compress)
if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
_Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers, "workers");
+ else
+ {
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, ZSTD_c_nbWorkers, 3);
+ if (ZSTD_isError(res))
+ pg_log_warning("could not set compression parameter: \"%s\": %s",
+ "workers", ZSTD_getErrorName(res));
+ }
return cstream;
}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index c0c165c2940..f146a29aeff 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -747,8 +747,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#else
/* Nothing to do in the default case */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index bb898b06bb4..0a635ae9fc3 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -385,10 +385,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -410,16 +410,16 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
glob_patterns => [
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
- $supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ $supports_zstd ?
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
On 4/1/23 01:16, Justin Pryzby wrote:
On Tue, Mar 28, 2023 at 06:23:26PM +0200, Tomas Vondra wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?I think that's what's best. I made it issue a warning if "workers" was
specified. It could also be an error, or just ignored.I considered disabling workers only for windows, but realized that I
haven't tested with threads myself - my local zstd package is compiled
without threading, and I remember having some issue recompiling it with
threading. Jacob's recipe for using meson wraps works well, but it
still seems better to leave it as a future feature. I used that recipe
to enabled zstd with threading on CI (except for linux/autoconf).
+1 to disable this if we're unsure it works correctly. I agree it's
better to just error out if workers are requested - I rather dislike
when a tool just ignores an explicit parameter. And AFAICS it's what
zstd does too, when someone requests workers on incompatible build.
FWIW I've been thinking about this a bit more and I don't quite see why
would the threading cause issues (except for Windows). I forgot
pg_basebackup already supports zstd, including the worker threading, so
why would it work there and not in pg_dump? Sure, pg_basebackup is not
parallel, but with separate pg_dump processes that shouldn't be an issue
(although I'm not sure when zstd creates threads).
The one thing I'm wondering about is at which point are the worker
threads spawned - but presumably not before the pg_dump processes fork.
I'll try building zstd with threading enabled, and do some tests over
the weekend.
The function is first checking if it was passed a filename which already
has a suffix. And if not, it searches through a list of suffixes,
testing for an existing file with each suffix. The search with stat()
doesn't happen if it has a suffix. I'm having trouble seeing how the
hasSuffix() branch isn't dead code. Another opened question.AFAICS it's done this way because of this comment in pg_backup_directory
* ...
* ".gz" suffix is added to the filenames. The TOC files are never
* compressed by pg_dump, however they are accepted with the .gz suffix
* too, in case the user has manually compressed them with 'gzip'.I haven't tried, but I believe that if you manually compress the
directory, it may hit this branch.That would make sense, but when I tried, it didn't work like that.
The filenames were all uncompressed names. Maybe it worked differently
in an older release. Or maybe it changed during development of the
parallel-directory-dump patch and it's actually dead code.Interesting. Would be good to find out. I wonder if a little bit of
git-log digging could tell us more. Anyway, until we confirm it's dead
code, we should probably do what .gz does and have the same check for
.lz4 and .zst files.I found that hasSuffix() and cfopen() originated in the refactored patch
Heikki's sent here; there's no history beyond that./messages/by-id/4D3954C7.9060503@enterprisedb.com
The patch published there appends the .gz within cfopen(), and the
caller writes into the TOC the filename without ".gz". It seems like
maybe a few hours prior, Heikki may have been appending the ".gz" suffix
in the caller, and then writing the TOC with filename.gz.The only way I've been able to get a "filename.gz" passed to hasSuffix
is to write a directory-format dump, with LOs, and without compression,
and then compress the blobs with "gzip", and *also* edit the blobs.toc
file to say ".gz" (which isn't necessary since, if the original file
isn't found, the restore would search for files with compressed
suffixes).So .. it's not *technically* unreachable, but I can't see why it'd be
useful to support editing the *content* of the blob TOC (other than
compressing it). I might give some weight to the idea if it were also
possible to edit the non-blob TOC; but, it's a binary file, so no.For now, I made the change to make zstd and lz4 to behave the same here
as .gz, unless Heikki has a memory or a git reflog going back far enough
to further support the idea that the code path isn't useful.
Makes sense. Let's keep the same behavior for all compression methods,
and if it's unreachable we shall remove it from all. It's a trivial
amount of code, we can live with that.
I'm going to set the patch as RFC as a hint to anyone who would want to
make a final review.
OK.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Sat, Apr 01, 2023 at 02:11:12AM +0200, Tomas Vondra wrote:
On 4/1/23 01:16, Justin Pryzby wrote:
On Tue, Mar 28, 2023 at 06:23:26PM +0200, Tomas Vondra wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?I think that's what's best. I made it issue a warning if "workers" was
specified. It could also be an error, or just ignored.I considered disabling workers only for windows, but realized that I
haven't tested with threads myself - my local zstd package is compiled
without threading, and I remember having some issue recompiling it with
threading. Jacob's recipe for using meson wraps works well, but it
still seems better to leave it as a future feature. I used that recipe
to enabled zstd with threading on CI (except for linux/autoconf).+1 to disable this if we're unsure it works correctly. I agree it's
better to just error out if workers are requested - I rather dislike
when a tool just ignores an explicit parameter. And AFAICS it's what
zstd does too, when someone requests workers on incompatible build.FWIW I've been thinking about this a bit more and I don't quite see why
would the threading cause issues (except for Windows). I forgot
pg_basebackup already supports zstd, including the worker threading, so
why would it work there and not in pg_dump? Sure, pg_basebackup is not
parallel, but with separate pg_dump processes that shouldn't be an issue
(although I'm not sure when zstd creates threads).
There's no concern at all except under windows (because on windows
pg_dump -j is implemented using threads rather than forking).
Especially since zstd:workers is already allowed in the basebackup
backend process.
I'll try building zstd with threading enabled, and do some tests over
the weekend.
Feel free to wait until v17 :)
I used "meson wraps" to get a local version with threading. Note that
if you want to use a zstd subproject, you may have to specify -D
zstd=enabled, or else meson may not enable the library at all.
Also, in order to introspect its settings, I had to do like this:
mkdir subprojects
meson wrap install zstd
meson subprojects download
mkdir build.meson
meson setup -C build.meson --force-fallback-for=zstd
--
Justin
On 4/1/23 02:28, Justin Pryzby wrote:
On Sat, Apr 01, 2023 at 02:11:12AM +0200, Tomas Vondra wrote:
On 4/1/23 01:16, Justin Pryzby wrote:
On Tue, Mar 28, 2023 at 06:23:26PM +0200, Tomas Vondra wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?I think that's what's best. I made it issue a warning if "workers" was
specified. It could also be an error, or just ignored.I considered disabling workers only for windows, but realized that I
haven't tested with threads myself - my local zstd package is compiled
without threading, and I remember having some issue recompiling it with
threading. Jacob's recipe for using meson wraps works well, but it
still seems better to leave it as a future feature. I used that recipe
to enabled zstd with threading on CI (except for linux/autoconf).+1 to disable this if we're unsure it works correctly. I agree it's
better to just error out if workers are requested - I rather dislike
when a tool just ignores an explicit parameter. And AFAICS it's what
zstd does too, when someone requests workers on incompatible build.FWIW I've been thinking about this a bit more and I don't quite see why
would the threading cause issues (except for Windows). I forgot
pg_basebackup already supports zstd, including the worker threading, so
why would it work there and not in pg_dump? Sure, pg_basebackup is not
parallel, but with separate pg_dump processes that shouldn't be an issue
(although I'm not sure when zstd creates threads).There's no concern at all except under windows (because on windows
pg_dump -j is implemented using threads rather than forking).
Especially since zstd:workers is already allowed in the basebackup
backend process.
If there are no concerns, why disable it outside Windows? I don't have a
good idea how beneficial the multi-threaded compression is, so I can't
quite judge the risk/benefits tradeoff.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Sat, Apr 01, 2023 at 02:49:44PM +0200, Tomas Vondra wrote:
On 4/1/23 02:28, Justin Pryzby wrote:
On Sat, Apr 01, 2023 at 02:11:12AM +0200, Tomas Vondra wrote:
On 4/1/23 01:16, Justin Pryzby wrote:
On Tue, Mar 28, 2023 at 06:23:26PM +0200, Tomas Vondra wrote:
On 3/27/23 19:28, Justin Pryzby wrote:
On Fri, Mar 17, 2023 at 03:43:31AM +0100, Tomas Vondra wrote:
On 3/16/23 05:50, Justin Pryzby wrote:
On Fri, Mar 10, 2023 at 12:48:13PM -0800, Jacob Champion wrote:
On Wed, Mar 8, 2023 at 10:59 AM Jacob Champion <jchampion@timescale.com> wrote:
I did some smoke testing against zstd's GitHub release on Windows. To
build against it, I had to construct an import library, and put that
and the DLL into the `lib` folder expected by the MSVC scripts...
which makes me wonder if I've chosen a harder way than necessary?It looks like pg_dump's meson.build is missing dependencies on zstd
(meson couldn't find the headers in the subproject without them).I saw that this was added for LZ4, but I hadn't added it for zstd since
I didn't run into an issue without it. Could you check that what I've
added works for your case ?Parallel zstd dumps seem to work as expected, in that the resulting
pg_restore output is identical to uncompressed dumps and nothing
explodes. I haven't inspected the threading implementation for safety
yet, as you mentioned.Hm. Best I can tell, the CloneArchive() machinery is supposed to be
handling safety for this, by isolating each thread's state. I don't feel
comfortable pronouncing this new addition safe or not, because I'm not
sure I understand what the comments in the format-specific _Clone()
callbacks are saying yet.My line of reasoning for unix is that pg_dump forks before any calls to
zstd. Nothing zstd does ought to affect the pg_dump layer. But that
doesn't apply to pg_dump under windows. This is an opened question. If
there's no solid answer, I could disable/ignore the option (maybe only
under windows).I may be missing something, but why would the patch affect this? Why
would it even affect safety of the parallel dump? And I don't see any
changes to the clone stuff ...zstd supports using threads during compression, with -Z zstd:workers=N.
When unix forks, the child processes can't do anything to mess up the
state of the parent processes.But windows pg_dump uses threads instead of forking, so it seems
possible that the pg_dump -j threads that then spawn zstd threads could
"leak threads" and break the main thread. I suspect there's no issue,
but we still ought to verify that before declaring it safe.OK. I don't have access to a Windows machine so I can't test that. Is it
possible to disable the zstd threading, until we figure this out?I think that's what's best. I made it issue a warning if "workers" was
specified. It could also be an error, or just ignored.I considered disabling workers only for windows, but realized that I
haven't tested with threads myself - my local zstd package is compiled
without threading, and I remember having some issue recompiling it with
threading. Jacob's recipe for using meson wraps works well, but it
still seems better to leave it as a future feature. I used that recipe
to enabled zstd with threading on CI (except for linux/autoconf).+1 to disable this if we're unsure it works correctly. I agree it's
better to just error out if workers are requested - I rather dislike
when a tool just ignores an explicit parameter. And AFAICS it's what
zstd does too, when someone requests workers on incompatible build.FWIW I've been thinking about this a bit more and I don't quite see why
would the threading cause issues (except for Windows). I forgot
pg_basebackup already supports zstd, including the worker threading, so
why would it work there and not in pg_dump? Sure, pg_basebackup is not
parallel, but with separate pg_dump processes that shouldn't be an issue
(although I'm not sure when zstd creates threads).There's no concern at all except under windows (because on windows
pg_dump -j is implemented using threads rather than forking).
Especially since zstd:workers is already allowed in the basebackup
backend process.If there are no concerns, why disable it outside Windows? I don't have a
good idea how beneficial the multi-threaded compression is, so I can't
quite judge the risk/benefits tradeoff.
Because it's a minor/fringe feature, and it's annoying to have platform
differences (would we plan on relaxing the restriction in v17, or is it
more likely we'd forget ?).
I realized how little I've tested with zstd workers myself. And I think
on cirrusci, the macos and freebsd tasks have zstd libraries with
threading support, but it wasn't being exercised (because using :workers
would cause the patch to fail unless it's supported everywhere). So I
updated the "for CI only" patch to 1) use meson wraps to compile zstd
library with threading on linux and windows; and, 2) use zstd:workers=3
"opportunistically" (but avoid failing if threads are not supported,
since the autoconf task still doesn't have access to a library with
thread support). That's a great step, but it still seems bad that the
thread stuff has been little exercised until now. (Also, the windows
task failed; I think that's due to a transient network issue).
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).
--
Justin
On 4/1/23 15:36, Justin Pryzby wrote:
...
If there are no concerns, why disable it outside Windows? I don't have a
good idea how beneficial the multi-threaded compression is, so I can't
quite judge the risk/benefits tradeoff.Because it's a minor/fringe feature, and it's annoying to have platform
differences (would we plan on relaxing the restriction in v17, or is it
more likely we'd forget ?).I realized how little I've tested with zstd workers myself. And I think
on cirrusci, the macos and freebsd tasks have zstd libraries with
threading support, but it wasn't being exercised (because using :workers
would cause the patch to fail unless it's supported everywhere). So I
updated the "for CI only" patch to 1) use meson wraps to compile zstd
library with threading on linux and windows; and, 2) use zstd:workers=3
"opportunistically" (but avoid failing if threads are not supported,
since the autoconf task still doesn't have access to a library with
thread support). That's a great step, but it still seems bad that the
thread stuff has been little exercised until now. (Also, the windows
task failed; I think that's due to a transient network issue).
Agreed, let's leave the threading for PG17, depending on how beneficial
it turns out to be for pg_dump.
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).
OK, understood. The long mode patch is pretty simple. IIUC it does not
change the format, i.e. in the worst case we could leave it for PG17
too. Correct?
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Sat, Apr 01, 2023 at 10:26:01PM +0200, Tomas Vondra wrote:
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).OK, understood. The long mode patch is pretty simple. IIUC it does not
change the format, i.e. in the worst case we could leave it for PG17
too. Correct?
Right, libzstd only has one "format", which is the same as what's used
by the commandline tool. zstd:long doesn't change the format of the
output: the library just uses a larger memory buffer to allow better
compression. There's no format change for zstd:workers, either.
--
Justin
On 4/3/23 21:17, Justin Pryzby wrote:
On Sat, Apr 01, 2023 at 10:26:01PM +0200, Tomas Vondra wrote:
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).OK, understood. The long mode patch is pretty simple. IIUC it does not
change the format, i.e. in the worst case we could leave it for PG17
too. Correct?Right, libzstd only has one "format", which is the same as what's used
by the commandline tool. zstd:long doesn't change the format of the
output: the library just uses a larger memory buffer to allow better
compression. There's no format change for zstd:workers, either.
OK. I plan to do a bit more review/testing on this, and get it committed
over the next day or two, likely including the long mode. One thing I
noticed today is that maybe long_distance should be a bool, not int.
Yes, ZSTD_c_enableLongDistanceMatching() accepts int, but it'd be
cleaner to cast the value during a call and keep it bool otherwise.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Mon, Apr 03, 2023 at 11:26:09PM +0200, Tomas Vondra wrote:
On 4/3/23 21:17, Justin Pryzby wrote:
On Sat, Apr 01, 2023 at 10:26:01PM +0200, Tomas Vondra wrote:
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).OK, understood. The long mode patch is pretty simple. IIUC it does not
change the format, i.e. in the worst case we could leave it for PG17
too. Correct?Right, libzstd only has one "format", which is the same as what's used
by the commandline tool. zstd:long doesn't change the format of the
output: the library just uses a larger memory buffer to allow better
compression. There's no format change for zstd:workers, either.OK. I plan to do a bit more review/testing on this, and get it committed
over the next day or two, likely including the long mode. One thing I
noticed today is that maybe long_distance should be a bool, not int.
Yes, ZSTD_c_enableLongDistanceMatching() accepts int, but it'd be
cleaner to cast the value during a call and keep it bool otherwise.
Thanks for noticing. Evidently I wrote it using "int" to get the
feature working, and then later wrote the bool parsing bits but never
changed the data structure.
This also updates a few comments, indentation, removes a useless
assertion, and updates the warning about zstd:workers.
--
Justin
Attachments:
0001-pg_dump-zstd-compression.patchtext/x-diff; charset=us-asciiDownload
From df0eb4d3c4799f24e58f1e5b0a9470e5af355ad6 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 7 Jan 2023 15:45:06 -0600
Subject: [PATCH 1/4] pg_dump: zstd compression
Previously proposed at: 20201221194924.GI30237@telsasoft.com
---
doc/src/sgml/ref/pg_dump.sgml | 13 +-
src/bin/pg_dump/Makefile | 2 +
src/bin/pg_dump/compress_io.c | 66 ++--
src/bin/pg_dump/compress_zstd.c | 537 ++++++++++++++++++++++++++
src/bin/pg_dump/compress_zstd.h | 25 ++
src/bin/pg_dump/meson.build | 4 +-
src/bin/pg_dump/pg_backup_archiver.c | 9 +-
src/bin/pg_dump/pg_backup_directory.c | 2 +
src/bin/pg_dump/pg_dump.c | 20 +-
src/bin/pg_dump/t/002_pg_dump.pl | 79 +++-
src/tools/pginclude/cpluspluscheck | 1 +
src/tools/pgindent/typedefs.list | 1 +
12 files changed, 705 insertions(+), 54 deletions(-)
create mode 100644 src/bin/pg_dump/compress_zstd.c
create mode 100644 src/bin/pg_dump/compress_zstd.h
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 77299878e02..8de38e0fd0d 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -330,8 +330,9 @@ PostgreSQL documentation
machine-readable format that <application>pg_restore</application>
can read. A directory format archive can be manipulated with
standard Unix tools; for example, files in an uncompressed archive
- can be compressed with the <application>gzip</application> or
- <application>lz4</application> tools.
+ can be compressed with the <application>gzip</application>,
+ <application>lz4</application>, or
+ <application>zstd</application> tools.
This format is compressed by default using <literal>gzip</literal>
and also supports parallel dumps.
</para>
@@ -655,7 +656,8 @@ PostgreSQL documentation
<para>
Specify the compression method and/or the compression level to use.
The compression method can be set to <literal>gzip</literal>,
- <literal>lz4</literal>, or <literal>none</literal> for no compression.
+ <literal>lz4</literal>, <literal>zstd</literal>,
+ or <literal>none</literal> for no compression.
A compression detail string can optionally be specified. If the
detail string is an integer, it specifies the compression level.
Otherwise, it should be a comma-separated list of items, each of the
@@ -676,8 +678,9 @@ PostgreSQL documentation
individual table-data segments, and the default is to compress using
<literal>gzip</literal> at a moderate level. For plain text output,
setting a nonzero compression level causes the entire output file to be compressed,
- as though it had been fed through <application>gzip</application> or
- <application>lz4</application>; but the default is not to compress.
+ as though it had been fed through <application>gzip</application>,
+ <application>lz4</application>, or <application>zstd</application>;
+ but the default is not to compress.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index eb8f59459a1..24de7593a6a 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -18,6 +18,7 @@ include $(top_builddir)/src/Makefile.global
export GZIP_PROGRAM=$(GZIP)
export LZ4
+export ZSTD
export with_icu
override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
@@ -29,6 +30,7 @@ OBJS = \
compress_io.o \
compress_lz4.o \
compress_none.o \
+ compress_zstd.o \
dumputils.o \
parallel.o \
pg_backup_archiver.o \
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index 0972a4f934a..4f06bb024f9 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -52,8 +52,8 @@
*
* InitDiscoverCompressFileHandle tries to infer the compression by the
* filename suffix. If the suffix is not yet known then it tries to simply
- * open the file and if it fails, it tries to open the same file with the .gz
- * suffix, and then again with the .lz4 suffix.
+ * open the file and if it fails, it tries to open the same file with
+ * compressed suffixes.
*
* IDENTIFICATION
* src/bin/pg_dump/compress_io.c
@@ -69,6 +69,7 @@
#include "compress_io.h"
#include "compress_lz4.h"
#include "compress_none.h"
+#include "compress_zstd.h"
#include "pg_backup_utils.h"
/*----------------------
@@ -77,7 +78,8 @@
*/
/*
- * Checks whether a compression algorithm is supported.
+ * Checks whether support for a compression algorithm is implemented in
+ * pg_dump/restore.
*
* On success returns NULL, otherwise returns a malloc'ed string which can be
* used by the caller in an error message.
@@ -98,6 +100,10 @@ supports_compression(const pg_compress_specification compression_spec)
if (algorithm == PG_COMPRESSION_LZ4)
supported = true;
#endif
+#ifdef USE_ZSTD
+ if (algorithm == PG_COMPRESSION_ZSTD)
+ supported = true;
+#endif
if (!supported)
return psprintf("this build does not support compression with %s",
@@ -130,6 +136,8 @@ AllocateCompressor(const pg_compress_specification compression_spec,
InitCompressorGzip(cs, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressorLZ4(cs, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressorZstd(cs, compression_spec);
return cs;
}
@@ -196,20 +204,30 @@ InitCompressFileHandle(const pg_compress_specification compression_spec)
InitCompressFileHandleGzip(CFH, compression_spec);
else if (compression_spec.algorithm == PG_COMPRESSION_LZ4)
InitCompressFileHandleLZ4(CFH, compression_spec);
+ else if (compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ InitCompressFileHandleZstd(CFH, compression_spec);
return CFH;
}
+static bool
+check_compressed_file(const char *path, char **fname, char *ext)
+{
+ free_keep_errno(*fname);
+ *fname = psprintf("%s.%s", path, ext);
+ return (access(*fname, F_OK) == 0);
+}
+
/*
* Open a file for reading. 'path' is the file to open, and 'mode' should
* be either "r" or "rb".
*
* If the file at 'path' contains the suffix of a supported compression method,
- * currently this includes ".gz" and ".lz4", then this compression will be used
+ * currently this includes ".gz", ".lz4" and ".zst", then this compression will be used
* throughout. Otherwise the compression will be inferred by iteratively trying
* to open the file at 'path', first as is, then by appending known compression
* suffixes. So if you pass "foo" as 'path', this will open either "foo" or
- * "foo.gz" or "foo.lz4", trying in that order.
+ * "foo.{gz,lz4,zst}", trying in that order.
*
* On failure, return NULL with an error code in errno.
*/
@@ -229,36 +247,20 @@ InitDiscoverCompressFileHandle(const char *path, const char *mode)
if (hasSuffix(fname, ".gz"))
compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (hasSuffix(fname, ".lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (hasSuffix(fname, ".zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
else
{
- bool exists;
-
- exists = (stat(path, &st) == 0);
- /* avoid unused warning if it is not built with compression */
- if (exists)
+ if (stat(path, &st) == 0)
compression_spec.algorithm = PG_COMPRESSION_NONE;
-#ifdef HAVE_LIBZ
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.gz", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_GZIP;
- }
-#endif
-#ifdef USE_LZ4
- if (!exists)
- {
- free_keep_errno(fname);
- fname = psprintf("%s.lz4", path);
- exists = (stat(fname, &st) == 0);
-
- if (exists)
- compression_spec.algorithm = PG_COMPRESSION_LZ4;
- }
-#endif
+ else if (check_compressed_file(path, &fname, "gz"))
+ compression_spec.algorithm = PG_COMPRESSION_GZIP;
+ else if (check_compressed_file(path, &fname, "lz4"))
+ compression_spec.algorithm = PG_COMPRESSION_LZ4;
+ else if (check_compressed_file(path, &fname, "zst"))
+ compression_spec.algorithm = PG_COMPRESSION_ZSTD;
}
CFH = InitCompressFileHandle(compression_spec);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
new file mode 100644
index 00000000000..cf85c3a4c93
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.c
+ * Routines for archivers to write a Zstd compressed data stream.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_backup_utils.h"
+#include "compress_zstd.h"
+
+#ifndef USE_ZSTD
+
+void
+InitCompressorZstd(CompressorState *cs, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH, const pg_compress_specification compression_spec)
+{
+ pg_fatal("this build does not support compression with %s", "ZSTD");
+}
+
+#else
+
+#include <zstd.h>
+
+typedef struct ZstdCompressorState
+{
+ /* This is a normal file to which we read/write compressed data */
+ FILE *fp;
+
+ ZSTD_CStream *cstream;
+ ZSTD_DStream *dstream;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+
+ /* pointer to a static string like from strerror(), for Zstd_write() */
+ const char *zstderror;
+} ZstdCompressorState;
+
+static ZSTD_CStream *_ZstdCStreamParams(pg_compress_specification compress);
+static void EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs);
+static void WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen);
+static void ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs);
+
+static void
+_Zstd_CCtx_setParam_or_die(ZSTD_CStream *cstream,
+ ZSTD_cParameter param, int value, char *paramname)
+{
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, param, value);
+ if (ZSTD_isError(res))
+ pg_fatal("could not set compression parameter: \"%s\": %s",
+ paramname, ZSTD_getErrorName(res));
+}
+
+/* Return a compression stream with parameters set per argument */
+static ZSTD_CStream *
+_ZstdCStreamParams(pg_compress_specification compress)
+{
+ ZSTD_CStream *cstream;
+
+ cstream = ZSTD_createCStream();
+ if (cstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
+ compress.level, "level");
+
+ return cstream;
+}
+
+/* Helper function for WriteDataToArchiveZstd and EndCompressorZstd */
+static void
+_ZstdWriteCommon(ArchiveHandle *AH, CompressorState *cs, bool flush)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop while there's any input or until flushed */
+ while (input->pos != input->size || flush)
+ {
+ size_t res;
+
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output,
+ input, flush ? ZSTD_e_end : ZSTD_e_continue);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not compress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * Extra paranoia: avoid zero-length chunks, since a zero length chunk
+ * is the EOF marker in the custom format. This should never happen
+ * but...
+ */
+ if (output->pos > 0)
+ cs->writeF(AH, output->dst, output->pos);
+
+ if (res == 0)
+ break; /* End of frame or all input consumed */
+ }
+}
+
+static void
+EndCompressorZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ if (cs->readF != NULL)
+ {
+ Assert(zstdcs->cstream == NULL);
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+ else if (cs->writeF != NULL)
+ {
+ Assert(zstdcs->dstream == NULL);
+ _ZstdWriteCommon(AH, cs, true);
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ pg_free(zstdcs);
+}
+
+static void
+WriteDataToArchiveZstd(ArchiveHandle *AH, CompressorState *cs,
+ const void *data, size_t dLen)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+
+ zstdcs->input.src = data;
+ zstdcs->input.size = dLen;
+ zstdcs->input.pos = 0;
+
+ _ZstdWriteCommon(AH, cs, false);
+}
+
+static void
+ReadDataFromArchiveZstd(ArchiveHandle *AH, CompressorState *cs)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) cs->private_data;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res;
+
+ for (;;)
+ {
+ size_t cnt;
+
+ /*
+ * Read compressed data. Note that readF can resize the buffer; the
+ * new size is tracked and used for future loops.
+ */
+ input->size = input_allocated_size;
+ cnt = cs->readF(AH, (char **) unconstify(void **, &input->src), &input->size);
+
+ /* ensure that readF didn't *shrink* the buffer */
+ Assert(input->size >= input_allocated_size);
+ input_allocated_size = input->size;
+ input->size = cnt;
+ input->pos = 0;
+
+ if (cnt == 0)
+ break;
+
+ /* Now decompress */
+ while (input->pos < input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ /*
+ * then write the decompressed data to the output handle
+ */
+ ((char *) output->dst)[output->pos] = '\0';
+ ahwrite(output->dst, 1, output->pos, AH);
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+ }
+}
+
+/* Public routine that supports Zstd compressed data I/O */
+void
+InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec)
+{
+ ZstdCompressorState *zstdcs;
+
+ cs->readData = ReadDataFromArchiveZstd;
+ cs->writeData = WriteDataToArchiveZstd;
+ cs->end = EndCompressorZstd;
+
+ cs->compression_spec = compression_spec;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ cs->private_data = zstdcs;
+
+ /* We expect that exactly one of readF/writeF is specified */
+ Assert((cs->readF == NULL) != (cs->writeF == NULL));
+
+ if (cs->readF != NULL)
+ {
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+
+ zstdcs->input.size = ZSTD_DStreamInSize();
+ zstdcs->input.src = pg_malloc(zstdcs->input.size);
+
+ /*
+ * output.size is the buffer size we tell zstd it can output to.
+ * Allocate an additional byte such that ReadDataFromArchiveZstd() can
+ * call ahwrite() with a null-terminated string, which is an optimized
+ * case in ExecuteSqlCommandBuf().
+ */
+ zstdcs->output.size = ZSTD_DStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size + 1);
+ }
+ else if (cs->writeF != NULL)
+ {
+ zstdcs->cstream = _ZstdCStreamParams(cs->compression_spec);
+
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc(zstdcs->output.size);
+ zstdcs->output.pos = 0;
+ }
+}
+
+/*
+ * Compressed stream API
+ */
+
+static bool
+Zstd_read(void *ptr, size_t size, size_t *rdsize, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t input_allocated_size = ZSTD_DStreamInSize();
+ size_t res,
+ cnt;
+
+ output->size = size;
+ output->dst = ptr;
+ output->pos = 0;
+
+ for (;;)
+ {
+ Assert(input->pos <= input->size);
+ Assert(input->size <= input_allocated_size);
+
+ /*
+ * If the input is completely consumed, start back at the beginning
+ */
+ if (input->pos == input->size)
+ {
+ /* input->size is size produced by "fread" */
+ input->size = 0;
+ /* input->pos is position consumed by decompress */
+ input->pos = 0;
+ }
+
+ /* read compressed data if we must produce more input */
+ if (input->pos == input->size)
+ {
+ cnt = fread(unconstify(void *, input->src), 1, input_allocated_size, zstdcs->fp);
+ input->size = cnt;
+
+ Assert(cnt <= input_allocated_size);
+
+ /* If we have no more input to consume, we're done */
+ if (cnt == 0)
+ break;
+ }
+
+ while (input->pos < input->size)
+ {
+ /* now decompress */
+ res = ZSTD_decompressStream(zstdcs->dstream, output, input);
+
+ if (ZSTD_isError(res))
+ pg_fatal("could not decompress data: %s", ZSTD_getErrorName(res));
+
+ if (output->pos == output->size)
+ break; /* No more room for output */
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ if (output->pos == output->size)
+ break; /* We read all the data that fits */
+ }
+
+ if (rdsize != NULL)
+ *rdsize = output->pos;
+
+ return true;
+}
+
+static bool
+Zstd_write(const void *ptr, size_t size, CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+ size_t res,
+ cnt;
+
+ input->src = ptr;
+ input->size = size;
+ input->pos = 0;
+
+ /* Consume all input, to be flushed later */
+ while (input->pos != input->size)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_continue);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return false;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return false;
+ }
+ }
+
+ return size;
+}
+
+static int
+Zstd_getc(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+ int ret;
+
+ if (CFH->read_func(&ret, 1, NULL, CFH) != 1)
+ {
+ if (feof(zstdcs->fp))
+ pg_fatal("could not read from input file: end of file");
+ else
+ pg_fatal("could not read from input file: %m");
+ }
+ return ret;
+}
+
+static char *
+Zstd_gets(char *buf, int len, CompressFileHandle *CFH)
+{
+ int i;
+
+ Assert(len > 0);
+
+ /*
+ * Read one byte at a time until newline or EOF. This is only used to read
+ * the list of LOs, and the I/O is buffered anyway.
+ */
+ for (i = 0; i < len - 1; ++i)
+ {
+ size_t readsz;
+
+ if (!CFH->read_func(&buf[i], 1, &readsz, CFH))
+ break;
+ if (readsz != 1)
+ break;
+ if (buf[i] == '\n')
+ {
+ ++i;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return i > 0 ? buf : NULL;
+}
+
+static bool
+Zstd_close(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ if (zstdcs->cstream)
+ {
+ size_t res,
+ cnt;
+ ZSTD_inBuffer *input = &zstdcs->input;
+ ZSTD_outBuffer *output = &zstdcs->output;
+
+ /* Loop until the compression buffers are fully consumed */
+ for (;;)
+ {
+ output->pos = 0;
+ res = ZSTD_compressStream2(zstdcs->cstream, output, input, ZSTD_e_end);
+ if (ZSTD_isError(res))
+ {
+ zstdcs->zstderror = ZSTD_getErrorName(res);
+ return false;
+ }
+
+ cnt = fwrite(output->dst, 1, output->pos, zstdcs->fp);
+ if (cnt != output->pos)
+ {
+ zstdcs->zstderror = strerror(errno);
+ return false;
+ }
+
+ if (res == 0)
+ break; /* End of frame */
+ }
+
+ ZSTD_freeCStream(zstdcs->cstream);
+ pg_free(zstdcs->output.dst);
+ }
+
+ if (zstdcs->dstream)
+ {
+ ZSTD_freeDStream(zstdcs->dstream);
+ pg_free(unconstify(void *, zstdcs->input.src));
+ }
+
+ if (fclose(zstdcs->fp) != 0)
+ return false;
+
+ pg_free(zstdcs);
+ return true;
+}
+
+static bool
+Zstd_eof(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return feof(zstdcs->fp);
+}
+
+static bool
+Zstd_open(const char *path, int fd, const char *mode,
+ CompressFileHandle *CFH)
+{
+ FILE *fp;
+ ZstdCompressorState *zstdcs;
+
+ if (fd >= 0)
+ fp = fdopen(fd, mode);
+ else
+ fp = fopen(path, mode);
+
+ if (fp == NULL)
+ return false;
+
+ zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs));
+ CFH->private_data = zstdcs;
+ zstdcs->fp = fp;
+
+ if (mode[0] == 'r')
+ {
+ zstdcs->input.src = pg_malloc0(ZSTD_DStreamInSize());
+ zstdcs->dstream = ZSTD_createDStream();
+ if (zstdcs->dstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else if (mode[0] == 'w' || mode[0] == 'a')
+ {
+ zstdcs->output.size = ZSTD_CStreamOutSize();
+ zstdcs->output.dst = pg_malloc0(zstdcs->output.size);
+ zstdcs->cstream = _ZstdCStreamParams(CFH->compression_spec);
+ if (zstdcs->cstream == NULL)
+ pg_fatal("could not initialize compression library");
+ }
+ else
+ pg_fatal("unhandled mode");
+
+ return true;
+}
+
+static bool
+Zstd_open_write(const char *path, const char *mode, CompressFileHandle *CFH)
+{
+ char fname[MAXPGPATH];
+
+ sprintf(fname, "%s.zst", path);
+ return CFH->open_func(fname, -1, mode, CFH);
+}
+
+static const char *
+Zstd_get_error(CompressFileHandle *CFH)
+{
+ ZstdCompressorState *zstdcs = (ZstdCompressorState *) CFH->private_data;
+
+ return zstdcs->zstderror;
+}
+
+void
+InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec)
+{
+ CFH->open_func = Zstd_open;
+ CFH->open_write_func = Zstd_open_write;
+ CFH->read_func = Zstd_read;
+ CFH->write_func = Zstd_write;
+ CFH->gets_func = Zstd_gets;
+ CFH->getc_func = Zstd_getc;
+ CFH->close_func = Zstd_close;
+ CFH->eof_func = Zstd_eof;
+ CFH->get_error_func = Zstd_get_error;
+
+ CFH->compression_spec = compression_spec;
+
+ CFH->private_data = NULL;
+}
+
+#endif /* USE_ZSTD */
diff --git a/src/bin/pg_dump/compress_zstd.h b/src/bin/pg_dump/compress_zstd.h
new file mode 100644
index 00000000000..2aaa6b100b1
--- /dev/null
+++ b/src/bin/pg_dump/compress_zstd.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * compress_zstd.h
+ * Zstd interface to compress_io.c routines
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/compress_zstd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COMPRESS_ZSTD_H
+#define COMPRESS_ZSTD_H
+
+#include "compress_io.h"
+
+extern void InitCompressorZstd(CompressorState *cs,
+ const pg_compress_specification compression_spec);
+extern void InitCompressFileHandleZstd(CompressFileHandle *CFH,
+ const pg_compress_specification compression_spec);
+
+#endif /* COMPRESS_ZSTD_H */
diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build
index b2fb7ac77fd..9d59a106f36 100644
--- a/src/bin/pg_dump/meson.build
+++ b/src/bin/pg_dump/meson.build
@@ -5,6 +5,7 @@ pg_dump_common_sources = files(
'compress_io.c',
'compress_lz4.c',
'compress_none.c',
+ 'compress_zstd.c',
'dumputils.c',
'parallel.c',
'pg_backup_archiver.c',
@@ -19,7 +20,7 @@ pg_dump_common_sources = files(
pg_dump_common = static_library('libpgdump_common',
pg_dump_common_sources,
c_pch: pch_postgres_fe_h,
- dependencies: [frontend_code, libpq, lz4, zlib],
+ dependencies: [frontend_code, libpq, lz4, zlib, zstd],
kwargs: internal_lib_args,
)
@@ -90,6 +91,7 @@ tests += {
'env': {
'GZIP_PROGRAM': gzip.path(),
'LZ4': program_lz4.found() ? program_lz4.path() : '',
+ 'ZSTD': program_zstd.found() ? program_zstd.path() : '',
'with_icu': icu.found() ? 'yes' : 'no',
},
'tests': [
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index ab77e373e91..e8ee6b1ad86 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2120,7 +2120,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
/*
* Check if the specified archive is a directory. If so, check if
- * there's a "toc.dat" (or "toc.dat.{gz,lz4}") file in it.
+ * there's a "toc.dat" (or "toc.dat.{gz,lz4,zst}") file in it.
*/
if (stat(AH->fSpec, &st) == 0 && S_ISDIR(st.st_mode))
{
@@ -2131,10 +2131,17 @@ _discoverArchiveFormat(ArchiveHandle *AH)
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.gz"))
return AH->format;
#endif
+
#ifdef USE_LZ4
if (_fileExistsInDirectory(AH->fSpec, "toc.dat.lz4"))
return AH->format;
#endif
+
+#ifdef USE_ZSTD
+ if (_fileExistsInDirectory(AH->fSpec, "toc.dat.zst"))
+ return AH->format;
+#endif
+
pg_fatal("directory \"%s\" does not appear to be a valid archive (\"toc.dat\" does not exist)",
AH->fSpec);
fh = NULL; /* keep compiler quiet */
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index abaaa3b10e3..2177d5ff425 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -785,6 +785,8 @@ _PrepParallelRestore(ArchiveHandle *AH)
strlcat(fname, ".gz", sizeof(fname));
else if (AH->compression_spec.algorithm == PG_COMPRESSION_LZ4)
strlcat(fname, ".lz4", sizeof(fname));
+ else if (AH->compression_spec.algorithm == PG_COMPRESSION_ZSTD)
+ strlcat(fname, ".zst", sizeof(fname));
if (stat(fname, &st) == 0)
te->dataLength = st.st_size;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 6abbcff6834..a426984046b 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -56,6 +56,7 @@
#include "catalog/pg_type_d.h"
#include "common/connect.h"
#include "common/relpath.h"
+#include "compress_io.h"
#include "dumputils.h"
#include "fe_utils/option_utils.h"
#include "fe_utils/string_utils.h"
@@ -735,18 +736,13 @@ main(int argc, char **argv)
pg_fatal("invalid compression specification: %s",
error_detail);
- switch (compression_algorithm)
- {
- case PG_COMPRESSION_NONE:
- /* fallthrough */
- case PG_COMPRESSION_GZIP:
- /* fallthrough */
- case PG_COMPRESSION_LZ4:
- break;
- case PG_COMPRESSION_ZSTD:
- pg_fatal("compression with %s is not yet supported", "ZSTD");
- break;
- }
+ error_detail = supports_compression(compression_spec);
+ if (error_detail != NULL)
+ pg_fatal("%s", error_detail);
+
+ if (compression_spec.options & PG_COMPRESSION_OPTION_WORKERS)
+ pg_log_warning("compression option \"%s\" is not currently supported by pg_dump",
+ "workers");
/*
* Custom and directory formats are compressed by default with gzip when
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 42215f82f7a..74f23ae7f74 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -54,8 +54,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir;
# those lines) to validate that part of the process.
my $supports_icu = ($ENV{with_icu} eq 'yes');
-my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
my $supports_gzip = check_pg_config("#define HAVE_LIBZ 1");
+my $supports_lz4 = check_pg_config("#define USE_LZ4 1");
+my $supports_zstd = check_pg_config("#define USE_ZSTD 1");
my %pgdump_runs = (
binary_upgrade => {
@@ -213,6 +214,77 @@ my %pgdump_runs = (
},
},
+ compression_zstd_custom => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=custom',
+ '--compress=zstd', "--file=$tempdir/compression_zstd_custom.dump",
+ 'postgres',
+ ],
+ restore_cmd => [
+ 'pg_restore',
+ "--file=$tempdir/compression_zstd_custom.sql",
+ "$tempdir/compression_zstd_custom.dump",
+ ],
+ command_like => {
+ command => [
+ 'pg_restore',
+ '-l', "$tempdir/compression_zstd_custom.dump",
+ ],
+ expected => qr/Compression: zstd/,
+ name => 'data content is zstd compressed'
+ },
+ },
+
+ compression_zstd_dir => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--jobs=2',
+ '--format=directory', '--compress=zstd:1',
+ "--file=$tempdir/compression_zstd_dir", 'postgres',
+ ],
+ # Give coverage for manually compressed blob.toc files during
+ # restore.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-z', '-f', '--rm',
+ "$tempdir/compression_zstd_dir/blobs.toc",
+ "-o", "$tempdir/compression_zstd_dir/blobs.toc.zst",
+ ],
+ },
+ # Verify that data files were compressed
+ glob_patterns => [
+ "$tempdir/compression_zstd_dir/toc.dat",
+ "$tempdir/compression_zstd_dir/*.dat.zst",
+ ],
+ restore_cmd => [
+ 'pg_restore', '--jobs=2',
+ "--file=$tempdir/compression_zstd_dir.sql",
+ "$tempdir/compression_zstd_dir",
+ ],
+ },
+
+ compression_zstd_plain => {
+ test_key => 'compression',
+ compile_option => 'zstd',
+ dump_cmd => [
+ 'pg_dump', '--format=plain', '--compress=zstd',
+ "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
+ ],
+ # Decompress the generated file to run through the tests.
+ compress_cmd => {
+ program => $ENV{'ZSTD'},
+ args => [
+ '-d', '-f',
+ "$tempdir/compression_zstd_plain.sql.zst",
+ "-o", "$tempdir/compression_zstd_plain.sql",
+ ],
+ },
+ },
+
clean => {
dump_cmd => [
'pg_dump',
@@ -4648,10 +4720,11 @@ foreach my $run (sort keys %pgdump_runs)
my $test_key = $run;
my $run_db = 'postgres';
- # Skip command-level tests for gzip/lz4 if there is no support for it.
+ # Skip command-level tests for gzip/lz4/zstd if the tool is not supported
if ($pgdump_runs{$run}->{compile_option} &&
(($pgdump_runs{$run}->{compile_option} eq 'gzip' && !$supports_gzip) ||
- ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4)))
+ ($pgdump_runs{$run}->{compile_option} eq 'lz4' && !$supports_lz4) ||
+ ($pgdump_runs{$run}->{compile_option} eq 'zstd' && !$supports_zstd)))
{
note "$run: skipped due to no $pgdump_runs{$run}->{compile_option} support";
next;
diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck
index 58039934756..10fb51585c9 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -154,6 +154,7 @@ do
test "$f" = src/bin/pg_dump/compress_io.h && continue
test "$f" = src/bin/pg_dump/compress_lz4.h && continue
test "$f" = src/bin/pg_dump/compress_none.h && continue
+ test "$f" = src/bin/pg_dump/compress_zstd.h && continue
test "$f" = src/bin/pg_dump/parallel.h && continue
test "$f" = src/bin/pg_dump/pg_backup_archiver.h && continue
test "$f" = src/bin/pg_dump/pg_dump.h && continue
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5c0410869f7..065acb6f50b 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3937,3 +3937,4 @@ yyscan_t
z_stream
z_streamp
zic_t
+ZSTD_CStream
--
2.34.1
0002-zstd-support-long-distance-mode-in-pg_dump-basebacku.patchtext/x-diff; charset=us-asciiDownload
From c82621cb1b1d5ab70dee2245be06ca29cabf8c35 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 27 Mar 2022 11:55:01 -0500
Subject: [PATCH 2/4] zstd: support long distance mode in pg_dump/basebackup
First proposed here:
20220327205020.GM28503@telsasoft.com
---
doc/src/sgml/protocol.sgml | 10 +++-
doc/src/sgml/ref/pg_basebackup.sgml | 4 +-
doc/src/sgml/ref/pg_dump.sgml | 2 +
src/backend/backup/basebackup_zstd.c | 12 ++++
src/bin/pg_basebackup/bbstreamer_zstd.c | 13 +++++
src/bin/pg_basebackup/t/010_pg_basebackup.pl | 9 ++-
src/bin/pg_dump/compress_zstd.c | 5 ++
src/bin/pg_dump/t/002_pg_dump.pl | 3 +-
src/bin/pg_verifybackup/t/008_untar.pl | 8 +++
src/bin/pg_verifybackup/t/010_client_untar.pl | 8 +++
src/common/compression.c | 57 ++++++++++++++++++-
src/include/common/compression.h | 2 +
12 files changed, 127 insertions(+), 6 deletions(-)
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 8b5e7b1ad7f..b11d9a6ba35 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2729,7 +2729,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
- keywords are <literal>level</literal> and <literal>workers</literal>.
+ keywords are <literal>level</literal>, <literal>long</literal> and
+ <literal>workers</literal>.
</para>
<para>
@@ -2746,6 +2747,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
+ <para>
+ The <literal>long</literal> keyword enables long-distance matching
+ mode, for improved compression ratio, at the expense of higher memory
+ use. Long-distance mode is supported only for
+ <literal>zstd</literal>.
+ </para>
+
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db3ad9cd5eb..79d3e657c32 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
- Currently, the supported keywords are <literal>level</literal>
- and <literal>workers</literal>.
+ Currently, the supported keywords are <literal>level</literal>,
+ <literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 8de38e0fd0d..e81e35c13b3 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -681,6 +681,8 @@ PostgreSQL documentation
as though it had been fed through <application>gzip</application>,
<application>lz4</application>, or <application>zstd</application>;
but the default is not to compress.
+ With zstd compression, <literal>long</literal> mode may improve the
+ compression ratio, at the cost of increased memory use.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c
index ac6cac178a0..1bb5820c884 100644
--- a/src/backend/backup/basebackup_zstd.c
+++ b/src/backend/backup/basebackup_zstd.c
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(mysink->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret)));
+ }
+
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.
diff --git a/src/bin/pg_basebackup/bbstreamer_zstd.c b/src/bin/pg_basebackup/bbstreamer_zstd.c
index fe17d6df4ef..fba391e2a0f 100644
--- a/src/bin/pg_basebackup/bbstreamer_zstd.c
+++ b/src/bin/pg_basebackup/bbstreamer_zstd.c
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
+ if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+ {
+ ret = ZSTD_CCtx_setParameter(streamer->cctx,
+ ZSTD_c_enableLongDistanceMatching,
+ compress->long_distance);
+ if (ZSTD_isError(ret))
+ {
+ pg_log_error("could not set compression flag for %s: %s",
+ "long", ZSTD_getErrorName(ret));
+ exit(1);
+ }
+ }
+
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b60cb78a0d5..4d130a7f944 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
- ],);
+ ],
+ [
+ 'gzip:long',
+ 'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+ 'failure on long mode for gzip'
+ ],
+ );
+
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index cf85c3a4c93..49a877ce010 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -80,6 +80,11 @@ _ZstdCStreamParams(pg_compress_specification compress)
_Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
compress.level, "level");
+ if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+ _Zstd_CCtx_setParam_or_die(cstream,
+ ZSTD_c_enableLongDistanceMatching,
+ compress.long_distance, "long");
+
return cstream;
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 74f23ae7f74..bb898b06bb4 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
+ # Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
- 'pg_dump', '--format=plain', '--compress=zstd',
+ 'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 3007bbe8556..05754bc8ec7 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => ['-d'],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index f3aa0f59e29..ac51a174d14 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
+ {
+ 'compression_method' => 'zstd',
+ 'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+ 'backup_archive' => 'base.tar.zst',
+ 'decompress_program' => $ENV{'ZSTD'},
+ 'decompress_flags' => [ '-d' ],
+ 'enabled' => check_pg_config("#define USE_ZSTD 1")
+ },
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
diff --git a/src/common/compression.c b/src/common/compression.c
index 2d3e56b4d62..35a7cade645 100644
--- a/src/common/compression.c
+++ b/src/common/compression.c
@@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+ pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
+ else if (strcmp(keyword, "long") == 0)
+ {
+ result->long_distance = expect_boolean_value(keyword, value, result);
+ result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+ }
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
+/*
+ * Parse 'value' as a boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1. The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+ if (value == NULL)
+ return true;
+
+ if (pg_strcasecmp(value, "yes") == 0)
+ return true;
+ if (pg_strcasecmp(value, "on") == 0)
+ return true;
+ if (pg_strcasecmp(value, "1") == 0)
+ return true;
+
+ if (pg_strcasecmp(value, "no") == 0)
+ return false;
+ if (pg_strcasecmp(value, "off") == 0)
+ return false;
+ if (pg_strcasecmp(value, "0") == 0)
+ return false;
+
+ result->parse_error =
+ psprintf(_("value for compression option \"%s\" must be a boolean"),
+ keyword);
+ return false;
+}
+
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
+ /*
+ * Of the compression algorithms that we currently support, only zstd
+ * supports long-distance mode.
+ */
+ if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+ (spec->algorithm != PG_COMPRESSION_ZSTD))
+ {
+ return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+ get_compress_algorithm_name(spec->algorithm));
+ }
+
return NULL;
}
diff --git a/src/include/common/compression.h b/src/include/common/compression.h
index b48c173022e..38aae9dd873 100644
--- a/src/include/common/compression.h
+++ b/src/include/common/compression.h
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
+ bool long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;
--
2.34.1
0003-WIP-pg_dump-support-zstd-workers.patchtext/x-diff; charset=us-asciiDownload
From 1b78e26b5901de39c24677362c78391cb7c39b6a Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Thu, 30 Mar 2023 17:48:57 -0500
Subject: [PATCH 3/4] WIP: pg_dump: support zstd workers
This is a separate commit since it's not essential; the zstd library is
frequently compiled without threading support, so the functionality
isn't very well-tested, and because use of zstd threads might
conceivably play poorly with pg_dump's use of threads under Windows.
Targetting postgres v17.
---
doc/src/sgml/ref/pg_dump.sgml | 8 ++++++--
src/bin/pg_dump/compress_zstd.c | 4 ++++
src/bin/pg_dump/pg_dump.c | 4 ----
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index e81e35c13b3..1d55ce05b21 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -681,8 +681,12 @@ PostgreSQL documentation
as though it had been fed through <application>gzip</application>,
<application>lz4</application>, or <application>zstd</application>;
but the default is not to compress.
- With zstd compression, <literal>long</literal> mode may improve the
- compression ratio, at the cost of increased memory use.
+ With zstd compression, <literal>long</literal> and
+ <literal>workers</literal> options may be specified to enable long-distance
+ matching and threaded workers, respectively.
+ Long distance mode may improve the compression ratio, at the cost of
+ increased memory use.
+ Threaded workers allow leveraging multiple CPUs during compression.
</para>
<para>
The tar archive format currently does not support compression at all.
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index 49a877ce010..f1f84ad69c4 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -85,6 +85,10 @@ _ZstdCStreamParams(pg_compress_specification compress)
ZSTD_c_enableLongDistanceMatching,
compress.long_distance, "long");
+ if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
+ _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
+ compress.workers, "workers");
+
return cstream;
}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index a426984046b..240dcdb0223 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -740,10 +740,6 @@ main(int argc, char **argv)
if (error_detail != NULL)
pg_fatal("%s", error_detail);
- if (compression_spec.options & PG_COMPRESSION_OPTION_WORKERS)
- pg_log_warning("compression option \"%s\" is not currently supported by pg_dump",
- "workers");
-
/*
* Custom and directory formats are compressed by default with gzip when
* available, not the others.
--
2.34.1
0004-TMP-pg_dump-use-Zstd-by-default-for-CI-only.patchtext/x-diff; charset=us-asciiDownload
From 6d8d1d6474b4a02689e21e59240188d4e621ef2a Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Wed, 4 Jan 2023 21:21:53 -0600
Subject: [PATCH 4/4] TMP: pg_dump: use Zstd by default, for CI only
//-os-only: linux-meson
---
.cirrus.yml | 9 ++++++++-
src/bin/pg_dump/compress_zstd.c | 9 +++++++++
src/bin/pg_dump/pg_dump.c | 4 ++--
src/bin/pg_dump/t/002_pg_dump.pl | 14 +++++++-------
4 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/.cirrus.yml b/.cirrus.yml
index 5b1747522f9..14402a0ad5c 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -267,6 +267,7 @@ LINUX_CONFIGURE_FEATURES: &LINUX_CONFIGURE_FEATURES >-
LINUX_MESON_FEATURES: &LINUX_MESON_FEATURES >-
-Dllvm=enabled
-Duuid=e2fs
+ -Dzstd=enabled
# Linux, both 32bit and 64bit
@@ -389,6 +390,9 @@ task:
configure_script: |
su postgres <<-EOF
+ mkdir subprojects
+ meson wrap install zstd
+ meson configure -D zstd:multithread=enabled --force-fallback-for=zstd
meson setup \
--buildtype=debug \
-Dcassert=true \
@@ -616,7 +620,10 @@ task:
# Use /DEBUG:FASTLINK to avoid high memory usage during linking
configure_script: |
vcvarsall x64
- meson setup --backend ninja --buildtype debug -Dc_link_args=/DEBUG:FASTLINK -Dcassert=true -Db_pch=true -Dextra_lib_dirs=c:\openssl\1.1\lib -Dextra_include_dirs=c:\openssl\1.1\include -DTAR=%TAR% -DPG_TEST_EXTRA="%PG_TEST_EXTRA%" build
+ mkdir subprojects
+ meson wrap install zstd
+ meson configure -D zstd:multithread=enabled --force-fallback-for=zstd
+ meson setup --backend ninja --buildtype debug -Dc_link_args=/DEBUG:FASTLINK -Dcassert=true -Db_pch=true -Dextra_lib_dirs=c:\openssl\1.1\lib -Dextra_include_dirs=c:\openssl\1.1\include -DTAR=%TAR% -DPG_TEST_EXTRA="%PG_TEST_EXTRA%" -D zstd=enabled build
build_script: |
vcvarsall x64
diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c
index f1f84ad69c4..c7be670b4a3 100644
--- a/src/bin/pg_dump/compress_zstd.c
+++ b/src/bin/pg_dump/compress_zstd.c
@@ -88,6 +88,15 @@ _ZstdCStreamParams(pg_compress_specification compress)
if (compress.options & PG_COMPRESSION_OPTION_WORKERS)
_Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_nbWorkers,
compress.workers, "workers");
+ else
+ {
+ size_t res;
+
+ res = ZSTD_CCtx_setParameter(cstream, ZSTD_c_nbWorkers, 3);
+ if (ZSTD_isError(res))
+ pg_log_warning("could not set compression parameter: \"%s\": %s",
+ "workers", ZSTD_getErrorName(res));
+ }
return cstream;
}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 240dcdb0223..b90a1087f30 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -747,8 +747,8 @@ main(int argc, char **argv)
if ((archiveFormat == archCustom || archiveFormat == archDirectory) &&
!user_compression_defined)
{
-#ifdef HAVE_LIBZ
- parse_compress_specification(PG_COMPRESSION_GZIP, NULL,
+#ifdef USE_ZSTD
+ parse_compress_specification(PG_COMPRESSION_ZSTD, NULL,
&compression_spec);
#else
/* Nothing to do in the default case */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index bb898b06bb4..0a635ae9fc3 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -385,10 +385,10 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_custom_format.dump", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
- name => 'data content is gzip-compressed by default if available',
+ name => 'data content is zstd-compressed by default if available',
},
},
@@ -410,16 +410,16 @@ my %pgdump_runs = (
command_like => {
command =>
[ 'pg_restore', '-l', "$tempdir/defaults_dir_format", ],
- expected => $supports_gzip ?
- qr/Compression: gzip/ :
+ expected => $supports_zstd ?
+ qr/Compression: zstd/ :
qr/Compression: none/,
name => 'data content is gzip-compressed by default',
},
glob_patterns => [
"$tempdir/defaults_dir_format/toc.dat",
"$tempdir/defaults_dir_format/blobs.toc",
- $supports_gzip ?
- "$tempdir/defaults_dir_format/*.dat.gz" :
+ $supports_zstd ?
+ "$tempdir/defaults_dir_format/*.dat.zst" :
"$tempdir/defaults_dir_format/*.dat",
],
},
--
2.34.1
On 4/4/23 05:04, Justin Pryzby wrote:
On Mon, Apr 03, 2023 at 11:26:09PM +0200, Tomas Vondra wrote:
On 4/3/23 21:17, Justin Pryzby wrote:
On Sat, Apr 01, 2023 at 10:26:01PM +0200, Tomas Vondra wrote:
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).OK, understood. The long mode patch is pretty simple. IIUC it does not
change the format, i.e. in the worst case we could leave it for PG17
too. Correct?Right, libzstd only has one "format", which is the same as what's used
by the commandline tool. zstd:long doesn't change the format of the
output: the library just uses a larger memory buffer to allow better
compression. There's no format change for zstd:workers, either.OK. I plan to do a bit more review/testing on this, and get it committed
over the next day or two, likely including the long mode. One thing I
noticed today is that maybe long_distance should be a bool, not int.
Yes, ZSTD_c_enableLongDistanceMatching() accepts int, but it'd be
cleaner to cast the value during a call and keep it bool otherwise.Thanks for noticing. Evidently I wrote it using "int" to get the
feature working, and then later wrote the bool parsing bits but never
changed the data structure.This also updates a few comments, indentation, removes a useless
assertion, and updates the warning about zstd:workers.
Thanks. I've cleaned up the 0001 a little bit (a couple comment
improvements), updated the commit message and pushed it. I plan to take
care of the 0002 (long distance mode) tomorrow, and that'll be it for
PG16 I think.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 4/5/23 21:42, Tomas Vondra wrote:
On 4/4/23 05:04, Justin Pryzby wrote:
On Mon, Apr 03, 2023 at 11:26:09PM +0200, Tomas Vondra wrote:
On 4/3/23 21:17, Justin Pryzby wrote:
On Sat, Apr 01, 2023 at 10:26:01PM +0200, Tomas Vondra wrote:
Feel free to mess around with threads (but I'd much rather see the patch
progress for zstd:long).OK, understood. The long mode patch is pretty simple. IIUC it does not
change the format, i.e. in the worst case we could leave it for PG17
too. Correct?Right, libzstd only has one "format", which is the same as what's used
by the commandline tool. zstd:long doesn't change the format of the
output: the library just uses a larger memory buffer to allow better
compression. There's no format change for zstd:workers, either.OK. I plan to do a bit more review/testing on this, and get it committed
over the next day or two, likely including the long mode. One thing I
noticed today is that maybe long_distance should be a bool, not int.
Yes, ZSTD_c_enableLongDistanceMatching() accepts int, but it'd be
cleaner to cast the value during a call and keep it bool otherwise.Thanks for noticing. Evidently I wrote it using "int" to get the
feature working, and then later wrote the bool parsing bits but never
changed the data structure.This also updates a few comments, indentation, removes a useless
assertion, and updates the warning about zstd:workers.Thanks. I've cleaned up the 0001 a little bit (a couple comment
improvements), updated the commit message and pushed it. I plan to take
care of the 0002 (long distance mode) tomorrow, and that'll be it for
PG16 I think.
I looked at the long mode patch again, updated the commit message and
pushed it. I was wondering if long_mode should really be bool -
logically it is, but ZSTD_CCtx_setParameter() expects int. But I think
that's fine.
I think that's all for PG16 in this patch series. If there's more we
want to do, it'll have to wait for PG17 - Justin, can you update and
submit the patches that you think are relevant for the next CF?
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Thu, Apr 06, 2023 at 05:34:30PM +0200, Tomas Vondra wrote:
I looked at the long mode patch again, updated the commit message and
pushed it. I was wondering if long_mode should really be bool -
logically it is, but ZSTD_CCtx_setParameter() expects int. But I think
that's fine.
Thanks!
I think that's all for PG16 in this patch series. If there's more we want to
do, it'll have to wait for PG17 -
Yes
Justin, can you update and submit the patches that you think are relevant for
the next CF?
Yeah.
It sounds like a shiny new feature, but it's not totally clear if it's safe
here or even how useful it is. (It might be like my patch for
wal_compression=zstd:level, and Michael's for toast_compression=zstd, neither
of which saw any support).
Last year's basebackup thread had some interesting comments about safety of
threads, although pg_dump's considerations may be different.
The patch itself is trivial, so it'd be fine to wait until PG16 is released to
get some experience. If someone else wanted to do that, it'd be fine with me.
--
Justin
Justin Pryzby <pryzby@telsasoft.com> writes:
On Thu, Apr 06, 2023 at 05:34:30PM +0200, Tomas Vondra wrote:
I think that's all for PG16 in this patch series. If there's more we want to
do, it'll have to wait for PG17 -
Yes
Shouldn't the CF entry be closed as committed? It's certainly
making the cfbot unhappy because the patch-of-record doesn't apply.
regards, tom lane