pluggable compression support

Started by Andres Freundover 12 years ago33 messages

andres@2ndquadrant.com

over 12 years ago

2 attachment(s)

Hi,

While hacking on the indirect toast support I felt the need to find out
how make compression formats pluggable to make sure .

In
http://archives.postgresql.org/message-id/20130605150144.GD28067%40alap2.anarazel.de
I submitted an initial patch that showed some promising results.

Here's a more cleaned up version which isn't intermingled with indirect
toast tuple support anymore.

It still contains a guc as described in the above message to control the
algorithm used for compressing new tuples but I think we should remove
that guc after testing.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

0001-Add-snappy-compression-algorithm-to-contrib.patchtext/x-patch; charset=us-asciiDownload

>From 80bc1e45cd25b2aaa84c7fed66c305ade910a69d Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 5 Jun 2013 15:29:47 +0200
Subject: [PATCH 1/2] Add snappy compression algorithm to /contrib

Copied (with minimal modifications) from https://github.com/andikleen/snappy-c

Code is licensed under a 3-clause BSD license and copyrighted by google.
---
 src/common/Makefile                |    4 +-
 src/common/snappy-compat.h         |   57 ++
 src/common/snappy-int.h            |   71 ++
 src/common/snappy.c                | 1536 ++++++++++++++++++++++++++++++++++++
 src/include/common/snappy/snappy.h |   34 +
 5 files changed, 1701 insertions(+), 1 deletion(-)
 create mode 100644 src/common/snappy-compat.h
 create mode 100644 src/common/snappy-int.h
 create mode 100644 src/common/snappy.c
 create mode 100644 src/include/common/snappy/snappy.h

diff --git a/src/common/Makefile b/src/common/Makefile
index cd97980..fe1979a 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -20,10 +20,12 @@ subdir = src/common
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 
+# XXX: override for snappy
+override CPPFLAGS := -Wno-declaration-after-statement $(CPPFLAGS)
 override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
-OBJS_COMMON = relpath.o
+OBJS_COMMON = $(SUBDIROBJS) relpath.o snappy.o
 
 OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o
 
diff --git a/src/common/snappy-compat.h b/src/common/snappy-compat.h
new file mode 100644
index 0000000..69d1735
--- /dev/null
+++ b/src/common/snappy-compat.h
@@ -0,0 +1,57 @@
+#ifdef __FreeBSD__
+#  include <sys/endian.h>
+#elif defined(__APPLE_CC_) || defined(__MACH__)  /* MacOS/X support */
+#  include <machine/endian.h>
+
+#if    __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN
+#  define	htole16(x) (x)
+#  define	le32toh(x) (x)
+#elif  __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN
+#  define	htole16(x) __DARWIN_OSSwapInt16(x)
+#  define	le32toh(x) __DARWIN_OSSwapInt32(x)
+#else
+#  error "Endianness is undefined"
+#endif
+
+
+#else
+#  include <endian.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/uio.h>
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned u32;
+typedef unsigned long long u64;
+
+#define BUG_ON(x) assert(!(x))
+
+#define get_unaligned(x) (*(x))
+#define get_unaligned_le32(x) (le32toh(*(u32 *)(x)))
+#define put_unaligned(v,x) (*(x) = (v))
+#define put_unaligned_le16(v,x) (*(u16 *)(x) = htole16(v))
+
+#define vmalloc(x) malloc(x)
+#define vfree(x) free(x)
+
+#define EXPORT_SYMBOL(x)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define likely(x) __builtin_expect((x), 1)
+#define unlikely(x) __builtin_expect((x), 0)
+
+#define min_t(t,x,y) ((x) < (y) ? (x) : (y))
+#define max_t(t,x,y) ((x) > (y) ? (x) : (y))
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN__ 1
+#endif
+
+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
diff --git a/src/common/snappy-int.h b/src/common/snappy-int.h
new file mode 100644
index 0000000..ad31b54
--- /dev/null
+++ b/src/common/snappy-int.h
@@ -0,0 +1,71 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Various stubs for the open-source version of Snappy.
+
+#define likely(x) __builtin_expect(x, 1)
+#define unlikely(x) __builtin_expect(x, 0)
+
+#define CRASH_UNLESS(x) assert(x)
+#define CHECK(cond) assert(cond)
+#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
+#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
+#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
+#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
+#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
+#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
+
+#define UNALIGNED_LOAD16(_p) (*(const uint16 *)(_p))
+#define UNALIGNED_LOAD32(_p) (*(const uint32 *)(_p))
+#define UNALIGNED_LOAD64(_p) (*(const uint64 *)(_p))
+
+#define UNALIGNED_STORE16(_p, _val) (*(uint16 *)(_p) = (_val))
+#define UNALIGNED_STORE32(_p, _val) (*(uint32 *)(_p) = (_val))
+#define UNALIGNED_STORE64(_p, _val) (*(uint64 *)(_p) = (_val))
+
+#ifdef NDEBUG
+
+#define DCHECK(cond) CRASH_UNLESS(true)
+#define DCHECK_LE(a, b) CRASH_UNLESS(true)
+#define DCHECK_GE(a, b) CRASH_UNLESS(true)
+#define DCHECK_EQ(a, b) CRASH_UNLESS(true)
+#define DCHECK_NE(a, b) CRASH_UNLESS(true)
+#define DCHECK_LT(a, b) CRASH_UNLESS(true)
+#define DCHECK_GT(a, b) CRASH_UNLESS(true)
+
+#else
+
+#define DCHECK(cond) CHECK(cond)
+#define DCHECK_LE(a, b) CHECK_LE(a, b)
+#define DCHECK_GE(a, b) CHECK_GE(a, b)
+#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
+#define DCHECK_NE(a, b) CHECK_NE(a, b)
+#define DCHECK_LT(a, b) CHECK_LT(a, b)
+#define DCHECK_GT(a, b) CHECK_GT(a, b)
+
+#endif
diff --git a/src/common/snappy.c b/src/common/snappy.c
new file mode 100644
index 0000000..b61ed03
--- /dev/null
+++ b/src/common/snappy.c
@@ -0,0 +1,1536 @@
+/*
+ * C port of the snappy compressor from Google.
+ * This is a very fast compressor with comparable compression to lzo.
+ * Works best on 64bit little-endian, but should be good on others too.
+ * Ported by Andi Kleen.
+ * Based on snappy 1.0.3 plus some selected changes from SVN.
+ */
+
+/*
+ * Copyright 2005 Google Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "postgres.h"
+
+#include "common/snappy/snappy.h"
+#include "snappy-compat.h"
+
+#include <endian.h>
+
+#define CRASH_UNLESS(x) BUG_ON(!(x))
+#define CHECK(cond) CRASH_UNLESS(cond)
+#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
+#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
+#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
+#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
+#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
+#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
+
+#define UNALIGNED_LOAD16(_p) get_unaligned((u16 *)(_p))
+#define UNALIGNED_LOAD32(_p) get_unaligned((u32 *)(_p))
+#define UNALIGNED_LOAD64(_p) get_unaligned((u64 *)(_p))
+
+#define UNALIGNED_STORE16(_p, _val) put_unaligned(_val, (u16 *)(_p))
+#define UNALIGNED_STORE32(_p, _val) put_unaligned(_val, (u32 *)(_p))
+#define UNALIGNED_STORE64(_p, _val) put_unaligned(_val, (u64 *)(_p))
+
+#ifdef NDEBUG
+
+#define DCHECK(cond) do {} while(0)
+#define DCHECK_LE(a, b) do {} while(0)
+#define DCHECK_GE(a, b) do {} while(0)
+#define DCHECK_EQ(a, b) do {} while(0)
+#define DCHECK_NE(a, b) do {} while(0)
+#define DCHECK_LT(a, b) do {} while(0)
+#define DCHECK_GT(a, b) do {} while(0)
+
+#else
+
+#define DCHECK(cond) CHECK(cond)
+#define DCHECK_LE(a, b) CHECK_LE(a, b)
+#define DCHECK_GE(a, b) CHECK_GE(a, b)
+#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
+#define DCHECK_NE(a, b) CHECK_NE(a, b)
+#define DCHECK_LT(a, b) CHECK_LT(a, b)
+#define DCHECK_GT(a, b) CHECK_GT(a, b)
+
+#endif
+
+static inline bool is_little_endian(void)
+{
+#ifdef __LITTLE_ENDIAN__
+	return true;
+#endif
+	return false;
+}
+
+static inline int log2_floor(u32 n)
+{
+	return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+
+static inline int find_lsb_set_non_zero(u32 n)
+{
+	return __builtin_ctz(n);
+}
+
+static inline int find_lsb_set_non_zero64(u64 n)
+{
+	return __builtin_ctzll(n);
+}
+
+#define kmax32 5
+
+/*
+ * Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
+ *  Never reads a character at or beyond limit.  If a valid/terminated varint32
+ * was found in the range, stores it in *OUTPUT and returns a pointer just
+ * past the last byte of the varint32. Else returns NULL.  On success,
+ * "result <= limit".
+ */
+static inline const char *varint_parse32_with_limit(const char *p,
+						    const char *l,
+						    u32 * OUTPUT)
+{
+	const unsigned char *ptr = (const unsigned char *)(p);
+	const unsigned char *limit = (const unsigned char *)(l);
+	u32 b, result;
+
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result = b & 127;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 7;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 14;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 21;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 28;
+	if (b < 16)
+		goto done;
+	return NULL;		/* Value is too long to be a varint32 */
+done:
+	*OUTPUT = result;
+	return (const char *)(ptr);
+}
+
+/*
+ * REQUIRES   "ptr" points to a buffer of length sufficient to hold "v".
+ *  EFFECTS    Encodes "v" into "ptr" and returns a pointer to the
+ *            byte just past the last encoded byte.
+ */
+static inline char *varint_encode32(char *sptr, u32 v)
+{
+	/* Operate on characters as unsigneds */
+	unsigned char *ptr = (unsigned char *)(sptr);
+	static const int B = 128;
+
+	if (v < (1 << 7)) {
+		*(ptr++) = v;
+	} else if (v < (1 << 14)) {
+		*(ptr++) = v | B;
+		*(ptr++) = v >> 7;
+	} else if (v < (1 << 21)) {
+		*(ptr++) = v | B;
+		*(ptr++) = (v >> 7) | B;
+		*(ptr++) = v >> 14;
+	} else if (v < (1 << 28)) {
+		*(ptr++) = v | B;
+		*(ptr++) = (v >> 7) | B;
+		*(ptr++) = (v >> 14) | B;
+		*(ptr++) = v >> 21;
+	} else {
+		*(ptr++) = v | B;
+		*(ptr++) = (v >> 7) | B;
+		*(ptr++) = (v >> 14) | B;
+		*(ptr++) = (v >> 21) | B;
+		*(ptr++) = v >> 28;
+	}
+	return (char *)(ptr);
+}
+
+#ifdef SG
+
+struct source {
+	struct iovec *iov;
+	int iovlen;
+	int curvec;
+	int curoff;
+	size_t total;
+};
+
+/* Only valid at beginning when nothing is consumed */
+static inline int available(struct source *s)
+{
+	return s->total;
+}
+
+static inline const char *peek(struct source *s, size_t *len)
+{
+	if (likely(s->curvec < s->iovlen)) {
+		struct iovec *iv = &s->iov[s->curvec];
+		if (s->curoff < iv->iov_len) { 
+			*len = iv->iov_len - s->curoff;
+			return iv->iov_base + s->curoff;
+		}
+	}
+	*len = 0;
+	return NULL;
+}
+
+static inline void skip(struct source *s, size_t n)
+{
+	struct iovec *iv = &s->iov[s->curvec];
+	s->curoff += n;
+	DCHECK_LE(s->curoff, iv->iov_len);
+	if (s->curoff >= iv->iov_len && s->curvec + 1 < s->iovlen) {
+		s->curoff = 0;
+		s->curvec++;
+	}
+}
+
+struct sink {
+	struct iovec *iov;
+	int iovlen;
+	unsigned curvec;
+	unsigned curoff;
+	unsigned written;
+};
+
+static inline void append(struct sink *s, const char *data, size_t n)
+{
+	struct iovec *iov = &s->iov[s->curvec];
+	char *dst = iov->iov_base + s->curoff;
+	size_t nlen = min_t(size_t, iov->iov_len - s->curoff, n);
+	if (data != dst)
+		memcpy(dst, data, nlen);
+	s->written += n;
+	s->curoff += nlen;
+	while ((n -= nlen) > 0) {
+		data += nlen;
+		s->curvec++;
+		DCHECK_LT(s->curvec, s->iovlen);
+		iov++;
+		nlen = min_t(size_t, iov->iov_len, n);
+		memcpy(iov->iov_base, data, nlen);
+		s->curoff = nlen;
+	}
+}
+
+static inline void *sink_peek(struct sink *s, size_t n)
+{
+	struct iovec *iov = &s->iov[s->curvec];
+	if (s->curvec < iov->iov_len && iov->iov_len - s->curoff >= n)
+		return iov->iov_base + s->curoff;
+	return NULL;
+}
+
+#else
+
+struct source {
+	const char *ptr;
+	size_t left;
+};
+
+static inline int available(struct source *s)
+{
+	return s->left;
+}
+
+static inline const char *peek(struct source *s, size_t * len)
+{
+	*len = s->left;
+	return s->ptr;
+}
+
+static inline void skip(struct source *s, size_t n)
+{
+	s->left -= n;
+	s->ptr += n;
+}
+
+struct sink {
+	char *dest;
+};
+
+static inline void append(struct sink *s, const char *data, size_t n)
+{
+	if (data != s->dest)
+		memcpy(s->dest, data, n);
+	s->dest += n;
+}
+
+#define sink_peek(s, n) sink_peek_no_sg(s)
+
+static inline void *sink_peek_no_sg(const struct sink *s)
+{
+	return s->dest;
+}
+
+#endif
+
+struct writer {
+	char *base;
+	char *op;
+	char *op_limit;
+};
+
+/* Called before decompression */
+static inline void writer_set_expected_length(struct writer *w, size_t len)
+{
+	w->op_limit = w->op + len;
+}
+
+/* Called after decompression */
+static inline bool writer_check_length(struct writer *w)
+{
+	return w->op == w->op_limit;
+}
+
+/*
+ * Copy "len" bytes from "src" to "op", one byte at a time.  Used for
+ *  handling COPY operations where the input and output regions may
+ * overlap.  For example, suppose:
+ *    src    == "ab"
+ *    op     == src + 2
+ *    len    == 20
+ * After IncrementalCopy(src, op, len), the result will have
+ * eleven copies of "ab"
+ *    ababababababababababab
+ * Note that this does not match the semantics of either memcpy()
+ * or memmove().
+ */
+static inline void incremental_copy(const char *src, char *op, int len)
+{
+	DCHECK_GT(len, 0);
+	do {
+		*op++ = *src++;
+	} while (--len > 0);
+}
+
+/*
+ * Equivalent to IncrementalCopy except that it can write up to ten extra
+ *  bytes after the end of the copy, and that it is faster.
+ *
+ * The main part of this loop is a simple copy of eight bytes at a time until
+ * we've copied (at least) the requested amount of bytes.  However, if op and
+ * src are less than eight bytes apart (indicating a repeating pattern of
+ * length < 8), we first need to expand the pattern in order to get the correct
+ * results. For instance, if the buffer looks like this, with the eight-byte
+ * <src> and <op> patterns marked as intervals:
+ *
+ *    abxxxxxxxxxxxx
+ *    [------]           src
+ *      [------]         op
+ *
+ * a single eight-byte copy from <src> to <op> will repeat the pattern once,
+ * after which we can move <op> two bytes without moving <src>:
+ *
+ *    ababxxxxxxxxxx
+ *    [------]           src
+ *        [------]       op
+ *
+ * and repeat the exercise until the two no longer overlap.
+ *
+ * This allows us to do very well in the special case of one single byte
+ * repeated many times, without taking a big hit for more general cases.
+ *
+ * The worst case of extra writing past the end of the match occurs when
+ * op - src == 1 and len == 1; the last copy will read from byte positions
+ * [0..7] and write to [4..11], whereas it was only supposed to write to
+ * position 1. Thus, ten excess bytes.
+ */
+
+#define kmax_increment_copy_overflow  10
+
+static inline void incremental_copy_fast_path(const char *src, char *op,
+					      int len)
+{
+	while (op - src < 8) {
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+		len -= op - src;
+		op += op - src;
+	}
+	while (len > 0) {
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+		src += 8;
+		op += 8;
+		len -= 8;
+	}
+}
+
+static inline bool writer_append_from_self(struct writer *w, u32 offset,
+					   u32 len)
+{
+	char *const op = w->op;
+	CHECK_LE(op, w->op_limit);
+	const u32 space_left = w->op_limit - op;
+
+	if (op - w->base <= offset - 1u)	/* -1u catches offset==0 */
+		return false;
+	if (len <= 16 && offset >= 8 && space_left >= 16) {
+		/* Fast path, used for the majority (70-80%) of dynamic
+		 * invocations. */
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
+		UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+	} else {
+		if (space_left >= len + kmax_increment_copy_overflow) {
+			incremental_copy_fast_path(op - offset, op, len);
+		} else {
+			if (space_left < len) {
+				return false;
+			}
+			incremental_copy(op - offset, op, len);
+		}
+	}
+
+	w->op = op + len;
+	return true;
+}
+
+static inline bool writer_append(struct writer *w, const char *ip, u32 len)
+{
+	char *const op = w->op;
+	CHECK_LE(op, w->op_limit);
+	const u32 space_left = w->op_limit - op;
+	if (space_left < len)
+		return false;
+	memcpy(op, ip, len);
+	w->op = op + len;
+	return true;
+}
+
+static inline bool writer_try_fast_append(struct writer *w, const char *ip, 
+					  u32 available_bytes, u32 len)
+{
+	char *const op = w->op;
+	const int space_left = w->op_limit - op;
+	if (len <= 16 && available_bytes >= 16 && space_left >= 16) {
+		/* Fast path, used for the majority (~95%) of invocations */
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+		UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+		w->op = op + len;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Any hash function will produce a valid compressed bitstream, but a good
+ * hash function reduces the number of collisions and thus yields better
+ * compression for compressible input, and more speed for incompressible
+ * input. Of course, it doesn't hurt if the hash function is reasonably fast
+ * either, as it gets called a lot.
+ */
+static inline u32 hash_bytes(u32 bytes, int shift)
+{
+	u32 kmul = 0x1e35a7bd;
+	return (bytes * kmul) >> shift;
+}
+
+static inline u32 hash(const char *p, int shift)
+{
+	return hash_bytes(UNALIGNED_LOAD32(p), shift);
+}
+
+/*
+ * Compressed data can be defined as:
+ *    compressed := item* literal*
+ *    item       := literal* copy
+ *
+ * The trailing literal sequence has a space blowup of at most 62/60
+ * since a literal of length 60 needs one tag byte + one extra byte
+ * for length information.
+ *
+ * Item blowup is trickier to measure.  Suppose the "copy" op copies
+ * 4 bytes of data.  Because of a special check in the encoding code,
+ * we produce a 4-byte copy only if the offset is < 65536.  Therefore
+ * the copy op takes 3 bytes to encode, and this type of item leads
+ * to at most the 62/60 blowup for representing literals.
+ *
+ * Suppose the "copy" op copies 5 bytes of data.  If the offset is big
+ * enough, it will take 5 bytes to encode the copy op.  Therefore the
+ * worst case here is a one-byte literal followed by a five-byte copy.
+ * I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
+ *
+ * This last factor dominates the blowup, so the final estimate is:
+ */
+size_t snappy_max_compressed_length(size_t source_len)
+{
+	return 32 + source_len + source_len / 6;
+}
+EXPORT_SYMBOL(snappy_max_compressed_length);
+
+enum {
+	LITERAL = 0,
+	COPY_1_BYTE_OFFSET = 1,	/* 3 bit length + 3 bits of offset in opcode */
+	COPY_2_BYTE_OFFSET = 2,
+	COPY_4_BYTE_OFFSET = 3
+};
+
+static inline char *emit_literal(char *op,
+				 const char *literal,
+				 int len, bool allow_fast_path)
+{
+	int n = len - 1;	/* Zero-length literals are disallowed */
+
+	if (n < 60) {
+		/* Fits in tag byte */
+		*op++ = LITERAL | (n << 2);
+
+/*
+ * The vast majority of copies are below 16 bytes, for which a
+ * call to memcpy is overkill. This fast path can sometimes
+ * copy up to 15 bytes too much, but that is okay in the
+ * main loop, since we have a bit to go on for both sides:
+ *
+ *   - The input will always have kInputMarginBytes = 15 extra
+ *     available bytes, as long as we're in the main loop, and
+ *     if not, allow_fast_path = false.
+ *   - The output will always have 32 spare bytes (see
+ *     MaxCompressedLength).
+ */
+		if (allow_fast_path && len <= 16) {
+			UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
+			UNALIGNED_STORE64(op + 8,
+					  UNALIGNED_LOAD64(literal + 8));
+			return op + len;
+		}
+	} else {
+		/* Encode in upcoming bytes */
+		char *base = op;
+		int count = 0;
+		op++;
+		while (n > 0) {
+			*op++ = n & 0xff;
+			n >>= 8;
+			count++;
+		}
+		DCHECK(count >= 1);
+		DCHECK(count <= 4);
+		*base = LITERAL | ((59 + count) << 2);
+	}
+	memcpy(op, literal, len);
+	return op + len;
+}
+
+static inline char *emit_copy_less_than64(char *op, int offset, int len)
+{
+	DCHECK_LE(len, 64);
+	DCHECK_GE(len, 4);
+	DCHECK_LT(offset, 65536);
+
+	if ((len < 12) && (offset < 2048)) {
+		int len_minus_4 = len - 4;
+		DCHECK(len_minus_4 < 8);	/* Must fit in 3 bits */
+		*op++ =
+		    COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8)
+								 << 5);
+		*op++ = offset & 0xff;
+	} else {
+		*op++ = COPY_2_BYTE_OFFSET | ((len - 1) << 2);
+		put_unaligned_le16(offset, op);
+		op += 2;
+	}
+	return op;
+}
+
+static inline char *emit_copy(char *op, int offset, int len)
+{
+	/*
+	 * Emit 64 byte copies but make sure to keep at least four bytes
+	 * reserved
+	 */
+	while (len >= 68) {
+		op = emit_copy_less_than64(op, offset, 64);
+		len -= 64;
+	}
+
+	/*
+	 * Emit an extra 60 byte copy if have too much data to fit in
+	 * one copy
+	 */
+	if (len > 64) {
+		op = emit_copy_less_than64(op, offset, 60);
+		len -= 60;
+	}
+
+	/* Emit remainder */
+	op = emit_copy_less_than64(op, offset, len);
+	return op;
+}
+
+/**
+ * snappy_uncompressed_length - return length of uncompressed output.
+ * @start: compressed buffer
+ * @n: length of compressed buffer.
+ * @result: Write the length of the uncompressed output here.
+ *
+ * Returns true when successfull, otherwise false.
+ */
+bool snappy_uncompressed_length(const char *start, size_t n, size_t * result)
+{
+	u32 v = 0;
+	const char *limit = start + n;
+	if (varint_parse32_with_limit(start, limit, &v) != NULL) {
+		*result = v;
+		return true;
+	} else {
+		return false;
+	}
+}
+EXPORT_SYMBOL(snappy_uncompressed_length);
+
+#define kblock_log 15
+#define kblock_size (1 << kblock_log)
+
+/* 
+ * This value could be halfed or quartered to save memory 
+ * at the cost of slightly worse compression.
+ */
+#define kmax_hash_table_bits 14
+#define kmax_hash_table_size (1U << kmax_hash_table_bits)
+
+/*
+ * Use smaller hash table when input.size() is smaller, since we
+ * fill the table, incurring O(hash table size) overhead for
+ * compression, and if the input is short, we won't need that
+ * many hash table entries anyway.
+ */
+static u16 *get_hash_table(struct snappy_env *env, size_t input_size,
+			      int *table_size)
+{
+	unsigned htsize = 256;
+
+	DCHECK(kmax_hash_table_size >= 256);
+	while (htsize < kmax_hash_table_size && htsize < input_size)
+		htsize <<= 1;
+	CHECK_EQ(0, htsize & (htsize - 1));
+	CHECK_LE(htsize, kmax_hash_table_size);
+
+	u16 *table;
+	table = env->hash_table;
+
+	*table_size = htsize;
+	memset(table, 0, htsize * sizeof(*table));
+	return table;
+}
+
+/*
+ * Return the largest n such that
+ *
+ *   s1[0,n-1] == s2[0,n-1]
+ *   and n <= (s2_limit - s2).
+ *
+ * Does not read *s2_limit or beyond.
+ * Does not read *(s1 + (s2_limit - s2)) or beyond.
+ * Requires that s2_limit >= s2.
+ *
+ * Separate implementation for x86_64, for speed.  Uses the fact that
+ * x86_64 is little endian.
+ */
+#if defined(__LITTLE_ENDIAN__) && BITS_PER_LONG == 64
+static inline int find_match_length(const char *s1,
+				    const char *s2, const char *s2_limit)
+{
+	int matched = 0;
+
+	DCHECK_GE(s2_limit, s2);
+	/*
+	 * Find out how long the match is. We loop over the data 64 bits at a
+	 * time until we find a 64-bit block that doesn't match; then we find
+	 * the first non-matching bit and use that to calculate the total
+	 * length of the match.
+	 */
+	while (likely(s2 <= s2_limit - 8)) {
+		if (unlikely
+		    (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched))) {
+			s2 += 8;
+			matched += 8;
+		} else {
+			/*
+			 * On current (mid-2008) Opteron models there
+			 * is a 3% more efficient code sequence to
+			 * find the first non-matching byte.  However,
+			 * what follows is ~10% better on Intel Core 2
+			 * and newer, and we expect AMD's bsf
+			 * instruction to improve.
+			 */
+			u64 x =
+			    UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 +
+								    matched);
+			int matching_bits = find_lsb_set_non_zero64(x);
+			matched += matching_bits >> 3;
+			return matched;
+		}
+	}
+	while (likely(s2 < s2_limit)) {
+		if (likely(s1[matched] == *s2)) {
+			++s2;
+			++matched;
+		} else {
+			return matched;
+		}
+	}
+	return matched;
+}
+#else
+static inline int find_match_length(const char *s1,
+				    const char *s2, const char *s2_limit)
+{
+	/* Implementation based on the x86-64 version, above. */
+	DCHECK_GE(s2_limit, s2);
+	int matched = 0;
+
+	while (s2 <= s2_limit - 4 &&
+	       UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
+		s2 += 4;
+		matched += 4;
+	}
+	if (is_little_endian() && s2 <= s2_limit - 4) {
+		u32 x =
+		    UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
+		int matching_bits = find_lsb_set_non_zero(x);
+		matched += matching_bits >> 3;
+	} else {
+		while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+			++s2;
+			++matched;
+		}
+	}
+	return matched;
+}
+#endif
+
+/*
+ * For 0 <= offset <= 4, GetU32AtOffset(UNALIGNED_LOAD64(p), offset) will
+ *  equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
+ * empirically found that overlapping loads such as
+ *  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
+ * are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to u32.
+ */
+static inline u32 get_u32_at_offset(u64 v, int offset)
+{
+	DCHECK(0 <= offset && offset <= 4);
+	return v >> (is_little_endian()? 8 * offset : 32 - 8 * offset);
+}
+
+/*
+ * Flat array compression that does not emit the "uncompressed length"
+ *  prefix. Compresses "input" string to the "*op" buffer.
+ *
+ * REQUIRES: "input" is at most "kBlockSize" bytes long.
+ * REQUIRES: "op" points to an array of memory that is at least
+ * "MaxCompressedLength(input.size())" in size.
+ * REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+ * REQUIRES: "table_size" is a power of two
+ *
+ * Returns an "end" pointer into "op" buffer.
+ * "end - op" is the compressed size of "input".
+ */
+
+static char *compress_fragment(const char *const input,
+			       const size_t input_size,
+			       char *op, u16 * table, const unsigned table_size)
+{
+	/* "ip" is the input pointer, and "op" is the output pointer. */
+	const char *ip = input;
+	CHECK_LE(input_size, kblock_size);
+	CHECK_EQ(table_size & (table_size - 1), 0);
+	const int shift = 32 - log2_floor(table_size);
+	DCHECK_EQ(UINT_MAX >> shift, table_size - 1);
+	const char *ip_end = input + input_size;
+	const char *baseip = ip;
+	/*
+	 * Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
+	 *  [next_emit, ip_end) after the main loop.
+	 */
+	const char *next_emit = ip;
+
+	const unsigned kinput_margin_bytes = 15;
+
+	if (likely(input_size >= kinput_margin_bytes)) {
+		const char *const ip_limit = input + input_size -
+			kinput_margin_bytes;
+
+		u32 next_hash;
+		for (next_hash = hash(++ip, shift);;) {
+			DCHECK_LT(next_emit, ip);
+/*
+ * The body of this loop calls EmitLiteral once and then EmitCopy one or
+ * more times.  (The exception is that when we're close to exhausting
+ * the input we goto emit_remainder.)
+ *
+ * In the first iteration of this loop we're just starting, so
+ * there's nothing to copy, so calling EmitLiteral once is
+ * necessary.  And we only start a new iteration when the
+ * current iteration has determined that a call to EmitLiteral will
+ * precede the next call to EmitCopy (if any).
+ *
+ * Step 1: Scan forward in the input looking for a 4-byte-long match.
+ * If we get close to exhausting the input then goto emit_remainder.
+ *
+ * Heuristic match skipping: If 32 bytes are scanned with no matches
+ * found, start looking only at every other byte. If 32 more bytes are
+ * scanned, look at every third byte, etc.. When a match is found,
+ * immediately go back to looking at every byte. This is a small loss
+ * (~5% performance, ~0.1% density) for lcompressible data due to more
+ * bookkeeping, but for non-compressible data (such as JPEG) it's a huge
+ * win since the compressor quickly "realizes" the data is incompressible
+ * and doesn't bother looking for matches everywhere.
+ *
+ * The "skip" variable keeps track of how many bytes there are since the
+ * last match; dividing it by 32 (ie. right-shifting by five) gives the
+ * number of bytes to move ahead for each iteration.
+ */
+			u32 skip_bytes = 32;
+
+			const char *next_ip = ip;
+			const char *candidate;
+			do {
+				ip = next_ip;
+				u32 hval = next_hash;
+				DCHECK_EQ(hval, hash(ip, shift));
+				u32 bytes_between_hash_lookups = skip_bytes++ >> 5;
+				next_ip = ip + bytes_between_hash_lookups;
+				if (unlikely(next_ip > ip_limit)) {
+					goto emit_remainder;
+				}
+				next_hash = hash(next_ip, shift);
+				candidate = baseip + table[hval];
+				DCHECK_GE(candidate, baseip);
+				DCHECK_LT(candidate, ip);
+
+				table[hval] = ip - baseip;
+			} while (likely(UNALIGNED_LOAD32(ip) !=
+					UNALIGNED_LOAD32(candidate)));
+
+/*
+ * Step 2: A 4-byte match has been found.  We'll later see if more
+ * than 4 bytes match.  But, prior to the match, input
+ * bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
+ */
+			DCHECK_LE(next_emit + 16, ip_end);
+			op = emit_literal(op, next_emit, ip - next_emit, true);
+
+/*
+ * Step 3: Call EmitCopy, and then see if another EmitCopy could
+ * be our next move.  Repeat until we find no match for the
+ * input immediately after what was consumed by the last EmitCopy call.
+ *
+ * If we exit this loop normally then we need to call EmitLiteral next,
+ * though we don't yet know how big the literal will be.  We handle that
+ * by proceeding to the next iteration of the main loop.  We also can exit
+ * this loop via goto if we get close to exhausting the input.
+ */
+			u64 input_bytes = 0;
+			u32 candidate_bytes = 0;
+
+			do {
+/*
+ * We have a 4-byte match at ip, and no need to emit any
+ *  "literal bytes" prior to ip.
+ */
+				const char *base = ip;
+				int matched = 4 +
+				    find_match_length(candidate + 4, ip + 4,
+						      ip_end);
+				ip += matched;
+				int offset = base - candidate;
+				DCHECK_EQ(0, memcmp(base, candidate, matched));
+				op = emit_copy(op, offset, matched);
+/*
+ * We could immediately start working at ip now, but to improve
+ * compression we first update table[Hash(ip - 1, ...)].
+ */
+				const char *insert_tail = ip - 1;
+				next_emit = ip;
+				if (unlikely(ip >= ip_limit)) {
+					goto emit_remainder;
+				}
+				input_bytes = UNALIGNED_LOAD64(insert_tail);
+				u32 prev_hash =
+				    hash_bytes(get_u32_at_offset
+					       (input_bytes, 0), shift);
+				table[prev_hash] = ip - baseip - 1;
+				u32 cur_hash =
+				    hash_bytes(get_u32_at_offset
+					       (input_bytes, 1), shift);
+				candidate = baseip + table[cur_hash];
+				candidate_bytes = UNALIGNED_LOAD32(candidate);
+				table[cur_hash] = ip - baseip;
+			} while (get_u32_at_offset(input_bytes, 1) ==
+				 candidate_bytes);
+
+			next_hash =
+			    hash_bytes(get_u32_at_offset(input_bytes, 2),
+				       shift);
+			++ip;
+		}
+	}
+
+emit_remainder:
+	/* Emit the remaining bytes as a literal */
+	if (next_emit < ip_end)
+		op = emit_literal(op, next_emit, ip_end - next_emit, false);
+
+	return op;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ *  Lookup table for decompression code.  Generated by ComputeTable() below.
+ * -----------------------------------------------------------------------
+ */
+
+/* Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits */
+static const u32 wordmask[] = {
+	0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+/*
+ * Data stored per entry in lookup table:
+ *       Range   Bits-used       Description
+ *      ------------------------------------
+ *      1..64   0..7            Literal/copy length encoded in opcode byte
+ *      0..7    8..10           Copy offset encoded in opcode byte / 256
+ *      0..4    11..13          Extra bytes after opcode
+ *
+ * We use eight bits for the length even though 7 would have sufficed
+ * because of efficiency reasons:
+ *      (1) Extracting a byte is faster than a bit-field
+ *      (2) It properly aligns copy offset so we do not need a <<8
+ */
+static const u16 char_table[256] = {
+	0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+	0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+	0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+	0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+	0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+	0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+	0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+	0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+	0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+	0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+	0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+	0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+	0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+	0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+	0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+	0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+	0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+	0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+	0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+	0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+	0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+	0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+	0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+	0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+	0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+	0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+	0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+	0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+	0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+	0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+	0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+	0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
+struct snappy_decompressor {
+	struct source *reader;	/* Underlying source of bytes to decompress */
+	const char *ip;		/* Points to next buffered byte */
+	const char *ip_limit;	/* Points just past buffered bytes */
+	u32 peeked;		/* Bytes peeked from reader (need to skip) */
+	bool eof;		/* Hit end of input without an error? */
+	char scratch[5];	/* Temporary buffer for peekfast boundaries */
+};
+
+static void
+init_snappy_decompressor(struct snappy_decompressor *d, struct source *reader)
+{
+	d->reader = reader;
+	d->ip = NULL;
+	d->ip_limit = NULL;
+	d->peeked = 0;
+	d->eof = false;
+}
+
+static void exit_snappy_decompressor(struct snappy_decompressor *d)
+{
+	skip(d->reader, d->peeked);
+}
+
+/*
+ * Read the uncompressed length stored at the start of the compressed data.
+ * On succcess, stores the length in *result and returns true.
+ * On failure, returns false.
+ */
+static bool read_uncompressed_length(struct snappy_decompressor *d,
+				     u32 * result)
+{
+	DCHECK(d->ip == NULL);	/*
+				 * Must not have read anything yet
+				 * Length is encoded in 1..5 bytes
+				 */
+	*result = 0;
+	u32 shift = 0;
+	while (true) {
+		if (shift >= 32)
+			return false;
+		size_t n;
+		const char *ip = peek(d->reader, &n);
+		if (n == 0)
+			return false;
+		const unsigned char c = *(const unsigned char *)(ip);
+		skip(d->reader, 1);
+		*result |= (u32) (c & 0x7f) << shift;
+		if (c < 128) {
+			break;
+		}
+		shift += 7;
+	}
+	return true;
+}
+
+static bool refill_tag(struct snappy_decompressor *d);
+
+/*
+ * Process the next item found in the input.
+ * Returns true if successful, false on error or end of input.
+ */
+static void decompress_all_tags(struct snappy_decompressor *d,
+				struct writer *writer)
+{
+	const char *ip = d->ip;
+
+	/*
+	 * We could have put this refill fragment only at the beginning of the loop.
+	 * However, duplicating it at the end of each branch gives the compiler more
+	 * scope to optimize the <ip_limit_ - ip> expression based on the local
+	 * context, which overall increases speed.
+	 */
+#define MAYBE_REFILL() \
+        if (d->ip_limit - ip < 5) {		\
+		d->ip = ip;			\
+		if (!refill_tag(d)) return;	\
+		ip = d->ip;			\
+        }
+
+
+	MAYBE_REFILL();
+	for (;;) {
+		if (d->ip_limit - ip < 5) {
+			d->ip = ip;
+			if (!refill_tag(d))
+				return;
+			ip = d->ip;
+		}
+
+		const unsigned char c = *(const unsigned char *)(ip++);
+
+		if ((c & 0x3) == LITERAL) {
+			u32 literal_length = (c >> 2) + 1;
+			if (writer_try_fast_append(writer, ip, d->ip_limit - ip, 
+						   literal_length)) {
+				DCHECK_LT(literal_length, 61);
+				ip += literal_length;
+				MAYBE_REFILL();
+				continue;
+			}
+			if (unlikely(literal_length >= 61)) {
+				/* Long literal */
+				const u32 literal_ll = literal_length - 60;
+				literal_length = (get_unaligned_le32(ip) &
+						  wordmask[literal_ll]) + 1;
+				ip += literal_ll;
+			}
+
+			u32 avail = d->ip_limit - ip;
+			while (avail < literal_length) {
+				if (!writer_append(writer, ip, avail))
+					return;
+				literal_length -= avail;
+				skip(d->reader, d->peeked);
+				size_t n;
+				ip = peek(d->reader, &n);
+				avail = n;
+				d->peeked = avail;
+				if (avail == 0)
+					return;	/* Premature end of input */
+				d->ip_limit = ip + avail;
+			}
+			if (!writer_append(writer, ip, literal_length))
+				return;
+			ip += literal_length;
+			MAYBE_REFILL();
+		} else {
+			const u32 entry = char_table[c];
+			const u32 trailer = get_unaligned_le32(ip) &
+				wordmask[entry >> 11];
+			const u32 length = entry & 0xff;
+			ip += entry >> 11;
+
+			/*
+			 * copy_offset/256 is encoded in bits 8..10.
+			 * By just fetching those bits, we get
+			 * copy_offset (since the bit-field starts at
+			 * bit 8).
+			 */
+			const u32 copy_offset = entry & 0x700;
+			if (!writer_append_from_self(writer,
+						     copy_offset + trailer,
+						     length))
+				return;
+			MAYBE_REFILL();
+		}
+	}
+}
+
+#undef MAYBE_REFILL
+
+static bool refill_tag(struct snappy_decompressor *d)
+{
+	const char *ip = d->ip;
+
+	if (ip == d->ip_limit) {
+		size_t n;
+		/* Fetch a new fragment from the reader */
+		skip(d->reader, d->peeked); /* All peeked bytes are used up */
+		ip = peek(d->reader, &n);
+		d->peeked = n;
+		if (n == 0) {
+			d->eof = true;
+			return false;
+		}
+		d->ip_limit = ip + n;
+	}
+
+	/* Read the tag character */
+	DCHECK_LT(ip, d->ip_limit);
+	const unsigned char c = *(const unsigned char *)(ip);
+	const u32 entry = char_table[c];
+	const u32 needed = (entry >> 11) + 1;	/* +1 byte for 'c' */
+	DCHECK_LE(needed, sizeof(d->scratch));
+
+	/* Read more bytes from reader if needed */
+	u32 nbuf = d->ip_limit - ip;
+
+	if (nbuf < needed) {
+		/*
+		 * Stitch together bytes from ip and reader to form the word
+		 * contents.  We store the needed bytes in "scratch".  They
+		 * will be consumed immediately by the caller since we do not
+		 * read more than we need.
+		 */
+		memmove(d->scratch, ip, nbuf);
+		skip(d->reader, d->peeked); /* All peeked bytes are used up */
+		d->peeked = 0;
+		while (nbuf < needed) {
+			size_t length;
+			const char *src = peek(d->reader, &length);
+			if (length == 0)
+				return false;
+			u32 to_add = min_t(u32, needed - nbuf, length);
+			memcpy(d->scratch + nbuf, src, to_add);
+			nbuf += to_add;
+			skip(d->reader, to_add);
+		}
+		DCHECK_EQ(nbuf, needed);
+		d->ip = d->scratch;
+		d->ip_limit = d->scratch + needed;
+	} else if (nbuf < 5) {
+		/*
+		 * Have enough bytes, but move into scratch so that we do not
+		 * read past end of input
+		 */
+		memmove(d->scratch, ip, nbuf);
+		skip(d->reader, d->peeked); /* All peeked bytes are used up */
+		d->peeked = 0;
+		d->ip = d->scratch;
+		d->ip_limit = d->scratch + nbuf;
+	} else {
+		/* Pass pointer to buffer returned by reader. */
+		d->ip = ip;
+	}
+	return true;
+}
+
+static int internal_uncompress(struct source *r,
+			       struct writer *writer, u32 max_len)
+{
+	struct snappy_decompressor decompressor;
+	u32 uncompressed_len = 0;
+
+	init_snappy_decompressor(&decompressor, r);
+
+	if (!read_uncompressed_length(&decompressor, &uncompressed_len))
+		return -EIO;
+	/* Protect against possible DoS attack */
+	if ((u64) (uncompressed_len) > max_len)
+		return -EIO;
+
+	writer_set_expected_length(writer, uncompressed_len);
+
+	/* Process the entire input */
+	decompress_all_tags(&decompressor, writer);
+
+	exit_snappy_decompressor(&decompressor);
+	return (decompressor.eof && writer_check_length(writer)) ? 0 : -EIO;
+}
+
+static inline int compress(struct snappy_env *env, struct source *reader,
+			   struct sink *writer)
+{
+	int err;
+	size_t written = 0;
+	int N = available(reader);
+	char ulength[kmax32];
+	char *p = varint_encode32(ulength, N);
+
+	append(writer, ulength, p - ulength);
+	written += (p - ulength);
+
+	while (N > 0) {
+		/* Get next block to compress (without copying if possible) */
+		size_t fragment_size;
+		const char *fragment = peek(reader, &fragment_size);
+		if (fragment_size == 0) {
+			err = -EIO;
+			goto out;
+		}
+		const unsigned num_to_read = min_t(int, N, kblock_size);
+		size_t bytes_read = fragment_size;
+
+		int pending_advance = 0;
+		if (bytes_read >= num_to_read) {
+			/* Buffer returned by reader is large enough */
+			pending_advance = num_to_read;
+			fragment_size = num_to_read;
+		}
+		else {
+			memcpy(env->scratch, fragment, bytes_read);
+			skip(reader, bytes_read);
+
+			while (bytes_read < num_to_read) {
+				fragment = peek(reader, &fragment_size);
+				size_t n =
+				    min_t(size_t, fragment_size,
+					  num_to_read - bytes_read);
+				memcpy((char *)(env->scratch) + bytes_read, fragment, n);
+				bytes_read += n;
+				skip(reader, n);
+			}
+			DCHECK_EQ(bytes_read, num_to_read);
+			fragment = env->scratch;
+			fragment_size = num_to_read;
+		}
+		if (fragment_size < num_to_read)
+			return -EIO;
+
+		/* Get encoding table for compression */
+		int table_size;
+		u16 *table = get_hash_table(env, num_to_read, &table_size);
+
+		/* Compress input_fragment and append to dest */
+		char *dest;
+		dest = sink_peek(writer, snappy_max_compressed_length(num_to_read));
+		if (!dest) {
+			/*
+			 * Need a scratch buffer for the output,
+			 * because the byte sink doesn't have enough
+			 * in one piece.
+			 */
+			dest = env->scratch_output;
+		}
+		char *end = compress_fragment(fragment, fragment_size,
+					      dest, table, table_size);
+		append(writer, dest, end - dest);
+		written += (end - dest);
+
+		N -= num_to_read;
+		skip(reader, pending_advance);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+#ifdef SG
+
+int snappy_compress_iov(struct snappy_env *env,
+			struct iovec *iov_in,
+			int iov_in_len,
+			size_t input_length,
+			struct iovec *iov_out,
+			int *iov_out_len,
+			size_t *compressed_length)
+{
+	struct source reader = {
+		.iov = iov_in,
+		.iovlen = iov_in_len,
+		.total = input_length
+	};
+	struct sink writer = {
+		.iov = iov_out,
+		.iovlen = *iov_out_len,
+	};
+	int err = compress(env, &reader, &writer);
+
+	*iov_out_len = writer.curvec + 1;
+
+	/* Compute how many bytes were added */
+	*compressed_length = writer.written;
+	return err;
+}
+EXPORT_SYMBOL(snappy_compress_iov);
+
+/**
+ * snappy_compress - Compress a buffer using the snappy compressor.
+ * @env: Preallocated environment
+ * @input: Input buffer
+ * @input_length: Length of input_buffer
+ * @compressed: Output buffer for compressed data
+ * @compressed_length: The real length of the output written here.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ *
+ * The output buffer must be at least
+ * snappy_max_compressed_length(input_length) bytes long.
+ *
+ * Requires a preallocated environment from snappy_init_env.
+ * The environment does not keep state over individual calls
+ * of this function, just preallocates the memory.
+ */
+int snappy_compress(struct snappy_env *env,
+		    const char *input,
+		    size_t input_length,
+		    char *compressed, size_t *compressed_length)
+{
+	struct iovec iov_in = {
+		.iov_base = (char *)input,
+		.iov_len = input_length,
+	};
+	struct iovec iov_out = {
+		.iov_base = compressed,
+		.iov_len = 0xffffffff,
+	};
+	int out = 1;
+	return snappy_compress_iov(env, 
+				   &iov_in, 1, input_length, 
+				   &iov_out, &out, compressed_length);
+}
+EXPORT_SYMBOL(snappy_compress);
+
+int snappy_uncompress_iov(struct iovec *iov_in, int iov_in_len,
+			   size_t input_len, char *uncompressed)
+{
+	struct source reader = {
+		.iov = iov_in,
+		.iovlen = iov_in_len,
+		.total = input_len
+	};
+	struct writer output = {
+		.base = uncompressed,
+		.op = uncompressed
+	};
+	return internal_uncompress(&reader, &output, 0xffffffff);
+}
+EXPORT_SYMBOL(snappy_uncompress_iov);
+
+/**
+ * snappy_uncompress - Uncompress a snappy compressed buffer
+ * @compressed: Input buffer with compressed data
+ * @n: length of compressed buffer
+ * @uncompressed: buffer for uncompressed data
+ *
+ * The uncompressed data buffer must be at least
+ * snappy_uncompressed_length(compressed) bytes long.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ */
+int snappy_uncompress(const char *compressed, size_t n, char *uncompressed)
+{
+	struct iovec iov = {
+		.iov_base = (char *)compressed,
+		.iov_len = n
+	};
+	return snappy_uncompress_iov(&iov, 1, n, uncompressed);
+}
+EXPORT_SYMBOL(snappy_uncompress);
+
+#else
+/**
+ * snappy_compress - Compress a buffer using the snappy compressor.
+ * @env: Preallocated environment
+ * @input: Input buffer
+ * @input_length: Length of input_buffer
+ * @compressed: Output buffer for compressed data
+ * @compressed_length: The real length of the output written here.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ *
+ * The output buffer must be at least
+ * snappy_max_compressed_length(input_length) bytes long.
+ *
+ * Requires a preallocated environment from snappy_init_env.
+ * The environment does not keep state over individual calls
+ * of this function, just preallocates the memory.
+ */
+int snappy_compress(struct snappy_env *env,
+		    const char *input,
+		    size_t input_length,
+		    char *compressed, size_t *compressed_length)
+{
+	struct source reader = {
+		.ptr = input,
+		.left = input_length
+	};
+	struct sink writer = {
+		.dest = compressed,
+	};
+	int err = compress(env, &reader, &writer);
+
+	/* Compute how many bytes were added */
+	*compressed_length = (writer.dest - compressed);
+	return err;
+}
+EXPORT_SYMBOL(snappy_compress);
+
+/**
+ * snappy_uncompress - Uncompress a snappy compressed buffer
+ * @compressed: Input buffer with compressed data
+ * @n: length of compressed buffer
+ * @uncompressed: buffer for uncompressed data
+ *
+ * The uncompressed data buffer must be at least
+ * snappy_uncompressed_length(compressed) bytes long.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ */
+int snappy_uncompress(const char *compressed, size_t n, char *uncompressed)
+{
+	struct source reader = {
+		.ptr = compressed,
+		.left = n
+	};
+	struct writer output = {
+		.base = uncompressed,
+		.op = uncompressed
+	};
+	return internal_uncompress(&reader, &output, 0xffffffff);
+}
+EXPORT_SYMBOL(snappy_uncompress);
+#endif
+
+#ifdef SG
+/**
+ * snappy_init_env_sg - Allocate snappy compression environment
+ * @env: Environment to preallocate
+ * @sg: Input environment ever does scather gather
+ *
+ * If false is passed to sg then multiple entries in an iovec
+ * are not legal.
+ * Returns 0 on success, otherwise negative errno.
+ * Must run in process context.
+ */
+int snappy_init_env_sg(struct snappy_env *env, bool sg)
+{
+	env->hash_table = vmalloc(sizeof(u16) * kmax_hash_table_size);
+	if (!env->hash_table)
+		goto error;
+	if (sg) {
+		env->scratch = vmalloc(kblock_size);
+		if (!env->scratch)
+			goto error;
+		env->scratch_output =
+			vmalloc(snappy_max_compressed_length(kblock_size));
+		if (!env->scratch_output)
+			goto error;
+	}
+	return 0;
+error:
+	snappy_free_env(env);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(snappy_init_env_sg);
+#endif
+
+/**
+ * snappy_init_env - Allocate snappy compression environment
+ * @env: Environment to preallocate
+ *
+ * Passing multiple entries in an iovec is not allowed
+ * on the environment allocated here.
+ * Returns 0 on success, otherwise negative errno.
+ * Must run in process context.
+ */
+int snappy_init_env(struct snappy_env *env)
+{
+	env->hash_table = vmalloc(sizeof(u16) * kmax_hash_table_size);
+	if (!env->hash_table)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(snappy_init_env);
+
+/**
+ * snappy_free_env - Free an snappy compression environment
+ * @env: Environment to free.
+ *
+ * Must run in process context.
+ */
+void snappy_free_env(struct snappy_env *env)
+{
+	vfree(env->hash_table);
+#ifdef SG
+	vfree(env->scratch);
+	vfree(env->scratch_output);
+#endif
+	memset(env, 0, sizeof(struct snappy_env));
+}
+EXPORT_SYMBOL(snappy_free_env);
diff --git a/src/include/common/snappy/snappy.h b/src/include/common/snappy/snappy.h
new file mode 100644
index 0000000..f650f03
--- /dev/null
+++ b/src/include/common/snappy/snappy.h
@@ -0,0 +1,34 @@
+#ifndef _LINUX_SNAPPY_H
+#define _LINUX_SNAPPY_H 1
+
+/* Only needed for compression. This preallocates the worst case */
+struct snappy_env {
+	unsigned short *hash_table;
+	void *scratch;
+	void *scratch_output;
+};
+
+struct iovec;
+int snappy_init_env(struct snappy_env *env);
+int snappy_init_env_sg(struct snappy_env *env, bool sg);
+void snappy_free_env(struct snappy_env *env);
+int snappy_uncompress_iov(struct iovec *iov_in, int iov_in_len,
+			   size_t input_len, char *uncompressed);
+int snappy_uncompress(const char *compressed, size_t n, char *uncompressed);
+int snappy_compress(struct snappy_env *env,
+		    const char *input,
+		    size_t input_length,
+		    char *compressed,
+		    size_t *compressed_length);
+int snappy_compress_iov(struct snappy_env *env,
+			struct iovec *iov_in,
+			int iov_in_len,
+			size_t input_length,
+			struct iovec *iov_out,
+			int *iov_out_len,
+			size_t *compressed_length);
+bool snappy_uncompressed_length(const char *buf, size_t len, size_t *result);
+size_t snappy_max_compressed_length(size_t source_len);
+
+
+#endif
-- 
1.8.2.rc2.4.g7799588.dirty

0002-pluggable-compression.patchtext/x-patch; charset=us-asciiDownload

>From dda7b9aba8a6314e3beb773f140bb733ebcdaa53 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 11 Jun 2013 12:23:51 +0200
Subject: [PATCH 2/2] pluggable compression

---
 src/backend/access/heap/tuptoaster.c | 231 +++++++++++++++++++++++++++++------
 src/backend/utils/misc/guc.c         |  11 ++
 src/include/access/tuptoaster.h      |   1 +
 src/include/postgres.h               |  36 ++----
 src/include/utils/pg_lzcompress.h    |   4 +-
 5 files changed, 220 insertions(+), 63 deletions(-)

diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index fc37ceb..12e3659 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -34,6 +34,7 @@
 #include "access/heapam.h"
 #include "access/tuptoaster.h"
 #include "access/xact.h"
+#include "common/snappy/snappy.h"
 #include "catalog/catalog.h"
 #include "utils/fmgroids.h"
 #include "utils/pg_lzcompress.h"
@@ -72,6 +73,7 @@ do { \
 	memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
 } while (0)
 
+int toast_compression_algo = 0;
 
 static void toast_delete_datum(Relation rel, Datum value);
 static Datum toast_save_datum(Relation rel, Datum value,
@@ -81,7 +83,8 @@ static bool toastid_valueid_exists(Oid toastrelid, Oid valueid);
 static struct varlena *toast_fetch_datum(struct varlena * attr);
 static struct varlena *toast_fetch_datum_slice(struct varlena * attr,
 						int32 sliceoffset, int32 length);
-
+static Datum toast_uncompress_datum(Datum attr);
+static Size toast_uncompressed_length(struct varlena *attr);
 
 /* ----------
  * heap_tuple_fetch_attr -
@@ -137,11 +140,11 @@ heap_tuple_untoast_attr(struct varlena * attr)
 		/* If it's compressed, decompress it */
 		if (VARATT_IS_COMPRESSED(attr))
 		{
-			PGLZ_Header *tmp = (PGLZ_Header *) attr;
+			struct varlena *tmp = attr;
+
+			attr = (struct varlena *) DatumGetPointer(
+				toast_uncompress_datum(PointerGetDatum(attr)));
 
-			attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-			SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-			pglz_decompress(tmp, VARDATA(attr));
 			pfree(tmp);
 		}
 	}
@@ -150,11 +153,8 @@ heap_tuple_untoast_attr(struct varlena * attr)
 		/*
 		 * This is a compressed value inside of the main tuple
 		 */
-		PGLZ_Header *tmp = (PGLZ_Header *) attr;
-
-		attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-		SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-		pglz_decompress(tmp, VARDATA(attr));
+		attr = (struct varlena *) DatumGetPointer(
+			toast_uncompress_datum(PointerGetDatum(attr)));
 	}
 	else if (VARATT_IS_SHORT(attr))
 	{
@@ -209,15 +209,8 @@ heap_tuple_untoast_attr_slice(struct varlena * attr,
 
 	if (VARATT_IS_COMPRESSED(preslice))
 	{
-		PGLZ_Header *tmp = (PGLZ_Header *) preslice;
-		Size		size = PGLZ_RAW_SIZE(tmp) + VARHDRSZ;
-
-		preslice = (struct varlena *) palloc(size);
-		SET_VARSIZE(preslice, size);
-		pglz_decompress(tmp, VARDATA(preslice));
-
-		if (tmp != (PGLZ_Header *) attr)
-			pfree(tmp);
+		preslice = (struct varlena *) DatumGetPointer(
+			toast_uncompress_datum(PointerGetDatum(preslice)));
 	}
 
 	if (VARATT_IS_SHORT(preslice))
@@ -277,8 +270,7 @@ toast_raw_datum_size(Datum value)
 	}
 	else if (VARATT_IS_COMPRESSED(attr))
 	{
-		/* here, va_rawsize is just the payload size */
-		result = VARRAWSIZE_4B_C(attr) + VARHDRSZ;
+		result = toast_uncompressed_length(attr);
 	}
 	else if (VARATT_IS_SHORT(attr))
 	{
@@ -1179,8 +1171,11 @@ toast_flatten_tuple_attribute(Datum value,
 Datum
 toast_compress_datum(Datum value)
 {
-	struct varlena *tmp;
+	struct varlena *tmp = NULL;
 	int32		valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
+	int32		compressed_size;
+	Size		buffer_size;
+	int			ret;
 
 	Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value)));
 	Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value)));
@@ -1188,36 +1183,195 @@ toast_compress_datum(Datum value)
 	/*
 	 * No point in wasting a palloc cycle if value size is out of the allowed
 	 * range for compression
+	 *
+	 * XXX: Generalize concept, without referring to PGLZ
 	 */
 	if (valsize < PGLZ_strategy_default->min_input_size ||
 		valsize > PGLZ_strategy_default->max_input_size)
 		return PointerGetDatum(NULL);
 
-	tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize));
+	/*
+	 * choose compressor based on toast_compression_algo GUC.
+	 * XXX: We probably rather want a storage attribute for that.
+	 */
+	/* compress with pglz */
+	if (toast_compression_algo == 0)
+	{
+		tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize));
+
+		if (!pglz_compress(VARDATA_ANY(DatumGetPointer(value)), valsize,
+						   (PGLZ_Header *) tmp, PGLZ_strategy_default))
+			goto incompressible;
+		/* pglz_compress sets rawsize internally */
+	}
+	/* compress with snappy */
+	else if (toast_compression_algo == 1)
+	{
+		static struct snappy_env *snappy_env = NULL;
+		if (snappy_env == NULL)
+		{
+			snappy_env = malloc(sizeof(struct snappy_env));
+			snappy_init_env(snappy_env);
+		}
+		/* ask compressor about buffer size */
+		buffer_size = snappy_max_compressed_length(valsize);
+		tmp = (struct varlena *) palloc(buffer_size	+ VARHDRSZ + 4);
+
+		ret = snappy_compress(snappy_env, VARDATA_ANY(DatumGetPointer(value)),
+							  (size_t)valsize, ((char *)VARDATA(tmp)) + 4,
+							  &buffer_size);
+		/* EIO is returned for incompressible data */
+		if (ret == EIO)
+			goto incompressible;
+		else if (ret != 0)
+			elog(ERROR, "compression failed: %d", ret);
+
+		/* encode compression algorithm in size */
+		*((uint32 *)VARDATA(tmp)) = 1 << 30 | valsize;
+		SET_VARSIZE_COMPRESSED(tmp, buffer_size + VARHDRSZ + 4);
+	}
+	else
+		elog(ERROR, "huh? There's not much between 1 and zero");
+
+	compressed_size = VARSIZE(tmp);
 
 	/*
-	 * We recheck the actual size even if pglz_compress() reports success,
-	 * because it might be satisfied with having saved as little as one byte
-	 * in the compressed data --- which could turn into a net loss once you
-	 * consider header and alignment padding.  Worst case, the compressed
-	 * format might require three padding bytes (plus header, which is
-	 * included in VARSIZE(tmp)), whereas the uncompressed format would take
-	 * only one header byte and no padding if the value is short enough.  So
-	 * we insist on a savings of more than 2 bytes to ensure we have a gain.
+	 * Check whether the compression was sufficiently effective. Some of the
+	 * compression methods check for blowing up to a larger amount of data than
+	 * the source, some don't. Even if they do, like pglz_compress(), they
+	 * might reports success, having saved as little as one byte in the
+	 * compressed data --- which could turn into a net loss once you consider
+	 * header and alignment padding.  Worst case, the compressed format might
+	 * require three padding bytes (plus header, which is included in
+	 * VARSIZE(tmp)), whereas the uncompressed format would take only one
+	 * header byte and no padding if the value is short enough.  So we insist
+	 * on a savings of more than 2 bytes to ensure we have a gain.
 	 */
-	if (pglz_compress(VARDATA_ANY(DatumGetPointer(value)), valsize,
-					  (PGLZ_Header *) tmp, PGLZ_strategy_default) &&
-		VARSIZE(tmp) < valsize - 2)
+	if (compressed_size < valsize - 2)
 	{
 		/* successful compression */
 		return PointerGetDatum(tmp);
 	}
+
+	/* incompressible data */
+incompressible:
+	if (tmp != NULL)
+		pfree(tmp);
+	return PointerGetDatum(NULL);
+}
+
+static Size
+toast_uncompressed_length(struct varlena *attr)
+{
+#ifdef USE_ASSERT_CHECKING
+	uint8 compression_type;
+#endif
+	Size result;
+
+	Assert(VARATT_IS_COMPRESSED(attr));
+
+	/*
+	 * Some compression methods have internal knowledge about the datum
+	 * length. Crosscheck.
+	 */
+#ifdef USE_ASSERT_CHECKING
+	compression_type = (*(uint32 *) (VARDATA(attr))) >> 30;
+
+	/* pglz stores the size as uint32 at the beginning */
+	if (compression_type == 0)
+	{
+		/* here, rawsize is just the payload size */
+		result = PGLZ_RAW_SIZE((PGLZ_Header *)attr);
+	}
+
+	/* snappy encodes the length as a varint */
+	else if (compression_type == 1)
+	{
+		if (!snappy_uncompressed_length(((char *)VARDATA(attr)) + 4,
+									   VARSIZE_ANY_EXHDR(attr) - 4,
+									   &result))
+			elog(ERROR, "could not read uncompressed size");
+	}
 	else
 	{
-		/* incompressible data */
-		pfree(tmp);
-		return PointerGetDatum(NULL);
+		elog(ERROR, "unknown compression method %u", (uint32)compression_type);
+	}
+	Assert(((*(uint32 *) (VARDATA(attr))) & 0x3ffffff) == result);
+#endif
+
+	result = (*(uint32 *) (VARDATA(attr))) & 0x3ffffff;
+
+	/* varlena overhead */
+	result += VARHDRSZ;
+	return result;
+}
+
+static Datum
+toast_uncompress_datum(Datum value)
+{
+	struct varlena *attr =  (struct varlena *) DatumGetPointer(value);
+	uint8 compression_type;
+
+	Assert(VARATT_IS_4B_C(value));
+
+	/* ----
+	 * Disambiguate between compression strategies:
+	 *
+	 * In PGLZ - the formerly only compression method - the first 4 bytes are
+	 * used to store the raw size of the datum as a signed integer. Since that
+	 * cannot be more than 1GB due to toast limitations we have the 2 high bits
+	 * to disambiguate whether its pglz or something more modern. We cannot
+	 * change the meaning of Datums with the first 2 bits unset since we need
+	 * to support the old ondisk format.
+	 *
+	 * If it's not pglz we store 1 byte of 1's and then 1 byte determining the
+	 * compression method. We could just use the two bytes to store 3 other
+	 * compression methods but maybe we better don't paint ourselves in a
+	 * corner again.
+	 * ----
+	 */
+	compression_type = (*(uint32 *) VARDATA(value)) >> 30;
+
+	if (compression_type == 0)
+	{
+		PGLZ_Header *tmp = (PGLZ_Header *) DatumGetPointer(value);
+		attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
+		SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
+		pglz_decompress(tmp, VARDATA(attr));
+	}
+	else if (compression_type == 1)
+	{
+		void *compressed_data;
+		Size compressed_length;
+		int ret;
+		Size uncompressed_length;
+
+		compressed_data = ((char *)VARDATA(attr)) + 4;
+		compressed_length = VARSIZE_ANY_EXHDR(attr) - 4;
+
+		ret = snappy_uncompressed_length(compressed_data,
+										 compressed_length,
+										 &uncompressed_length);
+		if (!ret)
+			elog(ERROR, "failed to determine compression length");
+		if (uncompressed_length != ((*(uint32 *) (VARDATA(attr))) & 0x3ffffff))
+			elog(ERROR, "compression size mismatch");
+
+		attr = (struct varlena *) palloc(uncompressed_length + VARHDRSZ);
+		SET_VARSIZE(attr, uncompressed_length + VARHDRSZ);
+
+		ret = snappy_uncompress(compressed_data,
+								compressed_length,
+								VARDATA(attr));
+		if (ret != 0)
+			elog(ERROR, "decompression failed: %d", ret);
+	}
+	else
+	{
+		elog(ERROR, "unknown extended compression method %c",
+			 compression_type);
 	}
+	return PointerGetDatum(attr);
 }
 
 
@@ -1284,10 +1438,11 @@ toast_save_datum(Relation rel, Datum value,
 	}
 	else if (VARATT_IS_COMPRESSED(dval))
 	{
+		struct varlena *dval_a = (struct varlena *) dval;
 		data_p = VARDATA(dval);
 		data_todo = VARSIZE(dval) - VARHDRSZ;
 		/* rawsize in a compressed datum is just the size of the payload */
-		toast_pointer.va_rawsize = VARRAWSIZE_4B_C(dval) + VARHDRSZ;
+		toast_pointer.va_rawsize = toast_uncompressed_length(dval_a);
 		toast_pointer.va_extsize = data_todo;
 		/* Assert that the numbers look like it's compressed */
 		Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..5f3b4f5 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -28,6 +28,7 @@
 
 #include "access/gin.h"
 #include "access/transam.h"
+#include "access/tuptoaster.h"
 #include "access/twophase.h"
 #include "access/xact.h"
 #include "catalog/namespace.h"
@@ -1889,6 +1890,16 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"toast_compression_algo", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("chooses the compression algo: 0 pglz, 1: snappy."),
+			NULL
+		},
+		&toast_compression_algo,
+		0, 0, 1,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"vacuum_freeze_min_age", PGC_USERSET, CLIENT_CONN_STATEMENT,
 			gettext_noop("Minimum age at which VACUUM should freeze a table row."),
 			NULL
diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h
index 6f4fc45..97334fd 100644
--- a/src/include/access/tuptoaster.h
+++ b/src/include/access/tuptoaster.h
@@ -94,6 +94,7 @@
 	 sizeof(int32) -									\
 	 VARHDRSZ)
 
+extern int toast_compression_algo;
 
 /* ----------
  * toast_insert_or_update -
diff --git a/src/include/postgres.h b/src/include/postgres.h
index 30e1dee..6fd70c6 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -81,21 +81,15 @@ struct varatt_external
  * compiler might otherwise think it could generate code that assumes
  * alignment while touching fields of a 1-byte-header varlena.
  */
-typedef union
+
+/* Normal and inline compressed varlena (4-byte length) */
+typedef struct
 {
-	struct						/* Normal varlena (4-byte length) */
-	{
-		uint32		va_header;
-		char		va_data[1];
-	}			va_4byte;
-	struct						/* Compressed-in-line format */
-	{
-		uint32		va_header;
-		uint32		va_rawsize; /* Original data size (excludes header) */
-		char		va_data[1]; /* Compressed data */
-	}			va_compressed;
+	uint32		va_header;
+	char		va_data[1];
 } varattrib_4b;
 
+/* short inline uncompressed varlena (1-byte lenght) */
 typedef struct
 {
 	uint8		va_header;
@@ -158,16 +152,16 @@ typedef struct
 
 /* VARSIZE_4B() should only be used on known-aligned data */
 #define VARSIZE_4B(PTR) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header & 0x3FFFFFFF)
+	(((varattrib_4b *) (PTR))->va_header & 0x3FFFFFFF)
 #define VARSIZE_1B(PTR) \
 	(((varattrib_1b *) (PTR))->va_header & 0x7F)
 #define VARSIZE_1B_E(PTR) \
 	(((varattrib_1b_e *) (PTR))->va_len_1be)
 
 #define SET_VARSIZE_4B(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF)
+	(((varattrib_4b *) (PTR))->va_header = (len) & 0x3FFFFFFF)
 #define SET_VARSIZE_4B_C(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = ((len) & 0x3FFFFFFF) | 0x40000000)
+	(((varattrib_4b *) (PTR))->va_header = ((len) & 0x3FFFFFFF) | 0x40000000)
 #define SET_VARSIZE_1B(PTR,len) \
 	(((varattrib_1b *) (PTR))->va_header = (len) | 0x80)
 #define SET_VARSIZE_1B_E(PTR,len) \
@@ -190,16 +184,16 @@ typedef struct
 
 /* VARSIZE_4B() should only be used on known-aligned data */
 #define VARSIZE_4B(PTR) \
-	((((varattrib_4b *) (PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF)
+	((((varattrib_4b *) (PTR))->va_header >> 2) & 0x3FFFFFFF)
 #define VARSIZE_1B(PTR) \
 	((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F)
 #define VARSIZE_1B_E(PTR) \
 	(((varattrib_1b_e *) (PTR))->va_len_1be)
 
 #define SET_VARSIZE_4B(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2))
+	(((varattrib_4b *) (PTR))->va_header = (((uint32) (len)) << 2))
 #define SET_VARSIZE_4B_C(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2) | 0x02)
+	(((varattrib_4b *) (PTR))->va_header = (((uint32) (len)) << 2) | 0x02)
 #define SET_VARSIZE_1B(PTR,len) \
 	(((varattrib_1b *) (PTR))->va_header = (((uint8) (len)) << 1) | 0x01)
 #define SET_VARSIZE_1B_E(PTR,len) \
@@ -217,14 +211,10 @@ typedef struct
 
 #define VARHDRSZ_EXTERNAL		2
 
-#define VARDATA_4B(PTR)		(((varattrib_4b *) (PTR))->va_4byte.va_data)
-#define VARDATA_4B_C(PTR)	(((varattrib_4b *) (PTR))->va_compressed.va_data)
+#define VARDATA_4B(PTR)		(((varattrib_4b *) (PTR))->va_data)
 #define VARDATA_1B(PTR)		(((varattrib_1b *) (PTR))->va_data)
 #define VARDATA_1B_E(PTR)	(((varattrib_1b_e *) (PTR))->va_data)
 
-#define VARRAWSIZE_4B_C(PTR) \
-	(((varattrib_4b *) (PTR))->va_compressed.va_rawsize)
-
 /* Externally visible macros */
 
 /*
diff --git a/src/include/utils/pg_lzcompress.h b/src/include/utils/pg_lzcompress.h
index 4af24a3..4ee7308 100644
--- a/src/include/utils/pg_lzcompress.h
+++ b/src/include/utils/pg_lzcompress.h
@@ -19,8 +19,8 @@
  */
 typedef struct PGLZ_Header
 {
-	int32		vl_len_;		/* varlena header (do not touch directly!) */
-	int32		rawsize;
+	uint32		vl_len_;		/* varlena header (do not touch directly!) */
+	uint32		rawsize;
 } PGLZ_Header;
 
 
-- 
1.8.2.rc2.4.g7799588.dirty

Josh Berkus

josh@agliodbs.com

over 12 years ago

In reply to: Andres Freund (#1)

Re: pluggable compression support

On 06/14/2013 04:01 PM, Andres Freund wrote:

It still contains a guc as described in the above message to control the
algorithm used for compressing new tuples but I think we should remove
that guc after testing.

Did you add the storage attribute?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Import Notes

Reply to msg id not found: WM6b524fda2c5277d652b26ae1550d2bce213d3c29a3dfecec1ba1056db6141dd9f889261342e1886e78eee109be901896@asav-1.01.com

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Josh Berkus (#2)

Re: pluggable compression support

On 2013-06-14 17:12:01 -0700, Josh Berkus wrote:

On 06/14/2013 04:01 PM, Andres Freund wrote:

It still contains a guc as described in the above message to control the
algorithm used for compressing new tuples but I think we should remove
that guc after testing.

Did you add the storage attribute?

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

If we want to make it configurable on a per column basis I think the way
to go is to add a new column to pg_attribute and split compression
related things out of attstorage into attcompression.
That's a fair amount of work and it includes a minor compatibility break
in the catalog format, so I'd prefer not to do it until there's a good
reason to do so.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Josh Berkus

josh@agliodbs.com

over 12 years ago

In reply to: Andres Freund (#1)

Re: pluggable compression support

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

Then it's not "pluggable", is it? It's "upgradable compression
support", if anything. Which is fine, but let's not confuse people.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Import Notes

Reply to msg id not found: WMa4de3e6fc61040a44e1953d5eaf89c31ae22d55793c882ee718b6fe0491688c5159416a7887102a2ce5c14049082adfc@asav-3.01.com

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Josh Berkus (#4)

Re: pluggable compression support

On 2013-06-14 17:35:02 -0700, Josh Berkus wrote:

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

Then it's not "pluggable", is it? It's "upgradable compression
support", if anything. Which is fine, but let's not confuse people.

The point is that it's pluggable on the storage level in the sense of
that several different algorithms can coexist and new ones can
relatively easily added.
That part is what seems to have blocked progress for quite a while
now. So fixing that seems to be the interesting thing.

I am happy enough to do the work of making it configurable if we want it
to be... But I have zap interest of doing it and throw it away in the
end because we decide we don't need it.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Robert Haas

robertmhaas@gmail.com

over 12 years ago

In reply to: Andres Freund (#5)

Re: pluggable compression support

On Fri, Jun 14, 2013 at 8:45 PM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-06-14 17:35:02 -0700, Josh Berkus wrote:

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

Then it's not "pluggable", is it? It's "upgradable compression
support", if anything. Which is fine, but let's not confuse people.

The point is that it's pluggable on the storage level in the sense of
that several different algorithms can coexist and new ones can
relatively easily added.
That part is what seems to have blocked progress for quite a while
now. So fixing that seems to be the interesting thing.

I am happy enough to do the work of making it configurable if we want it
to be... But I have zap interest of doing it and throw it away in the
end because we decide we don't need it.

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Joshua D. Drake

jd@commandprompt.com

over 12 years ago

In reply to: Robert Haas (#6)

Re: pluggable compression support

On 06/14/2013 06:56 PM, Robert Haas wrote:

On Fri, Jun 14, 2013 at 8:45 PM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-06-14 17:35:02 -0700, Josh Berkus wrote:

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

Then it's not "pluggable", is it? It's "upgradable compression
support", if anything. Which is fine, but let's not confuse people.

The point is that it's pluggable on the storage level in the sense of
that several different algorithms can coexist and new ones can
relatively easily added.
That part is what seems to have blocked progress for quite a while
now. So fixing that seems to be the interesting thing.

I am happy enough to do the work of making it configurable if we want it
to be... But I have zap interest of doing it and throw it away in the
end because we decide we don't need it.

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

We have IP legal representation through Software in the Public interest
who pretty much specializes in this type of thing.

Should I follow up? If so, I need a summary of the exact question
including licenses etc.

--
Command Prompt, Inc. - http://www.commandprompt.com/ 509-416-6579
PostgreSQL Support, Training, Professional Services and Development
High Availability, Oracle Conversion, Postgres-XC, @cmdpromptinc
For my dreams of your image that blossoms
a rose in the deeps of my heart. - W.B. Yeats

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Robert Haas (#6)

Re: pluggable compression support

On 2013-06-14 21:56:52 -0400, Robert Haas wrote:

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

I personally don't think the legal arguments holds all that much water
for snappy and lz4. But then the opinion of a european non-lawyer doesn't
hold much either.
Both are widely used by a large number open and closed projects, some of
which have patent grant clauses in their licenses. E.g. hadoop,
cassandra use lz4, and I'd be surprised if the companies behind those
have opened themselves to litigation.

I think we should preliminarily decide which algorithm to use before we
get lawyers involved. I'd surprised if they can make such a analysis
faster than we can rule out one of them via benchmarks.

Will post an updated patch that includes lz4 as well.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Hannu Krosing

hannu@2ndQuadrant.com

over 12 years ago

In reply to: Robert Haas (#6)

Re: pluggable compression support

On 06/15/2013 03:56 AM, Robert Haas wrote:

On Fri, Jun 14, 2013 at 8:45 PM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-06-14 17:35:02 -0700, Josh Berkus wrote:

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

Then it's not "pluggable", is it? It's "upgradable compression
support", if anything. Which is fine, but let's not confuse people.

The point is that it's pluggable on the storage level in the sense of
that several different algorithms can coexist and new ones can
relatively easily added.
That part is what seems to have blocked progress for quite a while
now. So fixing that seems to be the interesting thing.

I am happy enough to do the work of making it configurable if we want it
to be... But I have zap interest of doing it and throw it away in the
end because we decide we don't need it.

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

If it were truly pluggable - that is just load a .dll, set a GUG and start
using it - then the selection of algorithms would be much
wider as several slow-but-high-compression ones under GPL could be
used as well, similar to how currently PostGIS works.

gzip and bzip2 are the first two that came in mind, but I am sure there
are more.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers.

Making a truly pluggable compression API delegates this question
to end users.

Delegation is good, as it lets you get done more :)

--
Hannu Krosing
PostgreSQL Consultant
Performance, Scalability and High Availability
2ndQuadrant Nordic Oï¿½

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10

Hannu Krosing

hannu@2ndQuadrant.com

over 12 years ago

In reply to: Andres Freund (#3)

Re: pluggable compression support

On 06/15/2013 02:20 AM, Andres Freund wrote:

On 2013-06-14 17:12:01 -0700, Josh Berkus wrote:

On 06/14/2013 04:01 PM, Andres Freund wrote:

It still contains a guc as described in the above message to control the
algorithm used for compressing new tuples but I think we should remove
that guc after testing.

Did you add the storage attribute?

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

If not significantly harder than what you currently do, I'd prefer a
true pluggable compression support which is

a) dynamically configurable , say by using a GUG

and

b) self-describing, that is, the compressed data should have enough
info to determine how to decompress it.

additionally it *could* have the property Simon proposed earlier
of *uncompressed* pages having some predetermined size, so we
could retain optimisations of substring() even on compressed TOAST
values.

the latter of course could also be achieved by adding offset
column to toast tables as well.

One more idea - if we are already changing toast table structure, we
could introduce a notion of "compress block", which could run over
several storage pages for much improved compression compared
to compressing only a single page at a time.

--
Hannu Krosing
PostgreSQL Consultant
Performance, Scalability and High Availability
2ndQuadrant Nordic Oï¿½

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Hannu Krosing (#10)

Re: pluggable compression support

On 2013-06-15 13:25:49 +0200, Hannu Krosing wrote:

On 06/15/2013 02:20 AM, Andres Freund wrote:

On 2013-06-14 17:12:01 -0700, Josh Berkus wrote:

On 06/14/2013 04:01 PM, Andres Freund wrote:

It still contains a guc as described in the above message to control the
algorithm used for compressing new tuples but I think we should remove
that guc after testing.

Did you add the storage attribute?

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

If not significantly harder than what you currently do, I'd prefer a
true pluggable compression support which is

a) dynamically configurable , say by using a GUG

and

b) self-describing, that is, the compressed data should have enough
info to determine how to decompress it.

Could you perhaps actually read the the description and the discussion
before making wild suggestions? Possibly even the patch.
Compressed toast datums now *do* have an identifier of the compression
algorithm used. That's how we can discern between pglz and whatever
algorithm we come up with.

But those identifiers should be *small* (since they are added to all
Datums) and they need to be stable, even across pg_upgrade. So I think
making this user configurable would be grave error at this point.

additionally it *could* have the property Simon proposed earlier
of *uncompressed* pages having some predetermined size, so we
could retain optimisations of substring() even on compressed TOAST
values.

We are not changing the toast format here, so I don't think that's
applicable. That's a completely separate feature.

the latter of course could also be achieved by adding offset
column to toast tables as well.

One more idea - if we are already changing toast table structure, we
could introduce a notion of "compress block", which could run over
several storage pages for much improved compression compared
to compressing only a single page at a time.

We aren't changing the toast table structure. And we can't easily do so,
think of pg_upgrade.
Besides, toast always has compressed datums over several chunks. What
would be beneficial would be to compress in a way that you can compress
several datums together, but that's several magnitudes more complex and
is unrelated to this feature.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Hannu Krosing (#9)

Re: pluggable compression support

On 2013-06-15 13:11:47 +0200, Hannu Krosing wrote:

If it were truly pluggable - that is just load a .dll, set a GUG and start
using it

Ok. I officially rechristen the patchset to 'extensible compression
support'.

- then the selection of algorithms would be much
wider as several slow-but-high-compression ones under GPL could be
used as well, similar to how currently PostGIS works.

gzip and bzip2 are the first two that came in mind, but I am sure there
are more.

gzip barely has a higher compression ratio than lz4 and is a magnitude
slower decompressing, so I don't think it's interesting.
I personally think bzip2 is too slow to be useful, even for
decompression. What might be useful is something like lzma, but it's
implementation is so complex that I don't really want to touch it.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers.

Making a truly pluggable compression API delegates this question
to end users.

Delegation is good, as it lets you get done more :)

No. It increases the features complexity by a magnitude. That's not
good. And it means that about nobody but a few expert users will benefit
from it, so I am pretty strongly opposed.

You suddently need to solve the question of how the identifiers for
compression formats are allocated and preserved across pg_upgrade and
across machines.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13

Hannu Krosing

hannu@2ndQuadrant.com

over 12 years ago

In reply to: Andres Freund (#11)

Re: pluggable compression support

On 06/15/2013 01:56 PM, Andres Freund wrote:

On 2013-06-15 13:25:49 +0200, Hannu Krosing wrote:

On 06/15/2013 02:20 AM, Andres Freund wrote:

On 2013-06-14 17:12:01 -0700, Josh Berkus wrote:

On 06/14/2013 04:01 PM, Andres Freund wrote:

It still contains a guc as described in the above message to control the
algorithm used for compressing new tuples but I think we should remove
that guc after testing.

Did you add the storage attribute?

No. I think as long as we only have pglz and one new algorithm (even if
that is lz4 instead of the current snappy) we should just always use the
new algorithm. Unless I missed it nobody seemed to have voiced a
contrary position?
For testing/evaluation the guc seems to be sufficient.

If not significantly harder than what you currently do, I'd prefer a
true pluggable compression support which is
a) dynamically configurable , say by using a GUG
and
b) self-describing, that is, the compressed data should have enough
info to determine how to decompress it.

Could you perhaps actually read the the description and the discussion
before making wild suggestions? Possibly even the patch.
Compressed toast datums now *do* have an identifier of the compression
algorithm used.
That's how we can discern between pglz and whatever
algorithm we come up with.

Claiming that the algorithm will be one of only two (current and
"whatever algorithm we come up with ") suggests that it is
only one bit, which is undoubtedly too little for having a "pluggable"
compression API :)

But those identifiers should be *small* (since they are added to all
Datums)

if there will be any alignment at all between the datums, then
one byte will be lost in the noise ("remember: nobody will need
more than 256 compression algorithms")
OTOH, if you plan to put these format markers in the compressed
stream and change the compression algorithm while reading it, I am lost.

and they need to be stable, even across pg_upgrade. So I think
making this user configurable would be grave error at this point.

"at this point" in what sense ?

additionally it *could* have the property Simon proposed earlier
of *uncompressed* pages having some predetermined size, so we
could retain optimisations of substring() even on compressed TOAST
values.

We are not changing the toast format here, so I don't think that's
applicable. That's a completely separate feature.

the latter of course could also be achieved by adding offset
column to toast tables as well.
One more idea - if we are already changing toast table structure, we
could introduce a notion of "compress block", which could run over
several storage pages for much improved compression compared
to compressing only a single page at a time.

We aren't changing the toast table structure. And we can't easily do so,
think of pg_upgrade.

Where was the page for "features rejected based on of pg_upgrade" ;)

Besides, toast always has compressed datums over several chunks. What
would be beneficial would be to compress in a way that you can compress
several datums together, but that's several magnitudes more complex and
is unrelated to this feature.

Greetings,

Andres Freund

--
Hannu Krosing
PostgreSQL Consultant
Performance, Scalability and High Availability
2ndQuadrant Nordic Oï¿½

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14

Hannu Krosing

hannu@2ndQuadrant.com

over 12 years ago

In reply to: Andres Freund (#12)

Re: pluggable compression support

On 06/15/2013 02:02 PM, Andres Freund wrote:

On 2013-06-15 13:11:47 +0200, Hannu Krosing wrote:

If it were truly pluggable - that is just load a .dll, set a GUG and start
using it

Ok. I officially rechristen the patchset to 'extensible compression
support'.

- then the selection of algorithms would be much
wider as several slow-but-high-compression ones under GPL could be
used as well, similar to how currently PostGIS works.
gzip and bzip2 are the first two that came in mind, but I am sure there
are more.

gzip barely has a higher compression ratio than lz4 and is a magnitude
slower decompressing, so I don't think it's interesting.
I personally think bzip2 is too slow to be useful, even for
decompression.

with low compression settings gzip and bzip2 seem to decompress at the
same speed :
http://pokecraft.first-world.info/wiki/Quick_Benchmark:_Gzip_vs_Bzip2_vs_LZMA_vs_XZ_vs_LZ4_vs_LZO

(an interesting thing there is memory usage, but I guess it is just an
artefact of outer layers around the algorithm)

and if better compression translates to more speed depends heavily on
disk speeds :

http://www.citusdata.com/blog/64-zfs-compression claims quite big
performance increases from using gzip, even with its slow decompression"

What might be useful is something like lzma, but it's
implementation is so complex that I don't really want to touch it.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers.

Making a truly pluggable compression API delegates this question
to end users.

Delegation is good, as it lets you get done more :)

No. It increases the features complexity by a magnitude. That's not
good. And it means that about nobody but a few expert users will benefit
from it, so I am pretty strongly opposed.

You suddently need to solve the question of how the identifiers for
compression formats are allocated and preserved across pg_upgrade and
across machines.

This is something similar we already need to do for any non-builtin type
OID.

Greetings,

Andres Freund

--
Hannu Krosing
PostgreSQL Consultant
Performance, Scalability and High Availability
2ndQuadrant Nordic Oï¿½

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#15

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Hannu Krosing (#13)

Re: pluggable compression support

On 2013-06-15 14:11:54 +0200, Hannu Krosing wrote:

Could you perhaps actually read the the description and the discussion
before making wild suggestions? Possibly even the patch.
Compressed toast datums now *do* have an identifier of the compression
algorithm used.
That's how we can discern between pglz and whatever
algorithm we come up with.

Claiming that the algorithm will be one of only two (current and
"whatever algorithm we come up with ") suggests that it is
only one bit, which is undoubtedly too little for having a "pluggable"
compression API :)

No, I am thinking 127 + 2 possibly algorithms for now. If we need more
the space used for the algorithm can be extended transparently at that
point.

But those identifiers should be *small* (since they are added to all
Datums)

if there will be any alignment at all between the datums, then
one byte will be lost in the noise ("remember: nobody will need
more than 256 compression algorithms")

No. There's no additional alignment involved here.

OTOH, if you plan to put these format markers in the compressed
stream and change the compression algorithm while reading it, I am
lost.

The marker *needs* to be in the compressed stream. When decompressing a
datum we only only get passed the varlena.

and they need to be stable, even across pg_upgrade. So I think
making this user configurable would be grave error at this point.

"at this point" in what sense ?

If we add another algorithm with different tradeofs we can have a column
attribute for choosing the algorithm. If there proofs to be a need to
add more configurabily, we can do that at that point.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16

Amit kapila

amit.kapila@huawei.com

over 12 years ago

In reply to: Andres Freund (#8)

Re: pluggable compression support

On Saturday, June 15, 2013 3:50 PM Andres Freund wrote:
On 2013-06-14 21:56:52 -0400, Robert Haas wrote:

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

I personally don't think the legal arguments holds all that much water
for snappy and lz4. But then the opinion of a european non-lawyer doesn't
hold much either.
Both are widely used by a large number open and closed projects, some of
which have patent grant clauses in their licenses. E.g. hadoop,
cassandra use lz4, and I'd be surprised if the companies behind those
have opened themselves to litigation.

I think we should preliminarily decide which algorithm to use before we
get lawyers involved. I'd surprised if they can make such a analysis
faster than we can rule out one of them via benchmarks.

I have also tried to use snappy for patch "Performance Improvement by reducing WAL for Update Operation".
It has shown very good results and performed very well for all the tests asked by Heikki.
Results are at below link:
/messages/by-id/009001ce2c6e$9bea4790$d3bed6b0$@kapila@huawei.com

I think if we can get snappy into core, it can be used for more things.
I wanted to try it for FPW as well.

With Regards,
Amit Kapila.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#17

Robert Haas

robertmhaas@gmail.com

over 12 years ago

In reply to: Hannu Krosing (#13)

Re: pluggable compression support

On Sat, Jun 15, 2013 at 8:11 AM, Hannu Krosing <hannu@2ndquadrant.com> wrote:

Claiming that the algorithm will be one of only two (current and
"whatever algorithm we come up with ") suggests that it is
only one bit, which is undoubtedly too little for having a "pluggable"
compression API :)

See /messages/by-id/20130607143053.GJ29964@alap2.anarazel.de

But those identifiers should be *small* (since they are added to all
Datums)

if there will be any alignment at all between the datums, then
one byte will be lost in the noise ("remember: nobody will need
more than 256 compression algorithms")
OTOH, if you plan to put these format markers in the compressed
stream and change the compression algorithm while reading it, I am lost.

The above-linked email addresses this point as well: there are bits
available in the toast pointer. But there aren't MANY bits without
increasing the storage footprint, so trying to do something that's
more general than we really need is going to cost us in terms of
on-disk footprint. Is that really worth it? And if so, why? I don't
find the idea of a trade-off between compression/decompression speed
and compression ratio to be very exciting. As Andres says, bzip2 is
impractically slow for ... almost everything. If there's a good
BSD-licensed algorithm available, let's just use it and be done. Our
current algorithm has lasted us a very long time; I see no reason to
think we'll want to change this again for another 10 years, and by
that time, we may have redesigned the storage format altogether,
making the limited extensibility of our current TOAST pointer format
moot.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18

Robert Haas

robertmhaas@gmail.com

over 12 years ago

In reply to: Hannu Krosing (#14)

Re: pluggable compression support

On Sat, Jun 15, 2013 at 8:22 AM, Hannu Krosing <hannu@2ndquadrant.com> wrote:

You suddently need to solve the question of how the identifiers for
compression formats are allocated and preserved across pg_upgrade and
across machines.

This is something similar we already need to do for any non-builtin type
OID.

That's true, but that code has already been written. And it's not
trivial. The code involved is CREATE/ALTER/DROP TYPE plus all the
corresponding pg_dump mechanism. To do what you're proposing here,
we'd need CREATE/ALTER/DROP COMPRESSION METHOD, and associated pg_dump
--binary-upgrade support. I think Andres is entirely right to be
skeptical about that. It will make this project about 4 times as hard
for almost no benefit.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19

Hannu Krosing

hannu@2ndQuadrant.com

over 12 years ago

In reply to: Robert Haas (#17)

Re: pluggable compression support

On 06/16/2013 03:50 AM, Robert Haas wrote:

On Sat, Jun 15, 2013 at 8:11 AM, Hannu Krosing <hannu@2ndquadrant.com> wrote:

Claiming that the algorithm will be one of only two (current and
"whatever algorithm we come up with ") suggests that it is
only one bit, which is undoubtedly too little for having a "pluggable"
compression API :)

See /messages/by-id/20130607143053.GJ29964@alap2.anarazel.de

But those identifiers should be *small* (since they are added to all
Datums)

if there will be any alignment at all between the datums, then
one byte will be lost in the noise ("remember: nobody will need
more than 256 compression algorithms")
OTOH, if you plan to put these format markers in the compressed
stream and change the compression algorithm while reading it, I am lost.

The above-linked email addresses this point as well: there are bits
available in the toast pointer. But there aren't MANY bits without
increasing the storage footprint, so trying to do something that's
more general than we really need is going to cost us in terms of
on-disk footprint. Is that really worth it? And if so, why? I don't
find the idea of a trade-off between compression/decompression speed
and compression ratio to be very exciting. As Andres says, bzip2 is
impractically slow for ... almost everything. If there's a good
BSD-licensed algorithm available, let's just use it and be done. Our
current algorithm has lasted us a very long time;

My scepticism about current algorithm comes from a brief test
(which may have been flawed) which showed almost no compression
for plain XML fields.

It may very well be that I was doing something stupid and got
wrong results though, as I the functions to ask for toast internals
like "is this field compressed" or "what is the compressed
length of this field" are well hidden - if available at all - in our
documentation.

I see no reason to
think we'll want to change this again for another 10 years, and by
that time, we may have redesigned the storage format altogether,
making the limited extensibility of our current TOAST pointer format
moot.

Agreed.

I just hoped that "pluggable compression support" would
be something that enables people not directly interested in
hacking the core to experiment with compression and thereby
possibly coming up with something that changes your "not
useful in next 10 years" prediction :)

Seeing that the scope of this patch is actually much narrower,
I have no objections of doing it as proposed by Andres.

--
Hannu Krosing
PostgreSQL Consultant
Performance, Scalability and High Availability
2ndQuadrant Nordic Oï¿½

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#20

Simon Riggs

simon@2ndQuadrant.com

over 12 years ago

In reply to: Andres Freund (#12)

Re: pluggable compression support

On 15 June 2013 13:02, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-06-15 13:11:47 +0200, Hannu Krosing wrote:

If it were truly pluggable - that is just load a .dll, set a GUG and start
using it

Ok. I officially rechristen the patchset to 'extensible compression
support'.

(I confess I was confused also.)

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21

Simon Riggs

simon@2ndQuadrant.com

over 12 years ago

In reply to: Hannu Krosing (#10)

Re: pluggable compression support

On 15 June 2013 12:25, Hannu Krosing <hannu@2ndquadrant.com> wrote:

additionally it *could* have the property Simon proposed earlier
of *uncompressed* pages having some predetermined size, so we
could retain optimisations of substring() even on compressed TOAST
values.

That wasn't about having fixed size pages, it was about having a
function to determine the split points. Reason being we want to use
some intelligence to split up JSON and XML documents.

I would also like to be able to do other types of processing on fields
before they are stored.

That is probably best done as a sequence of processing transforms,
rather than a single compression module.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22

Alvaro Herrera

alvherre@2ndquadrant.com

over 12 years ago

In reply to: Simon Riggs (#21)

Re: pluggable compression support

Simon Riggs wrote:

On 15 June 2013 12:25, Hannu Krosing <hannu@2ndquadrant.com> wrote:

additionally it *could* have the property Simon proposed earlier
of *uncompressed* pages having some predetermined size, so we
could retain optimisations of substring() even on compressed TOAST
values.

That wasn't about having fixed size pages, it was about having a
function to determine the split points. Reason being we want to use
some intelligence to split up JSON and XML documents.

Maybe indexed compressed text would be useful:
http://pizzachili.dcc.uchile.cl/

--
ï¿½lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#23

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Andres Freund (#8)

3 attachment(s)

Re: pluggable compression support

On 2013-06-15 12:20:28 +0200, Andres Freund wrote:

On 2013-06-14 21:56:52 -0400, Robert Haas wrote:

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

I personally don't think the legal arguments holds all that much water
for snappy and lz4. But then the opinion of a european non-lawyer doesn't
hold much either.
Both are widely used by a large number open and closed projects, some of
which have patent grant clauses in their licenses. E.g. hadoop,
cassandra use lz4, and I'd be surprised if the companies behind those
have opened themselves to litigation.

I think we should preliminarily decide which algorithm to use before we
get lawyers involved. I'd surprised if they can make such a analysis
faster than we can rule out one of them via benchmarks.

Will post an updated patch that includes lz4 as well.

Attached.

Changes:
* add lz4 compression algorithm (2 clause bsd)
* move compression algorithms into own subdirectory
* clean up compression/decompression functions
* allow 258 compression algorithms, uses 1byte extra for any but the
first three
* don't pass a varlena to pg_lzcompress.c anymore, but data directly
* add pglz_long as a test fourth compression method that uses the +1
byte encoding
* us postgres' endian detection in snappy for compatibility with osx

Based on the benchmarks I think we should go with lz4 only for now. The
patch provides the infrastructure should somebody else want to add more
or even proper configurability.

Todo:
* windows build support
* remove toast_compression_algo guc
* remove either snappy or lz4 support
* remove pglz_long support (just there for testing)

New benchmarks:

Workstation (2xE5520, enough s_b for everything):

Data load:
pglz: 36643.384 ms
snappy: 24626.894 ms
lz4: 23871.421 ms
pglz_long: 37097.681 ms

COPY messages_* TO '/dev/null' WITH BINARY;
pglz: 3116.083 ms
snappy: 2524.388 ms
lz4: 2349.396 ms
pglz_long: 3104.134 ms

COPY (SELECT rawtxt FROM messages_*) TO '/dev/null' WITH BINARY;
pglz: 1609.969 ms
snappy: 1031.696 ms
lz4: 886.782 ms
pglz_long: 1606.803 ms

On my elderly laptop (core 2 duo), too load shared buffers:

Data load:
pglz: 39968.381 ms
snappy: 26952.330 ms
lz4: 29225.472 ms
pglz_long: 39929.568 ms

COPY messages_* TO '/dev/null' WITH BINARY;
pglz: 3920.588 ms
snappy: 3421.938 ms
lz4: 3311.540 ms
pglz_long: 3885.920 ms

COPY (SELECT rawtxt FROM messages_*) TO '/dev/null' WITH BINARY;
pglz: 2238.145 ms
snappy: 1753.403 ms
lz4: 1638.092 ms
pglz_long: 2227.804 ms

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

0001-Add-snappy-compression-algorithm-to-contrib.patchtext/x-patch; charset=us-asciiDownload

>From 94a7e7415f3dc657bea7dffbc2bbf79d946f2e56 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 5 Jun 2013 15:29:47 +0200
Subject: [PATCH 1/3] Add snappy compression algorithm to /contrib

Copied (with minimal modifications) from https://github.com/andikleen/snappy-c

Code is licensed under a 3-clause BSD license and copyrighted by google.

Todo:
* fix or document inclusion of src/backend/common.mk
---
 src/common/Makefile                |   26 +-
 src/common/snappy/Makefile         |   29 +
 src/common/snappy/snappy-compat.h  |   57 ++
 src/common/snappy/snappy-int.h     |   71 ++
 src/common/snappy/snappy.c         | 1534 ++++++++++++++++++++++++++++++++++++
 src/include/common/snappy/snappy.h |   34 +
 6 files changed, 1741 insertions(+), 10 deletions(-)
 create mode 100644 src/common/snappy/Makefile
 create mode 100644 src/common/snappy/snappy-compat.h
 create mode 100644 src/common/snappy/snappy-int.h
 create mode 100644 src/common/snappy/snappy.c
 create mode 100644 src/include/common/snappy/snappy.h

diff --git a/src/common/Makefile b/src/common/Makefile
index cd97980..b53e4df 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -23,11 +23,17 @@ include $(top_builddir)/src/Makefile.global
 override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
-OBJS_COMMON = relpath.o
+SUBDIRS = snappy
 
-OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o
+include $(top_srcdir)/src/backend/common.mk
 
-OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
+LOCAL_OBJS_COMMON = relpath.o
+LOCAL_OBJS_FRONTEND = $(LOCAL_OBJS_COMMON) fe_memutils.o
+LOCAL_OBJS_SRV = $(LOCAL_OBJS_COMMON:%.o=%_srv.o)
+
+SUBDIROBJS_EX = $(call expand_subsys,$(SUBDIROBJS))
+OBJS_FRONTEND = $(LOCAL_OBJS_FRONTEND) $(SUBDIROBJS_EX)
+OBJS_SRV = $(LOCAL_OBJS_SRV) $(SUBDIROBJS_EX:%.o=%_srv.o)
 
 all: libpgcommon.a libpgcommon_srv.a
 
@@ -41,15 +47,14 @@ installdirs:
 uninstall:
 	rm -f '$(DESTDIR)$(libdir)/libpgcommon.a'
 
-libpgcommon.a: $(OBJS_FRONTEND)
-	$(AR) $(AROPT) $@ $^
+libpgcommon.a: $(LOCAL_OBJS_FRONTEND) $(SUBDIROBJS)
+	$(AR) $(AROPT) $@ $(OBJS_FRONTEND)
 
 #
 # Server versions of object files
 #
-
-libpgcommon_srv.a: $(OBJS_SRV)
-	$(AR) $(AROPT) $@ $^
+libpgcommon_srv.a: $(LOCAL_OBJS_SRV) $(SUBDIROBJS)
+	$(AR) $(AROPT) $@ $(OBJS_SRV)
 
 # Because this uses its own compilation rule, it doesn't use the
 # dependency tracking logic from Makefile.global.  To make sure that
@@ -60,7 +65,8 @@ libpgcommon_srv.a: $(OBJS_SRV)
 %_srv.o: %.c %.o
 	$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
 
-$(OBJS_SRV): | submake-errcodes
+$(LOCAL_OBJS): | submake-errcodes
+$(SUBDIROBJS): | submake-errcodes
 
 .PHONY: submake-errcodes
 
@@ -68,4 +74,4 @@ submake-errcodes:
 	$(MAKE) -C ../backend submake-errcodes
 
 clean distclean maintainer-clean:
-	rm -f libpgcommon.a libpgcommon_srv.a $(OBJS_FRONTEND) $(OBJS_SRV)
+	rm -f libpgcommon.a libpgcommon_srv.a $(LOCAL_OBJS_FRONTEND) $(LOCAL_OBJS_SRV)
diff --git a/src/common/snappy/Makefile b/src/common/snappy/Makefile
new file mode 100644
index 0000000..2f9e439
--- /dev/null
+++ b/src/common/snappy/Makefile
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for common/snappy
+#
+# IDENTIFICATION
+#    src/common/snappy/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/common/snappy
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -Wno-declaration-after-statement $(CPPFLAGS)
+override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
+
+OBJS = snappy.o
+OBJS_SRV = $(OBJS:%.o=%_srv.o)
+
+include $(top_srcdir)/src/backend/common.mk
+
+%_srv.o: %.c %.o
+	$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
+
+all: $(OBJS_SRV)
+
+clean distclean maintainer-clean:
+	rm -f $(OBJS_SRV)
diff --git a/src/common/snappy/snappy-compat.h b/src/common/snappy/snappy-compat.h
new file mode 100644
index 0000000..69d1735
--- /dev/null
+++ b/src/common/snappy/snappy-compat.h
@@ -0,0 +1,57 @@
+#ifdef __FreeBSD__
+#  include <sys/endian.h>
+#elif defined(__APPLE_CC_) || defined(__MACH__)  /* MacOS/X support */
+#  include <machine/endian.h>
+
+#if    __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN
+#  define	htole16(x) (x)
+#  define	le32toh(x) (x)
+#elif  __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN
+#  define	htole16(x) __DARWIN_OSSwapInt16(x)
+#  define	le32toh(x) __DARWIN_OSSwapInt32(x)
+#else
+#  error "Endianness is undefined"
+#endif
+
+
+#else
+#  include <endian.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/uio.h>
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned u32;
+typedef unsigned long long u64;
+
+#define BUG_ON(x) assert(!(x))
+
+#define get_unaligned(x) (*(x))
+#define get_unaligned_le32(x) (le32toh(*(u32 *)(x)))
+#define put_unaligned(v,x) (*(x) = (v))
+#define put_unaligned_le16(v,x) (*(u16 *)(x) = htole16(v))
+
+#define vmalloc(x) malloc(x)
+#define vfree(x) free(x)
+
+#define EXPORT_SYMBOL(x)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define likely(x) __builtin_expect((x), 1)
+#define unlikely(x) __builtin_expect((x), 0)
+
+#define min_t(t,x,y) ((x) < (y) ? (x) : (y))
+#define max_t(t,x,y) ((x) > (y) ? (x) : (y))
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN__ 1
+#endif
+
+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
diff --git a/src/common/snappy/snappy-int.h b/src/common/snappy/snappy-int.h
new file mode 100644
index 0000000..ad31b54
--- /dev/null
+++ b/src/common/snappy/snappy-int.h
@@ -0,0 +1,71 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Various stubs for the open-source version of Snappy.
+
+#define likely(x) __builtin_expect(x, 1)
+#define unlikely(x) __builtin_expect(x, 0)
+
+#define CRASH_UNLESS(x) assert(x)
+#define CHECK(cond) assert(cond)
+#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
+#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
+#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
+#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
+#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
+#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
+
+#define UNALIGNED_LOAD16(_p) (*(const uint16 *)(_p))
+#define UNALIGNED_LOAD32(_p) (*(const uint32 *)(_p))
+#define UNALIGNED_LOAD64(_p) (*(const uint64 *)(_p))
+
+#define UNALIGNED_STORE16(_p, _val) (*(uint16 *)(_p) = (_val))
+#define UNALIGNED_STORE32(_p, _val) (*(uint32 *)(_p) = (_val))
+#define UNALIGNED_STORE64(_p, _val) (*(uint64 *)(_p) = (_val))
+
+#ifdef NDEBUG
+
+#define DCHECK(cond) CRASH_UNLESS(true)
+#define DCHECK_LE(a, b) CRASH_UNLESS(true)
+#define DCHECK_GE(a, b) CRASH_UNLESS(true)
+#define DCHECK_EQ(a, b) CRASH_UNLESS(true)
+#define DCHECK_NE(a, b) CRASH_UNLESS(true)
+#define DCHECK_LT(a, b) CRASH_UNLESS(true)
+#define DCHECK_GT(a, b) CRASH_UNLESS(true)
+
+#else
+
+#define DCHECK(cond) CHECK(cond)
+#define DCHECK_LE(a, b) CHECK_LE(a, b)
+#define DCHECK_GE(a, b) CHECK_GE(a, b)
+#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
+#define DCHECK_NE(a, b) CHECK_NE(a, b)
+#define DCHECK_LT(a, b) CHECK_LT(a, b)
+#define DCHECK_GT(a, b) CHECK_GT(a, b)
+
+#endif
diff --git a/src/common/snappy/snappy.c b/src/common/snappy/snappy.c
new file mode 100644
index 0000000..3f244fb
--- /dev/null
+++ b/src/common/snappy/snappy.c
@@ -0,0 +1,1534 @@
+/*
+ * C port of the snappy compressor from Google.
+ * This is a very fast compressor with comparable compression to lzo.
+ * Works best on 64bit little-endian, but should be good on others too.
+ * Ported by Andi Kleen.
+ * Based on snappy 1.0.3 plus some selected changes from SVN.
+ */
+
+/*
+ * Copyright 2005 Google Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "postgres.h"
+
+#include "common/snappy/snappy.h"
+#include "snappy-compat.h"
+
+#define CRASH_UNLESS(x) BUG_ON(!(x))
+#define CHECK(cond) CRASH_UNLESS(cond)
+#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
+#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
+#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
+#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
+#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
+#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
+
+#define UNALIGNED_LOAD16(_p) get_unaligned((u16 *)(_p))
+#define UNALIGNED_LOAD32(_p) get_unaligned((u32 *)(_p))
+#define UNALIGNED_LOAD64(_p) get_unaligned((u64 *)(_p))
+
+#define UNALIGNED_STORE16(_p, _val) put_unaligned(_val, (u16 *)(_p))
+#define UNALIGNED_STORE32(_p, _val) put_unaligned(_val, (u32 *)(_p))
+#define UNALIGNED_STORE64(_p, _val) put_unaligned(_val, (u64 *)(_p))
+
+#ifdef NDEBUG
+
+#define DCHECK(cond) do {} while(0)
+#define DCHECK_LE(a, b) do {} while(0)
+#define DCHECK_GE(a, b) do {} while(0)
+#define DCHECK_EQ(a, b) do {} while(0)
+#define DCHECK_NE(a, b) do {} while(0)
+#define DCHECK_LT(a, b) do {} while(0)
+#define DCHECK_GT(a, b) do {} while(0)
+
+#else
+
+#define DCHECK(cond) CHECK(cond)
+#define DCHECK_LE(a, b) CHECK_LE(a, b)
+#define DCHECK_GE(a, b) CHECK_GE(a, b)
+#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
+#define DCHECK_NE(a, b) CHECK_NE(a, b)
+#define DCHECK_LT(a, b) CHECK_LT(a, b)
+#define DCHECK_GT(a, b) CHECK_GT(a, b)
+
+#endif
+
+static inline bool is_little_endian(void)
+{
+#ifndef WORDS_BIGENDIAN
+	return true;
+#endif
+	return false;
+}
+
+static inline int log2_floor(u32 n)
+{
+	return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+
+static inline int find_lsb_set_non_zero(u32 n)
+{
+	return __builtin_ctz(n);
+}
+
+static inline int find_lsb_set_non_zero64(u64 n)
+{
+	return __builtin_ctzll(n);
+}
+
+#define kmax32 5
+
+/*
+ * Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
+ *  Never reads a character at or beyond limit.  If a valid/terminated varint32
+ * was found in the range, stores it in *OUTPUT and returns a pointer just
+ * past the last byte of the varint32. Else returns NULL.  On success,
+ * "result <= limit".
+ */
+static inline const char *varint_parse32_with_limit(const char *p,
+						    const char *l,
+						    u32 * OUTPUT)
+{
+	const unsigned char *ptr = (const unsigned char *)(p);
+	const unsigned char *limit = (const unsigned char *)(l);
+	u32 b, result;
+
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result = b & 127;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 7;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 14;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 21;
+	if (b < 128)
+		goto done;
+	if (ptr >= limit)
+		return NULL;
+	b = *(ptr++);
+	result |= (b & 127) << 28;
+	if (b < 16)
+		goto done;
+	return NULL;		/* Value is too long to be a varint32 */
+done:
+	*OUTPUT = result;
+	return (const char *)(ptr);
+}
+
+/*
+ * REQUIRES   "ptr" points to a buffer of length sufficient to hold "v".
+ *  EFFECTS    Encodes "v" into "ptr" and returns a pointer to the
+ *            byte just past the last encoded byte.
+ */
+static inline char *varint_encode32(char *sptr, u32 v)
+{
+	/* Operate on characters as unsigneds */
+	unsigned char *ptr = (unsigned char *)(sptr);
+	static const int B = 128;
+
+	if (v < (1 << 7)) {
+		*(ptr++) = v;
+	} else if (v < (1 << 14)) {
+		*(ptr++) = v | B;
+		*(ptr++) = v >> 7;
+	} else if (v < (1 << 21)) {
+		*(ptr++) = v | B;
+		*(ptr++) = (v >> 7) | B;
+		*(ptr++) = v >> 14;
+	} else if (v < (1 << 28)) {
+		*(ptr++) = v | B;
+		*(ptr++) = (v >> 7) | B;
+		*(ptr++) = (v >> 14) | B;
+		*(ptr++) = v >> 21;
+	} else {
+		*(ptr++) = v | B;
+		*(ptr++) = (v >> 7) | B;
+		*(ptr++) = (v >> 14) | B;
+		*(ptr++) = (v >> 21) | B;
+		*(ptr++) = v >> 28;
+	}
+	return (char *)(ptr);
+}
+
+#ifdef SG
+
+struct source {
+	struct iovec *iov;
+	int iovlen;
+	int curvec;
+	int curoff;
+	size_t total;
+};
+
+/* Only valid at beginning when nothing is consumed */
+static inline int available(struct source *s)
+{
+	return s->total;
+}
+
+static inline const char *peek(struct source *s, size_t *len)
+{
+	if (likely(s->curvec < s->iovlen)) {
+		struct iovec *iv = &s->iov[s->curvec];
+		if (s->curoff < iv->iov_len) {
+			*len = iv->iov_len - s->curoff;
+			return iv->iov_base + s->curoff;
+		}
+	}
+	*len = 0;
+	return NULL;
+}
+
+static inline void skip(struct source *s, size_t n)
+{
+	struct iovec *iv = &s->iov[s->curvec];
+	s->curoff += n;
+	DCHECK_LE(s->curoff, iv->iov_len);
+	if (s->curoff >= iv->iov_len && s->curvec + 1 < s->iovlen) {
+		s->curoff = 0;
+		s->curvec++;
+	}
+}
+
+struct sink {
+	struct iovec *iov;
+	int iovlen;
+	unsigned curvec;
+	unsigned curoff;
+	unsigned written;
+};
+
+static inline void append(struct sink *s, const char *data, size_t n)
+{
+	struct iovec *iov = &s->iov[s->curvec];
+	char *dst = iov->iov_base + s->curoff;
+	size_t nlen = min_t(size_t, iov->iov_len - s->curoff, n);
+	if (data != dst)
+		memcpy(dst, data, nlen);
+	s->written += n;
+	s->curoff += nlen;
+	while ((n -= nlen) > 0) {
+		data += nlen;
+		s->curvec++;
+		DCHECK_LT(s->curvec, s->iovlen);
+		iov++;
+		nlen = min_t(size_t, iov->iov_len, n);
+		memcpy(iov->iov_base, data, nlen);
+		s->curoff = nlen;
+	}
+}
+
+static inline void *sink_peek(struct sink *s, size_t n)
+{
+	struct iovec *iov = &s->iov[s->curvec];
+	if (s->curvec < iov->iov_len && iov->iov_len - s->curoff >= n)
+		return iov->iov_base + s->curoff;
+	return NULL;
+}
+
+#else
+
+struct source {
+	const char *ptr;
+	size_t left;
+};
+
+static inline int available(struct source *s)
+{
+	return s->left;
+}
+
+static inline const char *peek(struct source *s, size_t * len)
+{
+	*len = s->left;
+	return s->ptr;
+}
+
+static inline void skip(struct source *s, size_t n)
+{
+	s->left -= n;
+	s->ptr += n;
+}
+
+struct sink {
+	char *dest;
+};
+
+static inline void append(struct sink *s, const char *data, size_t n)
+{
+	if (data != s->dest)
+		memcpy(s->dest, data, n);
+	s->dest += n;
+}
+
+#define sink_peek(s, n) sink_peek_no_sg(s)
+
+static inline void *sink_peek_no_sg(const struct sink *s)
+{
+	return s->dest;
+}
+
+#endif
+
+struct writer {
+	char *base;
+	char *op;
+	char *op_limit;
+};
+
+/* Called before decompression */
+static inline void writer_set_expected_length(struct writer *w, size_t len)
+{
+	w->op_limit = w->op + len;
+}
+
+/* Called after decompression */
+static inline bool writer_check_length(struct writer *w)
+{
+	return w->op == w->op_limit;
+}
+
+/*
+ * Copy "len" bytes from "src" to "op", one byte at a time.  Used for
+ *  handling COPY operations where the input and output regions may
+ * overlap.  For example, suppose:
+ *    src    == "ab"
+ *    op     == src + 2
+ *    len    == 20
+ * After IncrementalCopy(src, op, len), the result will have
+ * eleven copies of "ab"
+ *    ababababababababababab
+ * Note that this does not match the semantics of either memcpy()
+ * or memmove().
+ */
+static inline void incremental_copy(const char *src, char *op, int len)
+{
+	DCHECK_GT(len, 0);
+	do {
+		*op++ = *src++;
+	} while (--len > 0);
+}
+
+/*
+ * Equivalent to IncrementalCopy except that it can write up to ten extra
+ *  bytes after the end of the copy, and that it is faster.
+ *
+ * The main part of this loop is a simple copy of eight bytes at a time until
+ * we've copied (at least) the requested amount of bytes.  However, if op and
+ * src are less than eight bytes apart (indicating a repeating pattern of
+ * length < 8), we first need to expand the pattern in order to get the correct
+ * results. For instance, if the buffer looks like this, with the eight-byte
+ * <src> and <op> patterns marked as intervals:
+ *
+ *    abxxxxxxxxxxxx
+ *    [------]           src
+ *      [------]         op
+ *
+ * a single eight-byte copy from <src> to <op> will repeat the pattern once,
+ * after which we can move <op> two bytes without moving <src>:
+ *
+ *    ababxxxxxxxxxx
+ *    [------]           src
+ *        [------]       op
+ *
+ * and repeat the exercise until the two no longer overlap.
+ *
+ * This allows us to do very well in the special case of one single byte
+ * repeated many times, without taking a big hit for more general cases.
+ *
+ * The worst case of extra writing past the end of the match occurs when
+ * op - src == 1 and len == 1; the last copy will read from byte positions
+ * [0..7] and write to [4..11], whereas it was only supposed to write to
+ * position 1. Thus, ten excess bytes.
+ */
+
+#define kmax_increment_copy_overflow  10
+
+static inline void incremental_copy_fast_path(const char *src, char *op,
+					      int len)
+{
+	while (op - src < 8) {
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+		len -= op - src;
+		op += op - src;
+	}
+	while (len > 0) {
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+		src += 8;
+		op += 8;
+		len -= 8;
+	}
+}
+
+static inline bool writer_append_from_self(struct writer *w, u32 offset,
+					   u32 len)
+{
+	char *const op = w->op;
+	CHECK_LE(op, w->op_limit);
+	const u32 space_left = w->op_limit - op;
+
+	if (op - w->base <= offset - 1u)	/* -1u catches offset==0 */
+		return false;
+	if (len <= 16 && offset >= 8 && space_left >= 16) {
+		/* Fast path, used for the majority (70-80%) of dynamic
+		 * invocations. */
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
+		UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+	} else {
+		if (space_left >= len + kmax_increment_copy_overflow) {
+			incremental_copy_fast_path(op - offset, op, len);
+		} else {
+			if (space_left < len) {
+				return false;
+			}
+			incremental_copy(op - offset, op, len);
+		}
+	}
+
+	w->op = op + len;
+	return true;
+}
+
+static inline bool writer_append(struct writer *w, const char *ip, u32 len)
+{
+	char *const op = w->op;
+	CHECK_LE(op, w->op_limit);
+	const u32 space_left = w->op_limit - op;
+	if (space_left < len)
+		return false;
+	memcpy(op, ip, len);
+	w->op = op + len;
+	return true;
+}
+
+static inline bool writer_try_fast_append(struct writer *w, const char *ip,
+					  u32 available_bytes, u32 len)
+{
+	char *const op = w->op;
+	const int space_left = w->op_limit - op;
+	if (len <= 16 && available_bytes >= 16 && space_left >= 16) {
+		/* Fast path, used for the majority (~95%) of invocations */
+		UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+		UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+		w->op = op + len;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Any hash function will produce a valid compressed bitstream, but a good
+ * hash function reduces the number of collisions and thus yields better
+ * compression for compressible input, and more speed for incompressible
+ * input. Of course, it doesn't hurt if the hash function is reasonably fast
+ * either, as it gets called a lot.
+ */
+static inline u32 hash_bytes(u32 bytes, int shift)
+{
+	u32 kmul = 0x1e35a7bd;
+	return (bytes * kmul) >> shift;
+}
+
+static inline u32 hash(const char *p, int shift)
+{
+	return hash_bytes(UNALIGNED_LOAD32(p), shift);
+}
+
+/*
+ * Compressed data can be defined as:
+ *    compressed := item* literal*
+ *    item       := literal* copy
+ *
+ * The trailing literal sequence has a space blowup of at most 62/60
+ * since a literal of length 60 needs one tag byte + one extra byte
+ * for length information.
+ *
+ * Item blowup is trickier to measure.  Suppose the "copy" op copies
+ * 4 bytes of data.  Because of a special check in the encoding code,
+ * we produce a 4-byte copy only if the offset is < 65536.  Therefore
+ * the copy op takes 3 bytes to encode, and this type of item leads
+ * to at most the 62/60 blowup for representing literals.
+ *
+ * Suppose the "copy" op copies 5 bytes of data.  If the offset is big
+ * enough, it will take 5 bytes to encode the copy op.  Therefore the
+ * worst case here is a one-byte literal followed by a five-byte copy.
+ * I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
+ *
+ * This last factor dominates the blowup, so the final estimate is:
+ */
+size_t snappy_max_compressed_length(size_t source_len)
+{
+	return 32 + source_len + source_len / 6;
+}
+EXPORT_SYMBOL(snappy_max_compressed_length);
+
+enum {
+	LITERAL = 0,
+	COPY_1_BYTE_OFFSET = 1,	/* 3 bit length + 3 bits of offset in opcode */
+	COPY_2_BYTE_OFFSET = 2,
+	COPY_4_BYTE_OFFSET = 3
+};
+
+static inline char *emit_literal(char *op,
+				 const char *literal,
+				 int len, bool allow_fast_path)
+{
+	int n = len - 1;	/* Zero-length literals are disallowed */
+
+	if (n < 60) {
+		/* Fits in tag byte */
+		*op++ = LITERAL | (n << 2);
+
+/*
+ * The vast majority of copies are below 16 bytes, for which a
+ * call to memcpy is overkill. This fast path can sometimes
+ * copy up to 15 bytes too much, but that is okay in the
+ * main loop, since we have a bit to go on for both sides:
+ *
+ *   - The input will always have kInputMarginBytes = 15 extra
+ *     available bytes, as long as we're in the main loop, and
+ *     if not, allow_fast_path = false.
+ *   - The output will always have 32 spare bytes (see
+ *     MaxCompressedLength).
+ */
+		if (allow_fast_path && len <= 16) {
+			UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
+			UNALIGNED_STORE64(op + 8,
+					  UNALIGNED_LOAD64(literal + 8));
+			return op + len;
+		}
+	} else {
+		/* Encode in upcoming bytes */
+		char *base = op;
+		int count = 0;
+		op++;
+		while (n > 0) {
+			*op++ = n & 0xff;
+			n >>= 8;
+			count++;
+		}
+		DCHECK(count >= 1);
+		DCHECK(count <= 4);
+		*base = LITERAL | ((59 + count) << 2);
+	}
+	memcpy(op, literal, len);
+	return op + len;
+}
+
+static inline char *emit_copy_less_than64(char *op, int offset, int len)
+{
+	DCHECK_LE(len, 64);
+	DCHECK_GE(len, 4);
+	DCHECK_LT(offset, 65536);
+
+	if ((len < 12) && (offset < 2048)) {
+		int len_minus_4 = len - 4;
+		DCHECK(len_minus_4 < 8);	/* Must fit in 3 bits */
+		*op++ =
+		    COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8)
+								 << 5);
+		*op++ = offset & 0xff;
+	} else {
+		*op++ = COPY_2_BYTE_OFFSET | ((len - 1) << 2);
+		put_unaligned_le16(offset, op);
+		op += 2;
+	}
+	return op;
+}
+
+static inline char *emit_copy(char *op, int offset, int len)
+{
+	/*
+	 * Emit 64 byte copies but make sure to keep at least four bytes
+	 * reserved
+	 */
+	while (len >= 68) {
+		op = emit_copy_less_than64(op, offset, 64);
+		len -= 64;
+	}
+
+	/*
+	 * Emit an extra 60 byte copy if have too much data to fit in
+	 * one copy
+	 */
+	if (len > 64) {
+		op = emit_copy_less_than64(op, offset, 60);
+		len -= 60;
+	}
+
+	/* Emit remainder */
+	op = emit_copy_less_than64(op, offset, len);
+	return op;
+}
+
+/**
+ * snappy_uncompressed_length - return length of uncompressed output.
+ * @start: compressed buffer
+ * @n: length of compressed buffer.
+ * @result: Write the length of the uncompressed output here.
+ *
+ * Returns true when successfull, otherwise false.
+ */
+bool snappy_uncompressed_length(const char *start, size_t n, size_t * result)
+{
+	u32 v = 0;
+	const char *limit = start + n;
+	if (varint_parse32_with_limit(start, limit, &v) != NULL) {
+		*result = v;
+		return true;
+	} else {
+		return false;
+	}
+}
+EXPORT_SYMBOL(snappy_uncompressed_length);
+
+#define kblock_log 15
+#define kblock_size (1 << kblock_log)
+
+/*
+ * This value could be halfed or quartered to save memory
+ * at the cost of slightly worse compression.
+ */
+#define kmax_hash_table_bits 14
+#define kmax_hash_table_size (1U << kmax_hash_table_bits)
+
+/*
+ * Use smaller hash table when input.size() is smaller, since we
+ * fill the table, incurring O(hash table size) overhead for
+ * compression, and if the input is short, we won't need that
+ * many hash table entries anyway.
+ */
+static u16 *get_hash_table(struct snappy_env *env, size_t input_size,
+			      int *table_size)
+{
+	unsigned htsize = 256;
+
+	DCHECK(kmax_hash_table_size >= 256);
+	while (htsize < kmax_hash_table_size && htsize < input_size)
+		htsize <<= 1;
+	CHECK_EQ(0, htsize & (htsize - 1));
+	CHECK_LE(htsize, kmax_hash_table_size);
+
+	u16 *table;
+	table = env->hash_table;
+
+	*table_size = htsize;
+	memset(table, 0, htsize * sizeof(*table));
+	return table;
+}
+
+/*
+ * Return the largest n such that
+ *
+ *   s1[0,n-1] == s2[0,n-1]
+ *   and n <= (s2_limit - s2).
+ *
+ * Does not read *s2_limit or beyond.
+ * Does not read *(s1 + (s2_limit - s2)) or beyond.
+ * Requires that s2_limit >= s2.
+ *
+ * Separate implementation for x86_64, for speed.  Uses the fact that
+ * x86_64 is little endian.
+ */
+#if !defined(WORDS_BIGENDIAN) && BITS_PER_LONG == 64
+static inline int find_match_length(const char *s1,
+				    const char *s2, const char *s2_limit)
+{
+	int matched = 0;
+
+	DCHECK_GE(s2_limit, s2);
+	/*
+	 * Find out how long the match is. We loop over the data 64 bits at a
+	 * time until we find a 64-bit block that doesn't match; then we find
+	 * the first non-matching bit and use that to calculate the total
+	 * length of the match.
+	 */
+	while (likely(s2 <= s2_limit - 8)) {
+		if (unlikely
+		    (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched))) {
+			s2 += 8;
+			matched += 8;
+		} else {
+			/*
+			 * On current (mid-2008) Opteron models there
+			 * is a 3% more efficient code sequence to
+			 * find the first non-matching byte.  However,
+			 * what follows is ~10% better on Intel Core 2
+			 * and newer, and we expect AMD's bsf
+			 * instruction to improve.
+			 */
+			u64 x =
+			    UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 +
+								    matched);
+			int matching_bits = find_lsb_set_non_zero64(x);
+			matched += matching_bits >> 3;
+			return matched;
+		}
+	}
+	while (likely(s2 < s2_limit)) {
+		if (likely(s1[matched] == *s2)) {
+			++s2;
+			++matched;
+		} else {
+			return matched;
+		}
+	}
+	return matched;
+}
+#else
+static inline int find_match_length(const char *s1,
+				    const char *s2, const char *s2_limit)
+{
+	/* Implementation based on the x86-64 version, above. */
+	DCHECK_GE(s2_limit, s2);
+	int matched = 0;
+
+	while (s2 <= s2_limit - 4 &&
+	       UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
+		s2 += 4;
+		matched += 4;
+	}
+	if (is_little_endian() && s2 <= s2_limit - 4) {
+		u32 x =
+		    UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
+		int matching_bits = find_lsb_set_non_zero(x);
+		matched += matching_bits >> 3;
+	} else {
+		while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+			++s2;
+			++matched;
+		}
+	}
+	return matched;
+}
+#endif
+
+/*
+ * For 0 <= offset <= 4, GetU32AtOffset(UNALIGNED_LOAD64(p), offset) will
+ *  equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
+ * empirically found that overlapping loads such as
+ *  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
+ * are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to u32.
+ */
+static inline u32 get_u32_at_offset(u64 v, int offset)
+{
+	DCHECK(0 <= offset && offset <= 4);
+	return v >> (is_little_endian()? 8 * offset : 32 - 8 * offset);
+}
+
+/*
+ * Flat array compression that does not emit the "uncompressed length"
+ *  prefix. Compresses "input" string to the "*op" buffer.
+ *
+ * REQUIRES: "input" is at most "kBlockSize" bytes long.
+ * REQUIRES: "op" points to an array of memory that is at least
+ * "MaxCompressedLength(input.size())" in size.
+ * REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+ * REQUIRES: "table_size" is a power of two
+ *
+ * Returns an "end" pointer into "op" buffer.
+ * "end - op" is the compressed size of "input".
+ */
+
+static char *compress_fragment(const char *const input,
+			       const size_t input_size,
+			       char *op, u16 * table, const unsigned table_size)
+{
+	/* "ip" is the input pointer, and "op" is the output pointer. */
+	const char *ip = input;
+	CHECK_LE(input_size, kblock_size);
+	CHECK_EQ(table_size & (table_size - 1), 0);
+	const int shift = 32 - log2_floor(table_size);
+	DCHECK_EQ(UINT_MAX >> shift, table_size - 1);
+	const char *ip_end = input + input_size;
+	const char *baseip = ip;
+	/*
+	 * Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
+	 *  [next_emit, ip_end) after the main loop.
+	 */
+	const char *next_emit = ip;
+
+	const unsigned kinput_margin_bytes = 15;
+
+	if (likely(input_size >= kinput_margin_bytes)) {
+		const char *const ip_limit = input + input_size -
+			kinput_margin_bytes;
+
+		u32 next_hash;
+		for (next_hash = hash(++ip, shift);;) {
+			DCHECK_LT(next_emit, ip);
+/*
+ * The body of this loop calls EmitLiteral once and then EmitCopy one or
+ * more times.  (The exception is that when we're close to exhausting
+ * the input we goto emit_remainder.)
+ *
+ * In the first iteration of this loop we're just starting, so
+ * there's nothing to copy, so calling EmitLiteral once is
+ * necessary.  And we only start a new iteration when the
+ * current iteration has determined that a call to EmitLiteral will
+ * precede the next call to EmitCopy (if any).
+ *
+ * Step 1: Scan forward in the input looking for a 4-byte-long match.
+ * If we get close to exhausting the input then goto emit_remainder.
+ *
+ * Heuristic match skipping: If 32 bytes are scanned with no matches
+ * found, start looking only at every other byte. If 32 more bytes are
+ * scanned, look at every third byte, etc.. When a match is found,
+ * immediately go back to looking at every byte. This is a small loss
+ * (~5% performance, ~0.1% density) for lcompressible data due to more
+ * bookkeeping, but for non-compressible data (such as JPEG) it's a huge
+ * win since the compressor quickly "realizes" the data is incompressible
+ * and doesn't bother looking for matches everywhere.
+ *
+ * The "skip" variable keeps track of how many bytes there are since the
+ * last match; dividing it by 32 (ie. right-shifting by five) gives the
+ * number of bytes to move ahead for each iteration.
+ */
+			u32 skip_bytes = 32;
+
+			const char *next_ip = ip;
+			const char *candidate;
+			do {
+				ip = next_ip;
+				u32 hval = next_hash;
+				DCHECK_EQ(hval, hash(ip, shift));
+				u32 bytes_between_hash_lookups = skip_bytes++ >> 5;
+				next_ip = ip + bytes_between_hash_lookups;
+				if (unlikely(next_ip > ip_limit)) {
+					goto emit_remainder;
+				}
+				next_hash = hash(next_ip, shift);
+				candidate = baseip + table[hval];
+				DCHECK_GE(candidate, baseip);
+				DCHECK_LT(candidate, ip);
+
+				table[hval] = ip - baseip;
+			} while (likely(UNALIGNED_LOAD32(ip) !=
+					UNALIGNED_LOAD32(candidate)));
+
+/*
+ * Step 2: A 4-byte match has been found.  We'll later see if more
+ * than 4 bytes match.  But, prior to the match, input
+ * bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
+ */
+			DCHECK_LE(next_emit + 16, ip_end);
+			op = emit_literal(op, next_emit, ip - next_emit, true);
+
+/*
+ * Step 3: Call EmitCopy, and then see if another EmitCopy could
+ * be our next move.  Repeat until we find no match for the
+ * input immediately after what was consumed by the last EmitCopy call.
+ *
+ * If we exit this loop normally then we need to call EmitLiteral next,
+ * though we don't yet know how big the literal will be.  We handle that
+ * by proceeding to the next iteration of the main loop.  We also can exit
+ * this loop via goto if we get close to exhausting the input.
+ */
+			u64 input_bytes = 0;
+			u32 candidate_bytes = 0;
+
+			do {
+/*
+ * We have a 4-byte match at ip, and no need to emit any
+ *  "literal bytes" prior to ip.
+ */
+				const char *base = ip;
+				int matched = 4 +
+				    find_match_length(candidate + 4, ip + 4,
+						      ip_end);
+				ip += matched;
+				int offset = base - candidate;
+				DCHECK_EQ(0, memcmp(base, candidate, matched));
+				op = emit_copy(op, offset, matched);
+/*
+ * We could immediately start working at ip now, but to improve
+ * compression we first update table[Hash(ip - 1, ...)].
+ */
+				const char *insert_tail = ip - 1;
+				next_emit = ip;
+				if (unlikely(ip >= ip_limit)) {
+					goto emit_remainder;
+				}
+				input_bytes = UNALIGNED_LOAD64(insert_tail);
+				u32 prev_hash =
+				    hash_bytes(get_u32_at_offset
+					       (input_bytes, 0), shift);
+				table[prev_hash] = ip - baseip - 1;
+				u32 cur_hash =
+				    hash_bytes(get_u32_at_offset
+					       (input_bytes, 1), shift);
+				candidate = baseip + table[cur_hash];
+				candidate_bytes = UNALIGNED_LOAD32(candidate);
+				table[cur_hash] = ip - baseip;
+			} while (get_u32_at_offset(input_bytes, 1) ==
+				 candidate_bytes);
+
+			next_hash =
+			    hash_bytes(get_u32_at_offset(input_bytes, 2),
+				       shift);
+			++ip;
+		}
+	}
+
+emit_remainder:
+	/* Emit the remaining bytes as a literal */
+	if (next_emit < ip_end)
+		op = emit_literal(op, next_emit, ip_end - next_emit, false);
+
+	return op;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ *  Lookup table for decompression code.  Generated by ComputeTable() below.
+ * -----------------------------------------------------------------------
+ */
+
+/* Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits */
+static const u32 wordmask[] = {
+	0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+/*
+ * Data stored per entry in lookup table:
+ *       Range   Bits-used       Description
+ *      ------------------------------------
+ *      1..64   0..7            Literal/copy length encoded in opcode byte
+ *      0..7    8..10           Copy offset encoded in opcode byte / 256
+ *      0..4    11..13          Extra bytes after opcode
+ *
+ * We use eight bits for the length even though 7 would have sufficed
+ * because of efficiency reasons:
+ *      (1) Extracting a byte is faster than a bit-field
+ *      (2) It properly aligns copy offset so we do not need a <<8
+ */
+static const u16 char_table[256] = {
+	0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+	0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+	0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+	0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+	0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+	0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+	0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+	0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+	0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+	0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+	0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+	0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+	0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+	0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+	0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+	0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+	0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+	0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+	0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+	0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+	0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+	0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+	0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+	0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+	0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+	0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+	0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+	0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+	0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+	0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+	0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+	0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
+struct snappy_decompressor {
+	struct source *reader;	/* Underlying source of bytes to decompress */
+	const char *ip;		/* Points to next buffered byte */
+	const char *ip_limit;	/* Points just past buffered bytes */
+	u32 peeked;		/* Bytes peeked from reader (need to skip) */
+	bool eof;		/* Hit end of input without an error? */
+	char scratch[5];	/* Temporary buffer for peekfast boundaries */
+};
+
+static void
+init_snappy_decompressor(struct snappy_decompressor *d, struct source *reader)
+{
+	d->reader = reader;
+	d->ip = NULL;
+	d->ip_limit = NULL;
+	d->peeked = 0;
+	d->eof = false;
+}
+
+static void exit_snappy_decompressor(struct snappy_decompressor *d)
+{
+	skip(d->reader, d->peeked);
+}
+
+/*
+ * Read the uncompressed length stored at the start of the compressed data.
+ * On succcess, stores the length in *result and returns true.
+ * On failure, returns false.
+ */
+static bool read_uncompressed_length(struct snappy_decompressor *d,
+				     u32 * result)
+{
+	DCHECK(d->ip == NULL);	/*
+				 * Must not have read anything yet
+				 * Length is encoded in 1..5 bytes
+				 */
+	*result = 0;
+	u32 shift = 0;
+	while (true) {
+		if (shift >= 32)
+			return false;
+		size_t n;
+		const char *ip = peek(d->reader, &n);
+		if (n == 0)
+			return false;
+		const unsigned char c = *(const unsigned char *)(ip);
+		skip(d->reader, 1);
+		*result |= (u32) (c & 0x7f) << shift;
+		if (c < 128) {
+			break;
+		}
+		shift += 7;
+	}
+	return true;
+}
+
+static bool refill_tag(struct snappy_decompressor *d);
+
+/*
+ * Process the next item found in the input.
+ * Returns true if successful, false on error or end of input.
+ */
+static void decompress_all_tags(struct snappy_decompressor *d,
+				struct writer *writer)
+{
+	const char *ip = d->ip;
+
+	/*
+	 * We could have put this refill fragment only at the beginning of the loop.
+	 * However, duplicating it at the end of each branch gives the compiler more
+	 * scope to optimize the <ip_limit_ - ip> expression based on the local
+	 * context, which overall increases speed.
+	 */
+#define MAYBE_REFILL() \
+        if (d->ip_limit - ip < 5) {		\
+		d->ip = ip;			\
+		if (!refill_tag(d)) return;	\
+		ip = d->ip;			\
+        }
+
+
+	MAYBE_REFILL();
+	for (;;) {
+		if (d->ip_limit - ip < 5) {
+			d->ip = ip;
+			if (!refill_tag(d))
+				return;
+			ip = d->ip;
+		}
+
+		const unsigned char c = *(const unsigned char *)(ip++);
+
+		if ((c & 0x3) == LITERAL) {
+			u32 literal_length = (c >> 2) + 1;
+			if (writer_try_fast_append(writer, ip, d->ip_limit - ip,
+						   literal_length)) {
+				DCHECK_LT(literal_length, 61);
+				ip += literal_length;
+				MAYBE_REFILL();
+				continue;
+			}
+			if (unlikely(literal_length >= 61)) {
+				/* Long literal */
+				const u32 literal_ll = literal_length - 60;
+				literal_length = (get_unaligned_le32(ip) &
+						  wordmask[literal_ll]) + 1;
+				ip += literal_ll;
+			}
+
+			u32 avail = d->ip_limit - ip;
+			while (avail < literal_length) {
+				if (!writer_append(writer, ip, avail))
+					return;
+				literal_length -= avail;
+				skip(d->reader, d->peeked);
+				size_t n;
+				ip = peek(d->reader, &n);
+				avail = n;
+				d->peeked = avail;
+				if (avail == 0)
+					return;	/* Premature end of input */
+				d->ip_limit = ip + avail;
+			}
+			if (!writer_append(writer, ip, literal_length))
+				return;
+			ip += literal_length;
+			MAYBE_REFILL();
+		} else {
+			const u32 entry = char_table[c];
+			const u32 trailer = get_unaligned_le32(ip) &
+				wordmask[entry >> 11];
+			const u32 length = entry & 0xff;
+			ip += entry >> 11;
+
+			/*
+			 * copy_offset/256 is encoded in bits 8..10.
+			 * By just fetching those bits, we get
+			 * copy_offset (since the bit-field starts at
+			 * bit 8).
+			 */
+			const u32 copy_offset = entry & 0x700;
+			if (!writer_append_from_self(writer,
+						     copy_offset + trailer,
+						     length))
+				return;
+			MAYBE_REFILL();
+		}
+	}
+}
+
+#undef MAYBE_REFILL
+
+static bool refill_tag(struct snappy_decompressor *d)
+{
+	const char *ip = d->ip;
+
+	if (ip == d->ip_limit) {
+		size_t n;
+		/* Fetch a new fragment from the reader */
+		skip(d->reader, d->peeked); /* All peeked bytes are used up */
+		ip = peek(d->reader, &n);
+		d->peeked = n;
+		if (n == 0) {
+			d->eof = true;
+			return false;
+		}
+		d->ip_limit = ip + n;
+	}
+
+	/* Read the tag character */
+	DCHECK_LT(ip, d->ip_limit);
+	const unsigned char c = *(const unsigned char *)(ip);
+	const u32 entry = char_table[c];
+	const u32 needed = (entry >> 11) + 1;	/* +1 byte for 'c' */
+	DCHECK_LE(needed, sizeof(d->scratch));
+
+	/* Read more bytes from reader if needed */
+	u32 nbuf = d->ip_limit - ip;
+
+	if (nbuf < needed) {
+		/*
+		 * Stitch together bytes from ip and reader to form the word
+		 * contents.  We store the needed bytes in "scratch".  They
+		 * will be consumed immediately by the caller since we do not
+		 * read more than we need.
+		 */
+		memmove(d->scratch, ip, nbuf);
+		skip(d->reader, d->peeked); /* All peeked bytes are used up */
+		d->peeked = 0;
+		while (nbuf < needed) {
+			size_t length;
+			const char *src = peek(d->reader, &length);
+			if (length == 0)
+				return false;
+			u32 to_add = min_t(u32, needed - nbuf, length);
+			memcpy(d->scratch + nbuf, src, to_add);
+			nbuf += to_add;
+			skip(d->reader, to_add);
+		}
+		DCHECK_EQ(nbuf, needed);
+		d->ip = d->scratch;
+		d->ip_limit = d->scratch + needed;
+	} else if (nbuf < 5) {
+		/*
+		 * Have enough bytes, but move into scratch so that we do not
+		 * read past end of input
+		 */
+		memmove(d->scratch, ip, nbuf);
+		skip(d->reader, d->peeked); /* All peeked bytes are used up */
+		d->peeked = 0;
+		d->ip = d->scratch;
+		d->ip_limit = d->scratch + nbuf;
+	} else {
+		/* Pass pointer to buffer returned by reader. */
+		d->ip = ip;
+	}
+	return true;
+}
+
+static int internal_uncompress(struct source *r,
+			       struct writer *writer, u32 max_len)
+{
+	struct snappy_decompressor decompressor;
+	u32 uncompressed_len = 0;
+
+	init_snappy_decompressor(&decompressor, r);
+
+	if (!read_uncompressed_length(&decompressor, &uncompressed_len))
+		return -EIO;
+	/* Protect against possible DoS attack */
+	if ((u64) (uncompressed_len) > max_len)
+		return -EIO;
+
+	writer_set_expected_length(writer, uncompressed_len);
+
+	/* Process the entire input */
+	decompress_all_tags(&decompressor, writer);
+
+	exit_snappy_decompressor(&decompressor);
+	return (decompressor.eof && writer_check_length(writer)) ? 0 : -EIO;
+}
+
+static inline int compress(struct snappy_env *env, struct source *reader,
+			   struct sink *writer)
+{
+	int err;
+	size_t written = 0;
+	int N = available(reader);
+	char ulength[kmax32];
+	char *p = varint_encode32(ulength, N);
+
+	append(writer, ulength, p - ulength);
+	written += (p - ulength);
+
+	while (N > 0) {
+		/* Get next block to compress (without copying if possible) */
+		size_t fragment_size;
+		const char *fragment = peek(reader, &fragment_size);
+		if (fragment_size == 0) {
+			err = -EIO;
+			goto out;
+		}
+		const unsigned num_to_read = min_t(int, N, kblock_size);
+		size_t bytes_read = fragment_size;
+
+		int pending_advance = 0;
+		if (bytes_read >= num_to_read) {
+			/* Buffer returned by reader is large enough */
+			pending_advance = num_to_read;
+			fragment_size = num_to_read;
+		}
+		else {
+			memcpy(env->scratch, fragment, bytes_read);
+			skip(reader, bytes_read);
+
+			while (bytes_read < num_to_read) {
+				fragment = peek(reader, &fragment_size);
+				size_t n =
+				    min_t(size_t, fragment_size,
+					  num_to_read - bytes_read);
+				memcpy((char *)(env->scratch) + bytes_read, fragment, n);
+				bytes_read += n;
+				skip(reader, n);
+			}
+			DCHECK_EQ(bytes_read, num_to_read);
+			fragment = env->scratch;
+			fragment_size = num_to_read;
+		}
+		if (fragment_size < num_to_read)
+			return -EIO;
+
+		/* Get encoding table for compression */
+		int table_size;
+		u16 *table = get_hash_table(env, num_to_read, &table_size);
+
+		/* Compress input_fragment and append to dest */
+		char *dest;
+		dest = sink_peek(writer, snappy_max_compressed_length(num_to_read));
+		if (!dest) {
+			/*
+			 * Need a scratch buffer for the output,
+			 * because the byte sink doesn't have enough
+			 * in one piece.
+			 */
+			dest = env->scratch_output;
+		}
+		char *end = compress_fragment(fragment, fragment_size,
+					      dest, table, table_size);
+		append(writer, dest, end - dest);
+		written += (end - dest);
+
+		N -= num_to_read;
+		skip(reader, pending_advance);
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+#ifdef SG
+
+int snappy_compress_iov(struct snappy_env *env,
+			struct iovec *iov_in,
+			int iov_in_len,
+			size_t input_length,
+			struct iovec *iov_out,
+			int *iov_out_len,
+			size_t *compressed_length)
+{
+	struct source reader = {
+		.iov = iov_in,
+		.iovlen = iov_in_len,
+		.total = input_length
+	};
+	struct sink writer = {
+		.iov = iov_out,
+		.iovlen = *iov_out_len,
+	};
+	int err = compress(env, &reader, &writer);
+
+	*iov_out_len = writer.curvec + 1;
+
+	/* Compute how many bytes were added */
+	*compressed_length = writer.written;
+	return err;
+}
+EXPORT_SYMBOL(snappy_compress_iov);
+
+/**
+ * snappy_compress - Compress a buffer using the snappy compressor.
+ * @env: Preallocated environment
+ * @input: Input buffer
+ * @input_length: Length of input_buffer
+ * @compressed: Output buffer for compressed data
+ * @compressed_length: The real length of the output written here.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ *
+ * The output buffer must be at least
+ * snappy_max_compressed_length(input_length) bytes long.
+ *
+ * Requires a preallocated environment from snappy_init_env.
+ * The environment does not keep state over individual calls
+ * of this function, just preallocates the memory.
+ */
+int snappy_compress(struct snappy_env *env,
+		    const char *input,
+		    size_t input_length,
+		    char *compressed, size_t *compressed_length)
+{
+	struct iovec iov_in = {
+		.iov_base = (char *)input,
+		.iov_len = input_length,
+	};
+	struct iovec iov_out = {
+		.iov_base = compressed,
+		.iov_len = 0xffffffff,
+	};
+	int out = 1;
+	return snappy_compress_iov(env,
+				   &iov_in, 1, input_length,
+				   &iov_out, &out, compressed_length);
+}
+EXPORT_SYMBOL(snappy_compress);
+
+int snappy_uncompress_iov(struct iovec *iov_in, int iov_in_len,
+			   size_t input_len, char *uncompressed)
+{
+	struct source reader = {
+		.iov = iov_in,
+		.iovlen = iov_in_len,
+		.total = input_len
+	};
+	struct writer output = {
+		.base = uncompressed,
+		.op = uncompressed
+	};
+	return internal_uncompress(&reader, &output, 0xffffffff);
+}
+EXPORT_SYMBOL(snappy_uncompress_iov);
+
+/**
+ * snappy_uncompress - Uncompress a snappy compressed buffer
+ * @compressed: Input buffer with compressed data
+ * @n: length of compressed buffer
+ * @uncompressed: buffer for uncompressed data
+ *
+ * The uncompressed data buffer must be at least
+ * snappy_uncompressed_length(compressed) bytes long.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ */
+int snappy_uncompress(const char *compressed, size_t n, char *uncompressed)
+{
+	struct iovec iov = {
+		.iov_base = (char *)compressed,
+		.iov_len = n
+	};
+	return snappy_uncompress_iov(&iov, 1, n, uncompressed);
+}
+EXPORT_SYMBOL(snappy_uncompress);
+
+#else
+/**
+ * snappy_compress - Compress a buffer using the snappy compressor.
+ * @env: Preallocated environment
+ * @input: Input buffer
+ * @input_length: Length of input_buffer
+ * @compressed: Output buffer for compressed data
+ * @compressed_length: The real length of the output written here.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ *
+ * The output buffer must be at least
+ * snappy_max_compressed_length(input_length) bytes long.
+ *
+ * Requires a preallocated environment from snappy_init_env.
+ * The environment does not keep state over individual calls
+ * of this function, just preallocates the memory.
+ */
+int snappy_compress(struct snappy_env *env,
+		    const char *input,
+		    size_t input_length,
+		    char *compressed, size_t *compressed_length)
+{
+	struct source reader = {
+		.ptr = input,
+		.left = input_length
+	};
+	struct sink writer = {
+		.dest = compressed,
+	};
+	int err = compress(env, &reader, &writer);
+
+	/* Compute how many bytes were added */
+	*compressed_length = (writer.dest - compressed);
+	return err;
+}
+EXPORT_SYMBOL(snappy_compress);
+
+/**
+ * snappy_uncompress - Uncompress a snappy compressed buffer
+ * @compressed: Input buffer with compressed data
+ * @n: length of compressed buffer
+ * @uncompressed: buffer for uncompressed data
+ *
+ * The uncompressed data buffer must be at least
+ * snappy_uncompressed_length(compressed) bytes long.
+ *
+ * Return 0 on success, otherwise an negative error code.
+ */
+int snappy_uncompress(const char *compressed, size_t n, char *uncompressed)
+{
+	struct source reader = {
+		.ptr = compressed,
+		.left = n
+	};
+	struct writer output = {
+		.base = uncompressed,
+		.op = uncompressed
+	};
+	return internal_uncompress(&reader, &output, 0xffffffff);
+}
+EXPORT_SYMBOL(snappy_uncompress);
+#endif
+
+#ifdef SG
+/**
+ * snappy_init_env_sg - Allocate snappy compression environment
+ * @env: Environment to preallocate
+ * @sg: Input environment ever does scather gather
+ *
+ * If false is passed to sg then multiple entries in an iovec
+ * are not legal.
+ * Returns 0 on success, otherwise negative errno.
+ * Must run in process context.
+ */
+int snappy_init_env_sg(struct snappy_env *env, bool sg)
+{
+	env->hash_table = vmalloc(sizeof(u16) * kmax_hash_table_size);
+	if (!env->hash_table)
+		goto error;
+	if (sg) {
+		env->scratch = vmalloc(kblock_size);
+		if (!env->scratch)
+			goto error;
+		env->scratch_output =
+			vmalloc(snappy_max_compressed_length(kblock_size));
+		if (!env->scratch_output)
+			goto error;
+	}
+	return 0;
+error:
+	snappy_free_env(env);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(snappy_init_env_sg);
+#endif
+
+/**
+ * snappy_init_env - Allocate snappy compression environment
+ * @env: Environment to preallocate
+ *
+ * Passing multiple entries in an iovec is not allowed
+ * on the environment allocated here.
+ * Returns 0 on success, otherwise negative errno.
+ * Must run in process context.
+ */
+int snappy_init_env(struct snappy_env *env)
+{
+	env->hash_table = vmalloc(sizeof(u16) * kmax_hash_table_size);
+	if (!env->hash_table)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(snappy_init_env);
+
+/**
+ * snappy_free_env - Free an snappy compression environment
+ * @env: Environment to free.
+ *
+ * Must run in process context.
+ */
+void snappy_free_env(struct snappy_env *env)
+{
+	vfree(env->hash_table);
+#ifdef SG
+	vfree(env->scratch);
+	vfree(env->scratch_output);
+#endif
+	memset(env, 0, sizeof(struct snappy_env));
+}
+EXPORT_SYMBOL(snappy_free_env);
diff --git a/src/include/common/snappy/snappy.h b/src/include/common/snappy/snappy.h
new file mode 100644
index 0000000..f650f03
--- /dev/null
+++ b/src/include/common/snappy/snappy.h
@@ -0,0 +1,34 @@
+#ifndef _LINUX_SNAPPY_H
+#define _LINUX_SNAPPY_H 1
+
+/* Only needed for compression. This preallocates the worst case */
+struct snappy_env {
+	unsigned short *hash_table;
+	void *scratch;
+	void *scratch_output;
+};
+
+struct iovec;
+int snappy_init_env(struct snappy_env *env);
+int snappy_init_env_sg(struct snappy_env *env, bool sg);
+void snappy_free_env(struct snappy_env *env);
+int snappy_uncompress_iov(struct iovec *iov_in, int iov_in_len,
+			   size_t input_len, char *uncompressed);
+int snappy_uncompress(const char *compressed, size_t n, char *uncompressed);
+int snappy_compress(struct snappy_env *env,
+		    const char *input,
+		    size_t input_length,
+		    char *compressed,
+		    size_t *compressed_length);
+int snappy_compress_iov(struct snappy_env *env,
+			struct iovec *iov_in,
+			int iov_in_len,
+			size_t input_length,
+			struct iovec *iov_out,
+			int *iov_out_len,
+			size_t *compressed_length);
+bool snappy_uncompressed_length(const char *buf, size_t len, size_t *result);
+size_t snappy_max_compressed_length(size_t source_len);
+
+
+#endif
-- 
1.8.3.1

0002-Add-lz4-compression-algorithm-to-contrib.patchtext/x-patch; charset=us-asciiDownload

>From 57af3886bc31fbaf82e6e868f44b1b54be18b39b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 21 Jun 2013 00:12:39 +0200
Subject: [PATCH 2/3] Add lz4 compression algorithm to /contrib

Copied from http://code.google.com/p/lz4/

Code is licensed under a 2-clause BSD license and copyrighted by Yann Collet.
---
 src/common/Makefile          |   2 +-
 src/common/lz4/Makefile      |  29 ++
 src/common/lz4/lz4.c         | 689 +++++++++++++++++++++++++++++++++++++++++++
 src/common/lz4/lz4_encoder.h | 258 ++++++++++++++++
 src/common/lz4/xxhash.c      | 477 ++++++++++++++++++++++++++++++
 src/common/lz4/xxhash.h      | 164 ++++++++++
 src/include/common/lz4/lz4.h | 165 +++++++++++
 7 files changed, 1783 insertions(+), 1 deletion(-)
 create mode 100644 src/common/lz4/Makefile
 create mode 100644 src/common/lz4/lz4.c
 create mode 100644 src/common/lz4/lz4_encoder.h
 create mode 100644 src/common/lz4/xxhash.c
 create mode 100644 src/common/lz4/xxhash.h
 create mode 100644 src/include/common/lz4/lz4.h

diff --git a/src/common/Makefile b/src/common/Makefile
index b53e4df..606a34c 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -23,7 +23,7 @@ include $(top_builddir)/src/Makefile.global
 override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
-SUBDIRS = snappy
+SUBDIRS = snappy lz4
 
 include $(top_srcdir)/src/backend/common.mk
 
diff --git a/src/common/lz4/Makefile b/src/common/lz4/Makefile
new file mode 100644
index 0000000..9fe9552
--- /dev/null
+++ b/src/common/lz4/Makefile
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for common/lz4
+#
+# IDENTIFICATION
+#    src/common/lz4/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/common/lz4
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -Wno-missing-declarations -Wno-missing-prototypes $(CPPFLAGS)
+override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
+
+OBJS = lz4.o xxhash.o
+OBJS_SRV = $(OBJS:%.o=%_srv.o)
+
+include $(top_srcdir)/src/backend/common.mk
+
+%_srv.o: %.c %.o
+	$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
+
+clean distclean maintainer-clean:
+	rm -f $(OBJS_SRV)
+
+all: $(OBJS_SRV)
diff --git a/src/common/lz4/lz4.c b/src/common/lz4/lz4.c
new file mode 100644
index 0000000..ce36239
--- /dev/null
+++ b/src/common/lz4/lz4.c
@@ -0,0 +1,689 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2013, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+   - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+
+/*
+Note : this source file requires "lz4_encoder.h"
+*/
+
+//**************************************
+// Tuning parameters
+//**************************************
+// MEMORY_USAGE :
+// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+// Increasing memory usage improves compression ratio
+// Reduced memory usage can improve speed, due to cache effect
+// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+#define MEMORY_USAGE 14
+
+// HEAPMODE :
+// Select how default compression function will allocate memory for its hash table,
+// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)).
+// Default allocation strategy is to use stack (HEAPMODE 0)
+// Note : explicit functions *_stack* and *_heap* are unaffected by this setting
+#define HEAPMODE 0
+
+// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
+// This will provide a small boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU.
+// You can set this option to 1 in situations where data will remain within closed environment
+// This option is useless on Little_Endian CPU (such as x86)
+//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1
+
+
+
+//**************************************
+// CPU Feature Detection
+//**************************************
+// 32 or 64 bits ?
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
+  || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
+  || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
+  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   // Detects 64 bits mode
+#  define LZ4_ARCH64 1
+#else
+#  define LZ4_ARCH64 0
+#endif
+
+// Little Endian or Big Endian ?
+// Overwrite the #define below if you know your architecture endianess
+#if defined (__GLIBC__)
+#  include <endian.h>
+#  if (__BYTE_ORDER == __BIG_ENDIAN)
+#     define LZ4_BIG_ENDIAN 1
+#  endif
+#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
+#  define LZ4_BIG_ENDIAN 1
+#elif defined(__sparc) || defined(__sparc__) \
+   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
+   || defined(__hpux)  || defined(__hppa) \
+   || defined(_MIPSEB) || defined(__s390__)
+#  define LZ4_BIG_ENDIAN 1
+#else
+// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
+#endif
+
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
+#if defined(__ARM_FEATURE_UNALIGNED)
+#  define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+// Define this parameter if your target system or compiler does not support hardware bit count
+#if defined(_MSC_VER) && defined(_WIN32_WCE)            // Visual Studio for Windows CE does not support Hardware bit count
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+/* "restrict" is a known keyword */
+#else
+#  define restrict // Disable restrict
+#endif
+
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef _MSC_VER    // Visual Studio
+#  include <intrin.h>   // For Visual 2005
+#  if LZ4_ARCH64   // 64-bit
+#    pragma intrinsic(_BitScanForward64) // For Visual 2005
+#    pragma intrinsic(_BitScanReverse64) // For Visual 2005
+#  else
+#    pragma intrinsic(_BitScanForward)   // For Visual 2005
+#    pragma intrinsic(_BitScanReverse)   // For Visual 2005
+#  endif
+#  pragma warning(disable : 4127)        // disable: C4127: conditional expression is constant
+#endif
+
+#ifdef _MSC_VER
+#  define lz4_bswap16(x) _byteswap_ushort(x)
+#else
+#  define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
+#endif
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#define likely(expr)     expect((expr) != 0, 1)
+#define unlikely(expr)   expect((expr) != 0, 0)
+
+
+//**************************************
+// Includes
+//**************************************
+#include <stdlib.h>   // for malloc
+#include <string.h>   // for memset
+#include "common/lz4/lz4.h"
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+#if defined(__GNUC__)  && !defined(LZ4_FORCE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(push, 1)
+#endif
+
+typedef struct _U16_S { U16 v; } _PACKED U16_S;
+typedef struct _U32_S { U32 v; } _PACKED U32_S;
+typedef struct _U64_S { U64 v; } _PACKED U64_S;
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A64(x) (((U64_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A16(x) (((U16_S *)(x))->v)
+
+
+//**************************************
+// Constants
+//**************************************
+#define HASHTABLESIZE (1 << MEMORY_USAGE)
+
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+#define MINLENGTH (MFLIMIT+1)
+
+#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1))
+#define SKIPSTRENGTH 6     // Increasing this value will make the compression run slower on incompressible data
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+//**************************************
+// Architecture-specific macros
+//**************************************
+#if LZ4_ARCH64   // 64-bit
+#  define STEPSIZE 8
+#  define UARCH U64
+#  define AARCH A64
+#  define LZ4_COPYSTEP(s,d)       A64(d) = A64(s); d+=8; s+=8;
+#  define LZ4_COPYPACKET(s,d)     LZ4_COPYSTEP(s,d)
+#  define LZ4_SECURECOPY(s,d,e)   if (d<e) LZ4_WILDCOPY(s,d,e)
+#  define HTYPE                   U32
+#  define INITBASE(base)          const BYTE* const base = ip
+#else      // 32-bit
+#  define STEPSIZE 4
+#  define UARCH U32
+#  define AARCH A32
+#  define LZ4_COPYSTEP(s,d)       A32(d) = A32(s); d+=4; s+=4;
+#  define LZ4_COPYPACKET(s,d)     LZ4_COPYSTEP(s,d); LZ4_COPYSTEP(s,d);
+#  define LZ4_SECURECOPY          LZ4_WILDCOPY
+#  define HTYPE                   const BYTE*
+#  define INITBASE(base)          const int base = 0
+#endif
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
+#else      // Little Endian
+#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
+#  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define LZ4_WILDCOPY(s,d,e)     do { LZ4_COPYPACKET(s,d) } while (d<e);
+#define LZ4_BLINDCOPY(s,d,l)    { BYTE* e=(d)+(l); LZ4_WILDCOPY(s,d,e); d=e; }
+
+
+//****************************
+// Private functions
+//****************************
+#if LZ4_ARCH64
+
+static inline int LZ4_NbCommonBytes (register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+    #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanReverse64( &r, val );
+    return (int)(r>>3);
+    #elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_clzll(val) >> 3);
+    #else
+    int r;
+    if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+    if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+    r += (!val);
+    return r;
+    #endif
+#else
+    #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanForward64( &r, val );
+    return (int)(r>>3);
+    #elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_ctzll(val) >> 3);
+    #else
+    static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+    return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58];
+    #endif
+#endif
+}
+
+#else
+
+static inline int LZ4_NbCommonBytes (register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#  if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanReverse( &r, val );
+    return (int)(r>>3);
+#  elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_clz(val) >> 3);
+#  else
+    int r;
+    if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+    r += (!val);
+    return r;
+#  endif
+#else
+#  if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r;
+    _BitScanForward( &r, val );
+    return (int)(r>>3);
+#  elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_ctz(val) >> 3);
+#  else
+    static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+    return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#  endif
+#endif
+}
+
+#endif
+
+
+
+//******************************
+// Compression functions
+//******************************
+
+/*
+int LZ4_compress_stack(
+                 const char* source,
+                 char* dest,
+                 int inputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest'.
+Destination buffer must be already allocated, and sized at a minimum of LZ4_compressBound(inputSize).
+return : the number of bytes written in buffer 'dest'
+*/
+#define FUNCTION_NAME LZ4_compress_stack
+#include "lz4_encoder.h"
+
+
+/*
+int LZ4_compress_stack_limitedOutput(
+                 const char* source,
+                 char* dest,
+                 int inputSize,
+                 int maxOutputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+If it cannot achieve it, compression will stop, and result of the function will be zero.
+return : the number of bytes written in buffer 'dest', or 0 if the compression fails
+*/
+#define FUNCTION_NAME LZ4_compress_stack_limitedOutput
+#define LIMITED_OUTPUT
+#include "lz4_encoder.h"
+
+
+/*
+int LZ4_compress64k_stack(
+                 const char* source,
+                 char* dest,
+                 int inputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest'.
+This function compresses better than LZ4_compress_stack(), on the condition that
+'inputSize' must be < to LZ4_64KLIMIT, or the function will fail.
+Destination buffer must be already allocated, and sized at a minimum of LZ4_compressBound(inputSize).
+return : the number of bytes written in buffer 'dest', or 0 if compression fails
+*/
+#define FUNCTION_NAME LZ4_compress64k_stack
+#define COMPRESS_64K
+#include "lz4_encoder.h"
+
+
+/*
+int LZ4_compress64k_stack_limitedOutput(
+                 const char* source,
+                 char* dest,
+                 int inputSize,
+                 int maxOutputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+This function compresses better than LZ4_compress_stack_limitedOutput(), on the condition that
+'inputSize' must be < to LZ4_64KLIMIT, or the function will fail.
+If it cannot achieve it, compression will stop, and result of the function will be zero.
+return : the number of bytes written in buffer 'dest', or 0 if the compression fails
+*/
+#define FUNCTION_NAME LZ4_compress64k_stack_limitedOutput
+#define COMPRESS_64K
+#define LIMITED_OUTPUT
+#include "lz4_encoder.h"
+
+
+/*
+void* LZ4_createHeapMemory();
+int LZ4_freeHeapMemory(void* ctx);
+
+Used to allocate and free hashTable memory
+to be used by the LZ4_compress_heap* family of functions.
+LZ4_createHeapMemory() returns NULL is memory allocation fails.
+*/
+void* LZ4_create() { return malloc(HASHTABLESIZE); }
+int   LZ4_free(void* ctx) { free(ctx); return 0; }
+
+
+/*
+int LZ4_compress_heap(
+                 void* ctx,
+                 const char* source,
+                 char* dest,
+                 int inputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest'.
+The memory used for compression must be created by LZ4_createHeapMemory() and provided by pointer 'ctx'.
+Destination buffer must be already allocated, and sized at a minimum of LZ4_compressBound(inputSize).
+return : the number of bytes written in buffer 'dest'
+*/
+#define FUNCTION_NAME LZ4_compress_heap
+#define USE_HEAPMEMORY
+#include "lz4_encoder.h"
+
+
+/*
+int LZ4_compress_heap_limitedOutput(
+                 void* ctx,
+                 const char* source,
+                 char* dest,
+                 int inputSize,
+                 int maxOutputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+If it cannot achieve it, compression will stop, and result of the function will be zero.
+The memory used for compression must be created by LZ4_createHeapMemory() and provided by pointer 'ctx'.
+return : the number of bytes written in buffer 'dest', or 0 if the compression fails
+*/
+#define FUNCTION_NAME LZ4_compress_heap_limitedOutput
+#define LIMITED_OUTPUT
+#define USE_HEAPMEMORY
+#include "lz4_encoder.h"
+
+
+/*
+int LZ4_compress64k_heap(
+                 void* ctx,
+                 const char* source,
+                 char* dest,
+                 int inputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest'.
+The memory used for compression must be created by LZ4_createHeapMemory() and provided by pointer 'ctx'.
+'inputSize' must be < to LZ4_64KLIMIT, or the function will fail.
+Destination buffer must be already allocated, and sized at a minimum of LZ4_compressBound(inputSize).
+return : the number of bytes written in buffer 'dest'
+*/
+#define FUNCTION_NAME LZ4_compress64k_heap
+#define COMPRESS_64K
+#define USE_HEAPMEMORY
+#include "lz4_encoder.h"
+
+
+/*
+int LZ4_compress64k_heap_limitedOutput(
+                 void* ctx,
+                 const char* source,
+                 char* dest,
+                 int inputSize,
+                 int maxOutputSize)
+
+Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+If it cannot achieve it, compression will stop, and result of the function will be zero.
+The memory used for compression must be created by LZ4_createHeapMemory() and provided by pointer 'ctx'.
+'inputSize' must be < to LZ4_64KLIMIT, or the function will fail.
+return : the number of bytes written in buffer 'dest', or 0 if the compression fails
+*/
+#define FUNCTION_NAME LZ4_compress64k_heap_limitedOutput
+#define COMPRESS_64K
+#define LIMITED_OUTPUT
+#define USE_HEAPMEMORY
+#include "lz4_encoder.h"
+
+
+int LZ4_compress(const char* source, char* dest, int inputSize)
+{
+#if HEAPMODE
+    void* ctx = LZ4_create();
+    int result;
+    if (ctx == NULL) return 0;    // Failed allocation => compression not done
+    if (inputSize < LZ4_64KLIMIT)
+        result = LZ4_compress64k_heap(ctx, source, dest, inputSize);
+    else result = LZ4_compress_heap(ctx, source, dest, inputSize);
+    LZ4_free(ctx);
+    return result;
+#else
+    if (inputSize < (int)LZ4_64KLIMIT) return LZ4_compress64k_stack(source, dest, inputSize);
+    return LZ4_compress_stack(source, dest, inputSize);
+#endif
+}
+
+
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+#if HEAPMODE
+    void* ctx = LZ4_create();
+    int result;
+    if (ctx == NULL) return 0;    // Failed allocation => compression not done
+    if (inputSize < LZ4_64KLIMIT)
+        result = LZ4_compress64k_heap_limitedOutput(ctx, source, dest, inputSize, maxOutputSize);
+    else result = LZ4_compress_heap_limitedOutput(ctx, source, dest, inputSize, maxOutputSize);
+    LZ4_free(ctx);
+    return result;
+#else
+    if (inputSize < (int)LZ4_64KLIMIT) return LZ4_compress64k_stack_limitedOutput(source, dest, inputSize, maxOutputSize);
+    return LZ4_compress_stack_limitedOutput(source, dest, inputSize, maxOutputSize);
+#endif
+}
+
+
+//****************************
+// Decompression functions
+//****************************
+
+typedef enum { noPrefix = 0, withPrefix = 1 } prefix64k_directive;
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } end_directive;
+typedef enum { full = 0, partial = 1 } exit_directive;
+
+
+// This generic decompression function cover all use cases.
+// It shall be instanciated several times, using different sets of directives
+// Note that it is essential this generic function is really inlined,
+// in order to remove useless branches during compilation optimisation.
+static inline int LZ4_decompress_generic(
+                 const char* source,
+                 char* dest,
+                 int inputSize,          //
+                 int outputSize,         // OutputSize must be != 0; if endOnInput==endOnInputSize, this value is the max size of Output Buffer.
+
+                 int endOnInput,         // endOnOutputSize, endOnInputSize
+                 int prefix64k,          // noPrefix, withPrefix
+                 int partialDecoding,    // full, partial
+                 int targetOutputSize    // only used if partialDecoding==partial
+                 )
+{
+    // Local Variables
+    const BYTE* restrict ip = (const BYTE*) source;
+    const BYTE* ref;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+
+    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+    size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+#endif
+
+
+    // Special case
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;   // targetOutputSize too large, better decode everything
+    if unlikely(outputSize==0) goto _output_error;                          // Empty output buffer
+
+
+    // Main Loop
+    while (1)
+    {
+        unsigned token;
+        size_t length;
+
+        // get runlength
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)
+        {
+            unsigned s=255;
+            while (((endOnInput)?ip<iend:1) && (s==255))
+            {
+                s = *ip++;
+                length += s;
+            }
+        }
+
+        // copy literals
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                            // Error : write attempt beyond end of output buffer
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;    // Error : read attempt beyond end of input buffer
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;        // Error : block decoding must stop exactly there, due to parsing restrictions
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   // Error : not enough place for another match (min 4) + 5 literals
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;                                       // Necessarily EOF, due to parsing restrictions
+        }
+        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
+
+        // get offset
+        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+        if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error;   // Error : offset outside destination buffer
+
+        // get matchlength
+        if ((length=(token&ML_MASK)) == ML_MASK)
+        {
+            while (endOnInput ? ip<iend-(LASTLITERALS+1) : 1)    // A minimum nb of input bytes must remain for LASTLITERALS + token
+            {
+                unsigned s = *ip++;
+                length += s;
+                if (s==255) continue;
+                break;
+            }
+        }
+
+        // copy repeated sequence
+        if unlikely((op-ref)<STEPSIZE)
+        {
+#if LZ4_ARCH64
+            size_t dec64 = dec64table[op-ref];
+#else
+            const size_t dec64 = 0;
+#endif
+            op[0] = ref[0];
+            op[1] = ref[1];
+            op[2] = ref[2];
+            op[3] = ref[3];
+            op += 4, ref += 4; ref -= dec32table[op-ref];
+            A32(op) = A32(ref);
+            op += STEPSIZE-4; ref -= dec64;
+        } else { LZ4_COPYSTEP(ref,op); }
+        cpy = op + length - (STEPSIZE-4);
+
+        if unlikely(cpy>oend-(COPYLENGTH)-(STEPSIZE-4))
+        {
+            if (cpy > oend-LASTLITERALS) goto _output_error;    // Error : last 5 bytes must be literals
+            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
+            while(op<cpy) *op++=*ref++;
+            op=cpy;
+            continue;
+        }
+        LZ4_WILDCOPY(ref, op, cpy);
+        op=cpy;   // correction
+    }
+
+    // end of decoding
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     // Nb of output bytes decoded
+    else
+       return (int) (((char*)ip)-source);   // Nb of input bytes read
+
+    // Overflow error detected
+_output_error:
+    return (int) (-(((char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, full, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, noPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, partial, targetOutputSize);
+}
diff --git a/src/common/lz4/lz4_encoder.h b/src/common/lz4/lz4_encoder.h
new file mode 100644
index 0000000..94ebda1
--- /dev/null
+++ b/src/common/lz4/lz4_encoder.h
@@ -0,0 +1,258 @@
+/*
+   LZ4 Encoder - Part of LZ4 compression algorithm
+   Copyright (C) 2011-2013, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+   - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+
+/* lz4_encoder.h must be included into lz4.c
+   The objective of this file is to create a single LZ4 compression function source
+   which will be instanciated multiple times with minor variations
+   depending on a set of #define.
+*/
+
+
+
+//****************************
+// Check required defines
+//****************************
+
+#ifndef FUNCTION_NAME
+#  error "FUNTION_NAME is not defined"
+#endif
+
+
+//****************************
+// Local definitions
+//****************************
+
+#ifdef COMPRESS_64K
+#  define HASHLOG (MEMORY_USAGE-1)
+#  define CURRENT_H_TYPE U16
+#  define CURRENTBASE(base) const BYTE* const base = ip
+#else
+#  define HASHLOG (MEMORY_USAGE-2)
+#  define CURRENT_H_TYPE HTYPE
+#  define CURRENTBASE(base) INITBASE(base)
+#endif
+
+#define HASHTABLE_NBCELLS  (1U<<HASHLOG)
+#define LZ4_HASH(i)        (((i) * 2654435761U) >> ((MINMATCH*8)-HASHLOG))
+#define LZ4_HASHVALUE(p)   LZ4_HASH(A32(p))
+
+
+
+//****************************
+// Function code
+//****************************
+
+int FUNCTION_NAME(
+#ifdef USE_HEAPMEMORY
+                 void* ctx,
+#endif
+                 const char* source,
+                 char* dest,
+                 int inputSize
+#ifdef LIMITED_OUTPUT
+                ,int maxOutputSize
+#endif
+                 )
+{
+#ifdef USE_HEAPMEMORY
+    CURRENT_H_TYPE* HashTable = (CURRENT_H_TYPE*)ctx;
+#else
+    CURRENT_H_TYPE HashTable[HASHTABLE_NBCELLS] = {0};
+#endif
+
+    const BYTE* ip = (BYTE*) source;
+    CURRENTBASE(base);
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+    BYTE* op = (BYTE*) dest;
+#ifdef LIMITED_OUTPUT
+    BYTE* const oend = op + maxOutputSize;
+#endif
+
+    int length;
+    const int skipStrength = SKIPSTRENGTH;
+    U32 forwardH;
+
+
+    // Init
+    if (inputSize<MINLENGTH) goto _last_literals;
+#ifdef COMPRESS_64K
+    if (inputSize>LZ4_64KLIMIT) return 0;   // Size too large (not within 64K limit)
+#endif
+#ifdef USE_HEAPMEMORY
+    memset((void*)HashTable, 0, HASHTABLESIZE);
+#endif
+
+    // First Byte
+    HashTable[LZ4_HASHVALUE(ip)] = (CURRENT_H_TYPE)(ip - base);
+    ip++; forwardH = LZ4_HASHVALUE(ip);
+
+    // Main Loop
+    for ( ; ; )
+    {
+        int findMatchAttempts = (1U << skipStrength) + 3;
+        const BYTE* forwardIp = ip;
+        const BYTE* ref;
+        BYTE* token;
+
+        // Find a match
+        do {
+            U32 h = forwardH;
+            int step = findMatchAttempts++ >> skipStrength;
+            ip = forwardIp;
+            forwardIp = ip + step;
+
+            if unlikely(forwardIp > mflimit) { goto _last_literals; }
+
+            forwardH = LZ4_HASHVALUE(forwardIp);
+            ref = base + HashTable[h];
+            HashTable[h] = (CURRENT_H_TYPE)(ip - base);
+
+        } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+        // Catch up
+        while ((ip>anchor) && (ref>(BYTE*)source) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; }
+
+        // Encode Literal length
+        length = (int)(ip - anchor);
+        token = op++;
+#ifdef LIMITED_OUTPUT
+        if unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend) return 0;   // Check output limit
+#endif
+        if (length>=(int)RUN_MASK)
+        {
+            int len = length-RUN_MASK;
+            *token=(RUN_MASK<<ML_BITS);
+            for(; len >= 255 ; len-=255) *op++ = 255;
+            *op++ = (BYTE)len;
+        }
+        else *token = (BYTE)(length<<ML_BITS);
+
+        // Copy Literals
+        LZ4_BLINDCOPY(anchor, op, length);
+
+_next_match:
+        // Encode Offset
+        LZ4_WRITE_LITTLEENDIAN_16(op,(U16)(ip-ref));
+
+        // Start Counting
+        ip+=MINMATCH; ref+=MINMATCH;    // MinMatch already verified
+        anchor = ip;
+        while likely(ip<matchlimit-(STEPSIZE-1))
+        {
+            UARCH diff = AARCH(ref) ^ AARCH(ip);
+            if (!diff) { ip+=STEPSIZE; ref+=STEPSIZE; continue; }
+            ip += LZ4_NbCommonBytes(diff);
+            goto _endCount;
+        }
+        if (LZ4_ARCH64) if ((ip<(matchlimit-3)) && (A32(ref) == A32(ip))) { ip+=4; ref+=4; }
+        if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
+        if ((ip<matchlimit) && (*ref == *ip)) ip++;
+_endCount:
+
+        // Encode MatchLength
+        length = (int)(ip - anchor);
+#ifdef LIMITED_OUTPUT
+        if unlikely(op + (1 + LASTLITERALS) + (length>>8) > oend) return 0;    // Check output limit
+#endif
+        if (length>=(int)ML_MASK)
+        {
+            *token += ML_MASK;
+            length -= ML_MASK;
+            for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; }
+            if (length >= 255) { length-=255; *op++ = 255; }
+            *op++ = (BYTE)length;
+        }
+        else *token += (BYTE)length;
+
+        // Test end of chunk
+        if (ip > mflimit) { anchor = ip;  break; }
+
+        // Fill table
+        HashTable[LZ4_HASHVALUE(ip-2)] = (CURRENT_H_TYPE)(ip - 2 - base);
+
+        // Test next position
+        ref = base + HashTable[LZ4_HASHVALUE(ip)];
+        HashTable[LZ4_HASHVALUE(ip)] = (CURRENT_H_TYPE)(ip - base);
+        if ((ref >= ip - MAX_DISTANCE) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; }
+
+        // Prepare next loop
+        anchor = ip++;
+        forwardH = LZ4_HASHVALUE(ip);
+    }
+
+_last_literals:
+    // Encode Last Literals
+    {
+        int lastRun = (int)(iend - anchor);
+#ifdef LIMITED_OUTPUT
+        if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0;  // Check output limit
+#endif
+        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun >= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+        else *op++ = (BYTE)(lastRun<<ML_BITS);
+        memcpy(op, anchor, iend - anchor);
+        op += iend-anchor;
+    }
+
+    // End
+    return (int) (((char*)op)-dest);
+}
+
+
+
+//****************************
+// Clean defines
+//****************************
+
+// Required defines
+#undef FUNCTION_NAME
+
+// Locally Generated
+#undef HASHLOG
+#undef HASHTABLE_NBCELLS
+#undef LZ4_HASH
+#undef LZ4_HASHVALUE
+#undef CURRENT_H_TYPE
+#undef CURRENTBASE
+
+// Optional defines
+#ifdef LIMITED_OUTPUT
+#undef LIMITED_OUTPUT
+#endif
+
+#ifdef USE_HEAPMEMORY
+#undef USE_HEAPMEMORY
+#endif
diff --git a/src/common/lz4/xxhash.c b/src/common/lz4/xxhash.c
new file mode 100644
index 0000000..c9dc1fb
--- /dev/null
+++ b/src/common/lz4/xxhash.c
@@ -0,0 +1,477 @@
+/*
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2013, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+
+
+//**************************************
+// Tuning parameters
+//**************************************
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
+// XXH_ACCEPT_NULL_INPUT_POINTER :
+// If the input pointer is a null pointer, xxHash default behavior is to crash, since it is a bad input.
+// If this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+// This option has a very small performance cost (only measurable on small inputs).
+// By default, this option is disabled. To enable it, uncomment below define :
+//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
+
+// XXH_FORCE_NATIVE_FORMAT :
+// By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+// Results are therefore identical for little-endian and big-endian CPU.
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independance be of no importance for your application, you may uncomment the #define below.
+// It will improve speed for Big-endian CPU.
+// This option has no impact on Little_Endian CPU.
+//#define XXH_FORCE_NATIVE_FORMAT 1
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(_MSC_VER) && !defined(__cplusplus)   // Visual Studio
+#  define inline __inline           // Visual C is not C99, but supports some kind of inline
+#endif
+
+
+//**************************************
+// Includes & Memory related functions
+//**************************************
+#include "xxhash.h"
+// Modify the local functions below should you wish to use some other memory related routines
+// for malloc(), free()
+#include <stdlib.h>
+static inline void* XXH_malloc(size_t s) { return malloc(s); }
+static inline void  XXH_free  (void* p)  { free(p); }
+// for memcpy()
+#include <string.h>
+static inline void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+
+//**************************************
+// CPU Feature Detection
+//**************************************
+// Little Endian or Big Endian ?
+// You can overwrite the #define below if you know your architecture endianess
+#if defined(XXH_FORCE_NATIVE_FORMAT) && (XXH_FORCE_NATIVE_FORMAT==1)
+// Force native format. The result will be endian dependant.
+#  define XXH_BIG_ENDIAN 0
+#elif defined (__GLIBC__)
+#  include <endian.h>
+#  if (__BYTE_ORDER == __BIG_ENDIAN)
+#     define XXH_BIG_ENDIAN 1
+#  endif
+#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
+#  define XXH_BIG_ENDIAN 1
+#elif defined(__sparc)  || defined(__sparc__) \
+    || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
+    || defined(__hpux)  || defined(__hppa) \
+    || defined(_MIPSEB) || defined(__s390__)
+#  define XXH_BIG_ENDIAN 1
+#endif
+
+#if !defined(XXH_BIG_ENDIAN)
+// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
+#  define XXH_BIG_ENDIAN 0
+#endif
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
+#endif
+
+#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(push, 1)
+#endif
+
+typedef struct _U32_S { U32 v; } _PACKED U32_S;
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A32(x) (((U32_S *)(x))->v)
+
+
+//***************************************
+// Compiler-specific Functions and Macros
+//***************************************
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+#if defined(_MSC_VER)     // Visual Studio
+#  define XXH_swap32 _byteswap_ulong
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static inline U32 XXH_swap32 (U32 x) {
+    return  ((x << 24) & 0xff000000 ) |
+        ((x <<  8) & 0x00ff0000 ) |
+        ((x >>  8) & 0x0000ff00 ) |
+        ((x >> 24) & 0x000000ff );}
+#endif
+
+
+//**************************************
+// Constants
+//**************************************
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+
+//**************************************
+// Macros
+//**************************************
+#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
+#define XXH_LE32(p)          (XXH_BIG_ENDIAN ? XXH_swap32(A32(p))     : A32(p))
+#define XXH_alignedLE32(p)   (XXH_BIG_ENDIAN ? XXH_swap32(*(U32*)(p)) : *(U32*)(p))
+
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS)
+// Specific version, for aligned 32-bits input. Useless for CPU supporting unaligned access.
+static U32 XXH32_alignedInput(const void* input, int len, U32 seed)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U32 h32;
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+        do
+        {
+            v1 += XXH_alignedLE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_alignedLE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_alignedLE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_alignedLE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else { h32  = seed + PRIME32_5; }
+    h32 += (U32) len;
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_alignedLE32(p) * PRIME32_3;
+        h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return h32;
+}
+#endif
+
+U32 XXH32(const void* input, int len, U32 seed)
+{
+#if 0
+    // Simple version, good for code maintenance, but unfortunately slow for small inputs
+    void* state = XXH32_init(seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U32 h32;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) { len=0; p=(const BYTE*)16; }
+#endif
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((U32)p) & 3) == 0) return XXH32_alignedInput(input, len, seed);   // Input is aligned, let's leverage the speed advantage
+#endif
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_LE32(p) * PRIME32_3;
+        h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+
+#endif
+}
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+struct XXH_state32_t
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    int memsize;
+    char memory[16];
+};
+
+
+int XXH32_sizeofState()
+{
+    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
+    return sizeof(struct XXH_state32_t);
+}
+
+
+XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+void* XXH32_init (U32 seed)
+{
+    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
+    XXH32_resetState(state, seed);
+    return state;
+}
+
+
+XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   // fill in tmp buffer
+    {
+        XXH_memcpy(state->memory + state->memsize, input, len);
+        state->memsize +=  len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   // some data left from previous update
+    {
+        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = (const U32*)state->memory;
+            state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
+            state->v2 += XXH_LE32(p32) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
+            state->v3 += XXH_LE32(p32) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
+            state->v4 += XXH_LE32(p32) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->memory, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+
+U32 XXH32_intermediateDigest (void* state_in)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    BYTE * p   = (BYTE*)state->memory;
+    BYTE* bEnd = (BYTE*)state->memory + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_LE32(p) * PRIME32_3;
+        h32 = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32_digest (void* state_in)
+{
+    U32 h32 = XXH32_intermediateDigest(state_in);
+
+    XXH_free(state_in);
+
+    return h32;
+}
diff --git a/src/common/lz4/xxhash.h b/src/common/lz4/xxhash.h
new file mode 100644
index 0000000..fbc0880
--- /dev/null
+++ b/src/common/lz4/xxhash.h
@@ -0,0 +1,164 @@
+/*
+   xxHash - Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2013, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+*/
+
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+//****************************
+// Type
+//****************************
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+
+unsigned int XXH32 (const void* input, int len, unsigned int seed);
+
+/*
+XXH32() :
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    The memory between input & input+len must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Note that "len" is type "int", which means it is limited to 2^31-1.
+    If your data is larger, use the advanced functions below.
+*/
+
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+void*         XXH32_init   (unsigned int seed);
+XXH_errorcode XXH32_update (void* state, const void* input, int len);
+unsigned int  XXH32_digest (void* state);
+
+/*
+These functions calculate the xxhash of an input provided in several small packets,
+as opposed to an input provided as a single block.
+
+It must be started with :
+void* XXH32_init()
+The function returns a pointer which holds the state of calculation.
+
+This pointer must be provided as "void* state" parameter for XXH32_update().
+XXH32_update() can be called as many times as necessary.
+The user must provide a valid (allocated) input.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+Note that "len" is type "int", which means it is limited to 2^31-1.
+If your data is larger, it is recommended to chunk your data into blocks
+of size for example 2^30 (1GB) to avoid any "int" overflow issue.
+
+Finally, you can end the calculation anytime, by using XXH32_digest().
+This function returns the final 32-bits hash.
+You must provide the same "void* state" parameter created by XXH32_init().
+Memory will be freed by XXH32_digest().
+*/
+
+
+int           XXH32_sizeofState();
+XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
+
+#define       XXH32_SIZEOFSTATE 48
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
+/*
+These functions allow user application to make its own allocation for state.
+
+XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
+Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
+This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+
+For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
+use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+*/
+
+
+unsigned int XXH32_intermediateDigest (void* state);
+/*
+This function does the same as XXH32_digest(), generating a 32-bit hash,
+but preserve memory context.
+This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
+To free memory context, use XXH32_digest(), or free().
+*/
+
+
+
+//****************************
+// Deprecated function names
+//****************************
+// The following translations are provided to ease code transition
+// You are encouraged to no longer this function names
+#define XXH32_feed   XXH32_update
+#define XXH32_result XXH32_digest
+#define XXH32_getIntermediateResult XXH32_intermediateDigest
+
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/src/include/common/lz4/lz4.h b/src/include/common/lz4/lz4.h
new file mode 100644
index 0000000..a73813e
--- /dev/null
+++ b/src/include/common/lz4/lz4.h
@@ -0,0 +1,165 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2013, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+   - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(_MSC_VER) && !defined(__cplusplus)   // Visual Studio
+#  define inline __forceinline           // Visual C is not C99, but supports some kind of inline. Note : we *do* want to force inline
+#endif
+
+
+//****************************
+// Simple Functions
+//****************************
+
+int LZ4_compress        (const char* source, char* dest, int inputSize);
+int LZ4_decompress_safe (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+LZ4_compress() :
+    Compresses 'inputSize' bytes from 'source' into 'dest'.
+    Destination buffer must be already allocated,
+    and must be sized to handle worst cases situations (input data not compressible)
+    Worst case size evaluation is provided by function LZ4_compressBound()
+    inputSize : Max supported value is ~1.9GB
+    return : the number of bytes written in buffer dest
+             or 0 if the compression fails
+
+LZ4_decompress_safe() :
+    maxOutputSize : is the size of the destination buffer (which must be already allocated)
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
+             If the source stream is malformed or too large, the function will stop decoding and return a negative result.
+             This function is protected against any kind of buffer overflow attemps (never writes outside of output buffer, and never reads outside of input buffer). It is therefore protected against malicious data packets
+*/
+
+
+//****************************
+// Advanced Functions
+//****************************
+
+static inline int LZ4_compressBound(int isize)   { return ((isize) + ((isize)/255) + 16); }
+#define           LZ4_COMPRESSBOUND(    isize)            ((isize) + ((isize)/255) + 16)
+
+/*
+LZ4_compressBound() :
+    Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible)
+    primarily useful for memory allocation of output buffer.
+	inline function is recommended for the general case,
+	macro is also provided when result needs to be evaluated at compile time (such as table size allocation).
+
+    isize  : is the input size. Max supported value is ~1.9GB
+    return : maximum output size in a "worst case" scenario
+    note : this function is limited by "int" range (2^31-1)
+*/
+
+
+int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+LZ4_compress_limitedOutput() :
+    Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+    If it cannot achieve it, compression will stop, and result of the function will be zero.
+    This function never writes outside of provided output buffer.
+
+    inputSize  : Max supported value is ~1.9GB
+    maxOutputSize : is the size of the destination buffer (which must be already allocated)
+    return : the number of bytes written in buffer 'dest'
+             or 0 if the compression fails
+*/
+
+
+int LZ4_decompress_fast (const char* source, char* dest, int outputSize);
+
+/*
+LZ4_decompress_fast() :
+    outputSize : is the original (uncompressed) size
+    return : the number of bytes read from the source buffer (in other words, the compressed size)
+             If the source stream is malformed, the function will stop decoding and return a negative result.
+    note : This function is a bit faster than LZ4_decompress_safe()
+           This function never writes outside of output buffers, and never read before input buffer, but may read beyond input buffer (since it doesn't know its size) in case of malicious data packet.
+           Use this function preferably into a trusted environment (data to decode comes from a trusted source).
+           Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes.
+*/
+
+int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize);
+
+/*
+LZ4_decompress_safe_partial() :
+    This function decompress a compressed block of size 'inputSize' at position 'source'
+    into output buffer 'dest' of size 'maxOutputSize'.
+    The function stops decompressing operation as soon as 'targetOutputSize' has been reached,
+    reducing decompression time.
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
+       Note : this number might be < 'targetOutputSize' if the number of bytes to decode into the compressed block is not enough.
+             Always control how many bytes were  decoded.
+             If the source stream is malformed, the function will stop decoding and return a negative result.
+             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+
+
+int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize);
+
+/*
+*_withPrefix64k() :
+    These decoding functions work the same as their "normal name" versions,
+    but will potentially use up to 64KB of data in front of 'char* dest'.
+    These functions are used for decoding inter-dependant blocks.
+*/
+
+
+//****************************
+// Obsolete Functions
+//****************************
+
+static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+/*
+These functions are deprecated and should no longer be used.
+They are provided here for compatibility with existing user programs.
+*/
+
+
+
+#if defined (__cplusplus)
+}
+#endif
-- 
1.8.3.1

0003-Introduce-pluggable-compression.patchtext/x-patch; charset=us-asciiDownload

>From c20cbbe6e39ee618dc4cc4b37eda43fb2d2bc813 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 21 Jun 2013 00:17:23 +0200
Subject: [PATCH 3/3] Introduce pluggable compression

Todo:
* windows build support
* remove toast_compression_algo guc
* remove either snappy or lz4 support
* remove pglz_long support (just there for testing)
---
 src/backend/access/heap/tuptoaster.c  | 384 ++++++++++++++++++++++++++++++----
 src/backend/utils/adt/pg_lzcompress.c |  81 ++-----
 src/backend/utils/misc/guc.c          |  11 +
 src/include/access/tuptoaster.h       |   1 +
 src/include/postgres.h                |  36 ++--
 src/include/utils/pg_lzcompress.h     |  38 +---
 6 files changed, 393 insertions(+), 158 deletions(-)

diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index fc37ceb..5d36772 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -34,6 +34,8 @@
 #include "access/heapam.h"
 #include "access/tuptoaster.h"
 #include "access/xact.h"
+#include "common/snappy/snappy.h"
+#include "common/lz4/lz4.h"
 #include "catalog/catalog.h"
 #include "utils/fmgroids.h"
 #include "utils/pg_lzcompress.h"
@@ -72,6 +74,20 @@ do { \
 	memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
 } while (0)
 
+/*
+ * Available compression algorithms
+ */
+typedef enum ToastCompressionAlgo
+{
+	TOAST_COMPRESS_PGLZ = 0,
+	TOAST_COMPRESS_SNAPPY,
+	TOAST_COMPRESS_LZ4,
+	TOAST_COMPRESS_PGLZ_LONG /* for testing of +1 byte storage */
+} ToastCompressionAlgo;
+#define LAST_COMPRESSION_ALGO TOAST_COMPRESS_PGLZ_LONG
+
+/* guc for the default compression algorithm, just for testing */
+int toast_compression_algo = 0;
 
 static void toast_delete_datum(Relation rel, Datum value);
 static Datum toast_save_datum(Relation rel, Datum value,
@@ -81,7 +97,12 @@ static bool toastid_valueid_exists(Oid toastrelid, Oid valueid);
 static struct varlena *toast_fetch_datum(struct varlena * attr);
 static struct varlena *toast_fetch_datum_slice(struct varlena * attr,
 						int32 sliceoffset, int32 length);
+static Datum toast_uncompress_datum(Datum attr);
+static Size toast_uncompressed_length(struct varlena *attr);
 
+static void toast_get_compress_size_algo(Datum value, ToastCompressionAlgo *algo,
+							 Size *compressed_length, Size *uncompressed_length,
+										 void **compressed_data);
 
 /* ----------
  * heap_tuple_fetch_attr -
@@ -137,11 +158,11 @@ heap_tuple_untoast_attr(struct varlena * attr)
 		/* If it's compressed, decompress it */
 		if (VARATT_IS_COMPRESSED(attr))
 		{
-			PGLZ_Header *tmp = (PGLZ_Header *) attr;
+			struct varlena *tmp = attr;
+
+			attr = (struct varlena *) DatumGetPointer(
+				toast_uncompress_datum(PointerGetDatum(attr)));
 
-			attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-			SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-			pglz_decompress(tmp, VARDATA(attr));
 			pfree(tmp);
 		}
 	}
@@ -150,11 +171,8 @@ heap_tuple_untoast_attr(struct varlena * attr)
 		/*
 		 * This is a compressed value inside of the main tuple
 		 */
-		PGLZ_Header *tmp = (PGLZ_Header *) attr;
-
-		attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-		SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
-		pglz_decompress(tmp, VARDATA(attr));
+		attr = (struct varlena *) DatumGetPointer(
+			toast_uncompress_datum(PointerGetDatum(attr)));
 	}
 	else if (VARATT_IS_SHORT(attr))
 	{
@@ -209,15 +227,8 @@ heap_tuple_untoast_attr_slice(struct varlena * attr,
 
 	if (VARATT_IS_COMPRESSED(preslice))
 	{
-		PGLZ_Header *tmp = (PGLZ_Header *) preslice;
-		Size		size = PGLZ_RAW_SIZE(tmp) + VARHDRSZ;
-
-		preslice = (struct varlena *) palloc(size);
-		SET_VARSIZE(preslice, size);
-		pglz_decompress(tmp, VARDATA(preslice));
-
-		if (tmp != (PGLZ_Header *) attr)
-			pfree(tmp);
+		preslice = (struct varlena *) DatumGetPointer(
+			toast_uncompress_datum(PointerGetDatum(preslice)));
 	}
 
 	if (VARATT_IS_SHORT(preslice))
@@ -277,8 +288,7 @@ toast_raw_datum_size(Datum value)
 	}
 	else if (VARATT_IS_COMPRESSED(attr))
 	{
-		/* here, va_rawsize is just the payload size */
-		result = VARRAWSIZE_4B_C(attr) + VARHDRSZ;
+		result = toast_uncompressed_length(attr);
 	}
 	else if (VARATT_IS_SHORT(attr))
 	{
@@ -1162,6 +1172,75 @@ toast_flatten_tuple_attribute(Datum value,
 	return PointerGetDatum(new_data);
 }
 
+/* ----
+ * toast_get_compress_size_algo -
+ *
+ * get metadata about compressed Datum
+ *
+ *
+ * How to disambiguate between compression strategies:
+ *
+ * For historical reasons the first 4 bytes of a compressed Datum are used to
+ * store the raw size of the datum as an unsigned integer. Since the length
+ * cannot be more than 1GB due to general toast limitations we have the 2 high
+ * bits to disambiguate whether the Datum has been compressed with the legacy
+ * pglz or something else. We cannot change the meaning of Datums with the
+ * first 2 bits unset since we need to support the old ondisk format.
+ *
+ * Since earlier our only compression format was pglz, which stored two 0 bits
+ * we can use those two bits to discern different formats. If the compresion
+ * format we use has a higher numeric value than 2 we store b11/3 in the high
+ * bits and use an extra byte for storing the numeric id - 2 in an extra byte.
+ *
+ * So storage looks like:
+ * 1) [4 byte varlena header]
+ * 2) [4 byte uncompressed length, 2 high bits for algorithm]
+ * 3) [1 optional byte of algorithm id - 2]
+ * 4) [compressed data ...]
+ *
+ * On little endian the storage for 2) looks like:
+ * [1st length byte][3rd length byte][2nd length byte][6 bit length][2 bit algorithm]
+ *
+ * Due to the 2 high bits only being in the 4th bit we cannot store the
+ * algorithm in a convenient format if we need more than two bits to represent
+ * it.
+ * ----
+ */
+static void
+toast_get_compress_size_algo(Datum value, ToastCompressionAlgo *algo,
+							 Size *compressed_length,
+							 Size *uncompressed_length,
+							 void **compressed_data)
+{
+	struct varlena *attr =  (struct varlena *) DatumGetPointer(value);
+	int32 compression_type;
+
+	/*
+	 * read the two highest bits of the size, the compression is (possibly
+	 * partially) stored there.
+	 */
+	compression_type = (*(uint32 *) VARDATA(value)) >> 30;
+	*compressed_data = ((char *) VARDATA(value)) + sizeof(uint32);
+
+	/* mask the two highest bits away to get the real size */
+	*uncompressed_length = ((*(uint32 *) (VARDATA(attr))) & 0x3ffffff);
+	*compressed_length = VARSIZE_ANY_EXHDR(attr) - sizeof(uint32);
+
+	/* algorithm is also stored in extra byte */
+	if (compression_type == 3)
+	{
+		/* add extra byte of compression algorithm metadata */
+		compression_type = 2 + *((uint8 *) *compressed_data);
+		/* and deal with the extra byte */
+		*compressed_data = ((char *) *compressed_data) + sizeof(uint8);
+		*compressed_length -= sizeof(uint8);
+	}
+
+	if (compression_type > LAST_COMPRESSION_ALGO)
+		elog(ERROR, "unknown compression algorithm %d", compression_type);
+
+	*algo = (ToastCompressionAlgo) compression_type;
+}
 
 /* ----------
  * toast_compress_datum -
@@ -1174,13 +1253,23 @@ toast_flatten_tuple_attribute(Datum value,
  *
  *	We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without
  *	copying them.  But we can't handle external or compressed datums.
+ *
+ *  NB: check toast_get_compress_size_algo for details of the encoding of
+ *  compression algorithms.
  * ----------
  */
 Datum
 toast_compress_datum(Datum value)
 {
-	struct varlena *tmp;
-	int32		valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
+	struct varlena *compressed_datum = NULL;
+	void		*compressed;
+	int32		uncompressed_size = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
+	void       *uncompressed_data = VARDATA_ANY(DatumGetPointer(value));
+	int32		raw_compressed_size; /* size returned by compressor */
+	int32		compressed_size; /* including varlena & uncompressed size */
+	Size		compression_overhead;
+	Size		buffer_size;
+	int		    ret;
 
 	Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value)));
 	Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value)));
@@ -1188,38 +1277,248 @@ toast_compress_datum(Datum value)
 	/*
 	 * No point in wasting a palloc cycle if value size is out of the allowed
 	 * range for compression
+	 *
+	 * XXX: define magic numbers somewhere else
 	 */
-	if (valsize < PGLZ_strategy_default->min_input_size ||
-		valsize > PGLZ_strategy_default->max_input_size)
+	if (uncompressed_size < 32)
 		return PointerGetDatum(NULL);
 
-	tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize));
+	compression_overhead = sizeof(uint32);
+	/* use extra byte to store additional algorithms */
+	if (toast_compression_algo > 2)
+		compression_overhead += sizeof(uint8);
 
 	/*
-	 * We recheck the actual size even if pglz_compress() reports success,
-	 * because it might be satisfied with having saved as little as one byte
-	 * in the compressed data --- which could turn into a net loss once you
-	 * consider header and alignment padding.  Worst case, the compressed
-	 * format might require three padding bytes (plus header, which is
-	 * included in VARSIZE(tmp)), whereas the uncompressed format would take
-	 * only one header byte and no padding if the value is short enough.  So
-	 * we insist on a savings of more than 2 bytes to ensure we have a gain.
+	 * compute compression algorithm agnostic part of the buffer size needed
+	 * for compression.
 	 */
-	if (pglz_compress(VARDATA_ANY(DatumGetPointer(value)), valsize,
-					  (PGLZ_Header *) tmp, PGLZ_strategy_default) &&
-		VARSIZE(tmp) < valsize - 2)
+	buffer_size = VARHDRSZ + compression_overhead;
+
+	/* and the compression specific part */
+	switch ((ToastCompressionAlgo) toast_compression_algo)
 	{
-		/* successful compression */
-		return PointerGetDatum(tmp);
+		case TOAST_COMPRESS_PGLZ:
+		case TOAST_COMPRESS_PGLZ_LONG:
+			buffer_size += PGLZ_MAX_OUTPUT(uncompressed_size);
+			break;
+		case TOAST_COMPRESS_SNAPPY:
+			buffer_size += snappy_max_compressed_length(uncompressed_size);
+			break;
+		case TOAST_COMPRESS_LZ4:
+			buffer_size += LZ4_compressBound(uncompressed_size);
+			break;
+	}
+
+	/* allocate to-be-compressed Datum */
+	compressed_datum = (struct varlena *) palloc(buffer_size);
+	compressed = ((char *) VARDATA(compressed_datum)) + compression_overhead;
+
+	/* compress data */
+	switch ((ToastCompressionAlgo) toast_compression_algo)
+	{
+		case TOAST_COMPRESS_PGLZ:
+		case TOAST_COMPRESS_PGLZ_LONG:
+			{
+				ret = pglz_compress(uncompressed_data, uncompressed_size,
+									compressed, PGLZ_strategy_default);
+				if (!ret)
+					goto incompressible;
+				raw_compressed_size = ret;
+				break;
+			}
+		case TOAST_COMPRESS_SNAPPY:
+			{
+				static struct snappy_env *snappy_env = NULL;
+				if (snappy_env == NULL)
+				{
+					snappy_env = malloc(sizeof(struct snappy_env));
+					snappy_init_env(snappy_env);
+				}
+
+				ret = snappy_compress(snappy_env,
+									  uncompressed_data,
+									  (size_t)uncompressed_size,
+									  compressed,
+									  &buffer_size);
+				/* EIO is returned for incompressible data */
+				if (ret == EIO)
+					goto incompressible;
+				else if (ret != 0)
+					elog(ERROR, "snappy: compression failed: %d", ret);
+
+				raw_compressed_size = buffer_size;
+				break;
+			}
+		case TOAST_COMPRESS_LZ4:
+			{
+				ret = LZ4_compress(uncompressed_data,
+								   compressed,
+								   uncompressed_size);
+				if (ret == 0)
+					elog(ERROR, "lz4: compression failed");
+
+				raw_compressed_size = ret;
+				break;
+			}
+		default:
+			elog(ERROR, "invalid compression algorithm");
+	}
+
+	/* encode compression algorithm in size if it fits there */
+	if (toast_compression_algo <= 2)
+	{
+		*((uint32 *) VARDATA(compressed_datum)) =
+			((uint32) toast_compression_algo) << 30 | uncompressed_size;
 	}
 	else
 	{
-		/* incompressible data */
-		pfree(tmp);
-		return PointerGetDatum(NULL);
+		/* set marker for separate algorithm byte */
+		*((uint32 *) VARDATA(compressed_datum)) =
+			((uint32) 3) << 30 | uncompressed_size;
+
+		/*
+		 * Store algorithm in extra byte. Algorithms 0, 1 and 2 are handled in
+		 * the above case.
+		 */
+		*((char *) VARDATA(compressed_datum) + sizeof(uint32)) =
+			toast_compression_algo - 2;
+	}
+
+	/* set size and mark varlena as being of type 4B_C/compressed */
+	SET_VARSIZE_COMPRESSED(compressed_datum,
+						   raw_compressed_size + compression_overhead + VARHDRSZ);
+
+	compressed_size = VARSIZE(compressed_datum);
+
+	/*
+	 * Check whether the compression was sufficiently effective. Some of the
+	 * compression methods check for blowing up to a larger amount of data than
+	 * the source, some don't. Even if they do, like pglz_compress(), they
+	 * might reports success, having saved as little as one byte in the
+	 * compressed data --- which could turn into a net loss once you consider
+	 * header and alignment padding.  Worst case, the compressed format might
+	 * require three padding bytes (plus header, which is included in
+	 * VARSIZE(tmp)), whereas the uncompressed format would take only one
+	 * header byte and no padding if the value is short enough.  So we insist
+	 * on a savings of more than 2 bytes to ensure we have a gain.
+	 */
+	if (compressed_size < uncompressed_size - 2)
+	{
+		/* successful compression */
+		return PointerGetDatum(compressed_datum);
+	}
+
+	/* incompressible data */
+incompressible:
+	if (compressed_datum != NULL)
+		pfree(compressed_datum);
+	return PointerGetDatum(NULL);
+}
+
+/*
+ * toast_uncompress_datum -
+ *
+ * Uncompress compressed datum using the appropriate compression
+ * algorithm. Will return a 4B Datum.
+ */
+static Datum
+toast_uncompress_datum(Datum value)
+{
+	struct varlena *attr =  (struct varlena *) DatumGetPointer(value);
+	ToastCompressionAlgo compression_type;
+	Size uncompressed_length;
+	void *uncompressed_data;
+	Size compressed_length;
+	void *compressed_data;
+
+	Assert(VARATT_IS_COMPRESSED(attr));
+
+	/* get meta information about the compressed datum */
+	toast_get_compress_size_algo(value,
+								 &compression_type,
+								 &compressed_length,
+								 &uncompressed_length,
+								 &compressed_data);
+
+	attr = (struct varlena *) palloc(uncompressed_length + VARHDRSZ);
+	SET_VARSIZE(attr, uncompressed_length + VARHDRSZ);
+	uncompressed_data = VARDATA(attr);
+
+	switch (compression_type)
+	{
+		case TOAST_COMPRESS_PGLZ:
+		case TOAST_COMPRESS_PGLZ_LONG:
+			{
+				pglz_decompress(compressed_data, compressed_length,
+								uncompressed_data, uncompressed_length);
+				break;
+			}
+		case TOAST_COMPRESS_SNAPPY:
+			{
+				int ret;
+				Size s_uncompressed_length;
+
+				ret = snappy_uncompressed_length(compressed_data,
+												 compressed_length,
+												 &s_uncompressed_length);
+				if (!ret)
+					elog(ERROR, "snappy: failed to determine compression length");
+				if (uncompressed_length != s_uncompressed_length)
+					elog(ERROR, "snappy: compression size mismatch %zu != %zu",
+						 uncompressed_length, s_uncompressed_length);
+
+				ret = snappy_uncompress(compressed_data,
+										compressed_length,
+										uncompressed_data);
+				if (ret != 0)
+					elog(ERROR, "snappy: decompression failed: %d", ret);
+				break;
+			}
+		case TOAST_COMPRESS_LZ4:
+			{
+				int ret;
+
+				ret = LZ4_decompress_fast(compressed_data, uncompressed_data,
+										  uncompressed_length);
+				if (ret != compressed_length)
+					elog(ERROR, "lz4: decompression size mismatch: %d vs %zu",
+						 ret, compressed_length);
+				break;
+			}
 	}
+
+	/* make sure we didn't overwrite this anywhere */
+	Assert(VARSIZE(attr) == uncompressed_length + VARHDRSZ);
+
+	return PointerGetDatum(attr);
 }
 
+/*
+ * toast_uncompressed_length -
+ *
+ * Return the length a compressed Datum has after decompression. Includes
+ * varlena overhead.
+ */
+static Size
+toast_uncompressed_length(struct varlena *attr)
+{
+	ToastCompressionAlgo compression_type;
+	Size uncompressed_length;
+	Size compressed_length;
+	void *compressed_data;
+
+	Assert(VARATT_IS_COMPRESSED(attr));
+
+	toast_get_compress_size_algo(PointerGetDatum(attr),
+								 &compression_type,
+								 &compressed_length,
+								 &uncompressed_length,
+								 &compressed_data);
+
+	/* varlena overhead */
+	uncompressed_length += VARHDRSZ;
+	return uncompressed_length;
+}
 
 /* ----------
  * toast_save_datum -
@@ -1284,10 +1583,11 @@ toast_save_datum(Relation rel, Datum value,
 	}
 	else if (VARATT_IS_COMPRESSED(dval))
 	{
+		struct varlena *dval_a = (struct varlena *) dval;
 		data_p = VARDATA(dval);
 		data_todo = VARSIZE(dval) - VARHDRSZ;
 		/* rawsize in a compressed datum is just the size of the payload */
-		toast_pointer.va_rawsize = VARRAWSIZE_4B_C(dval) + VARHDRSZ;
+		toast_pointer.va_rawsize = toast_uncompressed_length(dval_a);
 		toast_pointer.va_extsize = data_todo;
 		/* Assert that the numbers look like it's compressed */
 		Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer));
diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c
index 66c64c1..3a9b6cf 100644
--- a/src/backend/utils/adt/pg_lzcompress.c
+++ b/src/backend/utils/adt/pg_lzcompress.c
@@ -9,8 +9,8 @@
  *		Entry routines:
  *
  *			bool
- *			pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
- *						  const PGLZ_Strategy *strategy);
+ *			pglz_compress(const char *source, int32 slen, char *dest,
+ *						  *const PGLZ_Strategy *strategy);
  *
  *				source is the input data to be compressed.
  *
@@ -23,12 +23,12 @@
  *					the compression algorithm. If NULL, the compiled
  *					in default strategy is used.
  *
- *				The return value is TRUE if compression succeeded,
- *				FALSE if not; in the latter case the contents of dest
- *				are undefined.
+ *				The return value is the size of the compressed output if
+ *				compression succeeded, 0 if not; in the latter case the
+ *				contents of dest are undefined.
  *
  *			void
- *			pglz_decompress(const PGLZ_Header *source, char *dest)
+ *			pglz_decompress(const char *source, char *dest)
  *
  *				source is the compressed input.
  *
@@ -42,25 +42,8 @@
  *
  *		The decompression algorithm and internal data format:
  *
- *			PGLZ_Header is defined as
- *
- *				typedef struct PGLZ_Header {
- *					int32		vl_len_;
- *					int32		rawsize;
- *				}
- *
- *			The header is followed by the compressed data itself.
- *
- *			The data representation is easiest explained by describing
- *			the process of decompression.
- *
- *			If VARSIZE(x) == rawsize + sizeof(PGLZ_Header), then the data
- *			is stored uncompressed as plain bytes. Thus, the decompressor
- *			simply copies rawsize bytes from the location after the
- *			header to the destination.
- *
- *			Otherwise the first byte after the header tells what to do
- *			the next 8 times. We call this the control byte.
+ *			The first byte after the header - which this file never sees -
+ *			tells what to do the next 8 times. We call this the control byte.
  *
  *			An unset bit in the control byte means, that one uncompressed
  *			byte follows, which is copied from input to output.
@@ -225,18 +208,6 @@ static const PGLZ_Strategy strategy_default_data = {
 const PGLZ_Strategy *const PGLZ_strategy_default = &strategy_default_data;
 
 
-static const PGLZ_Strategy strategy_always_data = {
-	0,							/* Chunks of any size are compressed */
-	INT_MAX,
-	0,							/* It's enough to save one single byte */
-	INT_MAX,					/* Never give up early */
-	128,						/* Stop history lookup if a match of 128 bytes
-								 * is found */
-	6							/* Look harder for a good match */
-};
-const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data;
-
-
 /* ----------
  * Statically allocated work arrays for history
  * ----------
@@ -478,16 +449,16 @@ pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end,
  *		Compresses source into dest using strategy.
  * ----------
  */
-bool
-pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
+int32
+pglz_compress(const void *source, int32 slen, void *dest,
 			  const PGLZ_Strategy *strategy)
 {
-	unsigned char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header);
+	unsigned char *bp = (unsigned char *) dest;
 	unsigned char *bstart = bp;
 	int			hist_next = 0;
 	bool		hist_recycle = false;
 	const char *dp = source;
-	const char *dend = source + slen;
+	const char *dend = ((char *) source) + slen;
 	unsigned char ctrl_dummy = 0;
 	unsigned char *ctrlp = &ctrl_dummy;
 	unsigned char ctrlb = 0;
@@ -514,12 +485,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 	if (strategy->match_size_good <= 0 ||
 		slen < strategy->min_input_size ||
 		slen > strategy->max_input_size)
-		return false;
-
-	/*
-	 * Save the original source size in the header.
-	 */
-	dest->rawsize = slen;
+		return 0;
 
 	/*
 	 * Limit the match parameters to the supported range.
@@ -574,7 +540,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 		 * allow 4 slop bytes.
 		 */
 		if (bp - bstart >= result_max)
-			return false;
+			return 0;
 
 		/*
 		 * If we've emitted more than first_success_by bytes without finding
@@ -583,7 +549,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 		 * pre-compressed data).
 		 */
 		if (!found_match && bp - bstart >= strategy->first_success_by)
-			return false;
+			return 0;
 
 		/*
 		 * Try to find a match in the history
@@ -627,14 +593,9 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 	*ctrlp = ctrlb;
 	result_size = bp - bstart;
 	if (result_size >= result_max)
-		return false;
-
-	/*
-	 * Success - need only fill in the actual length of the compressed datum.
-	 */
-	SET_VARSIZE_COMPRESSED(dest, result_size + sizeof(PGLZ_Header));
+		return 0;
 
-	return true;
+	return result_size;
 }
 
 
@@ -645,17 +606,17 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
  * ----------
  */
 void
-pglz_decompress(const PGLZ_Header *source, char *dest)
+pglz_decompress(const void *source, int32 slen, void *dest, int32 dlen)
 {
 	const unsigned char *sp;
 	const unsigned char *srcend;
 	unsigned char *dp;
 	unsigned char *destend;
 
-	sp = ((const unsigned char *) source) + sizeof(PGLZ_Header);
-	srcend = ((const unsigned char *) source) + VARSIZE(source);
+	sp = (const unsigned char *) source;
+	srcend = ((const unsigned char *) source) + slen;
 	dp = (unsigned char *) dest;
-	destend = dp + source->rawsize;
+	destend = dp + dlen;
 
 	while (sp < srcend && dp < destend)
 	{
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3a76536..788731e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -28,6 +28,7 @@
 
 #include "access/gin.h"
 #include "access/transam.h"
+#include "access/tuptoaster.h"
 #include "access/twophase.h"
 #include "access/xact.h"
 #include "catalog/namespace.h"
@@ -1890,6 +1891,16 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"toast_compression_algo", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("chooses the compression algo: 0: pglz, 1: snappy, 2: lz4, 3: pglz_long"),
+			NULL
+		},
+		&toast_compression_algo,
+		2, 0, 3,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"vacuum_freeze_min_age", PGC_USERSET, CLIENT_CONN_STATEMENT,
 			gettext_noop("Minimum age at which VACUUM should freeze a table row."),
 			NULL
diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h
index 6f4fc45..97334fd 100644
--- a/src/include/access/tuptoaster.h
+++ b/src/include/access/tuptoaster.h
@@ -94,6 +94,7 @@
 	 sizeof(int32) -									\
 	 VARHDRSZ)
 
+extern int toast_compression_algo;
 
 /* ----------
  * toast_insert_or_update -
diff --git a/src/include/postgres.h b/src/include/postgres.h
index 30e1dee..6fd70c6 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -81,21 +81,15 @@ struct varatt_external
  * compiler might otherwise think it could generate code that assumes
  * alignment while touching fields of a 1-byte-header varlena.
  */
-typedef union
+
+/* Normal and inline compressed varlena (4-byte length) */
+typedef struct
 {
-	struct						/* Normal varlena (4-byte length) */
-	{
-		uint32		va_header;
-		char		va_data[1];
-	}			va_4byte;
-	struct						/* Compressed-in-line format */
-	{
-		uint32		va_header;
-		uint32		va_rawsize; /* Original data size (excludes header) */
-		char		va_data[1]; /* Compressed data */
-	}			va_compressed;
+	uint32		va_header;
+	char		va_data[1];
 } varattrib_4b;
 
+/* short inline uncompressed varlena (1-byte lenght) */
 typedef struct
 {
 	uint8		va_header;
@@ -158,16 +152,16 @@ typedef struct
 
 /* VARSIZE_4B() should only be used on known-aligned data */
 #define VARSIZE_4B(PTR) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header & 0x3FFFFFFF)
+	(((varattrib_4b *) (PTR))->va_header & 0x3FFFFFFF)
 #define VARSIZE_1B(PTR) \
 	(((varattrib_1b *) (PTR))->va_header & 0x7F)
 #define VARSIZE_1B_E(PTR) \
 	(((varattrib_1b_e *) (PTR))->va_len_1be)
 
 #define SET_VARSIZE_4B(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF)
+	(((varattrib_4b *) (PTR))->va_header = (len) & 0x3FFFFFFF)
 #define SET_VARSIZE_4B_C(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = ((len) & 0x3FFFFFFF) | 0x40000000)
+	(((varattrib_4b *) (PTR))->va_header = ((len) & 0x3FFFFFFF) | 0x40000000)
 #define SET_VARSIZE_1B(PTR,len) \
 	(((varattrib_1b *) (PTR))->va_header = (len) | 0x80)
 #define SET_VARSIZE_1B_E(PTR,len) \
@@ -190,16 +184,16 @@ typedef struct
 
 /* VARSIZE_4B() should only be used on known-aligned data */
 #define VARSIZE_4B(PTR) \
-	((((varattrib_4b *) (PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF)
+	((((varattrib_4b *) (PTR))->va_header >> 2) & 0x3FFFFFFF)
 #define VARSIZE_1B(PTR) \
 	((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F)
 #define VARSIZE_1B_E(PTR) \
 	(((varattrib_1b_e *) (PTR))->va_len_1be)
 
 #define SET_VARSIZE_4B(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2))
+	(((varattrib_4b *) (PTR))->va_header = (((uint32) (len)) << 2))
 #define SET_VARSIZE_4B_C(PTR,len) \
-	(((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2) | 0x02)
+	(((varattrib_4b *) (PTR))->va_header = (((uint32) (len)) << 2) | 0x02)
 #define SET_VARSIZE_1B(PTR,len) \
 	(((varattrib_1b *) (PTR))->va_header = (((uint8) (len)) << 1) | 0x01)
 #define SET_VARSIZE_1B_E(PTR,len) \
@@ -217,14 +211,10 @@ typedef struct
 
 #define VARHDRSZ_EXTERNAL		2
 
-#define VARDATA_4B(PTR)		(((varattrib_4b *) (PTR))->va_4byte.va_data)
-#define VARDATA_4B_C(PTR)	(((varattrib_4b *) (PTR))->va_compressed.va_data)
+#define VARDATA_4B(PTR)		(((varattrib_4b *) (PTR))->va_data)
 #define VARDATA_1B(PTR)		(((varattrib_1b *) (PTR))->va_data)
 #define VARDATA_1B_E(PTR)	(((varattrib_1b_e *) (PTR))->va_data)
 
-#define VARRAWSIZE_4B_C(PTR) \
-	(((varattrib_4b *) (PTR))->va_compressed.va_rawsize)
-
 /* Externally visible macros */
 
 /*
diff --git a/src/include/utils/pg_lzcompress.h b/src/include/utils/pg_lzcompress.h
index 4af24a3..eaa36e6 100644
--- a/src/include/utils/pg_lzcompress.h
+++ b/src/include/utils/pg_lzcompress.h
@@ -10,20 +10,6 @@
 #ifndef _PG_LZCOMPRESS_H_
 #define _PG_LZCOMPRESS_H_
 
-
-/* ----------
- * PGLZ_Header -
- *
- *		The information at the start of the compressed data.
- * ----------
- */
-typedef struct PGLZ_Header
-{
-	int32		vl_len_;		/* varlena header (do not touch directly!) */
-	int32		rawsize;
-} PGLZ_Header;
-
-
 /* ----------
  * PGLZ_MAX_OUTPUT -
  *
@@ -31,17 +17,7 @@ typedef struct PGLZ_Header
  *		We allow 4 bytes for overrun before detecting compression failure.
  * ----------
  */
-#define PGLZ_MAX_OUTPUT(_dlen)			((_dlen) + 4 + sizeof(PGLZ_Header))
-
-/* ----------
- * PGLZ_RAW_SIZE -
- *
- *		Macro to determine the uncompressed data size contained
- *		in the entry.
- * ----------
- */
-#define PGLZ_RAW_SIZE(_lzdata)			((_lzdata)->rawsize)
-
+#define PGLZ_MAX_OUTPUT(_dlen)			((_dlen) + 4)
 
 /* ----------
  * PGLZ_Strategy -
@@ -88,25 +64,21 @@ typedef struct PGLZ_Strategy
 
 
 /* ----------
- * The standard strategies
+ * The standard strategy
  *
  *		PGLZ_strategy_default		Recommended default strategy for TOAST.
- *
- *		PGLZ_strategy_always		Try to compress inputs of any length.
- *									Fallback to uncompressed storage only if
- *									output would be larger than input.
  * ----------
  */
 extern const PGLZ_Strategy *const PGLZ_strategy_default;
-extern const PGLZ_Strategy *const PGLZ_strategy_always;
 
 
 /* ----------
  * Global function declarations
  * ----------
  */
-extern bool pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
+extern int32 pglz_compress(const void *source, int32 slen, void *dest,
 			  const PGLZ_Strategy *strategy);
-extern void pglz_decompress(const PGLZ_Header *source, char *dest);
+extern void pglz_decompress(const void *source, int32 slen, void *dest,
+			  int32 dlen);
 
 #endif   /* _PG_LZCOMPRESS_H_ */
-- 
1.8.3.1

#24

Robert Haas

robertmhaas@gmail.com

over 12 years ago

In reply to: Andres Freund (#23)

Re: pluggable compression support

On Thu, Jun 20, 2013 at 8:09 PM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-06-15 12:20:28 +0200, Andres Freund wrote:

On 2013-06-14 21:56:52 -0400, Robert Haas wrote:

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

I personally don't think the legal arguments holds all that much water
for snappy and lz4. But then the opinion of a european non-lawyer doesn't
hold much either.
Both are widely used by a large number open and closed projects, some of
which have patent grant clauses in their licenses. E.g. hadoop,
cassandra use lz4, and I'd be surprised if the companies behind those
have opened themselves to litigation.

I think we should preliminarily decide which algorithm to use before we
get lawyers involved. I'd surprised if they can make such a analysis
faster than we can rule out one of them via benchmarks.

Will post an updated patch that includes lz4 as well.

Attached.

Changes:
* add lz4 compression algorithm (2 clause bsd)
* move compression algorithms into own subdirectory
* clean up compression/decompression functions
* allow 258 compression algorithms, uses 1byte extra for any but the
first three
* don't pass a varlena to pg_lzcompress.c anymore, but data directly
* add pglz_long as a test fourth compression method that uses the +1
byte encoding
* us postgres' endian detection in snappy for compatibility with osx

Based on the benchmarks I think we should go with lz4 only for now. The
patch provides the infrastructure should somebody else want to add more
or even proper configurability.

Todo:
* windows build support
* remove toast_compression_algo guc
* remove either snappy or lz4 support
* remove pglz_long support (just there for testing)

New benchmarks:

Table size:
List of relations
Schema | Name | Type | Owner | Size | Description
--------+--------------------+-------+--------+--------+-------------
public | messages_pglz | table | andres | 526 MB |
public | messages_snappy | table | andres | 523 MB |
public | messages_lz4 | table | andres | 522 MB |
public | messages_pglz_long | table | andres | 527 MB |
(4 rows)

Workstation (2xE5520, enough s_b for everything):

Data load:
pglz: 36643.384 ms
snappy: 24626.894 ms
lz4: 23871.421 ms
pglz_long: 37097.681 ms

COPY messages_* TO '/dev/null' WITH BINARY;
pglz: 3116.083 ms
snappy: 2524.388 ms
lz4: 2349.396 ms
pglz_long: 3104.134 ms

COPY (SELECT rawtxt FROM messages_*) TO '/dev/null' WITH BINARY;
pglz: 1609.969 ms
snappy: 1031.696 ms
lz4: 886.782 ms
pglz_long: 1606.803 ms

On my elderly laptop (core 2 duo), too load shared buffers:

Data load:
pglz: 39968.381 ms
snappy: 26952.330 ms
lz4: 29225.472 ms
pglz_long: 39929.568 ms

COPY messages_* TO '/dev/null' WITH BINARY;
pglz: 3920.588 ms
snappy: 3421.938 ms
lz4: 3311.540 ms
pglz_long: 3885.920 ms

COPY (SELECT rawtxt FROM messages_*) TO '/dev/null' WITH BINARY;
pglz: 2238.145 ms
snappy: 1753.403 ms
lz4: 1638.092 ms
pglz_long: 2227.804 ms

Well, the performance of both snappy and lz4 seems to be significantly
better than pglz. On these tests lz4 has a small edge but that might
not be true on other data sets. I still think the main issue is legal
review: are there any license or patent concerns about including
either of these algorithms in PG? If neither of them have issues, we
might need to experiment a little more before picking between them.
If one does and the other does not, well, then it's a short
conversation.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Robert Haas (#24)

Re: pluggable compression support

On 2013-06-25 12:22:31 -0400, Robert Haas wrote:

On Thu, Jun 20, 2013 at 8:09 PM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-06-15 12:20:28 +0200, Andres Freund wrote:

On 2013-06-14 21:56:52 -0400, Robert Haas wrote:

I don't think we need it. I think what we need is to decide is which
algorithm is legally OK to use. And then put it in.

In the past, we've had a great deal of speculation about that legal
question from people who are not lawyers. Maybe it would be valuable
to get some opinions from people who ARE lawyers. Tom and Heikki both
work for real big companies which, I'm guessing, have substantial
legal departments; perhaps they could pursue getting the algorithms of
possible interest vetted. Or, I could try to find out whether it's
possible do something similar through EnterpriseDB.

I personally don't think the legal arguments holds all that much water
for snappy and lz4. But then the opinion of a european non-lawyer doesn't
hold much either.
Both are widely used by a large number open and closed projects, some of
which have patent grant clauses in their licenses. E.g. hadoop,
cassandra use lz4, and I'd be surprised if the companies behind those
have opened themselves to litigation.

I think we should preliminarily decide which algorithm to use before we
get lawyers involved. I'd surprised if they can make such a analysis
faster than we can rule out one of them via benchmarks.

Will post an updated patch that includes lz4 as well.

Attached.

Well, the performance of both snappy and lz4 seems to be significantly
better than pglz. On these tests lz4 has a small edge but that might
not be true on other data sets.

From what I've seen of independent benchmarks on more varying datasets
and from what I tested (without pg inbetween) lz4 usually has a bigger
margin than this, especially on decompression.
The implementation also seems to be better prepared to run on more
platforms, e.g. it didn't require any fiddling with endian.h in contrast
to snappy.
But yes, "even" snappy would be a big improvement should lz4 turn out to
be problematic and the performance difference isn't big enough to rule
one out as I'd hopped.

I still think the main issue is legal
review: are there any license or patent concerns about including
either of these algorithms in PG? If neither of them have issues, we
might need to experiment a little more before picking between them.
If one does and the other does not, well, then it's a short
conversation.

True. So, how do we proceed on that?

The ASF decided it was safe to use lz4 in cassandra. Does anybody have
contacts over there?

Btw, I have the feeling we hold this topic to a higher standard wrt
patent issues than other work in postgres...

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#26

Josh Berkus

josh@agliodbs.com

over 12 years ago

In reply to: Josh Berkus (#2)

Re: pluggable compression support

On 06/25/2013 11:42 AM, Andres Freund wrote:

True. So, how do we proceed on that?

The ASF decided it was safe to use lz4 in cassandra. Does anybody have
contacts over there?

Btw, I have the feeling we hold this topic to a higher standard wrt
patent issues than other work in postgres...

We have access to attorneys, both in the US and Canada. I will proceed
to ask for real legal advice.

However, can you tell me what exactly you are concerned about? lz4 is
under the BSD license, and released by Google. Why are we worried, exactly?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Import Notes

Reply to msg id not found: WMe9d5617ef26cc26a34063c20c06c86c10c5ff840b135d5f0ca0903e9fd0066ecb0a81c427517cede67708747c4369d77@asav-3.01.comReference msg id not found: WM6b524fda2c5277d652b26ae1550d2bce213d3c29a3dfecec1ba1056db6141dd9f889261342e1886e78eee109be901896@asav-1.01.com

#27

Andres Freund

andres@2ndquadrant.com

over 12 years ago

In reply to: Josh Berkus (#26)

Re: pluggable compression support

On 2013-06-25 12:08:22 -0700, Josh Berkus wrote:

On 06/25/2013 11:42 AM, Andres Freund wrote:

True. So, how do we proceed on that?

The ASF decided it was safe to use lz4 in cassandra. Does anybody have
contacts over there?

Btw, I have the feeling we hold this topic to a higher standard wrt
patent issues than other work in postgres...

We have access to attorneys, both in the US and Canada. I will proceed
to ask for real legal advice.

Cool!

However, can you tell me what exactly you are concerned about? lz4 is
under the BSD license, and released by Google.

Snappy is released/copyrighted by google. lz4 by Yann Collet.

Both are under BSD licenses (3 and 2 clause variants respectively). I
don't think we need to worry about the license.

Why are we worried, exactly?

The concerns I heard about were all about patent issues.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#28

Claudio Freire

klaussfreire@gmail.com

over 12 years ago

In reply to: Andres Freund (#27)

Re: pluggable compression support

On Tue, Jun 25, 2013 at 4:15 PM, Andres Freund <andres@2ndquadrant.com> wrote:

However, can you tell me what exactly you are concerned about? lz4 is
under the BSD license, and released by Google.

Snappy is released/copyrighted by google. lz4 by Yann Collet.

Both are under BSD licenses (3 and 2 clause variants respectively). I
don't think we need to worry about the license.

Why are we worried, exactly?

The concerns I heard about were all about patent issues.

IANAL, but patents have nothing to do with licenses. Licenses apply to
specific implementations, whereas software patents (ironically, since
they're forbidden to in general, but they do it anyway for software
patents) apply to processes. Two implementations may differ, and be
under different licenses, but the same patent may apply to both.

It's a sick state of affairs when you can implement lz4 completely
yourself and still be unclear about whether your own, clean-room
implementation is encumbered by patents or not.

A lawyer has to sift through pattent applications to know whether this
particular implementation is encumbered, and that's a very
time-consuming process (and thus very expensive).

If you can take the effort, it would be greatly beneficial I imagine.
But I think you're underestimating what those lawyers will ask for it.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#29

Tom Lane

tgl@sss.pgh.pa.us

over 12 years ago

In reply to: Josh Berkus (#26)

Re: pluggable compression support

Josh Berkus <josh@agliodbs.com> writes:

However, can you tell me what exactly you are concerned about? lz4 is
under the BSD license, and released by Google. Why are we worried, exactly?

Patents. The license on the code doesn't matter --- worst case, if
someone objected, we could rewrite the algorithm ourselves to get out
of an alleged copyright violation. But if someone comes after us for
a patent violation we're screwed; or at least, our users who have
terabytes of data stored with an infringing algorithm are screwed.

Andres is right that we're paying closer attention to patent risks here
than we do most places. That's because we know that the whole arena
of data compression is a minefield of patents. It would be
irresponsible not to try to check.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#30

Josh Berkus

josh@agliodbs.com

over 12 years ago

In reply to: Josh Berkus (#2)

Re: pluggable compression support

On 06/25/2013 12:23 PM, Tom Lane wrote:

Josh Berkus <josh@agliodbs.com> writes:

However, can you tell me what exactly you are concerned about? lz4 is
under the BSD license, and released by Google. Why are we worried, exactly?

Patents. The license on the code doesn't matter --- worst case, if
someone objected, we could rewrite the algorithm ourselves to get out
of an alleged copyright violation. But if someone comes after us for
a patent violation we're screwed; or at least, our users who have
terabytes of data stored with an infringing algorithm are screwed.

Taking this off-list, because it is a legal matter. Particularly, it's
legally problematic to discuss patents on a public mailing list, as we
found out with the ARC patent.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Import Notes

Reply to msg id not found: WM895d50d4c6a289db7f607ba7985ce5b13bcbea0f7f2c4f49621721664cd2b2a1a02c65c31cf7a833a85174f025b3d6db@asav-1.01.comReference msg id not found: WM6b524fda2c5277d652b26ae1550d2bce213d3c29a3dfecec1ba1056db6141dd9f889261342e1886e78eee109be901896@asav-1.01.com

#31

Huchev

hugochevrain@gmail.com

over 12 years ago

In reply to: Josh Berkus (#30)

Re: pluggable compression support

I've been following this issue these last few months.
Having the latest and best compressors built-in is a fashionable features
these days. And for good reasons.

I'm quite amazed that this issue is still considered a "legal risk".
To put this in perspective, the *whole world* is using LZ4 by now. It's
integrated directly into Linux kernel ARM, which means every smartphone on
the planet will have this piece of software integrated right at the factory.

And that's not just Linux. HBase has it. TokuDB has it. Delphix has it.
And PostgreSQL is stuck with what, pglz ?

How come any compressor which could put some competition to pglz is
systematically pushed out of the field on the ground of unverifiable "legal
risks" ?

And why would pglz be much safer to the very same risks ? From what I can
see, pglz is more complex, and therefore exposed to many more patent risks,
than simpler lz alternatives.

Seems the debate is overly biaised in favor of pglz.

--
View this message in context: http://postgresql.1045698.n5.nabble.com/pluggable-compression-support-tp5759259p5772891.html
Sent from the PostgreSQL - hackers mailing list archive at Nabble.com.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#32

Daniel Farina

daniel@heroku.com

over 12 years ago

In reply to: Huchev (#31)

Re: pluggable compression support

On Mon, Sep 30, 2013 at 1:49 PM, Huchev <hugochevrain@gmail.com> wrote:

How come any compressor which could put some competition to pglz is
systematically pushed out of the field on the ground of unverifiable "legal
risks" ?

Because pglz has been around for a while and has not caused patent
trouble. The risks have been accepted and the downsides have not
materialized. Were pglz were being written and distributed starting
today, perhaps your reasoning would be more compelling, but as-is the
pglz ship has sailed for quite some time and empirically it has not
been a problem.

That said, I hope the findings are in favor of lz4 or snappy
integration. It does seem lz4 has picked up a slight edge.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#33

Robert Haas

robertmhaas@gmail.com

over 12 years ago

In reply to: Daniel Farina (#32)

Re: pluggable compression support

On Tue, Oct 1, 2013 at 9:56 PM, Daniel Farina <daniel@heroku.com> wrote:

On Mon, Sep 30, 2013 at 1:49 PM, Huchev <hugochevrain@gmail.com> wrote:

How come any compressor which could put some competition to pglz is
systematically pushed out of the field on the ground of unverifiable "legal
risks" ?

Because pglz has been around for a while and has not caused patent
trouble. The risks have been accepted and the downsides have not
materialized. Were pglz were being written and distributed starting
today, perhaps your reasoning would be more compelling, but as-is the
pglz ship has sailed for quite some time and empirically it has not
been a problem.

That said, I hope the findings are in favor of lz4 or snappy
integration. It does seem lz4 has picked up a slight edge.

Yeah, I'm also in favor of a new compression format, whatever we can
agree on. However, I'm uncertain we're actually moving toward that
goal in any meaningful way.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers