jsonb format is pessimal for toast compression

Started by Tom Laneover 11 years ago190 messages
#1Tom Lane
tgl@sss.pgh.pa.us

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

As an example, here's gdb's report of the bitwise representation of the
example JSON value in the bug thread:

0x2ab85ac: 0x20000005 0x00000004 0x50003098 0x0000309f
0x2ab85bc: 0x000030ae 0x000030b8 0x000030cf 0x000030da
0x2ab85cc: 0x000030df 0x000030ee 0x00003105 0x6b6e756a
0x2ab85dc: 0x400000de 0x00000034 0x00000068 0x0000009c
0x2ab85ec: 0x000000d0 0x00000104 0x00000138 0x0000016c
0x2ab85fc: 0x000001a0 0x000001d4 0x00000208 0x0000023c
0x2ab860c: 0x00000270 0x000002a4 0x000002d8 0x0000030c
0x2ab861c: 0x00000340 0x00000374 0x000003a8 0x000003dc
0x2ab862c: 0x00000410 0x00000444 0x00000478 0x000004ac
0x2ab863c: 0x000004e0 0x00000514 0x00000548 0x0000057c
0x2ab864c: 0x000005b0 0x000005e4 0x00000618 0x0000064c
0x2ab865c: 0x00000680 0x000006b4 0x000006e8 0x0000071c
0x2ab866c: 0x00000750 0x00000784 0x000007b8 0x000007ec
0x2ab867c: 0x00000820 0x00000854 0x00000888 0x000008bc
0x2ab868c: 0x000008f0 0x00000924 0x00000958 0x0000098c
0x2ab869c: 0x000009c0 0x000009f4 0x00000a28 0x00000a5c
0x2ab86ac: 0x00000a90 0x00000ac4 0x00000af8 0x00000b2c
0x2ab86bc: 0x00000b60 0x00000b94 0x00000bc8 0x00000bfc
0x2ab86cc: 0x00000c30 0x00000c64 0x00000c98 0x00000ccc
0x2ab86dc: 0x00000d00 0x00000d34 0x00000d68 0x00000d9c
0x2ab86ec: 0x00000dd0 0x00000e04 0x00000e38 0x00000e6c
0x2ab86fc: 0x00000ea0 0x00000ed4 0x00000f08 0x00000f3c
0x2ab870c: 0x00000f70 0x00000fa4 0x00000fd8 0x0000100c
0x2ab871c: 0x00001040 0x00001074 0x000010a8 0x000010dc
0x2ab872c: 0x00001110 0x00001144 0x00001178 0x000011ac
0x2ab873c: 0x000011e0 0x00001214 0x00001248 0x0000127c
0x2ab874c: 0x000012b0 0x000012e4 0x00001318 0x0000134c
0x2ab875c: 0x00001380 0x000013b4 0x000013e8 0x0000141c
0x2ab876c: 0x00001450 0x00001484 0x000014b8 0x000014ec
0x2ab877c: 0x00001520 0x00001554 0x00001588 0x000015bc
0x2ab878c: 0x000015f0 0x00001624 0x00001658 0x0000168c
0x2ab879c: 0x000016c0 0x000016f4 0x00001728 0x0000175c
0x2ab87ac: 0x00001790 0x000017c4 0x000017f8 0x0000182c
0x2ab87bc: 0x00001860 0x00001894 0x000018c8 0x000018fc
0x2ab87cc: 0x00001930 0x00001964 0x00001998 0x000019cc
0x2ab87dc: 0x00001a00 0x00001a34 0x00001a68 0x00001a9c
0x2ab87ec: 0x00001ad0 0x00001b04 0x00001b38 0x00001b6c
0x2ab87fc: 0x00001ba0 0x00001bd4 0x00001c08 0x00001c3c
0x2ab880c: 0x00001c70 0x00001ca4 0x00001cd8 0x00001d0c
0x2ab881c: 0x00001d40 0x00001d74 0x00001da8 0x00001ddc
0x2ab882c: 0x00001e10 0x00001e44 0x00001e78 0x00001eac
0x2ab883c: 0x00001ee0 0x00001f14 0x00001f48 0x00001f7c
0x2ab884c: 0x00001fb0 0x00001fe4 0x00002018 0x0000204c
0x2ab885c: 0x00002080 0x000020b4 0x000020e8 0x0000211c
0x2ab886c: 0x00002150 0x00002184 0x000021b8 0x000021ec
0x2ab887c: 0x00002220 0x00002254 0x00002288 0x000022bc
0x2ab888c: 0x000022f0 0x00002324 0x00002358 0x0000238c
0x2ab889c: 0x000023c0 0x000023f4 0x00002428 0x0000245c
0x2ab88ac: 0x00002490 0x000024c4 0x000024f8 0x0000252c
0x2ab88bc: 0x00002560 0x00002594 0x000025c8 0x000025fc
0x2ab88cc: 0x00002630 0x00002664 0x00002698 0x000026cc
0x2ab88dc: 0x00002700 0x00002734 0x00002768 0x0000279c
0x2ab88ec: 0x000027d0 0x00002804 0x00002838 0x0000286c
0x2ab88fc: 0x000028a0 0x000028d4 0x00002908 0x0000293c
0x2ab890c: 0x00002970 0x000029a4 0x000029d8 0x00002a0c
0x2ab891c: 0x00002a40 0x00002a74 0x00002aa8 0x00002adc
0x2ab892c: 0x00002b10 0x00002b44 0x00002b78 0x00002bac
0x2ab893c: 0x00002be0 0x00002c14 0x00002c48 0x00002c7c
0x2ab894c: 0x00002cb0 0x00002ce4 0x00002d18 0x32343231
0x2ab895c: 0x74653534 0x74656577 0x33746577 0x77673534
0x2ab896c: 0x74657274 0x33347477 0x72777120 0x20717771
0x2ab897c: 0x65727771 0x20777120 0x66647372 0x73616b6c
0x2ab898c: 0x33353471 0x71772035 0x72777172 0x71727771
0x2ab899c: 0x77203277 0x72777172 0x71727771 0x33323233
0x2ab89ac: 0x6b207732 0x20657773 0x73616673 0x73207372
0x2ab89bc: 0x64736664 0x32343231 0x74653534 0x74656577
0x2ab89cc: 0x33746577 0x77673534 0x74657274 0x33347477
0x2ab89dc: 0x72777120 0x20717771 0x65727771 0x20777120
0x2ab89ec: 0x66647372 0x73616b6c 0x33353471 0x71772035
0x2ab89fc: 0x72777172 0x71727771 0x77203277 0x72777172
0x2ab8a0c: 0x71727771 0x33323233 0x6b207732 0x20657773
0x2ab8a1c: 0x73616673 0x73207372 0x64736664 0x32343231
0x2ab8a2c: 0x74653534 0x74656577 0x33746577 0x77673534
0x2ab8a3c: 0x74657274 0x33347477 0x72777120 0x20717771
0x2ab8a4c: 0x65727771 0x20777120 0x66647372 0x73616b6c
0x2ab8a5c: 0x33353471 0x71772035 0x72777172 0x71727771
0x2ab8a6c: 0x77203277 0x72777172 0x71727771 0x33323233
0x2ab8a7c: 0x6b207732 0x20657773 0x73616673 0x73207372
0x2ab8a8c: 0x64736664 0x32343231 0x74653534 0x74656577
0x2ab8a9c: 0x33746577 0x77673534 0x74657274 0x33347477
0x2ab8aac: 0x72777120 0x20717771 0x65727771 0x20777120
0x2ab8abc: 0x66647372 0x73616b6c 0x33353471 0x71772035
0x2ab8acc: 0x72777172 0x71727771 0x77203277 0x72777172
0x2ab8adc: 0x71727771 0x33323233 0x6b207732 0x20657773
0x2ab8aec: 0x73616673 0x73207372 0x64736664 0x32343231
0x2ab8afc: 0x74653534 0x74656577 0x33746577 0x77673534
...
0x2abb61c: 0x74657274 0x33347477 0x72777120 0x20717771
0x2abb62c: 0x65727771 0x20777120 0x66647372 0x73616b6c
0x2abb63c: 0x33353471 0x71772035 0x72777172 0x71727771
0x2abb64c: 0x77203277 0x72777172 0x71727771 0x33323233
0x2abb65c: 0x6b207732 0x20657773 0x73616673 0x73207372
0x2abb66c: 0x64736664 0x537a6962 0x41706574 0x73756220
0x2abb67c: 0x73656e69 0x74732073 0x45617065 0x746e6576
0x2abb68c: 0x656d6954 0x34313032 0x2d38302d 0x32203730
0x2abb69c: 0x33323a31 0x2e33333a 0x62393434 0x6f4c7a69
0x2abb6ac: 0x69746163 0x61506e6f 0x74736972 0x736e6172
0x2abb6bc: 0x69746361 0x61446e6f 0x30326574 0x302d3431
0x2abb6cc: 0x37302d38 0x3a313220 0x333a3332 0x34342e33

There is plenty of compressible data once we get into the repetitive
strings in the payload part --- but that starts at offset 944, and up to
that point there is nothing that pg_lzcompress can get a handle on. There
are, by definition, no sequences of 4 or more repeated bytes in that area.
I think in principle pg_lzcompress could decide to compress the 3-byte
sequences consisting of the high-order 24 bits of each offset; but it
doesn't choose to do so, probably because of the way its lookup hash table
works:

* pglz_hist_idx -
*
* Computes the history table slot for the lookup by the next 4
* characters in the input.
*
* NB: because we use the next 4 characters, we are not guaranteed to
* find 3-character matches; they very possibly will be in the wrong
* hash list. This seems an acceptable tradeoff for spreading out the
* hash keys more.

For jsonb header data, the "next 4 characters" are *always* different, so
only a chance hash collision can result in a match. There is therefore a
pretty good chance that no compression will occur before it gives up
because of first_success_by.

I'm not sure if there is any easy fix for this. We could possibly change
the default first_success_by value, but I think that'd just be postponing
the problem to larger jsonb objects/arrays, and it would hurt performance
for genuinely incompressible data. A somewhat painful, but not yet
out-of-the-question, alternative is to change the jsonb on-disk
representation. Perhaps the JEntry array could be defined as containing
element lengths instead of element ending offsets. Not sure though if
that would break binary searching for JSON object keys.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#2Larry White
ljw1001@gmail.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

Apologies if this is a ridiculous suggestion, but I believe that swapping
out the compression algorithm (for Snappy, for example) has been discussed
in the past. I wonder if that algorithm is sufficiently different that it
would produce a better result, and if that might not be preferable to some
of the other options.

On Thu, Aug 7, 2014 at 11:17 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Show quoted text

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

As an example, here's gdb's report of the bitwise representation of the
example JSON value in the bug thread:

0x2ab85ac: 0x20000005 0x00000004 0x50003098 0x0000309f
0x2ab85bc: 0x000030ae 0x000030b8 0x000030cf 0x000030da
0x2ab85cc: 0x000030df 0x000030ee 0x00003105 0x6b6e756a
0x2ab85dc: 0x400000de 0x00000034 0x00000068 0x0000009c
0x2ab85ec: 0x000000d0 0x00000104 0x00000138 0x0000016c
0x2ab85fc: 0x000001a0 0x000001d4 0x00000208 0x0000023c
0x2ab860c: 0x00000270 0x000002a4 0x000002d8 0x0000030c
0x2ab861c: 0x00000340 0x00000374 0x000003a8 0x000003dc
0x2ab862c: 0x00000410 0x00000444 0x00000478 0x000004ac
0x2ab863c: 0x000004e0 0x00000514 0x00000548 0x0000057c
0x2ab864c: 0x000005b0 0x000005e4 0x00000618 0x0000064c
0x2ab865c: 0x00000680 0x000006b4 0x000006e8 0x0000071c
0x2ab866c: 0x00000750 0x00000784 0x000007b8 0x000007ec
0x2ab867c: 0x00000820 0x00000854 0x00000888 0x000008bc
0x2ab868c: 0x000008f0 0x00000924 0x00000958 0x0000098c
0x2ab869c: 0x000009c0 0x000009f4 0x00000a28 0x00000a5c
0x2ab86ac: 0x00000a90 0x00000ac4 0x00000af8 0x00000b2c
0x2ab86bc: 0x00000b60 0x00000b94 0x00000bc8 0x00000bfc
0x2ab86cc: 0x00000c30 0x00000c64 0x00000c98 0x00000ccc
0x2ab86dc: 0x00000d00 0x00000d34 0x00000d68 0x00000d9c
0x2ab86ec: 0x00000dd0 0x00000e04 0x00000e38 0x00000e6c
0x2ab86fc: 0x00000ea0 0x00000ed4 0x00000f08 0x00000f3c
0x2ab870c: 0x00000f70 0x00000fa4 0x00000fd8 0x0000100c
0x2ab871c: 0x00001040 0x00001074 0x000010a8 0x000010dc
0x2ab872c: 0x00001110 0x00001144 0x00001178 0x000011ac
0x2ab873c: 0x000011e0 0x00001214 0x00001248 0x0000127c
0x2ab874c: 0x000012b0 0x000012e4 0x00001318 0x0000134c
0x2ab875c: 0x00001380 0x000013b4 0x000013e8 0x0000141c
0x2ab876c: 0x00001450 0x00001484 0x000014b8 0x000014ec
0x2ab877c: 0x00001520 0x00001554 0x00001588 0x000015bc
0x2ab878c: 0x000015f0 0x00001624 0x00001658 0x0000168c
0x2ab879c: 0x000016c0 0x000016f4 0x00001728 0x0000175c
0x2ab87ac: 0x00001790 0x000017c4 0x000017f8 0x0000182c
0x2ab87bc: 0x00001860 0x00001894 0x000018c8 0x000018fc
0x2ab87cc: 0x00001930 0x00001964 0x00001998 0x000019cc
0x2ab87dc: 0x00001a00 0x00001a34 0x00001a68 0x00001a9c
0x2ab87ec: 0x00001ad0 0x00001b04 0x00001b38 0x00001b6c
0x2ab87fc: 0x00001ba0 0x00001bd4 0x00001c08 0x00001c3c
0x2ab880c: 0x00001c70 0x00001ca4 0x00001cd8 0x00001d0c
0x2ab881c: 0x00001d40 0x00001d74 0x00001da8 0x00001ddc
0x2ab882c: 0x00001e10 0x00001e44 0x00001e78 0x00001eac
0x2ab883c: 0x00001ee0 0x00001f14 0x00001f48 0x00001f7c
0x2ab884c: 0x00001fb0 0x00001fe4 0x00002018 0x0000204c
0x2ab885c: 0x00002080 0x000020b4 0x000020e8 0x0000211c
0x2ab886c: 0x00002150 0x00002184 0x000021b8 0x000021ec
0x2ab887c: 0x00002220 0x00002254 0x00002288 0x000022bc
0x2ab888c: 0x000022f0 0x00002324 0x00002358 0x0000238c
0x2ab889c: 0x000023c0 0x000023f4 0x00002428 0x0000245c
0x2ab88ac: 0x00002490 0x000024c4 0x000024f8 0x0000252c
0x2ab88bc: 0x00002560 0x00002594 0x000025c8 0x000025fc
0x2ab88cc: 0x00002630 0x00002664 0x00002698 0x000026cc
0x2ab88dc: 0x00002700 0x00002734 0x00002768 0x0000279c
0x2ab88ec: 0x000027d0 0x00002804 0x00002838 0x0000286c
0x2ab88fc: 0x000028a0 0x000028d4 0x00002908 0x0000293c
0x2ab890c: 0x00002970 0x000029a4 0x000029d8 0x00002a0c
0x2ab891c: 0x00002a40 0x00002a74 0x00002aa8 0x00002adc
0x2ab892c: 0x00002b10 0x00002b44 0x00002b78 0x00002bac
0x2ab893c: 0x00002be0 0x00002c14 0x00002c48 0x00002c7c
0x2ab894c: 0x00002cb0 0x00002ce4 0x00002d18 0x32343231
0x2ab895c: 0x74653534 0x74656577 0x33746577 0x77673534
0x2ab896c: 0x74657274 0x33347477 0x72777120 0x20717771
0x2ab897c: 0x65727771 0x20777120 0x66647372 0x73616b6c
0x2ab898c: 0x33353471 0x71772035 0x72777172 0x71727771
0x2ab899c: 0x77203277 0x72777172 0x71727771 0x33323233
0x2ab89ac: 0x6b207732 0x20657773 0x73616673 0x73207372
0x2ab89bc: 0x64736664 0x32343231 0x74653534 0x74656577
0x2ab89cc: 0x33746577 0x77673534 0x74657274 0x33347477
0x2ab89dc: 0x72777120 0x20717771 0x65727771 0x20777120
0x2ab89ec: 0x66647372 0x73616b6c 0x33353471 0x71772035
0x2ab89fc: 0x72777172 0x71727771 0x77203277 0x72777172
0x2ab8a0c: 0x71727771 0x33323233 0x6b207732 0x20657773
0x2ab8a1c: 0x73616673 0x73207372 0x64736664 0x32343231
0x2ab8a2c: 0x74653534 0x74656577 0x33746577 0x77673534
0x2ab8a3c: 0x74657274 0x33347477 0x72777120 0x20717771
0x2ab8a4c: 0x65727771 0x20777120 0x66647372 0x73616b6c
0x2ab8a5c: 0x33353471 0x71772035 0x72777172 0x71727771
0x2ab8a6c: 0x77203277 0x72777172 0x71727771 0x33323233
0x2ab8a7c: 0x6b207732 0x20657773 0x73616673 0x73207372
0x2ab8a8c: 0x64736664 0x32343231 0x74653534 0x74656577
0x2ab8a9c: 0x33746577 0x77673534 0x74657274 0x33347477
0x2ab8aac: 0x72777120 0x20717771 0x65727771 0x20777120
0x2ab8abc: 0x66647372 0x73616b6c 0x33353471 0x71772035
0x2ab8acc: 0x72777172 0x71727771 0x77203277 0x72777172
0x2ab8adc: 0x71727771 0x33323233 0x6b207732 0x20657773
0x2ab8aec: 0x73616673 0x73207372 0x64736664 0x32343231
0x2ab8afc: 0x74653534 0x74656577 0x33746577 0x77673534
...
0x2abb61c: 0x74657274 0x33347477 0x72777120 0x20717771
0x2abb62c: 0x65727771 0x20777120 0x66647372 0x73616b6c
0x2abb63c: 0x33353471 0x71772035 0x72777172 0x71727771
0x2abb64c: 0x77203277 0x72777172 0x71727771 0x33323233
0x2abb65c: 0x6b207732 0x20657773 0x73616673 0x73207372
0x2abb66c: 0x64736664 0x537a6962 0x41706574 0x73756220
0x2abb67c: 0x73656e69 0x74732073 0x45617065 0x746e6576
0x2abb68c: 0x656d6954 0x34313032 0x2d38302d 0x32203730
0x2abb69c: 0x33323a31 0x2e33333a 0x62393434 0x6f4c7a69
0x2abb6ac: 0x69746163 0x61506e6f 0x74736972 0x736e6172
0x2abb6bc: 0x69746361 0x61446e6f 0x30326574 0x302d3431
0x2abb6cc: 0x37302d38 0x3a313220 0x333a3332 0x34342e33

There is plenty of compressible data once we get into the repetitive
strings in the payload part --- but that starts at offset 944, and up to
that point there is nothing that pg_lzcompress can get a handle on. There
are, by definition, no sequences of 4 or more repeated bytes in that area.
I think in principle pg_lzcompress could decide to compress the 3-byte
sequences consisting of the high-order 24 bits of each offset; but it
doesn't choose to do so, probably because of the way its lookup hash table
works:

* pglz_hist_idx -
*
* Computes the history table slot for the lookup by the next
4
* characters in the input.
*
* NB: because we use the next 4 characters, we are not guaranteed to
* find 3-character matches; they very possibly will be in the wrong
* hash list. This seems an acceptable tradeoff for spreading out the
* hash keys more.

For jsonb header data, the "next 4 characters" are *always* different, so
only a chance hash collision can result in a match. There is therefore a
pretty good chance that no compression will occur before it gives up
because of first_success_by.

I'm not sure if there is any easy fix for this. We could possibly change
the default first_success_by value, but I think that'd just be postponing
the problem to larger jsonb objects/arrays, and it would hurt performance
for genuinely incompressible data. A somewhat painful, but not yet
out-of-the-question, alternative is to change the jsonb on-disk
representation. Perhaps the JEntry array could be defined as containing
element lengths instead of element ending offsets. Not sure though if
that would break binary searching for JSON object keys.

regards, tom lane

#3Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

I haven't looked at this in any detail, so take this with a grain of
salt, but what about teaching pglz_compress about using an offset
farther into the data, if the incoming data is quite a bit larger than
1k? This is just a test to see if it's worthwhile to keep going, no? I
wonder if this might even be able to be provided as a type-specific
option, to avoid changing the behavior for types other than jsonb in
this regard.

(I'm imaginging a boolean saying "pick a random sample", or perhaps a
function which can be called that'll return "here's where you wanna test
if this thing is gonna compress at all")

I'm rather disinclined to change the on-disk format because of this
specific test, that feels a bit like the tail wagging the dog to me,
especially as I do hope that some day we'll figure out a way to use a
better compression algorithm than pglz.

Thanks,

Stephen

#4Ashutosh Bapat
ashutosh.bapat@enterprisedb.com
In reply to: Stephen Frost (#3)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 10:48 AM, Stephen Frost <sfrost@snowman.net> wrote:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

I looked into the issue reported in bug #11109. The problem appears to

be

that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible,

because

it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

I haven't looked at this in any detail, so take this with a grain of
salt, but what about teaching pglz_compress about using an offset
farther into the data, if the incoming data is quite a bit larger than
1k? This is just a test to see if it's worthwhile to keep going, no? I
wonder if this might even be able to be provided as a type-specific
option, to avoid changing the behavior for types other than jsonb in
this regard.

+1 for offset. Or sample the data in the beginning, middle and end.
Obviously one could always come up with worst case, but.

(I'm imaginging a boolean saying "pick a random sample", or perhaps a
function which can be called that'll return "here's where you wanna test
if this thing is gonna compress at all")

I'm rather disinclined to change the on-disk format because of this
specific test, that feels a bit like the tail wagging the dog to me,
especially as I do hope that some day we'll figure out a way to use a
better compression algorithm than pglz.

Thanks,

Stephen

--
Best Wishes,
Ashutosh Bapat
EnterpriseDB Corporation
The Postgres Database Company

#5Andrew Dunstan
andrew@dunslane.net
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 08/07/2014 11:17 PM, Tom Lane wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

[snip]

There is plenty of compressible data once we get into the repetitive
strings in the payload part --- but that starts at offset 944, and up to
that point there is nothing that pg_lzcompress can get a handle on. There
are, by definition, no sequences of 4 or more repeated bytes in that area.
I think in principle pg_lzcompress could decide to compress the 3-byte
sequences consisting of the high-order 24 bits of each offset; but it
doesn't choose to do so, probably because of the way its lookup hash table
works:

* pglz_hist_idx -
*
* Computes the history table slot for the lookup by the next 4
* characters in the input.
*
* NB: because we use the next 4 characters, we are not guaranteed to
* find 3-character matches; they very possibly will be in the wrong
* hash list. This seems an acceptable tradeoff for spreading out the
* hash keys more.

For jsonb header data, the "next 4 characters" are *always* different, so
only a chance hash collision can result in a match. There is therefore a
pretty good chance that no compression will occur before it gives up
because of first_success_by.

I'm not sure if there is any easy fix for this. We could possibly change
the default first_success_by value, but I think that'd just be postponing
the problem to larger jsonb objects/arrays, and it would hurt performance
for genuinely incompressible data. A somewhat painful, but not yet
out-of-the-question, alternative is to change the jsonb on-disk
representation. Perhaps the JEntry array could be defined as containing
element lengths instead of element ending offsets. Not sure though if
that would break binary searching for JSON object keys.

Ouch.

Back when this structure was first presented at pgCon 2013, I wondered
if we shouldn't extract the strings into a dictionary, because of key
repetition, and convinced myself that this shouldn't be necessary
because in significant cases TOAST would take care of it.

Maybe we should have pglz_compress() look at the *last* 1024 bytes if it
can't find anything worth compressing in the first, for values larger
than a certain size.

It's worth noting that this is a fairly pathological case. AIUI the
example you constructed has an array with 100k string elements. I don't
think that's typical. So I suspect that unless I've misunderstood the
statement of the problem we're going to find that almost all the jsonb
we will be storing is still compressible.

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#6Tom Lane
tgl@sss.pgh.pa.us
In reply to: Stephen Frost (#3)
Re: jsonb format is pessimal for toast compression

Stephen Frost <sfrost@snowman.net> writes:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

I haven't looked at this in any detail, so take this with a grain of
salt, but what about teaching pglz_compress about using an offset
farther into the data, if the incoming data is quite a bit larger than
1k? This is just a test to see if it's worthwhile to keep going, no?

Well, the point of the existing approach is that it's a *nearly free*
test to see if it's worthwhile to keep going; there's just one if-test
added in the outer loop of the compression code. (cf commit ad434473ebd2,
which added that along with some other changes.) AFAICS, what we'd have
to do to do it as you suggest would to execute compression on some subset
of the data and then throw away that work entirely. I do not find that
attractive, especially when for most datatypes there's no particular
reason to look at one subset instead of another.

I'm rather disinclined to change the on-disk format because of this
specific test, that feels a bit like the tail wagging the dog to me,
especially as I do hope that some day we'll figure out a way to use a
better compression algorithm than pglz.

I'm unimpressed by that argument too, for a number of reasons:

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is very
repetitive. That's a compression-unfriendly transformation by anyone's
measure. Assuming that some future replacement for pg_lzcompress() will
nonetheless be able to compress the data strikes me as mostly wishful
thinking. Besides, we'd more than likely have a similar early-exit rule
in any substitute implementation, so that we'd still be at risk even if
it usually worked.

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

Considering all the hype that's built up around jsonb, shipping a design
with a fundamental performance handicap doesn't seem like a good plan
to me. We could perhaps band-aid around it by using different compression
parameters for jsonb, although that would require some painful API changes
since toast_compress_datum() doesn't know what datatype it's operating on.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#7Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andrew Dunstan (#5)
Re: jsonb format is pessimal for toast compression

Andrew Dunstan <andrew@dunslane.net> writes:

On 08/07/2014 11:17 PM, Tom Lane wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.

Ouch.

Back when this structure was first presented at pgCon 2013, I wondered
if we shouldn't extract the strings into a dictionary, because of key
repetition, and convinced myself that this shouldn't be necessary
because in significant cases TOAST would take care of it.

That's not really the issue here, I think. The problem is that a
relatively minor aspect of the representation, namely the choice to store
a series of offsets rather than a series of lengths, produces
nonrepetitive data even when the original input is repetitive.

Maybe we should have pglz_compress() look at the *last* 1024 bytes if it
can't find anything worth compressing in the first, for values larger
than a certain size.

Not possible with anything like the current implementation, since it's
just an on-the-fly status check not a trial compression.

It's worth noting that this is a fairly pathological case. AIUI the
example you constructed has an array with 100k string elements. I don't
think that's typical. So I suspect that unless I've misunderstood the
statement of the problem we're going to find that almost all the jsonb
we will be storing is still compressible.

Actually, the 100K-string example I constructed *did* compress. Larry's
example that's not compressing is only about 12kB. AFAICS, the threshold
for trouble is in the vicinity of 256 array or object entries (resulting
in a 1kB JEntry array). That doesn't seem especially high. There is a
probabilistic component as to whether the early-exit case will actually
fire, since any chance hash collision will probably result in some 3-byte
offset prefix getting compressed. But the fact that a beta tester tripped
over this doesn't leave me with a warm feeling about the odds that it
won't happen much in the field.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#8Andrew Dunstan
andrew@dunslane.net
In reply to: Tom Lane (#7)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 11:18 AM, Tom Lane wrote:

Andrew Dunstan <andrew@dunslane.net> writes:

On 08/07/2014 11:17 PM, Tom Lane wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.

Back when this structure was first presented at pgCon 2013, I wondered
if we shouldn't extract the strings into a dictionary, because of key
repetition, and convinced myself that this shouldn't be necessary
because in significant cases TOAST would take care of it.

That's not really the issue here, I think. The problem is that a
relatively minor aspect of the representation, namely the choice to store
a series of offsets rather than a series of lengths, produces
nonrepetitive data even when the original input is repetitive.

It would certainly be worth validating that changing this would fix the
problem.

I don't know how invasive that would be - I suspect (without looking
very closely) not terribly much.

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

Considering all the hype that's built up around jsonb, shipping a design
with a fundamental performance handicap doesn't seem like a good plan
to me. We could perhaps band-aid around it by using different compression
parameters for jsonb, although that would require some painful API changes
since toast_compress_datum() doesn't know what datatype it's operating on.

Yeah, it would be a bit painful, but after all finding out this sort of
thing is why we have betas.

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#9Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andrew Dunstan (#8)
Re: jsonb format is pessimal for toast compression

Andrew Dunstan <andrew@dunslane.net> writes:

On 08/08/2014 11:18 AM, Tom Lane wrote:

That's not really the issue here, I think. The problem is that a
relatively minor aspect of the representation, namely the choice to store
a series of offsets rather than a series of lengths, produces
nonrepetitive data even when the original input is repetitive.

It would certainly be worth validating that changing this would fix the
problem.
I don't know how invasive that would be - I suspect (without looking
very closely) not terribly much.

I took a quick look and saw that this wouldn't be that easy to get around.
As I'd suspected upthread, there are places that do random access into a
JEntry array, such as the binary search in findJsonbValueFromContainer().
If we have to add up all the preceding lengths to locate the corresponding
value part, we lose the performance advantages of binary search. AFAICS
that's applied directly to the on-disk representation. I'd thought
perhaps there was always a transformation step to build a pointer list,
but nope.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10John W Higgins
wishdev@gmail.com
In reply to: Tom Lane (#6)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 8:02 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Stephen Frost <sfrost@snowman.net> writes:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

I'm rather disinclined to change the on-disk format because of this
specific test, that feels a bit like the tail wagging the dog to me,
especially as I do hope that some day we'll figure out a way to use a
better compression algorithm than pglz.

I'm unimpressed by that argument too, for a number of reasons:

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is very
repetitive. That's a compression-unfriendly transformation by anyone's
measure. Assuming that some future replacement for pg_lzcompress() will
nonetheless be able to compress the data strikes me as mostly wishful
thinking. Besides, we'd more than likely have a similar early-exit rule
in any substitute implementation, so that we'd still be at risk even if
it usually worked.

Would an answer be to switch the location of the jsonb "header" data to the
end of the field as opposed to the beginning of the field? That would allow
pglz to see what it wants to see early on and go to work when possible?

Add an offset at the top of the field to show where to look - but then it
would be the same in terms of functionality outside of that? Or pretty
close?

John

#11Tom Lane
tgl@sss.pgh.pa.us
In reply to: John W Higgins (#10)
Re: jsonb format is pessimal for toast compression

John W Higgins <wishdev@gmail.com> writes:

Would an answer be to switch the location of the jsonb "header" data to the
end of the field as opposed to the beginning of the field? That would allow
pglz to see what it wants to see early on and go to work when possible?

Hm, might work. Seems a bit odd, but it would make pglz_compress happier.

OTOH, the big-picture issue here is that jsonb is generating
noncompressible data in the first place. Putting it somewhere else in the
Datum doesn't change the fact that we're going to have bloated storage,
even if we dodge the early-exit problem. (I suspect the compression
disadvantage vs text/plain-json that I showed yesterday is coming largely
from that offset array.) But I don't currently see how to avoid that and
still preserve the fast binary-search key lookup property, which is surely
a nice thing to have.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12Andrew Dunstan
andrew@dunslane.net
In reply to: John W Higgins (#10)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 12:04 PM, John W Higgins wrote:

Would an answer be to switch the location of the jsonb "header" data
to the end of the field as opposed to the beginning of the field? That
would allow pglz to see what it wants to see early on and go to work
when possible?

Add an offset at the top of the field to show where to look - but then
it would be the same in terms of functionality outside of that? Or
pretty close?

That might make building up jsonb structures piece by piece as we do
difficult.

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13Andrew Dunstan
andrew@dunslane.net
In reply to: Tom Lane (#9)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 11:54 AM, Tom Lane wrote:

Andrew Dunstan <andrew@dunslane.net> writes:

On 08/08/2014 11:18 AM, Tom Lane wrote:

That's not really the issue here, I think. The problem is that a
relatively minor aspect of the representation, namely the choice to store
a series of offsets rather than a series of lengths, produces
nonrepetitive data even when the original input is repetitive.

It would certainly be worth validating that changing this would fix the
problem.
I don't know how invasive that would be - I suspect (without looking
very closely) not terribly much.

I took a quick look and saw that this wouldn't be that easy to get around.
As I'd suspected upthread, there are places that do random access into a
JEntry array, such as the binary search in findJsonbValueFromContainer().
If we have to add up all the preceding lengths to locate the corresponding
value part, we lose the performance advantages of binary search. AFAICS
that's applied directly to the on-disk representation. I'd thought
perhaps there was always a transformation step to build a pointer list,
but nope.

It would be interesting to know what the performance hit would be if we
calculated the offsets/pointers on the fly, especially if we could cache
it somehow. The main benefit of binary search is in saving on
comparisons, especially of strings, ISTM, and that could still be
available - this would just be a bit of extra arithmetic.

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14Teodor Sigaev
teodor@sigaev.ru
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

Curious idea: we could swap JEntry array and values: values in the
begining of type will be catched by pg_lzcompress. But we will need to
know offset of JEntry array, so header will grow up till 8 bytes
(actually, it will be a varlena header!)

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#15Teodor Sigaev
teodor@sigaev.ru
In reply to: Teodor Sigaev (#14)
Re: jsonb format is pessimal for toast compression

Curious idea: we could swap JEntry array and values: values in the
begining of type will be catched by pg_lzcompress. But we will need to
know offset of JEntry array, so header will grow up till 8 bytes
(actually, it will be a varlena header!)

May be I wasn't clear:jsonb type will start from string collection
instead of JEntry array, JEntry array will be placed at the end of
object/array. so, pg_lzcompress will find repeatable 4-byte pieces in
first 1024 bytes of jsonb.

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#6)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 11:02:26AM -0400, Tom Lane wrote:

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

FYI, pg_upgrade could be taught to refuse to upgrade from earlier 9.4
betas and report the problem JSONB columns.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#17Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 08:02 AM, Tom Lane wrote:

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

Considering all the hype that's built up around jsonb, shipping a design
with a fundamental performance handicap doesn't seem like a good plan
to me. We could perhaps band-aid around it by using different compression
parameters for jsonb, although that would require some painful API changes
since toast_compress_datum() doesn't know what datatype it's operating on.

I would rather ship late than ship a noncompressable JSONB.

One we ship 9.4, many users are going to load 100's of GB into JSONB
fields. Even if we fix the compressability issue in 9.5, those users
won't be able to fix the compression without rewriting all their data,
which could be prohibitive. And we'll be in a position where we have
to support the 9.4 JSONB format/compression technique for years so that
users aren't blocked from upgrading.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Alexander Korotkov
aekorotkov@gmail.com
In reply to: Teodor Sigaev (#15)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 9:14 PM, Teodor Sigaev <teodor@sigaev.ru> wrote:

Curious idea: we could swap JEntry array and values: values in the

begining of type will be catched by pg_lzcompress. But we will need to
know offset of JEntry array, so header will grow up till 8 bytes
(actually, it will be a varlena header!)

May be I wasn't clear:jsonb type will start from string collection instead
of JEntry array, JEntry array will be placed at the end of object/array.
so, pg_lzcompress will find repeatable 4-byte pieces in first 1024 bytes of
jsonb.

Another idea I have is that store offset in each JEntry is not necessary to
have benefit of binary search. Namely what if we store offsets in each 8th
JEntry and length in others? The speed of binary search will be about the
same: overhead is only calculation offsets in the 8-entries chunk. But
lengths will probably repeat.

------
With best regards,
Alexander Korotkov.

#19Ants Aasma
ants@cybertec.at
In reply to: Andrew Dunstan (#13)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 7:35 PM, Andrew Dunstan <andrew@dunslane.net> wrote:

I took a quick look and saw that this wouldn't be that easy to get around.
As I'd suspected upthread, there are places that do random access into a
JEntry array, such as the binary search in findJsonbValueFromContainer().
If we have to add up all the preceding lengths to locate the corresponding
value part, we lose the performance advantages of binary search. AFAICS
that's applied directly to the on-disk representation. I'd thought
perhaps there was always a transformation step to build a pointer list,
but nope.

It would be interesting to know what the performance hit would be if we
calculated the offsets/pointers on the fly, especially if we could cache it
somehow. The main benefit of binary search is in saving on comparisons,
especially of strings, ISTM, and that could still be available - this would
just be a bit of extra arithmetic.

I don't think binary search is the main problem here. Objects are
usually reasonably sized, while arrays are more likely to be huge. To
make matters worse, jsonb -> int goes from O(1) to O(n).

Regards,
Ants Aasma
--
Cybertec Schönig & Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#20Hannu Krosing
hannu@2ndQuadrant.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 06:17 AM, Tom Lane wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.

How hard and how expensive would it be to teach pg_lzcompress to
apply a delta filter on suitable data ?

So that instead of integers their deltas will be fed to the "real"
compressor

--
Hannu Krosing
PostgreSQL Consultant
Performance, Scalability and High Availability
2ndQuadrant Nordic O�

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21Peter Geoghegan
pg@heroku.com
In reply to: Ants Aasma (#19)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 12:41 PM, Ants Aasma <ants@cybertec.at> wrote:

I don't think binary search is the main problem here. Objects are
usually reasonably sized, while arrays are more likely to be huge. To
make matters worse, jsonb -> int goes from O(1) to O(n).

I don't think it's true that arrays are more likely to be huge. That
regression would be bad, but jsonb -> int is not the most compelling
operator by far. The indexable operators (in particular, @>) don't
support subscripting arrays like that, and with good reason.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22Peter Geoghegan
pg@heroku.com
In reply to: Josh Berkus (#17)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 12:06 PM, Josh Berkus <josh@agliodbs.com> wrote:

One we ship 9.4, many users are going to load 100's of GB into JSONB
fields. Even if we fix the compressability issue in 9.5, those users
won't be able to fix the compression without rewriting all their data,
which could be prohibitive. And we'll be in a position where we have
to support the 9.4 JSONB format/compression technique for years so that
users aren't blocked from upgrading.

FWIW, if we take the delicious JSON data as representative, a table
storing that data as jsonb is 1374 MB in size. Whereas an equivalent
table with the data typed using the original json datatype (but with
white space differences more or less ignored, because it was created
using a jsonb -> json cast), the same data is 1352 MB.

Larry's complaint is valid; this is a real problem, and I'd like to
fix it before 9.4 is out. However, let us not lose sight of the fact
that JSON data is usually a poor target for TOAST compression. With
idiomatic usage, redundancy is very much more likely to appear across
rows, and not within individual Datums. Frankly, we aren't doing a
very good job there, and doing better requires an alternative
strategy.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#23Larry White
ljw1001@gmail.com
In reply to: Peter Geoghegan (#22)
Re: jsonb format is pessimal for toast compression

I was not complaining; I think JSONB is awesome.

But I am one of those people who would like to put 100's of GB (or more)
JSON files into Postgres and I am concerned about file size and possible
future changes to the format.

On Fri, Aug 8, 2014 at 7:10 PM, Peter Geoghegan <pg@heroku.com> wrote:

Show quoted text

On Fri, Aug 8, 2014 at 12:06 PM, Josh Berkus <josh@agliodbs.com> wrote:

One we ship 9.4, many users are going to load 100's of GB into JSONB
fields. Even if we fix the compressability issue in 9.5, those users
won't be able to fix the compression without rewriting all their data,
which could be prohibitive. And we'll be in a position where we have
to support the 9.4 JSONB format/compression technique for years so that
users aren't blocked from upgrading.

FWIW, if we take the delicious JSON data as representative, a table
storing that data as jsonb is 1374 MB in size. Whereas an equivalent
table with the data typed using the original json datatype (but with
white space differences more or less ignored, because it was created
using a jsonb -> json cast), the same data is 1352 MB.

Larry's complaint is valid; this is a real problem, and I'd like to
fix it before 9.4 is out. However, let us not lose sight of the fact
that JSON data is usually a poor target for TOAST compression. With
idiomatic usage, redundancy is very much more likely to appear across
rows, and not within individual Datums. Frankly, we aren't doing a
very good job there, and doing better requires an alternative
strategy.

--
Peter Geoghegan

#24Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#6)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Stephen Frost <sfrost@snowman.net> writes:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

I haven't looked at this in any detail, so take this with a grain of
salt, but what about teaching pglz_compress about using an offset
farther into the data, if the incoming data is quite a bit larger than
1k? This is just a test to see if it's worthwhile to keep going, no?

Well, the point of the existing approach is that it's a *nearly free*
test to see if it's worthwhile to keep going; there's just one if-test
added in the outer loop of the compression code. (cf commit ad434473ebd2,
which added that along with some other changes.) AFAICS, what we'd have
to do to do it as you suggest would to execute compression on some subset
of the data and then throw away that work entirely. I do not find that
attractive, especially when for most datatypes there's no particular
reason to look at one subset instead of another.

Ah, I see- we were using the first block as it means we can reuse the
work done on it if we decide to continue with the compression. Makes
sense. We could possibly arrange to have the amount attempted depend on
the data type, but you point out that we can't do that without teaching
lower components about types, which is less than ideal.

What about considering how large the object is when we are analyzing if
it compresses well overall? That is- for a larger object, make a larger
effort to compress it. There's clearly a pessimistic case which could
arise from that, but it may be better than the current situation.
There's a clear risk that such an algorithm may well be very type
specific, meaning that we make things worse for some types (eg: bytea's
which end up never compressing well we'd likely spend more CPU time
trying than we do today).

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is very
repetitive. That's a compression-unfriendly transformation by anyone's
measure. Assuming that some future replacement for pg_lzcompress() will
nonetheless be able to compress the data strikes me as mostly wishful
thinking. Besides, we'd more than likely have a similar early-exit rule
in any substitute implementation, so that we'd still be at risk even if
it usually worked.

I agree that jsonb ends up being nonrepetitive in part, which is why
I've been trying to push the discussion in the direction of making it
more likely for the highly-compressible data to be considered rather
than the start of the jsonb object. I don't care for our compression
algorithm having to be catered to in this regard in general though as
the exact same problem could, and likely does, exist in some real life
bytea-using PG implementations.

I disagree that another algorithm wouldn't be able to manage better on
this data than pglz. pglz, from my experience, is notoriously bad a
certain data sets which other algorithms are not as poorly impacted by.

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

I'd really hate to ship 9.4 without a fix for this, but I have a similar
hard time with shipping 9.4 without the binary search component..

Considering all the hype that's built up around jsonb, shipping a design
with a fundamental performance handicap doesn't seem like a good plan
to me. We could perhaps band-aid around it by using different compression
parameters for jsonb, although that would require some painful API changes
since toast_compress_datum() doesn't know what datatype it's operating on.

I don't like the idea of shipping with this handicap either.

Perhaps another options would be a new storage type which basically says
"just compress it, no matter what"? We'd be able to make that the
default for jsonb columns too, no? Again- I'll admit this is shooting
from the hip, but I wanted to get these thoughts out and I won't have
much more time tonight.

Thanks!

Stephen

#25Stephen Frost
sfrost@snowman.net
In reply to: Bruce Momjian (#16)
Re: jsonb format is pessimal for toast compression

* Bruce Momjian (bruce@momjian.us) wrote:

On Fri, Aug 8, 2014 at 11:02:26AM -0400, Tom Lane wrote:

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

FYI, pg_upgrade could be taught to refuse to upgrade from earlier 9.4
betas and report the problem JSONB columns.

That is *not* a good solution..

Thanks,

Stephen

#26Stephen Frost
sfrost@snowman.net
In reply to: Josh Berkus (#17)
Re: jsonb format is pessimal for toast compression

* Josh Berkus (josh@agliodbs.com) wrote:

On 08/08/2014 08:02 AM, Tom Lane wrote:

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

Considering all the hype that's built up around jsonb, shipping a design
with a fundamental performance handicap doesn't seem like a good plan
to me. We could perhaps band-aid around it by using different compression
parameters for jsonb, although that would require some painful API changes
since toast_compress_datum() doesn't know what datatype it's operating on.

I would rather ship late than ship a noncompressable JSONB.

One we ship 9.4, many users are going to load 100's of GB into JSONB
fields. Even if we fix the compressability issue in 9.5, those users
won't be able to fix the compression without rewriting all their data,
which could be prohibitive. And we'll be in a position where we have
to support the 9.4 JSONB format/compression technique for years so that
users aren't blocked from upgrading.

Would you accept removing the binary-search capability from jsonb just
to make it compressable? I certainly wouldn't. I'd hate to ship late
also, but I'd be willing to support that if we can find a good solution
to keep both compressability and binary-search (and provided it doesn't
delay us many months..).

Thanks,

Stephen

#27Tom Lane
tgl@sss.pgh.pa.us
In reply to: Stephen Frost (#24)
Re: jsonb format is pessimal for toast compression

Stephen Frost <sfrost@snowman.net> writes:

What about considering how large the object is when we are analyzing if
it compresses well overall?

Hmm, yeah, that's a possibility: we could redefine the limit at which
we bail out in terms of a fraction of the object size instead of a fixed
limit. However, that risks expending a large amount of work before we
bail, if we have a very large incompressible object --- which is not
exactly an unlikely case. Consider for example JPEG images stored as
bytea, which I believe I've heard of people doing. Another issue is
that it's not real clear that that fixes the problem for any fractional
size we'd want to use. In Larry's example of a jsonb value that fails
to compress, the header size is 940 bytes out of about 12K, so we'd be
needing to trial-compress about 10% of the object before we reach
compressible data --- and I doubt his example is worst-case.

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is very
repetitive. That's a compression-unfriendly transformation by anyone's
measure.

I disagree that another algorithm wouldn't be able to manage better on
this data than pglz. pglz, from my experience, is notoriously bad a
certain data sets which other algorithms are not as poorly impacted by.

Well, I used to be considered a compression expert, and I'm going to
disagree with you here. It's surely possible that other algorithms would
be able to get some traction where pglz fails to get any, but that doesn't
mean that presenting them with hard-to-compress data in the first place is
a good design decision. There is no scenario in which data like this is
going to be friendly to a general-purpose compression algorithm. It'd
be necessary to have explicit knowledge that the data consists of an
increasing series of four-byte integers to be able to do much with it.
And then such an assumption would break down once you got past the
header ...

Perhaps another options would be a new storage type which basically says
"just compress it, no matter what"? We'd be able to make that the
default for jsonb columns too, no?

Meh. We could do that, but it would still require adding arguments to
toast_compress_datum() that aren't there now. In any case, this is a
band-aid solution; and as Josh notes, once we ship 9.4 we are going to
be stuck with jsonb's on-disk representation pretty much forever.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#28Andrew Dunstan
andrew@dunslane.net
In reply to: Tom Lane (#27)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 08:45 PM, Tom Lane wrote:

Perhaps another options would be a new storage type which basically says
"just compress it, no matter what"? We'd be able to make that the
default for jsonb columns too, no?

Meh. We could do that, but it would still require adding arguments to
toast_compress_datum() that aren't there now. In any case, this is a
band-aid solution; and as Josh notes, once we ship 9.4 we are going to
be stuck with jsonb's on-disk representation pretty much forever.

Yeah, and almost any other solution is likely to mean non-jsonb users
potentially paying a penalty for fixing this for jsonb. So if we can
adjust the jsonb layout to fix this problem I think we should do so.

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#29Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#27)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Stephen Frost <sfrost@snowman.net> writes:

What about considering how large the object is when we are analyzing if
it compresses well overall?

Hmm, yeah, that's a possibility: we could redefine the limit at which
we bail out in terms of a fraction of the object size instead of a fixed
limit. However, that risks expending a large amount of work before we
bail, if we have a very large incompressible object --- which is not
exactly an unlikely case. Consider for example JPEG images stored as
bytea, which I believe I've heard of people doing. Another issue is
that it's not real clear that that fixes the problem for any fractional
size we'd want to use. In Larry's example of a jsonb value that fails
to compress, the header size is 940 bytes out of about 12K, so we'd be
needing to trial-compress about 10% of the object before we reach
compressible data --- and I doubt his example is worst-case.

Agreed- I tried to allude to that in my prior mail, there's clearly a
concern that we'd make things worse in certain situations. Then again,
at least for that case, we could recommend changing the storage type to
EXTERNAL.

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is very
repetitive. That's a compression-unfriendly transformation by anyone's
measure.

I disagree that another algorithm wouldn't be able to manage better on
this data than pglz. pglz, from my experience, is notoriously bad a
certain data sets which other algorithms are not as poorly impacted by.

Well, I used to be considered a compression expert, and I'm going to
disagree with you here. It's surely possible that other algorithms would
be able to get some traction where pglz fails to get any, but that doesn't
mean that presenting them with hard-to-compress data in the first place is
a good design decision. There is no scenario in which data like this is
going to be friendly to a general-purpose compression algorithm. It'd
be necessary to have explicit knowledge that the data consists of an
increasing series of four-byte integers to be able to do much with it.
And then such an assumption would break down once you got past the
header ...

I've wondered previously as to if we, perhaps, missed the boat pretty
badly by not providing an explicitly versioned per-type compression
capability, such that we wouldn't be stuck with one compression
algorith for all types, and would be able to version compression types
in a way that would allow us to change them over time, provided the
newer code always understands how to decode X-4 (or whatever) versions
back.

I do agree that it'd be great to represent every type in a highly
compressable way for the sake of the compression algorithm, but
I've not seen any good suggestions for how to make that happen and I've
got a hard time seeing how we could completely change the jsonb storage
format, retain the capabilities it has today, make it highly
compressible, and get 9.4 out this calendar year.

I expect we could trivially add padding into the jsonb header to make it
compress better, for the sake of this particular check, but then we're
going to always be compression jsonb, even when the user data isn't
actually terribly good for compression, spending a good bit of CPU time
while we're at it.

Perhaps another options would be a new storage type which basically says
"just compress it, no matter what"? We'd be able to make that the
default for jsonb columns too, no?

Meh. We could do that, but it would still require adding arguments to
toast_compress_datum() that aren't there now. In any case, this is a
band-aid solution; and as Josh notes, once we ship 9.4 we are going to
be stuck with jsonb's on-disk representation pretty much forever.

I agree that we need to avoid changing jsonb's on-disk representation.
Have I missed where a good suggestion has been made about how to do that
which preserves the binary-search capabilities and doesn't make the code
much more difficult? Trying to move the header to the end just for the
sake of this doesn't strike me as a good solution as it'll make things
quite a bit more complicated. Is there a way we could interleave the
likely-compressible user data in with the header instead? I've not
looked, but it seems like that's the only reasonable approach to address
this issue in this manner. If that's simply done, then great, but it
strikes me as unlikely to be..

I'll just throw out a bit of a counter-point to all this also though- we
don't try to focus on making our on-disk representation of data,
generally, very compressible even though there are filesystems, such as
ZFS, which might benefit from certain rearrangements of our on-disk
formats (no, I don't have any specific recommendations in this vein, but
I certainly don't see anyone else asking after it or asking for us to be
concerned about it). Compression is great and I'd hate to see us have a
format that will just work with it even though it might be beneficial in
many cases, but I feel the fault here is with the compression algorithm
and the decisions made as part of that operation and not really with
this particular data structure.

Thanks,

Stephen

#30Tom Lane
tgl@sss.pgh.pa.us
In reply to: Stephen Frost (#29)
Re: jsonb format is pessimal for toast compression

Stephen Frost <sfrost@snowman.net> writes:

I agree that we need to avoid changing jsonb's on-disk representation.

... post-release, I assume you mean.

Have I missed where a good suggestion has been made about how to do that
which preserves the binary-search capabilities and doesn't make the code
much more difficult?

We don't have one yet, but we've only been thinking about this for a few
hours.

Trying to move the header to the end just for the
sake of this doesn't strike me as a good solution as it'll make things
quite a bit more complicated. Is there a way we could interleave the
likely-compressible user data in with the header instead?

Yeah, I was wondering about that too, but I don't immediately see how to
do it without some sort of preprocessing step when we read the object
(which'd be morally equivalent to converting a series of lengths into a
pointer array). Binary search isn't going to work if the items it's
searching in aren't all the same size.

Having said that, I am not sure that a preprocessing step is a
deal-breaker. It'd be O(N), but with a pretty darn small constant factor,
and for plausible sizes of objects I think the binary search might still
dominate. Worth investigation perhaps.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#31Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#30)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Stephen Frost <sfrost@snowman.net> writes:

I agree that we need to avoid changing jsonb's on-disk representation.

... post-release, I assume you mean.

Yes.

Have I missed where a good suggestion has been made about how to do that
which preserves the binary-search capabilities and doesn't make the code
much more difficult?

We don't have one yet, but we've only been thinking about this for a few
hours.

Fair enough.

Trying to move the header to the end just for the
sake of this doesn't strike me as a good solution as it'll make things
quite a bit more complicated. Is there a way we could interleave the
likely-compressible user data in with the header instead?

Yeah, I was wondering about that too, but I don't immediately see how to
do it without some sort of preprocessing step when we read the object
(which'd be morally equivalent to converting a series of lengths into a
pointer array). Binary search isn't going to work if the items it's
searching in aren't all the same size.

Having said that, I am not sure that a preprocessing step is a
deal-breaker. It'd be O(N), but with a pretty darn small constant factor,
and for plausible sizes of objects I think the binary search might still
dominate. Worth investigation perhaps.

For my part, I'm less concerned about a preprocessing step which happens
when we store the data and more concerned about ensuring that we're able
to extract data quickly. Perhaps that's simply because I'm used to
writes being more expensive than reads, but I'm not alone in that
regard either. I doubt I'll have time in the next couple of weeks to
look into this and if we're going to want this change for 9.4, we really
need someone working on it sooner than later. (to the crowd)- do we
have any takers for this investigation?

Thanks,

Stephen

#32Amit Kapila
amit.kapila16@gmail.com
In reply to: Tom Lane (#27)
Re: jsonb format is pessimal for toast compression

On Sat, Aug 9, 2014 at 6:15 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Stephen Frost <sfrost@snowman.net> writes:

What about considering how large the object is when we are analyzing if
it compresses well overall?

Hmm, yeah, that's a possibility: we could redefine the limit at which
we bail out in terms of a fraction of the object size instead of a fixed
limit. However, that risks expending a large amount of work before we
bail, if we have a very large incompressible object --- which is not
exactly an unlikely case. Consider for example JPEG images stored as
bytea, which I believe I've heard of people doing. Another issue is
that it's not real clear that that fixes the problem for any fractional
size we'd want to use. In Larry's example of a jsonb value that fails
to compress, the header size is 940 bytes out of about 12K, so we'd be
needing to trial-compress about 10% of the object before we reach
compressible data --- and I doubt his example is worst-case.

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is

very

repetitive. That's a compression-unfriendly transformation by anyone's
measure.

I disagree that another algorithm wouldn't be able to manage better on
this data than pglz. pglz, from my experience, is notoriously bad a
certain data sets which other algorithms are not as poorly impacted by.

Well, I used to be considered a compression expert, and I'm going to
disagree with you here. It's surely possible that other algorithms would
be able to get some traction where pglz fails to get any,

During my previous work in this area, I had seen that some algorithms
use skipping logic which can be useful for incompressible data followed
by compressible data or in general as well. One of the technique could
be If we don't find any match for first 4 bytes, then skip 4 bytes
and if we don't find match again for next 8 bytes, then skip 8
bytes and keep on doing the same until we find first match in which
case it would go back to beginning of data. Now here we could follow
this logic until we actually compare total of first_success_by bytes.
There can be caveats in this particular scheme of skipping but I
just wanted to mention in general about the skipping idea to reduce
the number of situations where we will bail out even though there is
lot of compressible data.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#33Bruce Momjian
bruce@momjian.us
In reply to: Stephen Frost (#25)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 08:25:04PM -0400, Stephen Frost wrote:

* Bruce Momjian (bruce@momjian.us) wrote:

On Fri, Aug 8, 2014 at 11:02:26AM -0400, Tom Lane wrote:

2. Are we going to ship 9.4 without fixing this? I definitely don't see
replacing pg_lzcompress as being on the agenda for 9.4, whereas changing
jsonb is still within the bounds of reason.

FYI, pg_upgrade could be taught to refuse to upgrade from earlier 9.4
betas and report the problem JSONB columns.

That is *not* a good solution..

If you change the JSONB binary format, and we can't read the old format,
that is the only option.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#34David G Johnston
david.g.johnston@gmail.com
In reply to: Amit Kapila (#32)
Re: jsonb format is pessimal for toast compression

akapila wrote

On Sat, Aug 9, 2014 at 6:15 AM, Tom Lane &lt;

tgl@.pa

&gt; wrote:

Stephen Frost &lt;

sfrost@

&gt; writes:

What about considering how large the object is when we are analyzing if
it compresses well overall?

Hmm, yeah, that's a possibility: we could redefine the limit at which
we bail out in terms of a fraction of the object size instead of a fixed
limit. However, that risks expending a large amount of work before we
bail, if we have a very large incompressible object --- which is not
exactly an unlikely case. Consider for example JPEG images stored as
bytea, which I believe I've heard of people doing. Another issue is
that it's not real clear that that fixes the problem for any fractional
size we'd want to use. In Larry's example of a jsonb value that fails
to compress, the header size is 940 bytes out of about 12K, so we'd be
needing to trial-compress about 10% of the object before we reach
compressible data --- and I doubt his example is worst-case.

1. The real problem here is that jsonb is emitting quite a bit of
fundamentally-nonrepetitive data, even when the user-visible input is

very

repetitive. That's a compression-unfriendly transformation by

anyone's

measure.

I disagree that another algorithm wouldn't be able to manage better on
this data than pglz. pglz, from my experience, is notoriously bad a
certain data sets which other algorithms are not as poorly impacted by.

Well, I used to be considered a compression expert, and I'm going to
disagree with you here. It's surely possible that other algorithms would
be able to get some traction where pglz fails to get any,

During my previous work in this area, I had seen that some algorithms
use skipping logic which can be useful for incompressible data followed
by compressible data or in general as well.

Random thought from the sideline...

This particular data type has the novel (within PostgreSQL) design of both a
(feature oriented - and sizeable) header and a payload. Is there some way
to add that model into the storage system so that, at a higher level,
separate attempts are made to compress each section and then the compressed
(or not) results and written out adjacently and with a small header
indicating the length of the stored header and other meta data like whether
each part is compressed and even the type that data represents? For reading
back into memory the header-payload generic type is populated and then the
header and payload decompressed - as needed - then the two parts are fed
into the appropriate type constructor that understands and accepts the two
pieces.

Just hoping to spark an idea here - I don't know enough about the internals
to even guess how close I am to something feasible.

David J.

--
View this message in context: http://postgresql.1045698.n5.nabble.com/jsonb-format-is-pessimal-for-toast-compression-tp5814162p5814299.html
Sent from the PostgreSQL - hackers mailing list archive at Nabble.com.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#35Stephen Frost
sfrost@snowman.net
In reply to: Bruce Momjian (#33)
Re: jsonb format is pessimal for toast compression

Bruce,

* Bruce Momjian (bruce@momjian.us) wrote:

On Fri, Aug 8, 2014 at 08:25:04PM -0400, Stephen Frost wrote:

* Bruce Momjian (bruce@momjian.us) wrote:

FYI, pg_upgrade could be taught to refuse to upgrade from earlier 9.4
betas and report the problem JSONB columns.

That is *not* a good solution..

If you change the JSONB binary format, and we can't read the old format,
that is the only option.

Apologies- I had thought you were suggesting this for a 9.4 -> 9.5
conversion, not for just 9.4beta to 9.4. Adding that to pg_upgrade to
address folks upgrading from betas would certainly be fine.

Thanks,

Stephen

#36Kevin Grittner
kgrittn@ymail.com
In reply to: Tom Lane (#30)
Re: jsonb format is pessimal for toast compression

Tom Lane <tgl@sss.pgh.pa.us> wrote:

Stephen Frost <sfrost@snowman.net> writes:

Trying to move the header to the end just for the sake of this
doesn't strike me as a good solution as it'll make things quite
a bit more complicated.

Why is that?  How much harder would it be to add a single offset
field to the front to point to the part we're shifting to the end?
It is not all that unusual to put a directory at the end, like in
the .zip file format.

Is there a way we could interleave the likely-compressible user
data in with the header instead?

Yeah, I was wondering about that too, but I don't immediately see
how to do it without some sort of preprocessing step when we read
the object (which'd be morally equivalent to converting a series
of lengths into a pointer array).

That sounds far more complex and fragile than just moving the
indexes to the end.

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#37Tom Lane
tgl@sss.pgh.pa.us
In reply to: Kevin Grittner (#36)
Re: jsonb format is pessimal for toast compression

Kevin Grittner <kgrittn@ymail.com> writes:

Stephen Frost <sfrost@snowman.net> writes:

Trying to move the header to the end just for the sake of this
doesn't strike me as a good solution as it'll make things quite
a bit more complicated.

Why is that?� How much harder would it be to add a single offset
field to the front to point to the part we're shifting to the end?
It is not all that unusual to put a directory at the end, like in
the .zip file format.

Yeah, I was wondering that too. Arguably, directory-at-the-end would
be easier to work with for on-the-fly creation, not that we do any
such thing at the moment. I think the main thing that's bugging Stephen
is that doing that just to make pglz_compress happy seems like a kluge
(and I have to agree).

Here's a possibly more concrete thing to think about: we may very well
someday want to support JSONB object field or array element extraction
without reading all blocks of a large toasted JSONB value, if the value is
stored external without compression. We already went to the trouble of
creating analogous logic for substring extraction from a long uncompressed
text or bytea value, so I think this is a plausible future desire. With
the current format you could imagine grabbing the first TOAST chunk, and
then if you see the header is longer than that you can grab the remainder
of the header without any wasted I/O, and for the array-subscripting case
you'd now have enough info to fetch the element value from the body of
the JSONB without any wasted I/O. With directory-at-the-end you'd
have to read the first chunk just to get the directory pointer, and this
would most likely not give you any of the directory proper; but at least
you'd know exactly how big the directory is before you go to read it in.
The former case is probably slightly better. However, if you're doing an
object key lookup not an array element fetch, neither of these formats are
really friendly at all, because each binary-search probe probably requires
bringing in one or two toast chunks from the body of the JSONB value so
you can look at the key text. I'm not sure if there's a way to redesign
the format to make that less painful/expensive --- but certainly, having
the key texts scattered through the JSONB value doesn't seem like a great
thing from this standpoint.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#38Robert Haas
robertmhaas@gmail.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On Sat, Aug 9, 2014 at 3:51 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Kevin Grittner <kgrittn@ymail.com> writes:

Stephen Frost <sfrost@snowman.net> writes:

Trying to move the header to the end just for the sake of this
doesn't strike me as a good solution as it'll make things quite
a bit more complicated.

Why is that? How much harder would it be to add a single offset
field to the front to point to the part we're shifting to the end?
It is not all that unusual to put a directory at the end, like in
the .zip file format.

Yeah, I was wondering that too. Arguably, directory-at-the-end would
be easier to work with for on-the-fly creation, not that we do any
such thing at the moment. I think the main thing that's bugging Stephen
is that doing that just to make pglz_compress happy seems like a kluge
(and I have to agree).

Here's a possibly more concrete thing to think about: we may very well
someday want to support JSONB object field or array element extraction
without reading all blocks of a large toasted JSONB value, if the value is
stored external without compression. We already went to the trouble of
creating analogous logic for substring extraction from a long uncompressed
text or bytea value, so I think this is a plausible future desire. With
the current format you could imagine grabbing the first TOAST chunk, and
then if you see the header is longer than that you can grab the remainder
of the header without any wasted I/O, and for the array-subscripting case
you'd now have enough info to fetch the element value from the body of
the JSONB without any wasted I/O. With directory-at-the-end you'd
have to read the first chunk just to get the directory pointer, and this
would most likely not give you any of the directory proper; but at least
you'd know exactly how big the directory is before you go to read it in.
The former case is probably slightly better. However, if you're doing an
object key lookup not an array element fetch, neither of these formats are
really friendly at all, because each binary-search probe probably requires
bringing in one or two toast chunks from the body of the JSONB value so
you can look at the key text. I'm not sure if there's a way to redesign
the format to make that less painful/expensive --- but certainly, having
the key texts scattered through the JSONB value doesn't seem like a great
thing from this standpoint.

I think that's a good point.

On the general topic, I don't think it's reasonable to imagine that
we're going to come up with a single heuristic that works well for
every kind of input data. What pglz is doing - assuming that if the
beginning of the data is incompressible then the rest probably is too
- is fundamentally reasonable, nonwithstanding the fact that it
doesn't happen to work out well for JSONB. We might be able to tinker
with that general strategy in some way that seems to fix this case and
doesn't appear to break others, but there's some risk in that, and
there's no obvious reason in my mind why PGLZ should be require to fly
blind. So I think it would be a better idea to arrange some method by
which JSONB (and perhaps other data types) can provide compression
hints to pglz.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#39Tom Lane
tgl@sss.pgh.pa.us
In reply to: Robert Haas (#38)
Re: jsonb format is pessimal for toast compression

Robert Haas <robertmhaas@gmail.com> writes:

... I think it would be a better idea to arrange some method by
which JSONB (and perhaps other data types) can provide compression
hints to pglz.

I agree with that as a long-term goal, but not sure if it's sane to
push into 9.4.

What we could conceivably do now is (a) add a datatype OID argument to
toast_compress_datum, and (b) hard-wire the selection of a different
compression-parameters struct if it's JSONBOID. The actual fix would
then be to increase the first_success_by field of this alternate struct.

I had been worrying about API-instability risks associated with (a),
but on reflection it seems unlikely that any third-party code calls
toast_compress_datum directly, and anyway it's not something we'd
be back-patching to before 9.4.

The main objection to (b) is that it wouldn't help for domains over jsonb
(and no, I don't want to insert a getBaseType call there to fix that).

A longer-term solution would be to make this some sort of type property
that domains could inherit, like typstorage is already. (Somebody
suggested dealing with this by adding more typstorage values, but
I don't find that attractive; typstorage is known in too many places.)
We'd need some thought about exactly what we want to expose, since
the specific knobs that pglz_compress has today aren't necessarily
good for the long term.

This is all kinda ugly really, but since I'm not hearing brilliant
ideas for redesigning jsonb's storage format, maybe this is the
best we can do for now.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#40Peter Geoghegan
pg@heroku.com
In reply to: Robert Haas (#38)
Re: jsonb format is pessimal for toast compression

On Mon, Aug 11, 2014 at 12:07 PM, Robert Haas <robertmhaas@gmail.com> wrote:

I think that's a good point.

I think that there may be something to be said for the current layout.
Having adjacent keys and values could take better advantage of CPU
cache characteristics. I've heard of approaches to improving B-Tree
locality that forced keys and values to be adjacent on individual
B-Tree pages [1]http://www.vldb.org/conf/1999/P7.pdf , "We also forced each key and child pointer to be adjacent to each other physically" -- Peter Geoghegan, for example. I've heard of this more than once. And
FWIW, I believe based on earlier research of user requirements in this
area that very large jsonb datums are not considered all that
compelling. Document database systems have considerable limitations
here.

On the general topic, I don't think it's reasonable to imagine that
we're going to come up with a single heuristic that works well for
every kind of input data. What pglz is doing - assuming that if the
beginning of the data is incompressible then the rest probably is too
- is fundamentally reasonable, nonwithstanding the fact that it
doesn't happen to work out well for JSONB. We might be able to tinker
with that general strategy in some way that seems to fix this case and
doesn't appear to break others, but there's some risk in that, and
there's no obvious reason in my mind why PGLZ should be require to fly
blind. So I think it would be a better idea to arrange some method by
which JSONB (and perhaps other data types) can provide compression
hints to pglz.

If there is to be any effort to make jsonb a more effective target for
compression, I imagine that that would have to target redundancy
between JSON documents. With idiomatic usage, we can expect plenty of
it.

[1]: http://www.vldb.org/conf/1999/P7.pdf , "We also forced each key and child pointer to be adjacent to each other physically" -- Peter Geoghegan
and child pointer to be adjacent to each other physically"
--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#41Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#39)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Robert Haas <robertmhaas@gmail.com> writes:

... I think it would be a better idea to arrange some method by
which JSONB (and perhaps other data types) can provide compression
hints to pglz.

I agree with that as a long-term goal, but not sure if it's sane to
push into 9.4.

Agreed.

What we could conceivably do now is (a) add a datatype OID argument to
toast_compress_datum, and (b) hard-wire the selection of a different
compression-parameters struct if it's JSONBOID. The actual fix would
then be to increase the first_success_by field of this alternate struct.

Isn't the offset-to-compressable-data variable though, depending on the
number of keys, etc? Would we be increasing first_success_by based off
of some function which inspects the object?

I had been worrying about API-instability risks associated with (a),
but on reflection it seems unlikely that any third-party code calls
toast_compress_datum directly, and anyway it's not something we'd
be back-patching to before 9.4.

Agreed.

The main objection to (b) is that it wouldn't help for domains over jsonb
(and no, I don't want to insert a getBaseType call there to fix that).

While not ideal, that seems like an acceptable compromise for 9.4 to me.

A longer-term solution would be to make this some sort of type property
that domains could inherit, like typstorage is already. (Somebody
suggested dealing with this by adding more typstorage values, but
I don't find that attractive; typstorage is known in too many places.)

Think that was me and having it be something which domains can inherit
makes sense. Would be able to use this approach to introduce type
(and domains inheirited from that type) specific compression algorithms,
perhaps? Or even get to a point where we could have a chunk-based
compression scheme for certain types of objects (such as JSONB) where we
keep track of which keys exist at which points in the compressed object,
allowing us to skip to the specific chunk which contains the requested
key, similar to what we do with uncompressed data?

We'd need some thought about exactly what we want to expose, since
the specific knobs that pglz_compress has today aren't necessarily
good for the long term.

Agreed.

This is all kinda ugly really, but since I'm not hearing brilliant
ideas for redesigning jsonb's storage format, maybe this is the
best we can do for now.

This would certainly be an improvement over what's going on now, and I
love the idea of possibly being able to expand this in the future to do
more. What I'd hate to see is having all of this and only ever using it
to say "skip ahead another 1k for JSONB".

Thanks,

Stephen

#42Tom Lane
tgl@sss.pgh.pa.us
In reply to: Stephen Frost (#41)
Re: jsonb format is pessimal for toast compression

Stephen Frost <sfrost@snowman.net> writes:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

What we could conceivably do now is (a) add a datatype OID argument to
toast_compress_datum, and (b) hard-wire the selection of a different
compression-parameters struct if it's JSONBOID. The actual fix would
then be to increase the first_success_by field of this alternate struct.

Isn't the offset-to-compressable-data variable though, depending on the
number of keys, etc? Would we be increasing first_success_by based off
of some function which inspects the object?

Given that this is a short-term hack, I'd be satisfied with setting it
to INT_MAX.

If we got more ambitious, we could consider improving the cutoff logic
so that it gives up at "x% of the object or n bytes, whichever comes
first"; but I'd want to see some hard evidence that that was useful
before adding any more cycles to pglz_compress.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#43Stephen Frost
sfrost@snowman.net
In reply to: Peter Geoghegan (#40)
Re: jsonb format is pessimal for toast compression

* Peter Geoghegan (pg@heroku.com) wrote:

If there is to be any effort to make jsonb a more effective target for
compression, I imagine that that would have to target redundancy
between JSON documents. With idiomatic usage, we can expect plenty of
it.

While I certainly agree, that's a rather different animal to address and
doesn't hold a lot of relevance to the current problem. Or, to put it
another way, I don't think anyone is going to be surprised that two rows
containing the same data (even if they're inserted in the same
transaction and have the same visibility information) are compressed
together in some fashion.

We've got a clear example of someone, quite reasonably, expecting their
JSONB object to be compressed using the normal TOAST mechanism, and
we're failing to do that in cases where it's actually a win to do so.
That's the focus of this discussion and what needs to be addressed
before 9.4 goes out.

Thanks,

Stephen

#44Peter Geoghegan
pg@heroku.com
In reply to: Stephen Frost (#43)
Re: jsonb format is pessimal for toast compression

On Mon, Aug 11, 2014 at 1:01 PM, Stephen Frost <sfrost@snowman.net> wrote:

We've got a clear example of someone, quite reasonably, expecting their
JSONB object to be compressed using the normal TOAST mechanism, and
we're failing to do that in cases where it's actually a win to do so.
That's the focus of this discussion and what needs to be addressed
before 9.4 goes out.

Sure. I'm not trying to minimize that. We should fix it, certainly.
However, it does bear considering that JSON data, with each document
stored in a row is not an effective target for TOAST compression in
general, even as text.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#45Marti Raudsepp
marti@juffo.org
In reply to: Hannu Krosing (#20)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 8, 2014 at 10:50 PM, Hannu Krosing <hannu@2ndquadrant.com> wrote:

How hard and how expensive would it be to teach pg_lzcompress to
apply a delta filter on suitable data ?

So that instead of integers their deltas will be fed to the "real"
compressor

Has anyone given this more thought? I know this might not be 9.4
material, but to me it sounds like the most promising approach, if
it's workable. This isn't a made up thing, the 7z and LZMA formats
also have an optional delta filter.

Of course with JSONB the problem is figuring out which parts to apply
the delta filter to, and which parts not.

This would also help with integer arrays, containing for example
foreign key values to a serial column. There's bound to be some
redundancy, as nearby serial values are likely to end up close
together. In one of my past projects we used to store large arrays of
integer fkeys, deliberately sorted for duplicate elimination.

For an ideal case comparison, intar2 could be as large as intar1 when
compressed with a 4-byte wide delta filter:

create table intar1 as select array(select 1::int from
generate_series(1,1000000)) a;
create table intar2 as select array(select generate_series(1,1000000)::int) a;

In PostgreSQL 9.3 the sizes are:
select pg_column_size(a) from intar1;
45810
select pg_column_size(a) from intar2;
4000020

So a factor of 87 difference.

Regards,
Marti

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#46Robert Haas
robertmhaas@gmail.com
In reply to: Tom Lane (#39)
Re: jsonb format is pessimal for toast compression

On Mon, Aug 11, 2014 at 3:35 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Robert Haas <robertmhaas@gmail.com> writes:

... I think it would be a better idea to arrange some method by
which JSONB (and perhaps other data types) can provide compression
hints to pglz.

I agree with that as a long-term goal, but not sure if it's sane to
push into 9.4.

What we could conceivably do now is (a) add a datatype OID argument to
toast_compress_datum, and (b) hard-wire the selection of a different
compression-parameters struct if it's JSONBOID. The actual fix would
then be to increase the first_success_by field of this alternate struct.

I think it would be perfectly sane to do that for 9.4. It may not be
perfect, but neither is what we have now.

A longer-term solution would be to make this some sort of type property
that domains could inherit, like typstorage is already. (Somebody
suggested dealing with this by adding more typstorage values, but
I don't find that attractive; typstorage is known in too many places.)
We'd need some thought about exactly what we want to expose, since
the specific knobs that pglz_compress has today aren't necessarily
good for the long term.

What would really be ideal here is if the JSON code could inform the
toast compression code "this many initial bytes are likely
incompressible, just pass them through without trying, and then start
compressing at byte N", where N is the byte following the TOC. But I
don't know that there's a reasonable way to implement that.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#47Bruce Momjian
bruce@momjian.us
In reply to: Peter Geoghegan (#44)
Re: jsonb format is pessimal for toast compression

On Mon, Aug 11, 2014 at 01:44:05PM -0700, Peter Geoghegan wrote:

On Mon, Aug 11, 2014 at 1:01 PM, Stephen Frost <sfrost@snowman.net> wrote:

We've got a clear example of someone, quite reasonably, expecting their
JSONB object to be compressed using the normal TOAST mechanism, and
we're failing to do that in cases where it's actually a win to do so.
That's the focus of this discussion and what needs to be addressed
before 9.4 goes out.

Sure. I'm not trying to minimize that. We should fix it, certainly.
However, it does bear considering that JSON data, with each document
stored in a row is not an effective target for TOAST compression in
general, even as text.

Seems we have two issues:

1) the header makes testing for compression likely to fail
2) use of pointers rather than offsets reduces compression potential

I understand we are focusing on #1, but how much does compression reduce
the storage size with and without #2? Seems we need to know that answer
before deciding if it is worth reducing the ability to do fast lookups
with #2.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#48Claudio Freire
klaussfreire@gmail.com
In reply to: Bruce Momjian (#47)
Re: jsonb format is pessimal for toast compression

On Tue, Aug 12, 2014 at 8:00 PM, Bruce Momjian <bruce@momjian.us> wrote:

On Mon, Aug 11, 2014 at 01:44:05PM -0700, Peter Geoghegan wrote:

On Mon, Aug 11, 2014 at 1:01 PM, Stephen Frost <sfrost@snowman.net> wrote:

We've got a clear example of someone, quite reasonably, expecting their
JSONB object to be compressed using the normal TOAST mechanism, and
we're failing to do that in cases where it's actually a win to do so.
That's the focus of this discussion and what needs to be addressed
before 9.4 goes out.

Sure. I'm not trying to minimize that. We should fix it, certainly.
However, it does bear considering that JSON data, with each document
stored in a row is not an effective target for TOAST compression in
general, even as text.

Seems we have two issues:

1) the header makes testing for compression likely to fail
2) use of pointers rather than offsets reduces compression potential

I do think the best solution for 2 is what's been proposed already, to
do delta-coding of the pointers in chunks (ie, 1 pointer, 15 deltas,
repeat).

But it does make binary search quite more complex.

Alternatively, it could be somewhat compressed as follows:

Segment = 1 pointer head, 15 deltas
Pointer head = pointers[0]
delta[i] = pointers[i] - pointers[0] for i in 1..15

(delta to segment head, not previous value)

Now, you can have 4 types of segments. 8, 16, 32, 64 bits, which is
the size of the deltas. You achieve between 8x and 1x compression, and
even when 1x (no compression), you make it easier for pglz to find
something compressible.

Accessing it is also simple, if you have a segment index (tough part here).

Replace the 15 for something that makes such segment index very compact ;)

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#49Tom Lane
tgl@sss.pgh.pa.us
In reply to: Bruce Momjian (#47)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

Bruce Momjian <bruce@momjian.us> writes:

Seems we have two issues:
1) the header makes testing for compression likely to fail
2) use of pointers rather than offsets reduces compression potential

I understand we are focusing on #1, but how much does compression reduce
the storage size with and without #2? Seems we need to know that answer
before deciding if it is worth reducing the ability to do fast lookups
with #2.

That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-). I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression. Remembering that the textual representation is 12353 bytes:

json: 382 bytes
jsonb, using offsets: 12593 bytes
jsonb, using lengths: 406 bytes

So this confirms my suspicion that the choice of offsets not lengths
is what's killing compressibility. If it used lengths, jsonb would be
very nearly as compressible as the original text.

Hack attached in case anyone wants to collect more thorough statistics.
We'd not actually want to do it like this because of the large expense
of recomputing the offsets on-demand all the time. (It does pass the
regression tests, for what that's worth.)

regards, tom lane

Attachments:

jsonb-with-lengths-hack.patchtext/x-diff; charset=us-ascii; name=jsonb-with-lengths-hack.patchDownload
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..2297504 100644
*** a/src/backend/utils/adt/jsonb_util.c
--- b/src/backend/utils/adt/jsonb_util.c
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1378,1385 ****
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
- 		if (i > 0)
- 			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
--- 1378,1383 ----
*************** convertJsonbObject(StringInfo buffer, JE
*** 1430,1437 ****
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
- 		if (i > 0)
- 			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
--- 1428,1433 ----
*************** convertJsonbObject(StringInfo buffer, JE
*** 1445,1451 ****
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
- 		meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
--- 1441,1446 ----
*************** uniqueifyJsonbObject(JsonbValue *object)
*** 1592,1594 ****
--- 1587,1600 ----
  		object->val.object.nPairs = res + 1 - object->val.object.pairs;
  	}
  }
+ 
+ uint32
+ jsonb_get_offset(const JEntry *ja, int index)
+ {
+ 	uint32	off = 0;
+ 	int i;
+ 
+ 	for (i = 0; i < index; i++)
+ 		off += JBE_LEN(ja, i);
+ 	return off;
+ }
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 5f2594b..c9b18e1 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef uint32 JEntry;
*** 153,162 ****
   * Macros for getting the offset and length of an element. Note multiple
   * evaluations and access to prior array element.
   */
! #define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
! #define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
! 								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
--- 153,163 ----
   * Macros for getting the offset and length of an element. Note multiple
   * evaluations and access to prior array element.
   */
! #define JBE_LENFLD(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			jsonb_get_offset(ja, i)
! #define JBE_LEN(ja, i)			JBE_LENFLD((ja)[i])
! 
! extern uint32 jsonb_get_offset(const JEntry *ja, int index);
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
#50Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Lane (#49)
Re: jsonb format is pessimal for toast compression

I wrote:

That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-). I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression. Remembering that the textual representation is 12353 bytes:

json: 382 bytes
jsonb, using offsets: 12593 bytes
jsonb, using lengths: 406 bytes

Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes. So that's better, but still more than a
3X penalty compared to using lengths. (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#51Andrew Dunstan
andrew@dunslane.net
In reply to: Tom Lane (#50)
Re: jsonb format is pessimal for toast compression

On 08/13/2014 09:01 PM, Tom Lane wrote:

I wrote:

That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-). I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression. Remembering that the textual representation is 12353 bytes:
json: 382 bytes
jsonb, using offsets: 12593 bytes
jsonb, using lengths: 406 bytes

Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes. So that's better, but still more than a
3X penalty compared to using lengths. (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)

What does changing to lengths do to the speed of other operations?

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#52Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andrew Dunstan (#51)
Re: jsonb format is pessimal for toast compression

Andrew Dunstan <andrew@dunslane.net> writes:

On 08/13/2014 09:01 PM, Tom Lane wrote:

That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-).

What does changing to lengths do to the speed of other operations?

This was explicitly *not* an attempt to measure the speed issue. To do
a fair trial of that, you'd have to work a good bit harder, methinks.
Examining each of N items would involve O(N^2) work with the patch as
posted, but presumably you could get it down to less in most or all
cases --- in particular, sequential traversal could be done with little
added cost. But it'd take a lot more hacking.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#53Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Tom Lane (#50)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

On 08/14/2014 04:01 AM, Tom Lane wrote:

I wrote:

That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-). I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression. Remembering that the textual representation is 12353 bytes:

json: 382 bytes
jsonb, using offsets: 12593 bytes
jsonb, using lengths: 406 bytes

Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes. So that's better, but still more than a
3X penalty compared to using lengths. (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)

For comparison, here's a patch that implements the scheme that Alexander
Korotkov suggested, where we store an offset every 8th element, and a
length in the others. It compresses Larry's example to 525 bytes.
Increasing the "stride" from 8 to 16 entries, it compresses to 461 bytes.

A nice thing about this patch is that it's on-disk compatible with the
current format, hence initdb is not required.

(The current comments claim that the first element in an array always
has the JENTRY_ISFIRST flags set; that is wrong, there is no such flag.
I removed the flag in commit d9daff0e, but apparently failed to update
the comment and the accompanying JBE_ISFIRST macro. Sorry about that,
will fix. This patch uses the bit that used to be JENTRY_ISFIRST to mark
entries that store a length instead of an end offset.).

- Heikki

Attachments:

jsonb-with-offsets-and-lengths-1.patchtext/x-diff; name=jsonb-with-offsets-and-lengths-1.patchDownload
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..47b2998 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1378,8 +1378,10 @@ convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
 							JENTRY_POSMASK)));
 
-		if (i > 0)
+		if (i % JBE_STORE_LEN_STRIDE == 0)
 			meta = (meta & ~JENTRY_POSMASK) | totallen;
+		else
+			meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 	}
@@ -1430,11 +1432,14 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
 							JENTRY_POSMASK)));
 
-		if (i > 0)
+		if (i % JBE_STORE_LEN_STRIDE == 0)
 			meta = (meta & ~JENTRY_POSMASK) | totallen;
+		else
+			meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 
+		/* put value */
 		convertJsonbValue(buffer, &meta, &pair->value, level);
 		len = meta & JENTRY_POSMASK;
 		totallen += len;
@@ -1445,7 +1450,7 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
 							JENTRY_POSMASK)));
 
-		meta = (meta & ~JENTRY_POSMASK) | totallen;
+		meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 	}
@@ -1592,3 +1597,39 @@ uniqueifyJsonbObject(JsonbValue *object)
 		object->val.object.nPairs = res + 1 - object->val.object.pairs;
 	}
 }
+
+uint32
+jsonb_get_offset(const JEntry *ja, int index)
+{
+	uint32		off = 0;
+	int			i;
+
+	/*
+	 * Each absolute entry contains the *end* offset. Start offset of this
+	 * entry is equal to the end offset of the previous entry.
+	 */
+	for (i = index - 1; i >= 0; i--)
+	{
+		off += JBE_POSFLD(ja[i]);
+		if (!JBE_HAS_LEN(ja[i]))
+			break;
+	}
+	return off;
+}
+
+uint32
+jsonb_get_length(const JEntry *ja, int index)
+{
+	uint32		off;
+	uint32		len;
+
+	if (JBE_HAS_LEN(ja[index]))
+		len = JBE_POSFLD(ja[index]);
+	else
+	{
+		off = jsonb_get_offset(ja, index);
+		len = JBE_POSFLD(ja[index]) - off;
+	}
+
+	return len;
+}
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 5f2594b..dae6512 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -102,12 +102,12 @@ typedef struct JsonbValue JsonbValue;
  * to JB_FSCALAR | JB_FARRAY.
  *
  * To encode the length and offset of the variable-length portion of each
- * node in a compact way, the JEntry stores only the end offset within the
- * variable-length portion of the container node. For the first JEntry in the
- * container's JEntry array, that equals to the length of the node data. For
- * convenience, the JENTRY_ISFIRST flag is set. The begin offset and length
- * of the rest of the entries can be calculated using the end offset of the
- * previous JEntry in the array.
+ * node in a compact way, the JEntry stores either the length of the element,
+ * or its end offset within the variable-length portion of the container node.
+ * Entries that store a length are marked with the JENTRY_HAS_LEN flag, other
+ * entries store an end offset. The begin offset and length of each entry
+ * can be calculated by scanning backwards to the previous entry storing an
+ * end offset, and adding up the lengths of the elements in between.
  *
  * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
  * the variable-length portion of some node types is aligned to a 4-byte
@@ -121,7 +121,8 @@ typedef struct JsonbValue JsonbValue;
 /*
  * Jentry format.
  *
- * The least significant 28 bits store the end offset of the entry (see
+ * The least significant 28 bits store the end offset or the length of the
+ * entry, depending on whether JENTRY_HAS_LEN flag is set (see
  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
  * are used to store the type of the entry. The most significant bit
  * is set on the first entry in an array of JEntrys.
@@ -139,8 +140,10 @@ typedef uint32 JEntry;
 #define JENTRY_ISNULL			0x40000000
 #define JENTRY_ISCONTAINER		0x50000000		/* array or object */
 
+#define JENTRY_HAS_LEN			0x80000000
+
 /* Note possible multiple evaluations */
-#define JBE_ISFIRST(je_)		(((je_) & JENTRY_ISFIRST) != 0)
+#define JBE_HAS_LEN(je_)		(((je_) & JENTRY_HAS_LEN) != 0)
 #define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
 #define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
 #define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
@@ -150,13 +153,20 @@ typedef uint32 JEntry;
 #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
 
 /*
- * Macros for getting the offset and length of an element. Note multiple
- * evaluations and access to prior array element.
+ * Macros for getting the offset and length of an element.
  */
-#define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
-#define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
-#define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
-								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
+#define JBE_POSFLD(je_)			((je_) & JENTRY_POSMASK)
+#define JBE_OFF(ja, i)			jsonb_get_offset(ja, i)
+#define JBE_LEN(ja, i)			jsonb_get_length(ja, i)
+
+/*
+ * Store an absolute end offset every JBE_STORE_LEN_STRIDE elements (for an
+ * array) or key/value pairs (for an object). Others are stored as lengths.
+ */
+#define JBE_STORE_LEN_STRIDE	8
+
+extern uint32 jsonb_get_offset(const JEntry *ja, int index);
+extern uint32 jsonb_get_length(const JEntry *ja, int index);
 
 /*
  * A jsonb array or object node, within a Jsonb Datum.
#54Tom Lane
tgl@sss.pgh.pa.us
In reply to: Heikki Linnakangas (#53)
Re: jsonb format is pessimal for toast compression

Heikki Linnakangas <hlinnakangas@vmware.com> writes:

For comparison, here's a patch that implements the scheme that Alexander
Korotkov suggested, where we store an offset every 8th element, and a
length in the others. It compresses Larry's example to 525 bytes.
Increasing the "stride" from 8 to 16 entries, it compresses to 461 bytes.

A nice thing about this patch is that it's on-disk compatible with the
current format, hence initdb is not required.

TBH, I think that's about the only nice thing about it :-(. It's
conceptually a mess. And while I agree that this way avoids creating
a big-O performance issue for large arrays/objects, I think the micro
performance is probably going to be not so good. The existing code is
based on the assumption that JBE_OFF() and JBE_LEN() are negligibly cheap;
but with a solution like this, it's guaranteed that one or the other is
going to be not-so-cheap.

I think if we're going to do anything to the representation at all,
we need to refactor the calling code; at least fixing the JsonbIterator
logic so that it tracks the current data offset rather than expecting to
able to compute it at no cost.

The difficulty in arguing about this is that unless we have an agreed-on
performance benchmark test, it's going to be a matter of unsupported
opinions whether one solution is faster than another. Have we got
anything that stresses key lookup and/or array indexing?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#55Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#50)
Re: jsonb format is pessimal for toast compression

On Wed, Aug 13, 2014 at 09:01:43PM -0400, Tom Lane wrote:

I wrote:

That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-). I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression. Remembering that the textual representation is 12353 bytes:

json: 382 bytes
jsonb, using offsets: 12593 bytes
jsonb, using lengths: 406 bytes

Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes. So that's better, but still more than a
3X penalty compared to using lengths. (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)

Uh, can we get compression for actual documents, rather than duplicate
strings?

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#56Tom Lane
tgl@sss.pgh.pa.us
In reply to: Bruce Momjian (#55)
Re: jsonb format is pessimal for toast compression

Bruce Momjian <bruce@momjian.us> writes:

Uh, can we get compression for actual documents, rather than duplicate
strings?

[ shrug... ] What's your proposed set of "actual documents"?
I don't think we have any corpus of JSON docs that are all large
enough to need compression.

This gets back to the problem of what test case are we going to consider
while debating what solution to adopt.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#57Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#56)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:

Bruce Momjian <bruce@momjian.us> writes:

Uh, can we get compression for actual documents, rather than duplicate
strings?

[ shrug... ] What's your proposed set of "actual documents"?
I don't think we have any corpus of JSON docs that are all large
enough to need compression.

This gets back to the problem of what test case are we going to consider
while debating what solution to adopt.

Uh, we just one need one 12k JSON document from somewhere. Clearly this
is something we can easily get.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#58Merlin Moncure
mmoncure@gmail.com
In reply to: Bruce Momjian (#57)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 14, 2014 at 11:52 AM, Bruce Momjian <bruce@momjian.us> wrote:

On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:

Bruce Momjian <bruce@momjian.us> writes:

Uh, can we get compression for actual documents, rather than duplicate
strings?

[ shrug... ] What's your proposed set of "actual documents"?
I don't think we have any corpus of JSON docs that are all large
enough to need compression.

This gets back to the problem of what test case are we going to consider
while debating what solution to adopt.

Uh, we just one need one 12k JSON document from somewhere. Clearly this
is something we can easily get.

it's trivial to make a large json[b] document:
select length(to_json(array(select row(a.*) from pg_attribute a))::TEXT);

select

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#59Tom Lane
tgl@sss.pgh.pa.us
In reply to: Bruce Momjian (#57)
Re: jsonb format is pessimal for toast compression

Bruce Momjian <bruce@momjian.us> writes:

On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:

This gets back to the problem of what test case are we going to consider
while debating what solution to adopt.

Uh, we just one need one 12k JSON document from somewhere. Clearly this
is something we can easily get.

I would put little faith in a single document as being representative.

To try to get some statistics about a real-world case, I looked at the
delicio.us dataset that someone posted awhile back (1252973 JSON docs).
These have a minimum length (in text representation) of 604 bytes and
a maximum length of 5949 bytes, which means that they aren't going to
tell us all that much about large JSON docs, but this is better than
no data at all.

Since documents of only a couple hundred bytes aren't going to be subject
to compression, I made a table of four columns each containing the same
JSON data, so that each row would be long enough to force the toast logic
to try to do something. (Note that none of these documents are anywhere
near big enough to hit the refuses-to-compress problem.) Given that,
I get the following statistics for pg_column_size():

min max avg

JSON (text) representation 382 1155 526.5

HEAD's JSONB representation 493 1485 695.1

all-lengths representation 440 1257 615.3

So IOW, on this dataset the existing JSONB representation creates about
32% bloat compared to just storing the (compressed) user-visible text,
and switching to all-lengths would about halve that penalty.

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#60Peter Geoghegan
pg@heroku.com
In reply to: Tom Lane (#59)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 14, 2014 at 10:57 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

Yes. Pavel posted some representative JSON data a while back:
http://pgsql.cz/data/data.dump.gz (it's a plain dump)

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#61Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#59)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 14, 2014 at 01:57:14PM -0400, Tom Lane wrote:

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

Yes, that is what I was expecting --- once the whitespace and syntax
sugar is gone in JSONB, I was unclear how much compression would help.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#62Josh Berkus
josh@agliodbs.com
In reply to: Peter Geoghegan (#40)
Re: jsonb format is pessimal for toast compression

On 08/14/2014 11:13 AM, Bruce Momjian wrote:

On Thu, Aug 14, 2014 at 01:57:14PM -0400, Tom Lane wrote:

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

Yes, that is what I was expecting --- once the whitespace and syntax
sugar is gone in JSONB, I was unclear how much compression would help.

I thought the destruction case was when we have enough top-level keys
that the offsets are more than 1K total, though, yes?

So we need to test that set ...

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#63Oleg Bartunov
obartunov@gmail.com
In reply to: Tom Lane (#59)
Re: jsonb format is pessimal for toast compression

I did quick test on the same bookmarks to test performance of 9.4beta2 and
9.4beta2+patch

The query was the same we used in pgcon presentation:
SELECT count(*) FROM jb WHERE jb @> '{"tags":[{"term":"NYC"}]}'::jsonb;

table size | time (ms)
9.4beta2: 1374 Mb | 1160
9.4beta2+patch: 1373 Mb | 1213

Yes, performance degrades, but not much. There is also small win in table
size, but bookmarks are not big, so it's difficult to say about compression.

Oleg

On Thu, Aug 14, 2014 at 9:57 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Show quoted text

Bruce Momjian <bruce@momjian.us> writes:

On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:

This gets back to the problem of what test case are we going to consider
while debating what solution to adopt.

Uh, we just one need one 12k JSON document from somewhere. Clearly this
is something we can easily get.

I would put little faith in a single document as being representative.

To try to get some statistics about a real-world case, I looked at the
delicio.us dataset that someone posted awhile back (1252973 JSON docs).
These have a minimum length (in text representation) of 604 bytes and
a maximum length of 5949 bytes, which means that they aren't going to
tell us all that much about large JSON docs, but this is better than
no data at all.

Since documents of only a couple hundred bytes aren't going to be subject
to compression, I made a table of four columns each containing the same
JSON data, so that each row would be long enough to force the toast logic
to try to do something. (Note that none of these documents are anywhere
near big enough to hit the refuses-to-compress problem.) Given that,
I get the following statistics for pg_column_size():

min max avg

JSON (text) representation 382 1155 526.5

HEAD's JSONB representation 493 1485 695.1

all-lengths representation 440 1257 615.3

So IOW, on this dataset the existing JSONB representation creates about
32% bloat compared to just storing the (compressed) user-visible text,
and switching to all-lengths would about halve that penalty.

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#64Larry White
ljw1001@gmail.com
In reply to: Oleg Bartunov (#63)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

I attached a json file of approximately 513K. It contains two repetitions
of a single json structure. The values are quasi-random. It might make a
decent test case of meaningfully sized data.

best

On Thu, Aug 14, 2014 at 2:25 PM, Oleg Bartunov <obartunov@gmail.com> wrote:

Show quoted text

I did quick test on the same bookmarks to test performance of 9.4beta2 and
9.4beta2+patch

The query was the same we used in pgcon presentation:
SELECT count(*) FROM jb WHERE jb @> '{"tags":[{"term":"NYC"}]}'::jsonb;

table size | time (ms)
9.4beta2: 1374 Mb | 1160
9.4beta2+patch: 1373 Mb | 1213

Yes, performance degrades, but not much. There is also small win in
table size, but bookmarks are not big, so it's difficult to say about
compression.

Oleg

On Thu, Aug 14, 2014 at 9:57 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Bruce Momjian <bruce@momjian.us> writes:

On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:

This gets back to the problem of what test case are we going to

consider

while debating what solution to adopt.

Uh, we just one need one 12k JSON document from somewhere. Clearly this
is something we can easily get.

I would put little faith in a single document as being representative.

To try to get some statistics about a real-world case, I looked at the
delicio.us dataset that someone posted awhile back (1252973 JSON docs).
These have a minimum length (in text representation) of 604 bytes and
a maximum length of 5949 bytes, which means that they aren't going to
tell us all that much about large JSON docs, but this is better than
no data at all.

Since documents of only a couple hundred bytes aren't going to be subject
to compression, I made a table of four columns each containing the same
JSON data, so that each row would be long enough to force the toast logic
to try to do something. (Note that none of these documents are anywhere
near big enough to hit the refuses-to-compress problem.) Given that,
I get the following statistics for pg_column_size():

min max avg

JSON (text) representation 382 1155 526.5

HEAD's JSONB representation 493 1485 695.1

all-lengths representation 440 1257 615.3

So IOW, on this dataset the existing JSONB representation creates about
32% bloat compared to just storing the (compressed) user-visible text,
and switching to all-lengths would about halve that penalty.

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Attachments:

random.json.zipapplication/zip; name=random.json.zipDownload
PK���Drandom.jsonUX�S!��S���i�#I��WBR�aW6�������=Z����4g�2�vVa'2";��pG����;73�uef�GU����{�������������*f��`b2 �`��`������>~���x����S�b�
 (�3��1���!������a{b��(�N��8;������O���{)cf�;�����������'�>|GR����C��=�(�����DF�B{�����K�$�
�q��T��#���n�����iw��?��������������i���f�����������������������c��|��n�><=z|:�Cv�a��o���#������n����S��}~����������w�S�)�����TnA\����<�Dg!E�
�	���)G��4�r��n��?<=}~�������[�!���?i���'�����_?�w��]t�w��?�����6����r����?>=��<�����\�����
�yJ�%V���t����'uw�w��e��^���������������{��|�p�/���?��\J�����>�==������?���W���?=�� ��*j�5�S��-����Kv�h%��9e��E������-��II�?�_�`�|d���������'���r���������.�F,>Z������J�K�Gww�G�����.��I�O7���~__��x1x��l��w���|�_���oc��S�������������>��`���p@��A���%��H����y�<����r|����W���
c��/����cx�^��w����@������t����g@L����xah)�xt�O7�~O�'`�����y������S���A#��sy�'<K_rs{�����Zt�x����O�n�s�m����+����|�������l����7����!�w7�o|�B��7������/�����P�s�����n���oz�nS9x=����o~��P~���1�������������������8�"����X>��d��>)�����[�~<~I�
�7\~�p�xg��7�y�������-oz?|"7�q���^��z���h'����7���<��>��w��;|yz�w���������b���������?�s���������&>�w���U�=-�/���U�h���	���t���	�wb'����������%
\3B�>xY[��$�������}�6�����|w����=<���y�[|�������y���h�R���~N������?'��%l��>�������)��m������������n�����{zZ���m������]�w��a}��z�]���	f=����g��g������,&�O�<���?n����b����q������MH�7�����Q�pu��q8Oq�=�G�
�����A��	������=�)J��z�Q|����|N	}���������K������ #X<A*x�I����#>W��P�~�3^6:���\.�#�����h���(���c�*;�;����)=�c~�c��N)��;���2�n�e/���X{��PYq��g&����,~�2�����R�����������||V�\=�����p���W}��O~q��������#���?����|�Y���E.�M�;}�=����	�5fM�h����~���cZ��r*�.q������~j�����N�h(w�w��>��������r����������H��|�����RN����q
:_���o������JT��������Oo�qy�G��������@NQ�`��{S��)b���tG@���[��lt���(5�J�rN�����$D2��ru0[�_��!��`��m8;���	����
`
���\>�5|����t�P|\-�f�x�o��>���/�p�U%?���h��O���*��KT=�1�,O���cF�M��S�S.c�zx������S�s���1�}>����_�����lc0j���`��`,�Q
������RH�V�!x�"�2�����>}M�.���Y<���X�i���)X����O�����t��}�C9@Xf�*�/����!��ST�������+�^��=m�?���������I@�h|�����-r/_�-�n����������a�X�=^���.�#� ���t�8�S��8���118XQg��b�Hmd'���N�@\�$x�m�s���i�0��<$n*C��)V�s��g&�$���K�(/��EL��!�����n�}����)W���o�r�W.}
�����C<X���*Cx�����-4����J����y<��<A������!���>x����}����%�\<���������0����9�%�\-�JS6�K���5�-�������_�5��hY���E4���r9k
�����V)�z���o-�������[[~s����c�T�e��o�c$�g�,;'���P�m�NNoN�p����������l���X��d�G����������l��������|W���(��_��Y��<{�F1&@��@��r� �D����������X���A�7�h�o�l���)��I����v�=�["�Rj����y�s	3�l���E�&J�����6�;|��������������AA?�C w��%�S@y�V��J�R^
�0��p��t������H?/�4����������z�d���X�
��;��eo�|-�J����,%��ak������<�Z[��i���b��iEB����,��!A�BG��d)6�����z9]����o�3w��l�
3m�����6�`^G�	8�7BY���:`L+o��������������c��=���3#���.�j_�K-k��hW�[3��
��:q%i�-����J�,y��$�5O���Z�,|����~������!��E���LXL�����|����w�����^y��GGS���S����l���x<��3������\�r�N��z���N-��uk�8e��|j9.�pWwG��<��#-:{j�.�|���_�����y��>�	��J���f�\�����{@��G���|��Y	��X���m��MM�8�z��[~�O�r������������}�s���DmAg1����
��q��T�R���Z��Og�[;s&�����D2����'����W.��++�I6�h�!D��)��9�0��/^��!�S
K��i��7�I;���	�H	Q�c(��H�}��Ze\�������8vA���hKx-T����]%���.d���l�����u��>R��_��: Sk�-U����'���7,e��F���LY��������h)��6�u���Z����������.�~nc�L�^�#��.�BM<9�;Nt�a�h���;�W�v/IUG�u���HSs�����u��D�%88�UQDG�����:`���B��w�0'
���0�i�K�do�7����c�+\f����`�c%:
�DC����������3f;��U>��3�]�t5��-�M��_.�[����r��vM>�9��5q;��\����d[-H�V�B
hC^��*�!Y�$	#i�$#�<G��j���O>��`VsX��Yj�^��_kb{s@�~�A���lW�\���K�Z�����V''ut'��@�[���t&��7��\i���Z�Jg�/������g�dA������$�1��5���N�`�5�,<�h���yW0o�3
������e��$q��uc�6m�K�E
'R������1U?�d�N������b�m:w�B��mx��>�T��	����
A�������Z�[n���olZ66@;�clC+"�B- ��h����B$a����S��S
g���k�����>.���.x�!s_�t�Wq{
���m��5�:�/�o_*�8���N���/���M��L�m!M�S�5l6�b,AC0��XX	+Uo��m�AQK��p�Za��;���;u�F�!�:�����W0�� ���*�j}�'oyd,_x:M�d��!3;1O����]�Zil5��Oc��f����+��`�*f�3�V�e�6�qN���)�G�L�y�����@�
�c��������l�����my��C7�c?f��,�4yz�,V^��u���c�C��c����60c������.�c�����Tc���iS>g���g
>'��Ip��:�k�_��t�g0�V������@��&esxs���l��2�Kr����(�pN
�i��r��z���\����V�S0O��<L��m��!��6�Gy���v#��C!��"h�-�N��}�
p���'��bs��`�]
�t�m�����62�
e��J#�����Z��\s����$Z��	�yKmF���Y��>�\���������d�M�������`e�������It�*I���9���5'�j�FH�m�L>�P��'h.D�2zH#�X��g�0�����q0e���O�?�vN��2����ll���c��3�k� ��b	"���������3�����Z!�H��DW�x�U�k�x0�%�9��K�y"���y������9�����/�\��Y9��[QR:����L��]�7K����nI|���}������VM�C
5�v?���0�rD����'|��RN�-	�����34X���^Y��h�j���^��]��|����c�+C0������"�.QP,r��`<.8��8�A�^��P:���r��f/.���d�]��^����#c�X�#��X#8x.�����Vc'i�u�&�[���&��^�f���U�X6����=�1��YB��ka\�
�����M��5o^wI���<��?]�-�k�w��_��0Fq$����Z?���"����Y:������4��o��_�����T�w��+�5"v��98���Cf�.t�KV���|X��G'��ck���'{��0��nx��������R� ��$@�"������	�h��!S���@��tN��������H������j�8����3M*GP�c�mI��5H��5J�dk����Plc�'��"@k������.ioo��H�(K	�L1�
*�YCT�0n�����ZJ�vp%��y�5sk���#Y�q�N%rD%�%�t,(�y�77�J[X��Z�p\�[;c*�|n�3?�xc��
���0/6�Q���x�)�#�L#p���"�$��\8^���z��7y����{2�S�9%�K����K&R���5o�p�������4�f���\�S9�M:�jq{�DhsaAl�����,h�F�`��@!r�B6LO�$����������Zt��_|�
��m��a{Fl��JTs/#c�X����I����\�������tE
�-��S-�%�������f6l�
�����PDp�o&,�	s�s�5�����SU��G\�9�5��?��u�U�Um;����l{�c��Em�J�2�D��� �r�=��#|�4
Qlj�4����)��e��u���!�Hl(���*�3��y�3��i�LX��4�Yz�4]���^[�g�?�c�J��C�Y;O�)��a�Ul��RZe�x����7��$Dj�t���9�'���.5H���}�n�EN�y:����j=�X�Bl����D������ ���p��5�~W.��d�YN]�	`M���ihC�3l2v����L��C���)m�1�MBKwmuf5q59o�Ow�[��j�t=�~�
�2"'����($t�XY��G�c 47��X�S�Yf��,��Q�ZS>)-�mq*�����c%�~���n���}�%��3S���_�V��J�.Wl�����5&��
HrXr�@8�#��E�.
p.1i�N�����;%�%��g�J��N��;��]�"�����G�(�$�.Z���d�\S[������pu�%i9��9=)�?k&��{���uU��#c�`,�N�b=.�����(��s>Yu1���n���IS�t�_�@�Y.w��&�9#��S��0���f4��IW��HH625���8��"�4����4�r���}�!5PV3C��H�F�6D����D��\��l�S�W3��������K��a����1�Pn(U$�dJ5X]F��1�=�@�I���J�����S5�~����{f��.�V��������n�ab&;lP }d�y�Zr�(�*d������`V����$�qA��	����zoj8�,+�7�"%4C�
"aF�-W�V���r���Sn/�	�\h�f�0�"�D�r�+��c���ZK�MV>SH���fI�5��M>`�]��@�5`�e`B���v��+	���KD��X���=��e����0�����4'�e������z:do���*
_��h�I�`4�V�!�h����b��
�FfeA���q���L�/#�����K	�+�*M'^ wv��
�2K}���R�O�.e.�������D�z���~���s�O��T]��Y=�.��efXT��c�(m��\>yp����B�T���W<����J?��nB��oe�l��u,^�6�N9�#I�2E[^X�S�L�N�)��e��C�;��^��A������T�6+�TJkN�B;�9c��[���D��q���2R��f0jR��C�sY��}��W�X/n��]�P����=n�\R��G�Q���W/���p�P��AY�m-�C�X��r��]P��|�,��b�����N�w�6�v/!��>�vj�2�42�
B�ZJ��|DK�������I�=�����O����|���������70)��X��+����)��<1��������G���)�q����}�ql����R�!&�6L*���wk����Z�S\b������|��H��|����z��iP�
��IZ^�|T��/	F2R%oH����(-��^y���/%�1Ni�h��M���z�N��LJ% +���>�"*��p�Y��J�6��X;'��0JzE��vI�%F��p��
�P�e���:��L)�[����:@vpu}���n�]���l�4�����r����HKT N�h,P"`� $��1�m�y��kT��t����S��ku��|N5Cu�����;���X���Mey��C'���y�f�'��
�������/�����?�I��q��0)�Tr 1��%!�����2�f3�H�$5����
���a�;m1�VBdH�����BoN��
���R4�\1���pY��i�L=���u\���n���pjYo`�J)I�5P�8+x+"x�u$����l���������3z�U���@�E�id��D
�������(��a)�"9��R��odmj4��L"���,T;�8@zA�SoX�$4$Ce-��Z�c��d!��#z�����+s�u
���9�������2t%_��YK�^8������`9p�R������}��W��nDW��F�������UR�m6J�B��X������I�H�>\��n��}�u�s���[��l[�wy�0�0���d�(y��c7��.��4)m��v����g�����|�� _\R����n����!Ze6���D�t��kE���@@G���_S�����������I�������3tS
U���^Q�e���!c��:�$@H�%��Qw:���X��Y�����J2m�%k�_{��7t*�BVXi1*$8�
d+����(�.��������,�����\W�NzG:��q��T��TfLj
*���k4��H5K���r���6��u������~g��Mk�t�������L�Wf�A�e1�N`�`�UJV��C�[�%p-m����o\���A����;j��0`�S�7�eAH�LY�Q6��X*�Y�����m0)���������P%�p��Q��J������L��))"���2�^�����������}a���Fn5\�m��f�2��e�t��T�1hpAp ��h������g;������\����.w�u����r����"�j��F��2;�V��;�-	�7��^9�)�o�b��DO��KN1���N�]��0��
�I���&=���TK�$eE����
����:wq�bB
_w �Y�K;�'.��,�2A��r4%��E%�.����JjD�]��KS�oaN������G������:������;e���n���[
qio��L#|��%>������~m��)�:1;,�����bJ�VX�WT�B���H��l����0�|9'�1�
M'� ��OA�@�*�����
o!V1AC�L,K���HbQ��s�P�����_�3p ���>��Of�1�(����	�2jKV3:	��sR���gLiU���@"�RdT#������M@�<�D�O�[��$o<����������v���^��}�����|���r"F��i�Ap��z&��u���T7m�����l������mc��,^nW�
3����<("s��(X�������=�<^��N��YY|��Y��C;AP;��Tu'�A1^6����0�BQ��0�	w��B������t�=�ZUk[`.����:W
>�������!�XTJ�4��8.��
W�
q	B&�I��K������]�w���KX��^��8G����%�q\f02����Y���������93�_<�v�n,	d�V�����EA[tGC�2}�,�!6~��&�^�K���f��zj�e����3|��-���G������A/cT���bz�$8M�$Xp�GM����Z��9������<ouh]�MY�4407���
�Wf���tSZbH����(cs�x�im����,U�F������bToV�J6V�/1c���]0c��@�� l
H��2���Tf����RTc0@���wL�cG��|���C��n��hU��x0��e��P�Y�<O9�ck���W}H��W5-h�N���u��i��D�P ]1hU�o���%�2���A���N��������QZ�� -�l8�G+z$ID�L���@p&��$@e���[I.&�P��FCz/;�^��������+K���4S0��t(�`'�2a�10NcB�#kH0�p����.�_f���F�C�����k�Q�OY ���8���A3m��W"���=^�h�T<w�:w��(��X5�n8���z�(�1���0��
�tN����;m������O��@�W�tz9X�ht��H�� �J+as�8�T�I����mfer1���TL�+��x^,�_~����_fGN/��`���f9�a*�*/c.(�hQ�
4biy�2{��N����h��|^���A/�>xo�
&�����!O�n���Q�\����EiQ���4��{����?�A1��&�_�T��@@�;��2=;�����&�,$���JL��&V6��n��<c�p����M�I�)��
	l�-0�W�#P�`��%CO�
F?����+�!���>�N��w��:8������Gwr<&Qm]5���9���lR+1+��G�4������5g��W���^2����W�Wt��U��&�`�8��&.�*!g��%N��n�r��f���JG���$�j�1D�6�A���@�Z��b������V����
���-����-b�u[!�!]�U��2i��qb&������4�s����j�O{��������"�~f��<l����+��th]s��~A��U35M�e90�.�,��JAo)M�b�:��T�?��)���$]����?E\�k������q���������[���5�����A�%�r0���O����%v�LF_�������@�����U��w�SB!0iAHm�k�)��!jUO����A�m��:��I+xw��+�4I�z�_R������2q�$FJH0)�g�^+��nE�	��@������V����vM<+�b���aFV��������!�<�� hB��\��l9j�V_n���h��h����em������eM`.�[
"	
^pV�D2"�����{8H�m����(:t���������;1��qZI�x51��.��l�L�],���39
�>�� �������
6�����	������.rK`|`���e�r�J�� RZ��Iw������oZ�D�^�����1�tt���>���	��R6�}h���d�XF���6�\��c2�z��|�&���Br��"�a0�tTc���it�^���:��*�0�~-��n��?X
��;74�Jic��Lyc)�)�B'5��<sa���0w����������m��N�,�#���)I}��8�$$�h����)8A�uW�����BTo��xy-q��B����$7�*�l=�-�@Tl���D�<K�Q}i-�.�e���n��(ot}t��g�6����|�����i��Jq�\,�RP��T5�����G����L����������c�vl�B78IS��aceV�	����.b�WI�)gO��� ��~��eCci�X�����$���
���!�Bc
�Y�����R#��T�����m��!��1M�C�m���\Ti����G���S;���p`�AdJ�J�3�Z�66�j������.�c�^pl��
4nO�e�����&$1s���e��8&�s�i��y�<h�+1��J�;cb��3G�����4�F���rN4,�N�����z!;R�0y������7�
����CI+4�E��rj	.�Y���+��G�,y��H���q�|���"?�~K�=kvQ�"�<u}��=uVO��4b�������%BYW��&WlY��j��� i�5F�����V�K�	�&���&��B?{�$��	�/'=�v��H�K���e�tt+�����^I�A3��0����� �)8����asT���,|����Lf3�����
�a�GTsFB
.��H����I��?�o����O�������%;�5��l��@�1B�1*]z9���R�4����J�}x�K.4�9o�����`�����c����SRePB��yL�������4Y3!���L��8/�_fa79;/��e����7�.��#+�0�8�em�r�C��V�>Q�9'��W��_�Q���B������T*�~����g�����U��|��~�
��a5�{���(��O��*dRTh�R���]g���Rt��,L��[y����\]�u�����^I#:e>��\*6�
�)2�,V�.�MY�JX0i�e>��j=m��`�,�������;�"�i�b�j��������2/zf<�����YT&��]��<�#�b�w>{��z��i�FT>���i
.H,�U� �E�������5�]#�����������"?���Y�I��C4r�M�Rj���B�����!Q�u����'�{�������N:��9r�USY��!�AdN�94�,��9�������=T	������uK�"]%���+r;H��q� �eS"�,c���K�0�V
h4������>m������~O��k�6�ii����i�J>�k�H*��.QE����VN%����6=\R?�so2�
#�,#y��7:U��@&���t�nnHMV4
�hU&���)�lf���K����G~����;�u;�D[�\��G12�
\�K���BY/�H.ko��1i�!�������J<�O�}�7�Wu���������QVA��T��2]�E],��}����������9��+G�����.,�G�!%�������$�(�C]F��Qg�Sm����V�L_��|e���u��p�����bp�g��0j����R�P#�4��!���-_u2�DG�7����%Zdz
5�P*���Ie	��\d{�L){����;�%i$�{T��Q�n��2��M�vu����55l(�
��!F��UZQ�ix��&m	a
������y�9���Y�'m:��
wa�����rlg�����(I�zZ
B�V�q��j�������s���9��vs5������3�M�k7m��5�e�"ee�W�)�DK���5N���1�v���L;���&��M'X]jpl���F�����Y�]d���xc�	-�x�yN����Oz�W���]KWe;3��*�N��[�@S�gj�&�!�1�V�5�M�Xk.��ev��\w���T��&�����1���NA����O�y
��� ����W�;1z/�NF�d4�BF�n��h�;2D�t���������[

[x�����1,4����1������%�9'<_������n�H�����Wo��AG�9��.����U�P�e�ak��P'1��c[C��{M:��������a_@o@����$"$)sqih/�;���h,���������W�k^Y��F\��z<�
FK29�t�alS�Z�if\r�����Yu.~=yg���6h�0bweu6��9{a��p2�9B)P�:Q�M�b���PP5Nv��xT$�ze�+�U�������o�cw��`R�`��@u�t������!f�U�z�r�������Koe\W�����"��]��a$QK��*�%t�L�C����Y�nO��;�'������m� ����1�C�+�I����~�<v`VD�����'�X`���y�����R7f�;y6n;��������DA`j
���t��[J��5
������`����/:�j�Y���c���D�C��H`���8�\c4f��r����ym�����8�����X*vE�Vu���!S�7Bg��d�EM�0ByP��%�pY��
�t1p�oND��I�^��Aw�����i�����^��0�b:��6��&�`���D���8�Nf3h6�~)x]7�������a6�GP�1wd����M�h���,9�=��F�����eL(���~�m���)c6����$OE2�:�c��c��E�&� ����6#XR/�����0^oz���VK�
�f���W�<�h���(K�R��sp�+��Na��.��u��A�)v���.������u5�j�Rc[;��0�a:���JgR$�J7�}�&	��1A�X�y��*�CX����j�8oz��G�l@Q����!� �+z�,�b��(��Q^���vWN��!�1��UX��8VT7n���8�]t<8c<p���*�����o�9v6�~Y�ly��h�)���a��R�	,_8Pc\f	sP�H)��.�p
)����_]4�|�)���������I*�K����uO>H���^��F�R&o���N{;6�k����F��e�����	���p��I�"<$����3Ck�X9{�3ON������F(d9�\���(M��"S��}�"o���z�v�i�>�$A��1�LE�Q;�>�c����igN�����������oE<���iv�p�.��Y �+�-�l8��H������M>�o�����x�4��7�AG��c��e,L��Y�a=E��^5��I���Zg���sW�*���rXl�jx'J���&���5~����V�b8��3\��oOf��g�(��V�����v6%9�������H�`P��^�?S#|2�;v;���s��T�-vH7��vc���F�/?��U�O�#���?��<���2���������4*��K��q*�d��q�<�K��o���}
o�Vj�@�s�z�V��f"J���R"J2�<x��$�	��A�[e-/fG�}w�U�9N�U�����B�HY�A��X�o�����9%�V4������Q]e[�������L/�C	���z���K�P�T=�����9�?S���~�T���$3X��Nov�-��Q�,Apm���4�-��h���n���f����'�4!�_��6��m'^��X�r#�@�A�cN�g4�48����b��p������|�%���E>��Hu������nN��?;�c�+��i�;����(�����,s���	q`
,IbH*����B�Fp�	��s���"�V�H�6����71��3p�"{*�Aa�2%	�2 �@�)�1� ���h�>���&CF���!d���tY�D2x*i����L��(��H�����|����u:z�Z*��j���[���t�X8��xlkp�5>�w[.��6B�������
;����%�'�V�h�P��,�?���[o���v����Mb���������1;9|���,Y@�(i�A4�mu�atd%J���T5����`��O���]������r�]8�c�������:�u���?����^}@�E�oy,
����<NP��[����'������m�vwQc�zg�������D:������C_
�9����Q{�����J&9�'j��~�.R�W f;V��t$5�������P�_I���s�����RGx=��<�$�+'�B�#;���?=��*C�1[�!�0���xg$��w������h �9'����/��1�>'�`6��^��n�J��\�84eM	�H*�D+�������'4k��}k��W����EZ]~a�
��+�:F`��,����(s�10���M�~�.�}�Ng��KK�� ����!m�n���9� ��{uB�{�H���M$5:������[}��j�a������6����\Yd�,�j���?�
��)�i@'mh��P�����r�v����Z��Q+Q��H>�"7�H����\+,�d-�R'Qk������k����|�/���-�_�?��B7��;F$!
���%�d����/V���m�^��c�D����Ka*��N����i���Ili)�F�&��^S����������{�����`�?�t�q��_#����g���nGl]L�g�x�eC��H�+KA+��bB��A+�:���]��gW`v~��������n�x�'� )���2����atG�.	�"��Y�vpp}x8o�9%��2Pr�1�:m�Sc��k��$�`h�w�@K������D|����Q��
�.�k\�:����p�d��lPC@����U0��X	��b���I6C�T��$���u�T���'R��b����m�oe�
��x�3Zw.=>&���K`D����}=��������0�N���7��e����Z4�A���.��1M���}��.����J ���	Yg�������V1���&~����W���l[[��Z��^T��#u ��E�s��$�<c�&UP�M�����{�{g����{yKyT����~������71����@�G#�s�F��H���]�{yO���$���58�
s����J3]$BB��mA�VF�L3���HBY����1��#�P��+_������������a*mV�u&�PF�X���VB�7�YM������kqrn��x^h��o�~y�u+{K��#_��� ���=�)==�S��XQ��p%Z���������l���;�>�?���\�b�r�����_�z���l
�iB�6���)!u���5�QB�1:e�����\��r	����[�veH@w��^���6��f���[P�`�>k0��BP��5&���v�e���������NR���^XY�7��[�I��%S�<h��x
��NE' tL�&�zv�~:�Z���)��� #�7�(ZAb���	D���4e��5�{�S�a�^�u�x5��_@��1��m��Cmu�A/��,����E"��<nDo
qV�(���'��5
w�a��6%�%��s1m'���(�#�����P4�AH�o�&���<7Z��n�����u � ���j��V[�_���H|�bU�P�����Ef�`B�`��$���3ZEY^}"�1�8��I��l�t�D�o�H�>PnK|��)jPVep*��rI�+��R�����[E��6]�vg���!n�7�#F���
$��*$�`�r u�q!9�����HnBYc����I-�9'�K(eC� ��Y�X`�M���y��&���
.�}�R�,1l	�c$���j\p�/���j5m&'�b��j*�9��t��)�sdX�����2W�����h5kZ����]xU9����R�;���T�6�!�El@G��X	��[��� �Xo0!�X�����������g:tf�UtDK{dM/�\�P���	��2�+�	o����
��pq(��.=�U�8����~�����X^;�wtX8�
!Z&�i%A0V��������oK�)�nR�e���sh:4���BVh��.d�������X��4>cj4��VY.o"������`�jmQ�����%6��z&I�&����.QbHvJqA�5�S
��i�
����;�T�l'��u�d$��=����EkH��7�3lM�n�����h�����1R6��6}�#�����7R�e�c���`"�����>y�m�4h��/`f��u���W�-�������I6���
$���S��5�`�3@���:������N����|�3x��U�����n�����1������:����Ap$x�T��*�:���u|ly&h�r���aA�N�]���������O?|:)k�����w�v�Fx,���*w��L�:�!lA���g�������Y����a���
�3M?o��c�����3N� ���4��B�1�q��Y����sR�k'�.��O�d�O���P6��]u8D���q�hp���e�����PLF0
a:��s���8o�r�Q�����wnP6��mJ/R��#k�0+F|NN�
K/R�.m<�R�n�57�<��H���o��V�s�EG&��QI�u�<����KLE�M�D�3m6#��D�Q���s�����VF������3n�������0���J�-G���2�PK�V"/E����Os������e�����F���c���UD���0B�5�%"1J7-���L��@@/�9��V��������J��=��j�5`�/+�Cc�K�����&�%v����r��P��5�:~s��f���N���[@�D��B{4LT
��
��F����u�/z������3�
����/�d�/L��B�"�V
�0�'7 �j�%�_��Lq���(H�RarV���%~`�xy$�W:~��h=W��(K����ac�k��s�d��&������''��j6_�i,�q�qo�������oTT���^U��N:c�� �}�-&���F��1&�C��t������	����tpo&�I�x�;1d"(V�AF�>6zY��;4��1����������Z����%RLZ����tZ�T�k��k�d��������n��(a4a���j��;��/�b�n����Zf����
�����2��,f�����@{�X�L���[�L���1����7���@/�j#s��5��TA����`�o��D�UB�e����C��w�f��kXt2�>�������
(#�e�-"T���+A��y��^P���B��}����a�iQ����F�rl����P����QY�8����T��?zl����X��p�Pn�uv[�>8�J]�]��0�tr�ie��XS���D��������r��#��u@��x�N�w��W9���,��3��~L���U�xS������5�����i����B���7�c�Rg�TP�0<'`Z+��T_w�'����3-xZ����FNCuR2f'����s�FY0`J�EOL�r�!�����v���zq�=47_qG������S-�j~�h��5�SQ���*%�0g����K��bf�S�f5��^[�]�l��F�el4�-�����,[�j��k�1)�\�^p�x�%Q���r=��p���px�#���?��������Q�h_�n���0����W����[mk�W��5)����B4Y��	�81�	�o]f�b����E��gv�G�O�)]��P�W/]K���v�����I�c�p%�a{��s_$���
�E]U����/�����D|��shrMnv)�a�VR�]�)�
���Im�+�"�(�HN *��qT�Pf'����c9�����[��K[>B��H�YJ#��$f�`MaQB�8E@+��!gNT#����-�g
��
�����Nn���;�z���r��l��lV�N�YH��d��;/1k��l��p�]3�
O��&���`�8��� (iAt?�2YIPO����M��
��e�]�o���E;x�I�^(#��0ccp��� �R*��W4��40-���\��~���YK����2�I�D��-m��N�������)�����NP���rCk��~�e@O�ZG�������J*�;t6����LN1sq�+#e��,.��<���ME��`�����so��i�bB~�E�d�\�D���Z�iZc��j����cg��C}���y�����.�oG���?��Q�t�T?	"���3L�mGWj���PM
�E
k��e�����;BeIeX�]�M�<f����"Vd�X�u���}�D�A���$����UM�w�G�8��fD��'c6��qKe���e"�I�e
�g6�h��4n����
��dp�o���^W�a����:���h���5)��%X]�J��!����~��������Y6�����C.�=�+��N���nJ`�+����1*�k�����krYS2�����<�/��"RJ;��vv��m(���HP'������a��<w�YF��\�T�r�!��X�(�trL
��G��MN�)�EI���Z���*e��_4��4B��!�Cz�"&����~,y(�c6DI[������q���$
N(���Qj~1Z���|Ks�Z�c��\�6���8����04���1��i���$���f�Z��7kf8�^���V��[5���������U�eIECU���������J�3��J`��4���o����1���1F���J A$t����$g>p�O�nf�������4���������JZ	R��Cc��cFf��3�h,<�R�VF-����H��L/���3�y�$��R��Uj����k�d�v����iFcV	��,��ox���(����@���i�_SoI�v:R����0��V@ec3�R,B$�3pe�_
��R��p=�@S�aj6�\�����M�+vz�����#�h
^dB��Ew�S���]Z/Q=['u1I�%�,�.��8LG�cC4�nh�b\�l	0
��"������,�h<^��P?_������B��+y�-�Q;6d��
�J,���O�9@dL,U�g�J��'��%%��M�������?� RfY�/?������������~�"N�c�-���,�0/����Kz+.J \����Ix�:����
���UY�P�������/f)8���bR
)<�9]������s���o�s�*,�M�����l�Tc�A��,&dV��`#�\Q'r�lt��`�d����b�
�&w�i�����[E��&Zh^�c��5xO
����":�hD�����y#��L4�X��b���#���V��d��B���PY�O���2[C����/��~��z���w�
���Z�@��O��\�f_^Z�N�)�%	���K�'���e����8:�c���X.L��o��x[����bc���e\��9`3S���&��D��R'�l!i�%�����i�A)������WP
�	S���;"�U��U��OX�i�	�����x��pi��l;�%'�v�l1_i1wL�v�;����1vQu�3DH�7�����R��4�����$<'��z�CN����V��v���I�X8��J��0J��2Z���v0K�Xtv�\����� �����������l*����Ke8�dZQoD���@)g1?(���xQ���/8zER�����S9��)g���m	o��:�7�Q��=l���]"�8��
^^�{�rFz���W"|-�����U��]vZ��s����/��������"�Nh�e���u���O@9s6S����[v�+m$�T����@7�,%I�4�Y�	
K�K-�{�!��fL���l��Tny�3�W��l��b/�
��I^�wc*��#�-V�"q�Uv$�p�l�U�7c��	x�*����_L#|`�`~���h���|���L $�8����<�L���?���_��+���>I�,�q��.�������~9���{����!�\���N��C�u0�4�D"���������2N�O�(��u�����-����$n��.�l�$Y���q����t��0�n/�0�y�6DI���N��>�������+W��f�>k�\��:h�@�21��CT�&R�bT�����3�\;�P�K����U�����c6��
� ���"�c�'��B�'��}������7�\6�-6tu���P�(n<��(�h�@X�\jC��a�l�L
z��N�v��h��i7���~p�",�����P�A�/�Z�>�l-v�+:�-�	JLP�"(aD�e�
W=��
���� �jK�Q1	4L,z�11�d���+��c<�l�o{~�g���t�������AI=Rf��5��; �`��R0�eHE��S/�l������7yB)"�U���{L��o�-������v�^a.�$7�b���p ���u@��1��������1���wa�/�����Us2oO�x���%u ���4�6@�M�S$L�lx���w�����/N(o�lr��U�R�����I~n��]m���H4Tw�3���MO�z�"XVzd:Q��$�e%�W�V��p��������,�|�i��D���C��@#+e6RP2����eh��K�0J8eQ^i����A���:F���8�!4���c�������C#�d���r�,9JP����5����o�"o��iA�yu����l[cu���0XrR
�e�"� Ra����<@��`��(w5������<��+;[��L g�RHZ��Nu�3t�|���1�A�>��=J�gt���a1N�k�B�"�y^�q��J�/����F-#�f�l�C4���D�=�"����m|a�b��5��5�B�������nZwaTc���=������t_�
�V������^����s�+��Z�����s�s�nU�G�
�KV�L�L���,
�@�W6����{l�+����I������hv���'�/@��R,��0��s"%�6:b��DB�s����r��t�����2B1q���|c	�.S`-(��L�SC9(^v�h��A&�QQ��zCMZ�������7��,G�%O���[��Y>�J
�.%���Q��1������
��7y�M��}"#��;�\>��s�6��Z���w��E�|�6>t��3<��W���4o)���n	��5�4�+����]��<������6�#�����#!m��z���tv��vd</_5�B=_���Q'���J�'��F��u&{�����z�?#$ C�+�M��A���X�k����Z�qf�m��Ro���B����K��S>��Y��Q;�U���7�Gt������M��2�����������:.^
���.a�A�@(=�hH���r{�x���q�y-9����k���z�x���d&�m=��7U���b
u�1�*~������[��'�rh���(��K�P�5�����-\:d��e�����XO��iZ���u�t�cz����{--����\��%����Sy���i��_�|}8{d�%hW��=X?�jAX�?W�DN�e��R�C���x���&���yv��)L�tT2vsj�*�b[^i<�Hv6�
[�$N������u�n�
[��������3Mu��VC������P��Br���,)�F
����+��{��W�������l����7��X�uj&S9���E�	��E�����cg��E�-���v�
�=@Y>�1�a�lrtC�t��$+�u5�'�eR��<a�`�*�VA��:-
��L���?��}��Hm�w�@��L`l`'�������,r_f�_�lP
�=������w����1���%5C���j���X�����^������t��q�+��4'���iyU��fB�w1�������TO����d^kM>�Z�
��ZzW��2)'�M@Y����zsY@+�D�B��m/�
�?�t��Z���`|��x�f#���r�5�����-Bd�g�
�T���ZK���Lh'J�
%�(����8Ql*��E*��F�z7i���R�Pc�����p$Kt��EG��z��;��/Y������3�X��)�����5�U"�5B&j��k]C�f�k#�4�v�<o~<3����l0�
��c�-��aw���[�DF��2����4<::r��F�O���W��.(��t%�O�����Q"��D@?��^��(r�yA*(U��h`���c��0U�������
c�+��H����E�Db�=�����	���6�'65�b������I+�({v$?�+[.��Eyw�>�+�'��a7�FJ�YV���V��s��RX����z�/~wA�j��AL:N1 �Q��Of���<�o3k0��Zh��/F����J��Q&����.������G�2��dJ=bL=��s��Tgk�GS�$d	���b�5X�ia�\ 6}����Xb�%��b!5��(�W���)
N���3�5w9�+��e��AO4�
�c��F�[
�K��G��n��/����lb���SM�A%5��T�2�����
���]���{������T�)(�QP��#�b�Ti�mkL�|b�x�*�6���Q�������U�?����r���B�5���������t�7��)����v#t>�`���E�n���^=�����=�a�Nnx��v����R57g�"�i��S`��#��}Ux��������j��������O?�d�w-R,v�m���E�F(&:�ZF�c�r�
��������)���Bg�	���C6t^���5�n�T�B0j j�����������y��{����L���*a?��)�A��l����|��J`�u�	���W��Pl���:lu[H,��e3��l�q���A�H�n����D�FP��*�6�Z�� �+�#1#��:�U�9��'�����4�'6���sht��V�o���$raAad"z����%����&�k�$��o������Cs*���i�}���=�����W�w���wK�����\
�����}�a��m�jE��}+����5?f#���|>��^��{���5��V~��7�qQ����K���9g"�,��
8��$�
�
S���O����5s��w�5�����Y�$����ik7��w��d-�HB��p��%d��j�L=-6��M��p:kI�>�i{����G58v���S�D�HK������� ��T��>8v�Z�����Wwm����I��nluK�����U�M�
�$����)�2k�'�����
E�����!�qZ�bwmq\7���������(���0����9�y+�d���EV58/�x���d���n�v�vI'Q4�MPU�
`(���k����\���o�[�������v�';92�>r��A����=m{�<kY��K��q�;���J����g�e����?�P���
J�����M"Y��l��lA�!����$i,Yk��A;-i�����KZ�9���IS����r���n�mL���L�	�i�*���!!�li-��������>��-W������DN�e��qy�i����v3�`Tv
rj��|��Ek#�/�:BS�������V=����<�������m�L��Y�
T
����km]��}h.�l��ZFD���������C���`��N�Sq�n��2���?STs�R����1<���yol�E	]���#�����cLy���i/&U?����B.��A�*�H)���n/���tm]�����~V�;�~n
���u3JY
�k�;�f�rM�!�������=�h�J����9^:L���}+�hw�6��&��e������2x��Uf�|�<gg�\�~~U��s-�Y9��.���%������]�uNy�(�0C��BC�B;#���
M4��#����u����S��G�.���B{A��H���!�|ms
�� Cb+] �.=Z��c�;(�����:�����'���(�mm/J7�Isa����N+�E�$!�,d�D.�������s�����`
7H)���������1%g$�!`- |����\P������vlV"[�l��S�-��]Em��3�Z�
�f�Hf����R�lz���Fs��i3�x�����y�������r8z:��x��$>2F����v
R�Z�������{G>i,E���RNI/=KC�.'���Vy��|p6K��m��l�K��j��UR
?Z6������z^�g��Y�3L��'3���(Ao��H��1�X*��	8+��Z�E���
�gb��7Wc��7�hL`8o��5e���!���V������.(/l�H���2l�}�������f��������92�m@�Q�E2�W��A��%��l�����)��"��Z�#^L�bD�������T�����H�:.�/o��~yo���~P0���Z>�e��n�vUG`P���tMX����V�Nx�]	'���������W}9��AP���~'d
��C����-x������d��(�����{�{���������AJ�������������?��9u�"��k���4��������CR�	P�Z��Whm.��Y��]�:��3��G�����sI��_����$����H�{����"��cgPf/C�����h��O?���w��?}���������}~�!|�����W;}����TW�����'��m/���o����Y��r�����ZQ���v�a�|�Sz�?|h�P����_��������{��~�6���[��<���XJkH�(��~������W����m};�����n&��X!�\�3����������I��>k�V5���'��Hcd�3�Fr~[Q���+`}r$�4��Qc�����5���"�oR���	���+���'��==���'��?�o��	���������=#�5_�5]���]�6}���������A��]NMe�=����n���.b�������Z�������1�#���Y�~����6Z�Q����3�S����Q%�Y�V�N���=��	��Z�sL�Q���'�GL����v�%���[
<\/$I���:���F~w|�����_��Kb��N�j�mB�gY�:lT3y�CT3/zTR@*����T�N;=�e��6��Y�������
�}[�c�Yj'�^���������&��e�h���:�lYM����F�0X3��'�k�+��0�h"�`k}Ys,�Zsz�d�HWP�j�l����<v��v_o�8v�|�����<gb�����V����\$!YQO���#u1*�R/a��q;�R�-u��:���,�2Ld�nY�H�J���m��1���������wuG�u���[����H7��@��[�K�1�R4�WR-���m]���-*�%��ew�8�����5��S/�������'5�T~Xa���2G`�
&
�@A�X\TF=P���k�+X��~Na=�6F]�l����c��z��"�P���k�Q2Q6�{�2�������������p�h������1�:RB��m��'�P������O�������j�����n�N�&��w����E��(��.-`��5k�Bk��%V��g�aO�����#O��q+OS��)���2f�������!��'cE��� U�;c�H������*s������U��aRl�rd����B3�G�5Hg\�
=K��f<;|�����B�~/f;��*��t�'���3�VKY#W���B!qE�^��&�|�t��R����
����}�=��)�(����w���������>��A8EMC3��=��]?��v�n1���TK����7���W[ZL�67J��u�� �7�/��r��
��h���d�Z�����N���`y��/+MMc1Z�
�	{��%������I� ���8�H�6zR"E��������B�.�j����.u���u�Iq�����#�����������3�����\|����������~�����}0����lY����r�
Qi���J����!�� Ff�BT"����=����~a<��d8C��n����YC�9��!&��QV<6������e����[�����fy���_yj��8���$x�:�,hm�� ���ZT�,��T�1�/�0	��O[��g�Sz��	=�w7����k��H��m�m<y�%�9��$�2he<����,m�������'=D����Q�-�s�&>�z��5���,l����E�c��j�zyU]����U�p(F��:�����K����b����>�J
tT��!�{u���X<�A������0��������=�;Q���E������F�U-]�)���C���V�2�.<?����N8�x�k}w��<����g����5�.�m,����gWGM-0;h&���o��T���NZ
�*BiA��YA`U��� �d�?��w�S����;��_T�������O������u&������P }�mM	���!f��"�f������{�l�����C4�a�pl�6�>��I�:kc(�#�c2@B���@!`
���������Y7q{��n�v����M�{��u�o���?]�k��o���e�d���
��/}s;����:����r����8��,:��O-�)���	���w�����u�����/�s��qJNM
u��� �FwBo��P���D��]���-�h���~��i�����lpf�'7����_�:?Vv53�bK���a�M���^���f���,"nq�����nZs��-�F�!��<{���a9U�������2���mFg���A�5H�7�6����w�r��4���|
�V�:�L����V���G*�0Z�/�����47�,��Rmt���DS�qo���J�P���(])<j7���'P_�����}1�l������)�n�n �.$�1A0�A�q��k�N&6?��2�}3��n�y��q�C7VId�\hF��j
#b����@>�`�L�0����_l���W�����$����>T��c�,!����@�d���I12�j�q{o���k�{�s{H�$C{H)��F��Js���&.@h����\�Yeq����&��2R\��#E��h5b�������$kM�e
�������A��8J����3��/&y���o
��D9M%���CF?��[}#mT�`�	P����V��7A3�n�NJx���z�eW�/^=�����j+�'�u�K	M����m�0P�C
[��y�(������M-��q��'o��M����Z��b?�enY:��T��Yfo���ye���_��>���i��F�D2Z�������%0[@�����b�#?����~G������syw7�u�dy�n�Ho�L�����5�R���ZR�y0@���������\��V�mf�G�n�-����{&g?��(����c�q=�J��0����(���F@q�	}�Gm��#�������~����x��n
�#x���1B�H&��G6@1��I'g��(g����R�?o�g�PC��*�'v.|%��9�>��A�B*�*q���}x^����+��|z�OY
��%za��
t���A�T�q�-��5��,�NI�B��ao��w��������[�����0���7�Io|	�T��4�blpXj&`�'�Y�E�i�H�!��0Y����yv$�.�F�I5](*�����K�Y��|�i��+��4���,�����<H��	Q��I TT�����X�9��l���8<��Ro���1�?2�1�S9�aJ5��� Y�.LR*R��nc/O�4b�� ']�W�	�M]{,�[�K8�)h_�	���O��P��0[���X<��>I�IR��!��$���z��o��,�����MsljL�nD�v_v�3��s�b+�j�E�P�,��"Y)�3c;b<J�{��H{1�g
������� �ucE1�������J�������y����]?y{�kt�K+�_���T��S�#��##}���J4�=(l�5� k�\K����g���:���LuE����<��@i�[�����f������������1�f
B���k[�����=x,E���`�A��������s���n���PU���3�C����K���s����5�� ��P���<������h�a*�TT����Xy���Zb)�d�t����9������>U��N^'��3�C���<���[��������/���F� $�62�AI�a�bH�C���wiJ��d�.�L����=����M������Y��N�9���X��Gt��O�/�� �[=hI&N�Nz��23e��B-y�<�X��h���+O����w���|45��_�[[�#��{$���z�h K_@%���#C����TC�h`�'M��#��\����7��TA\S��`�����/c��J���l���U�E��7���5��L���#5���}B�t#]ZhV�!�\���p8`��(���"��8|���:��h�Qt8����F�eJN� %1��$M�9'3*�������}�
���{d���p���� ����rE����d����:����q��]_/��eV�o@ ����jm+���L�x(�b3W��CHN)S��t)���
{5��� ������
<�QX�QQ,=����^�O�>-s)Qu�w�;����U���,	CY����u����EQ�*�L�k���k!�X�L2�������������|�od��?�>��v^3�S�fm��7W4���J��UEApY���a]1���h��k�eO���;�j�jf�-�v�
`7fNS�}M���$�PK���+�5Y�q��ta���rL�����X���4�������*E���q��F}Dp6�H*s��7����9��m�����vPW�'>�p?&�*:�6���"�sj�%�I�Z�g��`dIJ�4;����SOj�����35����8@]�H�Q�J�(����:��u�75��O������Ab�O�����d�E����[�c@��*���*�Gr�)m-������vqC���j���i���.V
��t5�q�v��7F�,�����-�7��y�.4�����������(/sd�2$�Ff����"���q���y��_�����L��C�/8�<pt��;�h���bG�S�����r`{!)Z�����?��#�������,V�4n�����&�\�Y'����#�3d�J(��
���!��hN��_��vw����[�w�R>R/�X2h�����w��mY-B�A�\�tjF�m��g�1�BF�F�������*|�
��u�E�nZTe��]72V4��	�TZQ�a��p@������Y������Y�����b�5��hlr�gu��T4�H���
�>�
��g!;?�%���]W���p�����������,��	K�Qy��5�A��#�Ee���h��N�g���'���}��rqO8��W��h�X9x�1�A��u��^��{��^�r�"g���������S�_�q�]�kv���L�-n��yJD��Q^�LY�CH��6Ae��b���*�_���(�B/��d���p*&���F����+�Qc���UpPl��SV���	C6��E���4>&����K���gk����k]��y(���m�sf�T�Mk���gu�|92��(Q�S�����I�VQa�G��v���O|��D3���Q��(m��2QxW����?pq��2J�->���o~)��������;3�5��F�G��t"z�J~�)M�}`I�(8Q�
=7�l+����u��?)�^m=.A����c��e��%�v?��6	��zf�i�H�G��i]H�!\��S)�p�M�Ox\���f��������eF��|�j��� �v}n�����6g���,0@V9%E:F��@f��������<X����IC7�����?��A��( BPK�������E��=1�YR�kZ[d��r@�n�y��=hsl�\�F!�R��DI`B�A� � 	�����?u�������i��.������,�n����7w������}�SS��!���mx�eHV��/�(��B���6��tN�l��F�9c�%�Yg�Kb�8?�����K�hc�d�%9��$&�(�+��
��eR.����w��w��`�o���_;�4%���I�%��>�G����R���j�GS4p�6��U��1
����pI\>����]�|���!�QtQy�h���s1��]d��k}������+��n�,ukhH<���Il���|������kj�����?��	oN����#X������
�2�������m�(�������i=���A�}�i�J�X����,h�0']��x ���I����u������Jz�,�!�j�Z��������jl8��mi6z�.TL^��>�P�>�:Z��o��|tw$3V�bW��(7R�C��4s�6�d��&-�B�_���:O���$�7_N���e��=�<��OV�Z�"w�!w)�N���vI����w������������R�O���G=����;5p��`�h&Lt��0�L��>��������L�h�C'x�O�+;�_���/��tfy����X�&<�/l��M
]�]��^�I�u3��Yy������_�����68�2?���8�,�I�&��`��z�������E�5c/�m8.��e,����o�Xl�u�$���#�b�Q[$mm=6�CPL������-$����}�	�wI���Y��+�=|�zp��F%����^�dtE�\d� M���Z�yd�I�H)&C�0��-$������U�Z6�P���A��1=:����@	�������i�E�G_�}N	��A�
���|i�����t
�<6�Hy6*$]Svm;�*��F�����!e�����t>�y��������@��}���h.�!l��|�A]d��L%�D%>��t*�v�����v���Qe �����P��y
�U��yZ�B	���r�U���k�ty$�����<{�I�#��t��I	A�hk�W�����Z�Q\������aW�=��,ZOL����r�����$���G� d�m�^X���m��1��9��F�����M��h��Yx�4k	�l3��
��JX�hm�p�Q2��G3���8���4�
���M���W8jb� &_#6Z��H��(�����Z>�a���a�X����^y:?���'!����/q�q&\�PhR�A����K���K�+;�C����C!������?����/N#�t��"7�,1g���5g��@���n��$]z������;VF,=�BA
=��'���'76N�6�"8e	��� u��9�F��{|\���k�8Et���3��sl������|�s�;�Y��}C��k�o\��������y�v�B��6���+�:c���wIE�q�+�����3>e���.��'_a��F�?�Y��}����
�O3^�
�f���v��8���������&k=�d<�����(TL��b�������%i	km��7F���	���t�h���-��g�&������t���c��J9�Y��4�v��( a�\r!��{{wQ��_y6��jHZ#��������Ljf�m�WZT*J�ns�LY��l��w�v(��/�i1N��q�!�}����E���\�N������������{��Tf��<�G��)�7Q�.���-
'a��ZO�4��M��}��8&A����b�m)	)+��!���7���Cn����)�}����3���B������%���IPzdG@�T�K"��U��y�ma��?o�	�:�,%"/d�l#=�)�Y��)���F����`�x=����i��y��]��`���.d��m���2�}�*
t�G�����:h�k�������8y��!��{��>+�6�,�mp{Tk�!P��1���,�1���
W��M���Zd4����+�+JK��M�9@�\b���
!����+�}�?6$�����x���A}�'Q����R�Y@p��)KQ�g1P�w*��5��R�����O��Y�v��o��=�-�;A��s��4��w���[Rn����y"]o���Kn!�/6M���|��a��o����V�z��T����@��)	�@��������r�Yp��e�Fz�M����Zu�V-����T2���^���6�"����#�������@�!
���_�=��4X��aO=�}�p�JY���d`N�;$��u����K�Gb�a���������%R��-/|c���5���
�����Sn]J�U�Q����K�����=�[y���&]��wZ����/d�%t^@�������9�����1�����o�O?tW��n���|y�����i���z9
>�]���!o�q%��>f��h#~hm�6���(g/���{5�Vsh4_?91VBS���J��A��X�}S�l���5Ak���~��S^����3XC�kH�pH_��9y�}M��������k
v��-B)Qm�v��X���c�i��[;��k#iG��:��E�2���|C�H<���hc{$}��017�" hn����7���E{���]Y�ua�O�Y'�0�<2��]���	4
S���7Z e����M�+���l��>���^����0|�nP��v��������baJ2.q[q.	|�0���*`.=���?'#���������K�������T����8�1hTJ�"l=�!���2g�|	)����@<����X��9>�?x���=��
(:��"�,$�����^VL#������m�	��K�����s�oO�����%���F�e#�3����Z���������!5����U�������z����|���C�f#0���?6���>e����Fu.�3h���l�w�A:<����aq/�����`�N��6Y����)a�9Ar16�>�F� 3����I�u�u�q�o����os8�
�/c�4�����b�vk�Fxi�/#(SD#�<+[R�$�e��#�u�nVkke�g���ckkV~��bG�qL��*B�����Z���UD�.),m��~�]L��gj�������!���^|��7�N�`il��s��IA
�Q��X��}*���xy��fo�~Z�b����86�R�n0n[�9�5�F,h�� �^��"f�}���9����7t���
KFY���Ag�[��b(=8�m[5k�BQ�s#;]T�/��!���dt���m�����M�7|�Q\�����>��~8=�[���]m����:]�W���k�S���O�5�������Av���r���I�O�\>bo����*yf�"lc��##(�5����e��:��,��?���V�K]�0�����A�=6d�n����(#���W|���P�Q�B��4�����h�x�������y�>�S����)��a�F��/i+m(�QW�g�a&V����>c~p�^[9��k��
Pc`��Qf8�6E�SQ��\W3��
4�����2��.�v|up���D�~�lt�tkM���ks/�\#�E"����}���s������P�*#}�sP5_����q\�Oz�l�>���~2����]��gXr����:?�@����<5�s���F��G{����'�����}fc�
��U�d!d�`��A�4�Fi��8i�k��K���>����C���c��d��h!k�.����tFa!F������gW�	����]���,������?�W��`�u.n��*Q����V5]a��'��M�X��A����]/�7r�4�v������sO����P��8�/]��/&���w?�Lr�#���������
��l
`�5{T������'M���'}�_cs���KF��e������N\�f��'�.$��U��,��S$	�J���~��,�����'Wr9���J�fq�,��2����1YRI���d�R���}��58�n��{�L+1G�tcr��#���"�k:m�J	�3��Jz�BWb�
�%f\b�g$f�Vn�a�I�}~�n,�
T�I1 U��`���K��+���w%��\by��>���A��I����so�"X%���,J�^
����1�����������N|>�eq���-=�o�d7��Zv���r���8��1A�"B���,�,�<�U��v�����t8�.������c�*�[���r�@�J�1�!��m�o7y��&���1_������$����>Wl����h���DN��cB�W"��,(3%I�q���v���f��L���b+�y���GBk:=l�����5J`Q���:����7�a�r'�\B��)4$[DS�����r�m�Q�m?Z����1E*���
k�,w�W\�?4��z�J�v����hE�6����D
�[I^�1&���lp��[����?��

����=��7�)�B���B�mQ;�r]�U��,���>�\J���
����x-���=�`Y�������V� ��v�1"(����Q�K��!����U�4����{ �v�Sy���V�,B[�\fv1A���hDz}���I?�[7������W7ou�&bb���y�HE�6b�:u�2���YZ �[k]-��_+5;�������������F(=�}����et`�BV�fM��0D���0%� k���q��
�g�D���x�������H���Ts%�3X�r�};��������k�����:���hlh�xq7��3�F��6n+���E@�L.�(mx�q��U�\����Q?%�F������`���������%�k=Q�sJ��yL�������|�'�oSz�L�����e�m�H�)F�$�����D��Q9���4t�������?�����
HR?����u������BE�`�9�TG,5G�v�-����!i���8PG������� ��C���^Q%X�b��s9NZ��:�7Sx+����>��9T/���<Hj�aJCY���|��� f���9.?��_��F��X��.��"��g\�M#�c��i�GBG"���A�D��x�����8N�����y��z��cM�<��W�d'�9�o:LC.����;<EJ`[
$0�L�?w���_d�����i�S�T�Q����#jJE�7�M1a����+���S	�cge��>u�`�F�Q�0y�lY%����C�\P���"N&�Y��5a��U\ s���d�5����s��#��E����AZ[o��D`�b���w������:�_�5������d[���qb�	������"dGM)]B-Fl>�r��"F����/?|��������/8wl�E��
o�e|0T.�X��jR���(��4��`(���Y�����l��T�]1:��(��Y�uP2�Nzm��5F}j��D�^P9�������Qz��OCgaYS.k�����A��6�g7�7~��ESX�uh����"$mr�1�a�y��=�F�f��>}P���`gP������4*;ER3h+k%C����l]������}�/��_�-��C��:��|"�j+6�F�����a�9�E��3:��<$�GEP���&�q���<��~�����R��Fn�*���|������kt}�_����b��:�� }�D��XxI�G)+P���ha�4KU�T��ZU�F7�$v�!o�	rN�������������;�{+r��L��'�.�@��u��zD�i����&���+���5&u@����R���~5��L������H��������w3���O����jC���IB
���j��Y��d��v�(�����j������.rt�w��v��1b+bJ
me���������Z��1I�K�Qx���\l>Z�����C�I��R��f������������A����k	�����lb�"������la����6nt���]��W�wQ=^�����Vmqw6�k�9QN��/��� |)*�z��@�0��a�i��|��R7+��p�p�|Q��8�!������M���UJmH7�Z�� ���	:�������X�35h�G!7)�n�oB<c�����#���h
�Hg8��R}����j�����0{�
���;|H��i������D-m������|"#@R�YJc��������������B���nn�H�(�&��d����YH@��4	�O�����{T�}�7�����1$�!����I�����.+��2�2�YBE�P,��v��3*�������r{Fm�����c��W��t WP���(]-\���fA��+���M/��S�w�+����&G���V��<�FT�b�J�@�&.��r��r�����.m����	�LN�jn>���oe�4�P#4�:a���G!P�	�+jR�J�l\��q��x��;�./����c�n����=^	�Kuu��� K�������>�5�������I:~�52N��W���`=��4���h���L��D��v�����g�0+�J��wpOU|��a_���x�F0v�������$�m�[, ��1%������reH"|}��9e�/���E=�Og��L���Tj��2��CFcI9&P���5��VjJ�L@���?��2�=8^��B�����DS���o���0b(�+�hk(��aba�TT����|�73�k�&
��Rz����-7��&�):���o& 	�7�B2F��y4w�o%�j�����Fh��r��+7�E��I�	�e`�I�D�>����O=������g�{���%�v����p4z�h����R�l�9���G6�+����C^�lDT*�G]�������T����`�IO�G��5�r��������]l�,F`2jb��=����	���vsS��L7��!��H#)�D[@'U���m*�
H�9d�f�������F��9������{�8U?�0R7f'V�Th����6,C(�0qVd��h�H�����?���+7�I�2	��&����~-���p4�,��%����?�P���qr���Z����s��sH�u^���R�\��[�y��9���6�)	�\j���90�bk�7�T�Aa��~�c^IzSP��z/hul0�C{^M[�A�����=��K��`�%B�������I���>����<LV{�����H��V���g�����I���QA���NC� Ze��!(=�c����I�L�T~�P|J5bh������O����ka�	�6�e$o��&	��A_���V������0�D�,�9����F%��>��K���p3#&��kTy�W�&��#u>Q�-�+�P�!���N��"��E�k����i�����R�9j������������{����2�y����{���	B(it����#f��+�����//ui,��X�^��'_�Z���)���W��:�;t=�[�7U��N������c?��=t��W;"�`�G���>����H�`�B22]�kU5��*LZ�c:����i�c<-s�_g��n ��p����Wh�nH�z�'
S��	
�)y��F�����������x����r>C�S�hZ�G�������B�!rk�b����UJI�hQ��A���Z`�M�	r����-�������V'�sME��*Zj�q<�t�}3���~�t:�3���_M����|��U��]k�F�r�bji�%0V[|�$�	Y�cq��y��W:N���M���}��Jo�����	|h������\A�$�����P�]�6��s/���<��"E}9[������Q*MV3�w�AD#��8��������Z�}��^>�D�`���!&.�Z�:�R������| 9/Q�)���H�3�������� Ku�	���(��j�2Y���sZ{������J��l+g����������Ts?����\��j�w��o��<����&1[B��m�f�So��t`�D�m���K?�}��s)5���]{�u�Ey�6����7S�tCs������7�c��B-1�D�];$���f��(�<����c��}�0��=���Y��&���J�@�&k�W4�_Bn
h�o�v@v�%Tx�����5K��!b��n���*�J	�HPk.����:�"M�>[��n#����MG�V��vW.�PU����[?z"��9�o�1�Rac���h��,sop7gx��y���iu��f'��w�����I���Z��TA+���c�,	������	��v�P�K�����n��F��C�n������ja������E:2>�.�-�.a�n��|ky�A�b[�:D��/\p�+\h%�Z@*���A(/*&���q4�Z:���j�|����C[G�����8n�LRT�p��"���b���)�Ko�<.�����O
���lk����>S�������>��q#YT���@���)@�\�pBY�:�GV��;k_���v��V���K�gr����F��d�������<Y��#��b
��sQ�69��$j��J7[��z��`�'�yr��
5�Z�A{A�H4S�Z�Pv�AR�SJ}���o�h���m��������~\,�Z����[��C��;d����������Ig rmy��Vp��y� �n\Q)���<�����,�9�`\N1�(z������Y�]����}�u^��`�(�u�u/���*�QAj��9��)�bn��)JU�����N��^��/�L8�8AO���9D�-��V��r��������,�}�>aSc'	�(���l��7�N���Z�s �xr��F�b��:����,�Z�0��������~_�/>�Eh.������&c��;�`����'�uQ%V���tdjtS�f\�M1(e|p_����v��|1p�n8>8y���	Y�M�$�C
~��u���>��68[���������ZR�/,�4#���\=������l[�>;�������Sc�,��I�!q�
8>���X��k��o���}��&wA�d�����jI����\�m���)`Q;B�c���k��B��$G{39f����*+D���Z���@ �j����h�g�g��S��~�s������D�u5t��*f�bFU����c�[�G�}��<iM��K�5����1�D
��N4��]���(|��BR����kz���m�"96
0�1+��������\��`���ze������O�����U<);Vyj?J��\��*�X����g�*���&�����y;�����^b�����b��� 4G�*v������, M���@$Y1�B�(����d������^'3�m����e�-7������W{���~���o��/����oU���_L
z����Q��="��f�`�����-�_mn���5+.������^����}>r�,x���%�������xl����=�L�@�B�Tm�=�#{����w���q��4qbG����)�8-�qZ�P��	(l�-A��8.��6�
���?��0��v��|n_��>M�����I���@�B����-���VP:���$�o\�j�L�v�����������M,�����g��f,�3��9����9�8H�g��������Xt�s��?���`�e�%v��Ei;s7S�4K7�+��Y�\�)^�R��}3f����nI�������$���c��@�o7~IjK�&���"4������aR�z�������[\����F���r�q;�I������+D�����A��+g���M4�POo�'�wF����r�8Z��C���dxd5a�S7�,�a����$���Tf�����4���/��e��PC��Y�d~�W������n,���F>�u��D3W�\c��,�$gI!�[��}�f:jh�<e��m���W������^��l1~:���4�����?k�.1=���r��b~{5�4%��+3���pw�U��"�-}��b�@$u�99��q���}��*�7{�L=;�@��e�m��6x�X)�:���G�Q;���H���|#VOd�D>O��XB�#:�m�:[��)Kh�/�i.��0�H�'vgk����
�}��Z�����F���������b���t7NA��@SC\5�k�b�p����7+r���w�q���y�<rR�����4����N�V��jM���T����[��Xq\�{O����?�.�1n[tOts�hRS��.�R���7�iN�"�����x���c�_�����8�Z�H�oa�Z2Thv�m����c���������\P��%�DiK��"Y�ir��7���!^�X=�M�m�c��2�	�S�Z�F?��;{�^���[w*���������ln��J@���(�k>���E((�������(N]���B��PY����HH�6>���7;"�`����b)������^����-�H&;m4>��d���B9�U���I���Z�z�
�i��}�Wy)���Z��&��*��Y��j���db���3���
+�A���l	!~��o���]�O_`�i���3>uV�����.|����R~���g�AN��
 N^S�{���7[M}j����?���|��f����iQ��!�XD�wz���Ic^T)��.+��������_�um��r� v[:�2�_]�Y��������4�>��~H�Ez�������9e�o.qv��-}y���-_.v���S�ne,��m �_1�F��A����S��/h@;/�5Q���������w�����e�(��B
�s'����ho�Q��3xTy$U��Fg�,�,���<���IzI:����V�����@�����.E���K T���1k ��.���<���~�������v�d=�>�+�����qb]N���Nl�Rr{
�M����N�&NK%���_����];X�
��,G��[��&r�a��'�lQ���h�v�%R5���a\���� ����>���p(f`q��;�BaB����A�C!���BEpEXi29�A���+�W�y�����F���X�6
N[��!^#���Ya/kd�6��f�{�����ki-����R�+=����'��nLF��[�*�)(p
�"0d�HM�;��AD.<�����BwzL&$M����|�m��_����w�tw@x�
�*	�fn���:�%�!�[w�����kQ����Xo��|��N>�����pW��c1����s�����2�,���AL�^��`�����_Ds���'�o�����������u�8�T��h��2Cp2�d��S�8����b5�q4����������V�1�,�pDl������x���aC����^�e���o�,v<�I[q�M[A(B������$N��u��1�d�(���i����R�,d��C�����U����$�m"��$A��%�Z�Q-��jm4;����I�6(������gJ��!:�2m�7�
��0���I�66B"J ��*RJ���U�-;���s�����.x0�JO8��x�Y�fQk&�)@9	��lMc6�����<
g��X��\G�����V<��D���K��C-c��'�.�`Yb�S����1��������w%Kv�6�WZ'�������~w�n���M��I��|�������)���B�88>}7��^���Q:��7s2�m@�L�8Y����(NA%��:����q�q2����B�5}�P�}yo�����dZ���p"A�D.��m����PZM&U1����`�M�����8���v�I
J'����44�J\���!j1=���Q�[��-@�o�������]��_�)�w���v�bi����J�4I��n������J6J;������F����}}?��G��z?:�y���	b���rHZ{
l58�2��
<���<���t� ����_�5��y
��2t
p�kB�c{Uj~dZl��AYC!����sh{/`�=��e��I�1f��'}�sJ�C��2EA���X�s��r`��0m?+�t�}���0���9d�����0�]!���{��18A@e�\[#���g\F�P���A�Zf��p<��F*vl]���6FG�5h���[9�B|(X'�GO���������k�@y��9��1�^�����6�E���[�,��
K�	%��T�]�O�\����l��.{|J��_-4����N��F��*��&RJneZB�`[�&�Kv
u��������R1��S=-��+NW	��n��n��Qe��/B$�����}R�A^������v�c�"A>J�N�
��Q���1GG	cZ�1�MTk�c<��!a�!E�D�lqy��S��|�2k�Gf����d/|2�m��I6��U?����4+�Y���~9��
�7��L��j�����x<�d�v�&�Mw<�6�����DS5�z
%�wT��l���s����0�����uM�*z_U�~���@.������Q=��!c����t�!��T�P��x&������p�=<1dG���0����d?�6�^�i���qf�'U�Q	��-*��E��!��5����^�3n}��8�N�:�����5����c������s�U�n8�A��2��F�v�t�t|bXga>�Y;��'��	�QsE����$��Eg	�p�%���Uq����Y����oK�����Oe	r~� ����{e{�/V��%Uc���~��7lm��g���2G���L����l�������S��p|h�X��6��U^�:+����+)@��Bq1`$�Z"]?�}r�����H������e��;	wL0J��

g�]{QSG���T��sb��@*-���1t���FFt2�m�I/
����~�8���k��2	�d�OM��@<.	6����z�$��������i��I�Q�ZOF����(!S?d� E+b�VE���k������K�� �����q�4S���jcj�K%�G�����,�B.����x5����_���0_���2�%\��g������i�1�k~���\OS����N���2�d!l�rN	���U�1`Vu�,�9�7\��N�=�??�-s�;�_������A����Z���U5+Q�I� ����;1�:q��e�#[3t��Oa!���p�p5;��Q�-S��:]���I����B����}H0iv�2a�_�E����(wjF��`���|Jo�Nq&q������@�����>���|1_<��g!:o���l��������u�I�_v/���P�_���a����T��Wt������<��9���?����e�x��_��/p�����)�����.���`c��w���@
R�� �
:I`����X���.^����v����u<����L2N���&X��+�@��1��}m����V�����#/q�z>�P"���F��?�����(�N�;��P���T�����T2{T������?������5���@|�/�7��)'Z��j�3m���Xk�"6DQM��oul���4����]J9�T3sr���ny�G������"�~��{�BS�}��H{�~�O��Xq8�)���I��:\|V\"���P���'P����U�r���������z�zg��dm�c���x ��dR���VM��JW�heD�<.�4�?�[y��M���x�����=��F��^�n�Q�G�mDI �S���h����6����d�����Q2��$���u����l�HUz"$m�m�����4��V�
d&��������c���O���SGc��t�O:���`��:� H���jA���^�za����T:�/��g��
�^�?�����?��9����'��1%�)��*�{�B���`ep�_�2%O��Oa�
?���)�����4�i��'9E��t�u��������1��|��SQ�h�>E�B�����M��Yi�H��;�a|�~aZA%�S*FA�M�����,4o^���F]8����lLgW�mh����r� ��S�Q@�~/4�"a���/�y�F�X<�;#
��������^�bv	�15�l��W����Z�,��������S2����]2�V�j�zw�P'�C��y���6�*;������V��@V'_��{wr�1�`��1�d7:I�\b+��������b��PuN�P�S�S���������>L\�p$�<�����;�vg�q�����A,6z�I4�SUf]�J������D�WO
�.c�g���:j��Aq���kO�~����u�;3�=w�[�{m����n����k���M��DOD�v����|������k���g��O
��S��K����"_h���w�_�F�q�9�����8 ��j�\�L����sP�`�\N����~�h_��F��f�����b&�$"����	6�h��{�-�b�
�������I�)@���2�����w��~F�����Sw;f����x:�Z��n��Q������M)�����Tr��|;�'r�~�P����2��k
�7R
���YJ�����\7�^���0�t����v��X��%������_�z��]��X!g�5�"$����� \$��q�e�����~o�����5>�Q����{���Oku�u��\��n�+`�����j�S�z��9j����{�&�7�����������v<���r�T��1U�P����8��������W�;�A�X�:��v,[!�IP��8(��]�T(m�H�u���p��2��{��3�W{���SY��4X]D<��F����������zN�~�GU�TB���(� F�,�H��m����nI�[R����/�����;������5*o�gc�A����y�1E��fo���)���u6)/`��]^S���Y�MS����T3"�*����+!�6�MV��>m����_N.������384:?��89��6��������o��H�\	����<Z������o.��xo=��M�N}�9����L
$CLN�AA��;P�>����K���&h��-��i��Oz�0�� }X��S��B�@5�<�!�P����u.���~s����&c�����f����`ax��N��]��R�!��yF��p�.�h�`fwQ�S:�qO]��7�Z�COe��,��m�
a����4S���H��U�
j�%�������|EG�sd��pEYO��lWS�3o�w��r
WD�Q����V�dYa������Y�v@��p�����pg7���2B!w�xc`�0cb$&��<b���9
�D']��	�w��r����q
9=z!i2��c�7��e�mm��s3��2F�d�DK�n���r�8���xw�j=���c�����'-o�A|���6�U=����O2�(�s��bz��|�*�w���O�#!^����7��q}T�$�����)=��!E��N�����/�Mi����=�>?]�p=���D6�B����z�/��b�!s{���!dh�A?�~o��k��v}����be���Uo���)%���V��������|�h��0_�w`�P�/���?�4�S�@�$�2H��p>��c�����~��jk��r�`��l�CA!��b�rxf����w���H���;���:�R�	����9P�78fc�
���7���>�v�3Y�*�����7vBJ�Qg)��J��Tce��=��>�%�|�s�{�x�x��iD;~�$���q$ec�A��eC�j��������a�@>C�����*�������H���VV(>���Y XBE��~9L_���vt_j
d�����od@�`t,���k��_s����KYg��	>�ee^��+��P;��i�>����#�D���NK��%��
��<�=����k�6�O�����F�� {����gO&f���������T����#���rj�k��QP`�����v&w8V���L#�����'[C��-5��I@�w3�j��	2#��<c��������,���Wh���FM'�ac:�4�`�e����BZA��qI�M>W<onV����o*2����G����F�(f���:�	B&��yD,�wOa�o�8/H�s!�S�$����F����L�@������V��e���#�Q�������(]�����Q��~	i�'rQ��=9k����/����@,����XT�li/��(���R2|��"��>��H�:T����enS&���D
���~��������o���o��g-~�r�ch�,��yW�����PK�QF<����PK���D�QF<����@��random.jsonUX�S!��SPKE��
#65Claudio Freire
klaussfreire@gmail.com
In reply to: Larry White (#64)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 14, 2014 at 3:49 PM, Larry White <ljw1001@gmail.com> wrote:

I attached a json file of approximately 513K. It contains two repetitions of
a single json structure. The values are quasi-random. It might make a decent
test case of meaningfully sized data.

I have a 59M in plain SQL (10M compressed, 51M on-disk table size)
collection of real-world JSON data.

This data is mostly counters and anciliary info stored in json for the
flexibility, more than anything else, since it's otherwise quite
structured: most values share a lot between each other (in key names)
but there's not much redundancy within single rows.

Value length stats (in text format):

min: 14
avg: 427
max: 23239

If anyone's interested, contact me personally (I gotta anonimize the
info a bit first, since it's production info, and it's too big to
attach on the ML).

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#66Claudio Freire
klaussfreire@gmail.com
In reply to: Claudio Freire (#65)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 14, 2014 at 4:24 PM, Claudio Freire <klaussfreire@gmail.com> wrote:

On Thu, Aug 14, 2014 at 3:49 PM, Larry White <ljw1001@gmail.com> wrote:

I attached a json file of approximately 513K. It contains two repetitions of
a single json structure. The values are quasi-random. It might make a decent
test case of meaningfully sized data.

I have a 59M in plain SQL (10M compressed, 51M on-disk table size)
collection of real-world JSON data.

This data is mostly counters and anciliary info stored in json for the
flexibility, more than anything else, since it's otherwise quite
structured: most values share a lot between each other (in key names)
but there's not much redundancy within single rows.

Value length stats (in text format):

min: 14
avg: 427
max: 23239

If anyone's interested, contact me personally (I gotta anonimize the
info a bit first, since it's production info, and it's too big to
attach on the ML).

Oh, that one has a 13k toast, not very interesting.

But I've got another (very similar), 47M table, 40M toast, length distribution:

min: 19
avg: 474
max: 20370

Not sure why it's got a bigger toast having a similar distribution.
Tells just how meaningless min/avg/max stats are :(

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#67Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Geoghegan (#60)
Re: jsonb format is pessimal for toast compression

Peter Geoghegan <pg@heroku.com> writes:

On Thu, Aug 14, 2014 at 10:57 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

Yes. Pavel posted some representative JSON data a while back:
http://pgsql.cz/data/data.dump.gz (it's a plain dump)

I did some quick stats on that. 206560 rows

min max avg

external text representation 220 172685 880.3

JSON representation (compressed text) 224 78565 541.3

pg_column_size, JSONB HEAD repr. 225 82540 639.0

pg_column_size, all-lengths repr. 225 66794 531.1

So in this data, there definitely is some scope for compression:
just compressing the text gets about 38% savings. The all-lengths
hack is able to beat that slightly, but the all-offsets format is
well behind at 27%.

Not sure what to conclude. It looks from both these examples like
we're talking about a 10 to 20 percent size penalty for JSON objects
that are big enough to need compression. Is that beyond our threshold
of pain? I'm not sure, but there is definitely room to argue that the
extra I/O costs will swamp any savings we get from faster access to
individual fields or array elements.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#68Gavin Flower
GavinFlower@archidevsys.co.nz
In reply to: Tom Lane (#67)
Re: jsonb format is pessimal for toast compression

On 15/08/14 09:47, Tom Lane wrote:

Peter Geoghegan <pg@heroku.com> writes:

On Thu, Aug 14, 2014 at 10:57 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done. I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place. Do we have other JSON
corpuses to look at?

Yes. Pavel posted some representative JSON data a while back:
http://pgsql.cz/data/data.dump.gz (it's a plain dump)

I did some quick stats on that. 206560 rows

min max avg

external text representation 220 172685 880.3

JSON representation (compressed text) 224 78565 541.3

pg_column_size, JSONB HEAD repr. 225 82540 639.0

pg_column_size, all-lengths repr. 225 66794 531.1

So in this data, there definitely is some scope for compression:
just compressing the text gets about 38% savings. The all-lengths
hack is able to beat that slightly, but the all-offsets format is
well behind at 27%.

Not sure what to conclude. It looks from both these examples like
we're talking about a 10 to 20 percent size penalty for JSON objects
that are big enough to need compression. Is that beyond our threshold
of pain? I'm not sure, but there is definitely room to argue that the
extra I/O costs will swamp any savings we get from faster access to
individual fields or array elements.

regards, tom lane

Curious, would adding the standard deviation help in characterising the
distribution of data values?

Also you might like to consider additionally using the median value, and
possibly the 25% & 75% (or some such) values. I assume the 'avg' in
your table, refers to the arithmetic mean. Sometimes the median is a
better meaure of 'normal' than the arithmetic mean, and it can be useful
to note the difference between the two!

Graphing the values may also be useful. You might have 2, or more,
distinct populations which might show up as several distinct peaks - in
which case, this might suggest changes to the algorithm.

Many moons ago, I did a 400 level statistics course at University, of
which I've forgotten most. However, I'm aware of other potentially
useful measure, but I suspect that they would be too esoteric for the
current problem!

Cheers,
Gavin

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#69Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

So, here's a destruction test case:

200,000 JSON values (plus 2 key columns)
Average width 4K (+/- 1K)
183 keys per JSON value
keys 10 to 30 characters
105 float values
70 integer values
8 text and date values
no nesting

The "jsonic" table is JSON
The "jsonbish" table is JSONB

(I can't share this data set, but it makes a good test case)

And, we see the effect:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));
pg_size_pretty
----------------
394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
pg_size_pretty
----------------
1147 MB
(1 row)

So, pretty bad; JSONB is 200% larger than JSON.

I don't think having 183 top-level keys is all that unreasonable of a
use case. Some folks will be migrating from Mongo, Redis or Couch to
PostgreSQL, and might have a whole denormalized schema in JSON.

BTW, I find this peculiar:

postgres=# select pg_size_pretty(pg_relation_size('jsonic'));

pg_size_pretty
----------------
383 MB
(1 row)

postgres=# select pg_size_pretty(pg_relation_size('jsonbish'));

pg_size_pretty
----------------
11 MB
(1 row)

Next up: Tom's patch and indexing!

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#70Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#69)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

So, here's a destruction test case:
200,000 JSON values (plus 2 key columns)
Average width 4K (+/- 1K)
183 keys per JSON value

Is that 183 keys exactly each time, or is 183 the average?
If so, what's the min/max number of keys?

I ask because 183 would be below the threshold where I'd expect the
no-compression behavior to kick in.

And, we see the effect:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));
pg_size_pretty
----------------
394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
pg_size_pretty
----------------
1147 MB
(1 row)

So, pretty bad; JSONB is 200% larger than JSON.

Ouch. But it's not clear how much of this is from the first_success_by
threshold and how much is from having poor compression even though we
escaped that trap.

BTW, I find this peculiar:

postgres=# select pg_size_pretty(pg_relation_size('jsonic'));

pg_size_pretty
----------------
383 MB
(1 row)

postgres=# select pg_size_pretty(pg_relation_size('jsonbish'));

pg_size_pretty
----------------
11 MB
(1 row)

pg_relation_size is just the main data fork; it excludes TOAST.
So what we can conclude is that most of the data got toasted out-of-line
in jsonb, while very little did in json. That probably just comes from
the average datum size being close to the push-out-of-line threshold,
so that worse compression puts it over the edge.

It would be useful to see min/max/avg of pg_column_size() in both
these cases.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#71Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/14/2014 04:02 PM, Tom Lane wrote:

Josh Berkus <josh@agliodbs.com> writes:

So, here's a destruction test case:
200,000 JSON values (plus 2 key columns)
Average width 4K (+/- 1K)
183 keys per JSON value

Is that 183 keys exactly each time, or is 183 the average?

Each time exactly.

It would be useful to see min/max/avg of pg_column_size() in both
these cases.

Well, this is 9.4, so I can do better than that. How about quartiles?

thetype | colsize_distribution
---------+----------------------------
json | {1777,1803,1890,1940,4424}
jsonb | {5902,5926,5978,6002,6208}

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#72Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#71)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

On 08/14/2014 04:02 PM, Tom Lane wrote:

It would be useful to see min/max/avg of pg_column_size() in both
these cases.

Well, this is 9.4, so I can do better than that. How about quartiles?

thetype | colsize_distribution
---------+----------------------------
json | {1777,1803,1890,1940,4424}
jsonb | {5902,5926,5978,6002,6208}

OK.  That matches with the observation about being mostly toasted or not
--- the threshold for pushing out-of-line would be something a little
under 2KB depending on the other columns you had in the table.

What's more, it looks like the jsonb data is pretty much never getting
compressed --- the min is too high for that. So I'm guessing that this
example is mostly about the first_success_by threshold preventing any
compression from happening. Please, before looking at my other patch,
try this: in src/backend/utils/adt/pg_lzcompress.c, change line 221
thusly:

-	1024,						/* Give up if no compression in the first 1KB */
+	INT_MAX,					/* Give up if no compression in the first 1KB */

then reload the jsonb data and give us the same stats on that.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#73Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

What's more, it looks like the jsonb data is pretty much never getting
compressed --- the min is too high for that. So I'm guessing that this
example is mostly about the first_success_by threshold preventing any
compression from happening. Please, before looking at my other patch,
try this: in src/backend/utils/adt/pg_lzcompress.c, change line 221
thusly:

-	1024,						/* Give up if no compression in the first 1KB */
+	INT_MAX,					/* Give up if no compression in the first 1KB */

then reload the jsonb data and give us the same stats on that.

That helped things, but not as much as you'd think:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));

pg_size_pretty
----------------
394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
pg_size_pretty
----------------
801 MB
(1 row)

What I find really strange is that the column size distribution is
exactly the same:

thetype | colsize_distribution
---------+----------------------------
json | {1777,1803,1890,1940,4424}
jsonb | {5902,5926,5978,6002,6208}

Shouldn't the lower end stuff be smaller?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#74Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/14/2014 04:47 PM, Josh Berkus wrote:

thetype | colsize_distribution
---------+----------------------------
json | {1777,1803,1890,1940,4424}
jsonb | {5902,5926,5978,6002,6208}

Just realized my query was counting the whole row size instead of just
the column size. Here's just the JSON column:

Before changing to to INT_MAX:

thetype | colsize_distribution
---------+----------------------------
json | {1741,1767,1854,1904,2292}
jsonb | {3551,5866,5910,5958,6168}

After:

thetype | colsize_distribution
---------+----------------------------
json | {1741,1767,1854,1904,2292}
jsonb | {3515,3543,3636,3690,4038}

So that did improve things, just not as much as we'd like.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#75Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

Before changing to to INT_MAX:

thetype | colsize_distribution
---------+----------------------------
json | {1741,1767,1854,1904,2292}
jsonb | {3551,5866,5910,5958,6168}

After:

thetype | colsize_distribution
---------+----------------------------
json | {1741,1767,1854,1904,2292}
jsonb | {3515,3543,3636,3690,4038}

So that did improve things, just not as much as we'd like.

And with Tom's test patch:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));

pg_size_pretty
----------------
394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
pg_size_pretty
----------------
541 MB
(1 row)

thetype | colsize_distribution
---------+----------------------------
json | {1741,1767,1854,1904,2292}
jsonb | {2037,2114,2288,2348,2746}

Since that improved things a *lot*, just +40% instead of +200%, I
thought I'd test some select queries. I decided to test a GIN lookup
and value extraction, since indexed lookup is really what I care about.

9.4b2 no patches:

postgres=# explain analyze select row_to_json -> 'kt1_total_sum' from
jsonbish where row_to_json @> '{ "rpt_per_dt" : "2003-06-30" }';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on jsonbish (cost=29.55..582.92 rows=200 width=18)
(actual time=20.814..2845.454 rows=100423 loops=1)
Recheck Cond: (row_to_json @> '{"rpt_per_dt": "2003-06-30"}'::jsonb)
Heap Blocks: exact=1471
-> Bitmap Index Scan on jsonbish_row_to_json_idx (cost=0.00..29.50
rows=200 width=0) (actual time=20.551..20.551 rows=100423 loops=1)
Index Cond: (row_to_json @> '{"rpt_per_dt": "2003-06-30"}'::jsonb)
Planning time: 0.102 ms
Execution time: 2856.179 ms

9.4b2 TL patch:

postgres=# explain analyze select row_to_json -> 'kt1_total_sum' from
jsonbish where row_to_json @> '{ "rpt_per_dt" : "2003-06-30" }';
QUERY
PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on jsonbish (cost=29.55..582.92 rows=200 width=18)
(actual time=24.071..5201.687 rows=100423 loops=1)
Recheck Cond: (row_to_json @> '{"rpt_per_dt": "2003-06-30"}'::jsonb)
Heap Blocks: exact=1471
-> Bitmap Index Scan on jsonbish_row_to_json_idx (cost=0.00..29.50
rows=200 width=0) (actual time=23.779..23.779 rows=100423 loops=1)
Index Cond: (row_to_json @> '{"rpt_per_dt": "2003-06-30"}'::jsonb)
Planning time: 0.098 ms
Execution time: 5214.212 ms

... so, an 80% increase in lookup and extraction time for swapping
offsets for lengths. That's actually all extraction time; I tried
removing the extraction from the query, and without it both queries are
close enough to be statstically insignificant.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#76Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#75)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

And with Tom's test patch:
...
Since that improved things a *lot*, just +40% instead of +200%, I
thought I'd test some select queries.

That test patch is not meant to be fast, its ambition was only to see
what the effects on storage size would be. So I find this unsurprising:

... so, an 80% increase in lookup and extraction time for swapping
offsets for lengths.

We can certainly reduce that. The question was whether it would be
worth the effort to try. At this point, with three different test
data sets having shown clear space savings, I think it is worth
the effort. I'll poke into it tomorrow or over the weekend, unless
somebody beats me to it.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#77Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/14/2014 07:24 PM, Tom Lane wrote:

We can certainly reduce that. The question was whether it would be
worth the effort to try. At this point, with three different test
data sets having shown clear space savings, I think it is worth
the effort. I'll poke into it tomorrow or over the weekend, unless
somebody beats me to it.

Note that I specifically created that data set to be a worst case: many
top-level keys, no nesting, and small values. However, I don't think
it's an unrealistic worst case.

Interestingly, even on the unpatched, 1GB table case, the *index* on the
JSONB is only 60MB. Which shows just how terrific the improvement in
GIN index size/performance is.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#78Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#77)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

On 08/14/2014 07:24 PM, Tom Lane wrote:

We can certainly reduce that. The question was whether it would be
worth the effort to try. At this point, with three different test
data sets having shown clear space savings, I think it is worth
the effort. I'll poke into it tomorrow or over the weekend, unless
somebody beats me to it.

Note that I specifically created that data set to be a worst case: many
top-level keys, no nesting, and small values. However, I don't think
it's an unrealistic worst case.

Interestingly, even on the unpatched, 1GB table case, the *index* on the
JSONB is only 60MB. Which shows just how terrific the improvement in
GIN index size/performance is.

I've been poking at this, and I think the main explanation for your result
is that with more JSONB documents being subject to compression, we're
spending more time in pglz_decompress. There's no free lunch in that
department: if you want compressed storage it's gonna cost ya to
decompress. The only way I can get decompression and TOAST access to not
dominate the profile on cases of this size is to ALTER COLUMN SET STORAGE
PLAIN. However, when I do that, I do see my test patch running about 25%
slower overall than HEAD on an "explain analyze select jfield -> 'key'
from table" type of query with 200-key documents with narrow fields (see
attached perl script that generates the test data).

It seems difficult to improve much on that for this test case. I put some
logic into findJsonbValueFromContainer to calculate the offset sums just
once not once per binary-search iteration, but that only improved matters
5% at best. I still think it'd be worth modifying the JsonbIterator code
to avoid repetitive offset calculations, but that's not too relevant to
this test case.

Having said all that, I think this test is something of a contrived worst
case. More realistic cases are likely to have many fewer keys (so that
speed of the binary search loop is less of an issue) or else to have total
document sizes large enough that inline PLAIN storage isn't an option,
meaning that detoast+decompression costs will dominate.

regards, tom lane

Attachments:

randomjson.pltext/x-perl; charset=us-ascii; name=randomjson.plDownload
#79Arthur Silva
arthurprs@gmail.com
In reply to: Tom Lane (#78)
Re: jsonb format is pessimal for toast compression

I'm still getting up to speed on postgres development but I'd like to leave
an opinion.

We should add some sort of versionning to the jsonb format. This can be
explored in the future in many ways.

As for the current problem, we should explore the directory at the end
option. It should improve compression and keep good access performance.

A 4 byte header is sufficient to store the directory offset and some
versionning bits.
Em 15/08/2014 17:39, "Tom Lane" <tgl@sss.pgh.pa.us> escreveu:

Show quoted text

Josh Berkus <josh@agliodbs.com> writes:

On 08/14/2014 07:24 PM, Tom Lane wrote:

We can certainly reduce that. The question was whether it would be
worth the effort to try. At this point, with three different test
data sets having shown clear space savings, I think it is worth
the effort. I'll poke into it tomorrow or over the weekend, unless
somebody beats me to it.

Note that I specifically created that data set to be a worst case: many
top-level keys, no nesting, and small values. However, I don't think
it's an unrealistic worst case.

Interestingly, even on the unpatched, 1GB table case, the *index* on the
JSONB is only 60MB. Which shows just how terrific the improvement in
GIN index size/performance is.

I've been poking at this, and I think the main explanation for your result
is that with more JSONB documents being subject to compression, we're
spending more time in pglz_decompress. There's no free lunch in that
department: if you want compressed storage it's gonna cost ya to
decompress. The only way I can get decompression and TOAST access to not
dominate the profile on cases of this size is to ALTER COLUMN SET STORAGE
PLAIN. However, when I do that, I do see my test patch running about 25%
slower overall than HEAD on an "explain analyze select jfield -> 'key'
from table" type of query with 200-key documents with narrow fields (see
attached perl script that generates the test data).

It seems difficult to improve much on that for this test case. I put some
logic into findJsonbValueFromContainer to calculate the offset sums just
once not once per binary-search iteration, but that only improved matters
5% at best. I still think it'd be worth modifying the JsonbIterator code
to avoid repetitive offset calculations, but that's not too relevant to
this test case.

Having said all that, I think this test is something of a contrived worst
case. More realistic cases are likely to have many fewer keys (so that
speed of the binary search loop is less of an issue) or else to have total
document sizes large enough that inline PLAIN storage isn't an option,
meaning that detoast+decompression costs will dominate.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#80Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: [Bad Attachment] Re: jsonb format is pessimal for toast compression

On 08/15/2014 01:38 PM, Tom Lane wrote:

I've been poking at this, and I think the main explanation for your result
is that with more JSONB documents being subject to compression, we're
spending more time in pglz_decompress. There's no free lunch in that
department: if you want compressed storage it's gonna cost ya to
decompress. The only way I can get decompression and TOAST access to not
dominate the profile on cases of this size is to ALTER COLUMN SET STORAGE
PLAIN. However, when I do that, I do see my test patch running about 25%
slower overall than HEAD on an "explain analyze select jfield -> 'key'
from table" type of query with 200-key documents with narrow fields (see
attached perl script that generates the test data).

Ok, that probably falls under the heading of "acceptable tradeoffs" then.

Having said all that, I think this test is something of a contrived worst
case. More realistic cases are likely to have many fewer keys (so that
speed of the binary search loop is less of an issue) or else to have total
document sizes large enough that inline PLAIN storage isn't an option,
meaning that detoast+decompression costs will dominate.

This was intended to be a worst case. However, I don't think that it's
the last time we'll see the case of having 100 to 200 keys each with
short values. That case was actually from some XML data which I'd
already converted into a regular table (hence every row having 183
keys), but if JSONB had been available when I started the project, I
might have chosen to store it as JSONB instead. It occurs to me that
the matching data from a personals website would very much fit the
pattern of having between 50 and 200 keys, each of which has a short value.

So we don't need to *optimize* for that case, but it also shouldn't be
disastrously slow or 300% of the size of comparable TEXT. Mind you, I
don't find +80% to be disastrously slow (especially not with a space
savings of 60%), so maybe that's good enough.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#81Tom Lane
tgl@sss.pgh.pa.us
In reply to: Arthur Silva (#79)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

Arthur Silva <arthurprs@gmail.com> writes:

We should add some sort of versionning to the jsonb format. This can be
explored in the future in many ways.

If we end up making an incompatible change to the jsonb format, I would
support taking the opportunity to stick a version ID in there. But
I don't want to force a dump/reload cycle *only* to do that.

As for the current problem, we should explore the directory at the end
option. It should improve compression and keep good access performance.

Meh. Pushing the directory to the end is just a band-aid, and since it
would still force a dump/reload, it's not a very enticing band-aid.
The only thing it'd really fix is the first_success_by issue, which
we could fix *without* a dump/reload by using different compression
parameters for jsonb. Moving the directory to the end, by itself,
does nothing to fix the problem that the directory contents aren't
compressible --- and we now have pretty clear evidence that that is a
significant issue. (See for instance Josh's results that increasing
first_success_by did very little for the size of his dataset.)

I think the realistic alternatives at this point are either to
switch to all-lengths as in my test patch, or to use the hybrid approach
of Heikki's test patch. IMO the major attraction of Heikki's patch
is that it'd be upward compatible with existing beta installations,
ie no initdb required (but thus, no opportunity to squeeze in a version
identifier either). It's not showing up terribly well in the performance
tests I've been doing --- it's about halfway between HEAD and my patch on
that extract-a-key-from-a-PLAIN-stored-column test. But, just as with my
patch, there are things that could be done to micro-optimize it by
touching a bit more code.

I did some quick stats comparing compressed sizes for the delicio.us
data, printing quartiles as per Josh's lead:

all-lengths {440,569,609,655,1257}
Heikki's patch {456,582,624,671,1274}
HEAD {493,636,684,744,1485}

(As before, this is pg_column_size of the jsonb within a table whose rows
are wide enough to force tuptoaster.c to try to compress the jsonb;
otherwise many of these values wouldn't get compressed.) These documents
don't have enough keys to trigger the first_success_by issue, so that
HEAD doesn't look too awful, but still there's about an 11% gain from
switching from offsets to lengths. Heikki's method captures much of
that but not all.

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

In case anyone else wants to do measurements on some more data sets,
attached is a copy of Heikki's patch updated to apply against git tip.

regards, tom lane

Attachments:

jsonb-with-offsets-and-lengths-2.patchtext/x-diff; charset=us-ascii; name=jsonb-with-offsets-and-lengths-2.patchDownload
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..47b2998 100644
*** a/src/backend/utils/adt/jsonb_util.c
--- b/src/backend/utils/adt/jsonb_util.c
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1378,1385 ****
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
! 		if (i > 0)
  			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
--- 1378,1387 ----
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
! 		if (i % JBE_STORE_LEN_STRIDE == 0)
  			meta = (meta & ~JENTRY_POSMASK) | totallen;
+ 		else
+ 			meta |= JENTRY_HAS_LEN;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
*************** convertJsonbObject(StringInfo buffer, JE
*** 1430,1440 ****
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
! 		if (i > 0)
  			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
  		convertJsonbValue(buffer, &meta, &pair->value, level);
  		len = meta & JENTRY_POSMASK;
  		totallen += len;
--- 1432,1445 ----
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
! 		if (i % JBE_STORE_LEN_STRIDE == 0)
  			meta = (meta & ~JENTRY_POSMASK) | totallen;
+ 		else
+ 			meta |= JENTRY_HAS_LEN;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
+ 		/* put value */
  		convertJsonbValue(buffer, &meta, &pair->value, level);
  		len = meta & JENTRY_POSMASK;
  		totallen += len;
*************** convertJsonbObject(StringInfo buffer, JE
*** 1445,1451 ****
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
! 		meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
--- 1450,1456 ----
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
  							JENTRY_POSMASK)));
  
! 		meta |= JENTRY_HAS_LEN;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
*************** uniqueifyJsonbObject(JsonbValue *object)
*** 1592,1594 ****
--- 1597,1635 ----
  		object->val.object.nPairs = res + 1 - object->val.object.pairs;
  	}
  }
+ 
+ uint32
+ jsonb_get_offset(const JEntry *ja, int index)
+ {
+ 	uint32		off = 0;
+ 	int			i;
+ 
+ 	/*
+ 	 * Each absolute entry contains the *end* offset. Start offset of this
+ 	 * entry is equal to the end offset of the previous entry.
+ 	 */
+ 	for (i = index - 1; i >= 0; i--)
+ 	{
+ 		off += JBE_POSFLD(ja[i]);
+ 		if (!JBE_HAS_LEN(ja[i]))
+ 			break;
+ 	}
+ 	return off;
+ }
+ 
+ uint32
+ jsonb_get_length(const JEntry *ja, int index)
+ {
+ 	uint32		off;
+ 	uint32		len;
+ 
+ 	if (JBE_HAS_LEN(ja[index]))
+ 		len = JBE_POSFLD(ja[index]);
+ 	else
+ 	{
+ 		off = jsonb_get_offset(ja, index);
+ 		len = JBE_POSFLD(ja[index]) - off;
+ 	}
+ 
+ 	return len;
+ }
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 91e3e14..10a07bb 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef struct JsonbValue JsonbValue;
*** 102,112 ****
   * to JB_FSCALAR | JB_FARRAY.
   *
   * To encode the length and offset of the variable-length portion of each
!  * node in a compact way, the JEntry stores only the end offset within the
!  * variable-length portion of the container node. For the first JEntry in the
!  * container's JEntry array, that equals to the length of the node data.  The
!  * begin offset and length of the rest of the entries can be calculated using
!  * the end offset of the previous JEntry in the array.
   *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
--- 102,113 ----
   * to JB_FSCALAR | JB_FARRAY.
   *
   * To encode the length and offset of the variable-length portion of each
!  * node in a compact way, the JEntry stores either the length of the element,
!  * or its end offset within the variable-length portion of the container node.
!  * Entries that store a length are marked with the JENTRY_HAS_LEN flag, other
!  * entries store an end offset. The begin offset and length of each entry
!  * can be calculated by scanning backwards to the previous entry storing an
!  * end offset, and adding up the lengths of the elements in between.
   *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
*************** typedef struct JsonbValue JsonbValue;
*** 120,134 ****
  /*
   * Jentry format.
   *
!  * The least significant 28 bits store the end offset of the entry (see
!  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
!  * are used to store the type of the entry. The most significant bit
!  * is unused, and should be set to zero.
   */
  typedef uint32 JEntry;
  
  #define JENTRY_POSMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
  
  /* values stored in the type bits */
  #define JENTRY_ISSTRING			0x00000000
--- 121,136 ----
  /*
   * Jentry format.
   *
!  * The least significant 28 bits store the end offset or the length of the
!  * entry, depending on whether the JENTRY_HAS_LEN flag is set (see
!  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The other three bits
!  * are used to store the type of the entry.
   */
  typedef uint32 JEntry;
  
  #define JENTRY_POSMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
+ #define JENTRY_HAS_LEN			0x80000000
  
  /* values stored in the type bits */
  #define JENTRY_ISSTRING			0x00000000
*************** typedef uint32 JEntry;
*** 146,160 ****
  #define JBE_ISBOOL_TRUE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE)
  #define JBE_ISBOOL_FALSE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
  /*
!  * Macros for getting the offset and length of an element. Note multiple
!  * evaluations and access to prior array element.
   */
! #define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
! #define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
! 								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
--- 148,170 ----
  #define JBE_ISBOOL_TRUE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE)
  #define JBE_ISBOOL_FALSE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
+ #define JBE_HAS_LEN(je_)		(((je_) & JENTRY_HAS_LEN) != 0)
  
  /*
!  * Macros for getting the offset and length of an element.
   */
! #define JBE_POSFLD(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			jsonb_get_offset(ja, i)
! #define JBE_LEN(ja, i)			jsonb_get_length(ja, i)
! 
! /*
!  * Store an absolute end offset every JBE_STORE_LEN_STRIDE elements (for an
!  * array) or key/value pairs (for an object). Others are stored as lengths.
!  */
! #define JBE_STORE_LEN_STRIDE	8
! 
! extern uint32 jsonb_get_offset(const JEntry *ja, int index);
! extern uint32 jsonb_get_length(const JEntry *ja, int index);
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
#82Arthur Silva
arthurprs@gmail.com
In reply to: Tom Lane (#81)
Re: jsonb format is pessimal for toast compression

On Fri, Aug 15, 2014 at 8:19 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Arthur Silva <arthurprs@gmail.com> writes:

We should add some sort of versionning to the jsonb format. This can be
explored in the future in many ways.

If we end up making an incompatible change to the jsonb format, I would
support taking the opportunity to stick a version ID in there. But
I don't want to force a dump/reload cycle *only* to do that.

As for the current problem, we should explore the directory at the end
option. It should improve compression and keep good access performance.

Meh. Pushing the directory to the end is just a band-aid, and since it
would still force a dump/reload, it's not a very enticing band-aid.
The only thing it'd really fix is the first_success_by issue, which
we could fix *without* a dump/reload by using different compression
parameters for jsonb. Moving the directory to the end, by itself,
does nothing to fix the problem that the directory contents aren't
compressible --- and we now have pretty clear evidence that that is a
significant issue. (See for instance Josh's results that increasing
first_success_by did very little for the size of his dataset.)

I think the realistic alternatives at this point are either to
switch to all-lengths as in my test patch, or to use the hybrid approach
of Heikki's test patch. IMO the major attraction of Heikki's patch
is that it'd be upward compatible with existing beta installations,
ie no initdb required (but thus, no opportunity to squeeze in a version
identifier either). It's not showing up terribly well in the performance
tests I've been doing --- it's about halfway between HEAD and my patch on
that extract-a-key-from-a-PLAIN-stored-column test. But, just as with my
patch, there are things that could be done to micro-optimize it by
touching a bit more code.

I did some quick stats comparing compressed sizes for the delicio.us
data, printing quartiles as per Josh's lead:

all-lengths {440,569,609,655,1257}
Heikki's patch {456,582,624,671,1274}
HEAD {493,636,684,744,1485}

(As before, this is pg_column_size of the jsonb within a table whose rows
are wide enough to force tuptoaster.c to try to compress the jsonb;
otherwise many of these values wouldn't get compressed.) These documents
don't have enough keys to trigger the first_success_by issue, so that
HEAD doesn't look too awful, but still there's about an 11% gain from
switching from offsets to lengths. Heikki's method captures much of
that but not all.

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

In case anyone else wants to do measurements on some more data sets,
attached is a copy of Heikki's patch updated to apply against git tip.

regards, tom lane

I agree that versioning might sound silly at this point, but lets keep it
in mind.
Row level compression is very slow itself, so it sounds odd to me paying
25% performance penalty everywhere for the sake of having better
compression ratio in the dictionary area.
Consider, for example, an optimization that stuffs integers (up to 28 bits)
inside the JEntry itself. That alone would save 8 bytes for each integer.

#83Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/15/2014 04:19 PM, Tom Lane wrote:

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

In case anyone else wants to do measurements on some more data sets,
attached is a copy of Heikki's patch updated to apply against git tip.

Note that this is not 100% comparable because I'm running it against git
clone, and the earlier tests were against beta2. However, the Heikki
patch looks like a bust on this dataset -- see below.

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));
pg_size_pretty
----------------
394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));

pg_size_pretty
----------------
542 MB

Extraction Test:

postgres=# explain analyze select row_to_json -> 'kt1_total_sum' from
jsonbish where row_to_json @> '{ "rpt_per_dt" : "2003-06-30" }';
QUERY
PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on jsonbish (cost=29.55..582.92 rows=200 width=18)
(actual time=22.742..5281.823 rows=100423 loops=1)
Recheck Cond: (row_to_json @> '{"rpt_per_dt": "2003-06-30"}'::jsonb)
Heap Blocks: exact=1471
-> Bitmap Index Scan on jsonbish_row_to_json_idx (cost=0.00..29.50
rows=200 width=0) (actual time=22.445..22.445 rows=100423 loops=1)
Index Cond: (row_to_json @> '{"rpt_per_dt": "2003-06-30"}'::jsonb)
Planning time: 0.095 ms
Execution time: 5292.047 ms
(7 rows)

So, that extraction test is about 1% *slower* than the basic Tom Lane
lengths-only patch, and still 80% slower than original JSONB. And it's
the same size as the lengths-only version.

Huh?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#84Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#83)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

On 08/15/2014 04:19 PM, Tom Lane wrote:

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

... So, that extraction test is about 1% *slower* than the basic Tom Lane
lengths-only patch, and still 80% slower than original JSONB. And it's
the same size as the lengths-only version.

Since it's looking like this might be the direction we want to go, I took
the time to flesh out my proof-of-concept patch. The attached version
takes care of cosmetic issues (like fixing the comments), and includes
code to avoid O(N^2) penalties in findJsonbValueFromContainer and
JsonbIteratorNext. I'm not sure whether those changes will help
noticeably on Josh's test case; for me, they seemed worth making, but
they do not bring the code back to full speed parity with the all-offsets
version. But as we've been discussing, it seems likely that those costs
would be swamped by compression and I/O considerations in most scenarios
with large documents; and of course for small documents it hardly matters.

Even if we don't go this way, there are parts of this patch that would
need to get committed. I found for instance that convertJsonbArray and
convertJsonbObject have insufficient defenses against overflowing the
overall length field for the array or object.

For my own part, I'm satisfied with the patch as attached (modulo the
need to teach pg_upgrade about the incompatibility). There remains the
question of whether to take this opportunity to add a version ID to the
binary format. I'm not as excited about that idea as I originally was;
having now studied the code more carefully, I think that any expansion
would likely happen by adding more type codes and/or commandeering the
currently-unused high-order bit of JEntrys. We don't need a version ID
in the header for that. Moreover, if we did have such an ID, it would be
notationally painful to get it to most of the places that might need it.

regards, tom lane

Attachments:

jsonb-all-lengths.patchtext/x-diff; charset=us-ascii; name=jsonb-all-lengths.patchDownload
diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c
index 2fd87fc..456011a 100644
*** a/src/backend/utils/adt/jsonb.c
--- b/src/backend/utils/adt/jsonb.c
*************** jsonb_from_cstring(char *json, int len)
*** 196,207 ****
  static size_t
  checkStringLen(size_t len)
  {
! 	if (len > JENTRY_POSMASK)
  		ereport(ERROR,
  				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  				 errmsg("string too long to represent as jsonb string"),
  				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
! 						   JENTRY_POSMASK)));
  
  	return len;
  }
--- 196,207 ----
  static size_t
  checkStringLen(size_t len)
  {
! 	if (len > JENTRY_LENMASK)
  		ereport(ERROR,
  				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  				 errmsg("string too long to represent as jsonb string"),
  				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
! 						   JENTRY_LENMASK)));
  
  	return len;
  }
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..e47eaea 100644
*** a/src/backend/utils/adt/jsonb_util.c
--- b/src/backend/utils/adt/jsonb_util.c
***************
*** 26,40 ****
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (the total size of an array's elements is also limited by JENTRY_POSMASK,
   * but we're not concerned about that here)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
  
! static void fillJsonbValue(JEntry *array, int index, char *base_addr,
  			   JsonbValue *result);
! static bool	equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static Jsonb *convertToJsonb(JsonbValue *val);
  static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
--- 26,41 ----
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (the total size of an array's elements is also limited by JENTRY_LENMASK,
   * but we're not concerned about that here)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
  
! static void fillJsonbValue(JEntry *children, int index,
! 			   char *base_addr, uint32 offset,
  			   JsonbValue *result);
! static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static Jsonb *convertToJsonb(JsonbValue *val);
  static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
*************** JsonbValueToJsonb(JsonbValue *val)
*** 108,113 ****
--- 109,135 ----
  }
  
  /*
+  * Get the offset of the variable-length portion of a Jsonb node within
+  * the variable-length-data part of its container.  The node is identified
+  * by index within the container's JEntry array.
+  *
+  * We do this by adding up the lengths of all the previous nodes'
+  * variable-length portions.  It's best to avoid using this function when
+  * iterating through all the nodes in a container, since that would result
+  * in O(N^2) work.
+  */
+ uint32
+ getJsonbOffset(const JEntry *ja, int index)
+ {
+ 	uint32		off = 0;
+ 	int			i;
+ 
+ 	for (i = 0; i < index; i++)
+ 		off += JBE_LEN(ja, i);
+ 	return off;
+ }
+ 
+ /*
   * BT comparator worker function.  Returns an integer less than, equal to, or
   * greater than zero, indicating whether a is less than, equal to, or greater
   * than b.  Consistent with the requirements for a B-Tree operator class
*************** findJsonbValueFromContainer(JsonbContain
*** 279,324 ****
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
  		int			i;
  
  		for (i = 0; i < count; i++)
  		{
! 			fillJsonbValue(children, i, base_addr, result);
  
  			if (key->type == result->type)
  			{
  				if (equalsJsonbScalarValue(key, result))
  					return result;
  			}
  		}
  	}
  	else if (flags & JB_FOBJECT & container->header)
  	{
  		/* Since this is an object, account for *Pairs* of Jentrys */
  		char	   *base_addr = (char *) (children + count * 2);
  		uint32		stopLow = 0,
! 					stopMiddle;
  
! 		/* Object key past by caller must be a string */
  		Assert(key->type == jbvString);
  
  		/* Binary search on object/pair keys *only* */
! 		while (stopLow < count)
  		{
  			int			index;
  			int			difference;
  			JsonbValue	candidate;
  
  			/*
! 			 * Note how we compensate for the fact that we're iterating
! 			 * through pairs (not entries) throughout.
  			 */
- 			stopMiddle = stopLow + (count - stopLow) / 2;
- 
  			index = stopMiddle * 2;
  
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + JBE_OFF(children, index);
  			candidate.val.string.len = JBE_LEN(children, index);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
--- 301,370 ----
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
+ 		uint32		offset = 0;
  		int			i;
  
  		for (i = 0; i < count; i++)
  		{
! 			fillJsonbValue(children, i, base_addr, offset, result);
  
  			if (key->type == result->type)
  			{
  				if (equalsJsonbScalarValue(key, result))
  					return result;
  			}
+ 
+ 			offset += JBE_LEN(children, i);
  		}
  	}
  	else if (flags & JB_FOBJECT & container->header)
  	{
  		/* Since this is an object, account for *Pairs* of Jentrys */
  		char	   *base_addr = (char *) (children + count * 2);
+ 		uint32	   *offsets;
+ 		uint32		lastoff;
+ 		int			lastoffpos;
  		uint32		stopLow = 0,
! 					stopHigh = count;
  
! 		/* Object key passed by caller must be a string */
  		Assert(key->type == jbvString);
  
+ 		/*
+ 		 * We use a cache to avoid redundant getJsonbOffset() computations
+ 		 * inside the search loop.  Note that count may well be zero at this
+ 		 * point; to avoid an ugly special case for initializing lastoff and
+ 		 * lastoffpos, we allocate one extra array element.
+ 		 */
+ 		offsets = (uint32 *) palloc((count * 2 + 1) * sizeof(uint32));
+ 		offsets[0] = lastoff = 0;
+ 		lastoffpos = 0;
+ 
  		/* Binary search on object/pair keys *only* */
! 		while (stopLow < stopHigh)
  		{
+ 			uint32		stopMiddle;
  			int			index;
  			int			difference;
  			JsonbValue	candidate;
  
+ 			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
+ 
  			/*
! 			 * Compensate for the fact that we're searching through pairs (not
! 			 * entries).
  			 */
  			index = stopMiddle * 2;
  
+ 			/* Update the offsets cache through at least index+1 */
+ 			while (lastoffpos <= index)
+ 			{
+ 				lastoff += JBE_LEN(children, lastoffpos);
+ 				offsets[++lastoffpos] = lastoff;
+ 			}
+ 
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + offsets[index];
  			candidate.val.string.len = JBE_LEN(children, index);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
*************** findJsonbValueFromContainer(JsonbContain
*** 326,333 ****
  			if (difference == 0)
  			{
  				/* Found our key, return value */
! 				fillJsonbValue(children, index + 1, base_addr, result);
  
  				return result;
  			}
  			else
--- 372,382 ----
  			if (difference == 0)
  			{
  				/* Found our key, return value */
! 				fillJsonbValue(children, index + 1,
! 							   base_addr, offsets[index + 1],
! 							   result);
  
+ 				pfree(offsets);
  				return result;
  			}
  			else
*************** findJsonbValueFromContainer(JsonbContain
*** 335,343 ****
  				if (difference < 0)
  					stopLow = stopMiddle + 1;
  				else
! 					count = stopMiddle;
  			}
  		}
  	}
  
  	/* Not found */
--- 384,394 ----
  				if (difference < 0)
  					stopLow = stopMiddle + 1;
  				else
! 					stopHigh = stopMiddle;
  			}
  		}
+ 
+ 		pfree(offsets);
  	}
  
  	/* Not found */
*************** getIthJsonbValueFromContainer(JsonbConta
*** 368,374 ****
  
  	result = palloc(sizeof(JsonbValue));
  
! 	fillJsonbValue(container->children, i, base_addr, result);
  
  	return result;
  }
--- 419,427 ----
  
  	result = palloc(sizeof(JsonbValue));
  
! 	fillJsonbValue(container->children, i, base_addr,
! 				   getJsonbOffset(container->children, i),
! 				   result);
  
  	return result;
  }
*************** getIthJsonbValueFromContainer(JsonbConta
*** 377,387 ****
   * A helper function to fill in a JsonbValue to represent an element of an
   * array, or a key or value of an object.
   *
   * A nested array or object will be returned as jbvBinary, ie. it won't be
   * expanded.
   */
  static void
! fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
  {
  	JEntry		entry = children[index];
  
--- 430,446 ----
   * A helper function to fill in a JsonbValue to represent an element of an
   * array, or a key or value of an object.
   *
+  * The node's JEntry is at children[index], and its variable-length data
+  * is at base_addr + offset.  We make the caller determine the offset since
+  * in many cases the caller can amortize the work across multiple children.
+  *
   * A nested array or object will be returned as jbvBinary, ie. it won't be
   * expanded.
   */
  static void
! fillJsonbValue(JEntry *children, int index,
! 			   char *base_addr, uint32 offset,
! 			   JsonbValue *result)
  {
  	JEntry		entry = children[index];
  
*************** fillJsonbValue(JEntry *children, int ind
*** 392,405 ****
  	else if (JBE_ISSTRING(entry))
  	{
  		result->type = jbvString;
! 		result->val.string.val = base_addr + JBE_OFF(children, index);
! 		result->val.string.len = JBE_LEN(children, index);
  		Assert(result->val.string.len >= 0);
  	}
  	else if (JBE_ISNUMERIC(entry))
  	{
  		result->type = jbvNumeric;
! 		result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index)));
  	}
  	else if (JBE_ISBOOL_TRUE(entry))
  	{
--- 451,464 ----
  	else if (JBE_ISSTRING(entry))
  	{
  		result->type = jbvString;
! 		result->val.string.val = base_addr + offset;
! 		result->val.string.len = JBE_LENFLD(entry);
  		Assert(result->val.string.len >= 0);
  	}
  	else if (JBE_ISNUMERIC(entry))
  	{
  		result->type = jbvNumeric;
! 		result->val.numeric = (Numeric) (base_addr + INTALIGN(offset));
  	}
  	else if (JBE_ISBOOL_TRUE(entry))
  	{
*************** fillJsonbValue(JEntry *children, int ind
*** 415,422 ****
  	{
  		Assert(JBE_ISCONTAINER(entry));
  		result->type = jbvBinary;
! 		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index)));
! 		result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index));
  	}
  }
  
--- 474,482 ----
  	{
  		Assert(JBE_ISCONTAINER(entry));
  		result->type = jbvBinary;
! 		/* Remove alignment padding from data pointer and len */
! 		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset));
! 		result->val.binary.len = JBE_LENFLD(entry) - (INTALIGN(offset) - offset);
  	}
  }
  
*************** recurse:
*** 668,680 ****
  			 * a full conversion
  			 */
  			val->val.array.rawScalar = (*it)->isScalar;
! 			(*it)->i = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
  
  		case JBI_ARRAY_ELEM:
! 			if ((*it)->i >= (*it)->nElems)
  			{
  				/*
  				 * All elements within array already processed.  Report this
--- 728,741 ----
  			 * a full conversion
  			 */
  			val->val.array.rawScalar = (*it)->isScalar;
! 			(*it)->curIndex = 0;
! 			(*it)->curDataOffset = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
  
  		case JBI_ARRAY_ELEM:
! 			if ((*it)->curIndex >= (*it)->nElems)
  			{
  				/*
  				 * All elements within array already processed.  Report this
*************** recurse:
*** 686,692 ****
  				return WJB_END_ARRAY;
  			}
  
! 			fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val);
  
  			if (!IsAJsonbScalar(val) && !skipNested)
  			{
--- 747,758 ----
  				return WJB_END_ARRAY;
  			}
  
! 			fillJsonbValue((*it)->children, (*it)->curIndex,
! 						   (*it)->dataProper, (*it)->curDataOffset,
! 						   val);
! 
! 			(*it)->curDataOffset += JBE_LEN((*it)->children, (*it)->curIndex);
! 			(*it)->curIndex++;
  
  			if (!IsAJsonbScalar(val) && !skipNested)
  			{
*************** recurse:
*** 712,724 ****
  			 * v->val.object.pairs is not actually set, because we aren't
  			 * doing a full conversion
  			 */
! 			(*it)->i = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
  
  		case JBI_OBJECT_KEY:
! 			if ((*it)->i >= (*it)->nElems)
  			{
  				/*
  				 * All pairs within object already processed.  Report this to
--- 778,791 ----
  			 * v->val.object.pairs is not actually set, because we aren't
  			 * doing a full conversion
  			 */
! 			(*it)->curIndex = 0;
! 			(*it)->curDataOffset = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
  
  		case JBI_OBJECT_KEY:
! 			if ((*it)->curIndex >= (*it)->nElems)
  			{
  				/*
  				 * All pairs within object already processed.  Report this to
*************** recurse:
*** 732,738 ****
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val);
  				if (val->type != jbvString)
  					elog(ERROR, "unexpected jsonb type as object key");
  
--- 799,807 ----
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->curIndex * 2,
! 							   (*it)->dataProper, (*it)->curDataOffset,
! 							   val);
  				if (val->type != jbvString)
  					elog(ERROR, "unexpected jsonb type as object key");
  
*************** recurse:
*** 745,752 ****
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1,
! 						   (*it)->dataProper, val);
  
  			/*
  			 * Value may be a container, in which case we recurse with new,
--- 814,829 ----
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			(*it)->curDataOffset += JBE_LEN((*it)->children,
! 											(*it)->curIndex * 2);
! 
! 			fillJsonbValue((*it)->children, (*it)->curIndex * 2 + 1,
! 						   (*it)->dataProper, (*it)->curDataOffset,
! 						   val);
! 
! 			(*it)->curDataOffset += JBE_LEN((*it)->children,
! 											(*it)->curIndex * 2 + 1);
! 			(*it)->curIndex++;
  
  			/*
  			 * Value may be a container, in which case we recurse with new,
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1340,1353 ****
  	int			totallen;
  	uint32		header;
  
! 	/* Initialize pointer into conversion buffer at this level */
  	offset = buffer->len;
  
  	padBufferToInt(buffer);
  
  	/*
! 	 * Construct the header Jentry, stored in the beginning of the variable-
! 	 * length payload.
  	 */
  	header = val->val.array.nElems | JB_FARRAY;
  	if (val->val.array.rawScalar)
--- 1417,1431 ----
  	int			totallen;
  	uint32		header;
  
! 	/* Remember where variable-length data starts for this array */
  	offset = buffer->len;
  
+ 	/* Align to 4-byte boundary (any padding counts as part of my data) */
  	padBufferToInt(buffer);
  
  	/*
! 	 * Construct the header Jentry and store it in the beginning of the
! 	 * variable-length payload.
  	 */
  	header = val->val.array.nElems | JB_FARRAY;
  	if (val->val.array.rawScalar)
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1358,1364 ****
  	}
  
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
! 	/* reserve space for the JEntries of the elements. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
  
  	totallen = 0;
--- 1436,1443 ----
  	}
  
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
! 
! 	/* Reserve space for the JEntries of the elements. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
  
  	totallen = 0;
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1368,1391 ****
  		int			len;
  		JEntry		meta;
  
  		convertJsonbValue(buffer, &meta, elem, level + 1);
- 		len = meta & JENTRY_POSMASK;
- 		totallen += len;
  
! 		if (totallen > JENTRY_POSMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
- 		if (i > 0)
- 			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
  
  	totallen = buffer->len - offset;
  
  	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
--- 1447,1485 ----
  		int			len;
  		JEntry		meta;
  
+ 		/*
+ 		 * Convert element, producing a JEntry and appending its
+ 		 * variable-length data to buffer
+ 		 */
  		convertJsonbValue(buffer, &meta, elem, level + 1);
  
! 		/*
! 		 * Bail out if total variable-length data exceeds what will fit in a
! 		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
! 		 */
! 		len = JBE_LENFLD(meta);
! 		totallen += len;
! 		if (totallen > JENTRY_LENMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_LENMASK)));
  
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
  
+ 	/* Total data size is everything we've appended to buffer */
  	totallen = buffer->len - offset;
  
+ 	/* Check length again, since we didn't include the metadata above */
+ 	if (totallen > JENTRY_LENMASK)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 				 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
+ 						JENTRY_LENMASK)));
+ 
  	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1393,1457 ****
  static void
  convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
- 	uint32		header;
  	int			offset;
  	int			metaoffset;
  	int			i;
  	int			totallen;
  
! 	/* Initialize pointer into conversion buffer at this level */
  	offset = buffer->len;
  
  	padBufferToInt(buffer);
  
! 	/* Initialize header */
  	header = val->val.object.nPairs | JB_FOBJECT;
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
  
! 	/* reserve space for the JEntries of the keys and values */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
  
  	totallen = 0;
  	for (i = 0; i < val->val.object.nPairs; i++)
  	{
! 		JsonbPair *pair = &val->val.object.pairs[i];
! 		int len;
! 		JEntry meta;
  
! 		/* put key */
  		convertJsonbScalar(buffer, &meta, &pair->key);
  
! 		len = meta & JENTRY_POSMASK;
  		totallen += len;
  
- 		if (totallen > JENTRY_POSMASK)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
- 							JENTRY_POSMASK)));
- 
- 		if (i > 0)
- 			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
! 		convertJsonbValue(buffer, &meta, &pair->value, level);
! 		len = meta & JENTRY_POSMASK;
! 		totallen += len;
  
! 		if (totallen > JENTRY_POSMASK)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
- 		meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
  
  	totallen = buffer->len - offset;
  
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
--- 1487,1570 ----
  static void
  convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
  	int			offset;
  	int			metaoffset;
  	int			i;
  	int			totallen;
+ 	uint32		header;
  
! 	/* Remember where variable-length data starts for this object */
  	offset = buffer->len;
  
+ 	/* Align to 4-byte boundary (any padding counts as part of my data) */
  	padBufferToInt(buffer);
  
! 	/*
! 	 * Construct the header Jentry and store it in the beginning of the
! 	 * variable-length payload.
! 	 */
  	header = val->val.object.nPairs | JB_FOBJECT;
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
  
! 	/* Reserve space for the JEntries of the keys and values. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
  
  	totallen = 0;
  	for (i = 0; i < val->val.object.nPairs; i++)
  	{
! 		JsonbPair  *pair = &val->val.object.pairs[i];
! 		int			len;
! 		JEntry		meta;
  
! 		/*
! 		 * Convert key, producing a JEntry and appending its variable-length
! 		 * data to buffer
! 		 */
  		convertJsonbScalar(buffer, &meta, &pair->key);
  
! 		len = JBE_LENFLD(meta);
  		totallen += len;
  
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
! 		/*
! 		 * Convert value, producing a JEntry and appending its variable-length
! 		 * data to buffer
! 		 */
! 		convertJsonbValue(buffer, &meta, &pair->value, level + 1);
  
! 		len = JBE_LENFLD(meta);
! 		totallen += len;
  
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
+ 
+ 		/*
+ 		 * Bail out if total variable-length data exceeds what will fit in a
+ 		 * JEntry length field.  We check this in each iteration, not just
+ 		 * once at the end, to forestall possible integer overflow.  But it
+ 		 * should be sufficient to check once per iteration, since
+ 		 * JENTRY_LENMASK is several bits narrower than int.
+ 		 */
+ 		if (totallen > JENTRY_LENMASK)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+ 							JENTRY_LENMASK)));
  	}
  
+ 	/* Total data size is everything we've appended to buffer */
  	totallen = buffer->len - offset;
  
+ 	/* Check length again, since we didn't include the metadata above */
+ 	if (totallen > JENTRY_LENMASK)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 				 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+ 						JENTRY_LENMASK)));
+ 
+ 	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 91e3e14..b9a4314 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef struct JsonbValue JsonbValue;
*** 83,91 ****
   * buffer is accessed, but they can also be deep copied and passed around.
   *
   * Jsonb is a tree structure. Each node in the tree consists of a JEntry
!  * header, and a variable-length content.  The JEntry header indicates what
!  * kind of a node it is, e.g. a string or an array, and the offset and length
!  * of its variable-length portion within the container.
   *
   * The JEntry and the content of a node are not stored physically together.
   * Instead, the container array or object has an array that holds the JEntrys
--- 83,91 ----
   * buffer is accessed, but they can also be deep copied and passed around.
   *
   * Jsonb is a tree structure. Each node in the tree consists of a JEntry
!  * header and a variable-length content (possibly of zero size).  The JEntry
!  * header indicates what kind of a node it is, e.g. a string or an array,
!  * and includes the length of its variable-length portion.
   *
   * The JEntry and the content of a node are not stored physically together.
   * Instead, the container array or object has an array that holds the JEntrys
*************** typedef struct JsonbValue JsonbValue;
*** 95,133 ****
   * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
   * is implicitly known that the root node must be an array or an object,
   * so we can get away without the type indicator as long as we can distinguish
!  * the two.  For that purpose, both an array and an object begins with a uint32
   * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
   * scalar value needs to be stored as a Jsonb value, what we actually store is
   * an array with one element, with the flags in the array's header field set
   * to JB_FSCALAR | JB_FARRAY.
   *
-  * To encode the length and offset of the variable-length portion of each
-  * node in a compact way, the JEntry stores only the end offset within the
-  * variable-length portion of the container node. For the first JEntry in the
-  * container's JEntry array, that equals to the length of the node data.  The
-  * begin offset and length of the rest of the entries can be calculated using
-  * the end offset of the previous JEntry in the array.
-  *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
   * boundary, while others are not. When alignment is needed, the padding is
   * in the beginning of the node that requires it. For example, if a numeric
   * node is stored after a string node, so that the numeric node begins at
   * offset 3, the variable-length portion of the numeric node will begin with
!  * one padding byte.
   */
  
  /*
   * Jentry format.
   *
!  * The least significant 28 bits store the end offset of the entry (see
!  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
!  * are used to store the type of the entry. The most significant bit
!  * is unused, and should be set to zero.
   */
  typedef uint32 JEntry;
  
! #define JENTRY_POSMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
  
  /* values stored in the type bits */
--- 95,126 ----
   * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
   * is implicitly known that the root node must be an array or an object,
   * so we can get away without the type indicator as long as we can distinguish
!  * the two.  For that purpose, both an array and an object begin with a uint32
   * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
   * scalar value needs to be stored as a Jsonb value, what we actually store is
   * an array with one element, with the flags in the array's header field set
   * to JB_FSCALAR | JB_FARRAY.
   *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
   * boundary, while others are not. When alignment is needed, the padding is
   * in the beginning of the node that requires it. For example, if a numeric
   * node is stored after a string node, so that the numeric node begins at
   * offset 3, the variable-length portion of the numeric node will begin with
!  * one padding byte so that the actual numeric data is 4-byte aligned.
   */
  
  /*
   * Jentry format.
   *
!  * The least significant 28 bits store the data length of the entry (see
!  * JBE_LENFLD and JBE_LEN macros below). The next three bits store the type
!  * of the entry. The most significant bit is reserved for future use, and
!  * should be set to zero.
   */
  typedef uint32 JEntry;
  
! #define JENTRY_LENMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
  
  /* values stored in the type bits */
*************** typedef uint32 JEntry;
*** 148,160 ****
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
  /*
!  * Macros for getting the offset and length of an element. Note multiple
!  * evaluations and access to prior array element.
   */
! #define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
! #define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
! 								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
--- 141,150 ----
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
  /*
!  * Macros for getting the data length of a JEntry.
   */
! #define JBE_LENFLD(je_)			((je_) & JENTRY_LENMASK)
! #define JBE_LEN(ja, i)			JBE_LENFLD((ja)[i])
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
*************** typedef struct JsonbIterator
*** 287,306 ****
  {
  	/* Container being iterated */
  	JsonbContainer *container;
! 	uint32		nElems;			/* Number of elements in children array (will be
! 								 * nPairs for objects) */
  	bool		isScalar;		/* Pseudo-array scalar value? */
! 	JEntry	   *children;
  
  	/* Current item in buffer (up to nElems, but must * 2 for objects) */
! 	int			i;
  
! 	/*
! 	 * Data proper.  This points just past end of children array.
! 	 * We use the JBE_OFF() macro on the Jentrys to find offsets of each
! 	 * child in this area.
! 	 */
! 	char	   *dataProper;
  
  	/* Private state */
  	JsonbIterState state;
--- 277,294 ----
  {
  	/* Container being iterated */
  	JsonbContainer *container;
! 	uint32		nElems;			/* Number of elements in children array (will
! 								 * be nPairs for objects) */
  	bool		isScalar;		/* Pseudo-array scalar value? */
! 	JEntry	   *children;		/* JEntrys for child nodes */
! 	/* Data proper.  This points just past end of children array */
! 	char	   *dataProper;
  
  	/* Current item in buffer (up to nElems, but must * 2 for objects) */
! 	int			curIndex;
  
! 	/* Data offset corresponding to current item */
! 	uint32		curDataOffset;
  
  	/* Private state */
  	JsonbIterState state;
*************** extern Datum gin_consistent_jsonb_path(P
*** 344,349 ****
--- 332,338 ----
  extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS);
  
  /* Support functions */
+ extern uint32 getJsonbOffset(const JEntry *ja, int index);
  extern int	compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
  extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
  							uint32 flags,
#85Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/20/2014 08:29 AM, Tom Lane wrote:

Since it's looking like this might be the direction we want to go, I took
the time to flesh out my proof-of-concept patch. The attached version
takes care of cosmetic issues (like fixing the comments), and includes
code to avoid O(N^2) penalties in findJsonbValueFromContainer and
JsonbIteratorNext

OK, will test.

This means we need a beta3, no?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#86Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#85)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

This means we need a beta3, no?

If we change the on-disk format, I'd say so. So we don't want to wait
around too long before deciding.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#87Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/20/2014 08:29 AM, Tom Lane wrote:

Josh Berkus <josh@agliodbs.com> writes:

On 08/15/2014 04:19 PM, Tom Lane wrote:

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

... So, that extraction test is about 1% *slower* than the basic Tom Lane
lengths-only patch, and still 80% slower than original JSONB. And it's
the same size as the lengths-only version.

Since it's looking like this might be the direction we want to go, I took
the time to flesh out my proof-of-concept patch. The attached version
takes care of cosmetic issues (like fixing the comments), and includes
code to avoid O(N^2) penalties in findJsonbValueFromContainer and
JsonbIteratorNext. I'm not sure whether those changes will help
noticeably on Josh's test case; for me, they seemed worth making, but
they do not bring the code back to full speed parity with the all-offsets
version. But as we've been discussing, it seems likely that those costs
would be swamped by compression and I/O considerations in most scenarios
with large documents; and of course for small documents it hardly matters.

Table sizes and extraction times are unchanged from the prior patch
based on my workload.

We should be comparing all-lengths vs length-and-offset maybe using
another workload as well ...

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#88Arthur Silva
arthurprs@gmail.com
In reply to: Josh Berkus (#87)
Re: jsonb format is pessimal for toast compression

What data are you using right now Josh?

There's the github archive http://www.githubarchive.org/
Here's some sample data https://gist.github.com/igrigorik/2017462

--
Arthur Silva

On Wed, Aug 20, 2014 at 6:09 PM, Josh Berkus <josh@agliodbs.com> wrote:

Show quoted text

On 08/20/2014 08:29 AM, Tom Lane wrote:

Josh Berkus <josh@agliodbs.com> writes:

On 08/15/2014 04:19 PM, Tom Lane wrote:

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid

approach

is too messy. Others might well disagree.

... So, that extraction test is about 1% *slower* than the basic Tom

Lane

lengths-only patch, and still 80% slower than original JSONB. And it's
the same size as the lengths-only version.

Since it's looking like this might be the direction we want to go, I took
the time to flesh out my proof-of-concept patch. The attached version
takes care of cosmetic issues (like fixing the comments), and includes
code to avoid O(N^2) penalties in findJsonbValueFromContainer and
JsonbIteratorNext. I'm not sure whether those changes will help
noticeably on Josh's test case; for me, they seemed worth making, but
they do not bring the code back to full speed parity with the all-offsets
version. But as we've been discussing, it seems likely that those costs
would be swamped by compression and I/O considerations in most scenarios
with large documents; and of course for small documents it hardly

matters.

Table sizes and extraction times are unchanged from the prior patch
based on my workload.

We should be comparing all-lengths vs length-and-offset maybe using
another workload as well ...

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

#89Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/20/2014 03:42 PM, Arthur Silva wrote:

What data are you using right now Josh?

The same data as upthread.

Can you test the three patches (9.4 head, 9.4 with Tom's cleanup of
Heikki's patch, and 9.4 with Tom's latest lengths-only) on your workload?

I'm concerned that my workload is unusual and don't want us to make this
decision based entirely on it.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#90Arthur Silva
arthurprs@gmail.com
In reply to: Josh Berkus (#89)
Re: jsonb format is pessimal for toast compression

On Thu, Aug 21, 2014 at 6:20 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 08/20/2014 03:42 PM, Arthur Silva wrote:

What data are you using right now Josh?

The same data as upthread.

Can you test the three patches (9.4 head, 9.4 with Tom's cleanup of
Heikki's patch, and 9.4 with Tom's latest lengths-only) on your workload?

I'm concerned that my workload is unusual and don't want us to make this
decision based entirely on it.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

Here's my test results so far with the github archive data.

It's important to keep in mind that the PushEvent event objects that I use
in the queries only contains a small number of keys (8 to be precise), so
these tests don't really stress the changed code.

Anyway, in this dataset (with the small objects) using the all-lengths
patch provide small compression savings but the overhead is minimal.

----------------

Test data: 610MB of Json -- 341969 items

Index size (jsonb_ops): 331MB

Test query 1: SELECT data->'url', data->'actor' FROM t_json WHERE data @>
'{"type": "PushEvent"}'
Test query 1 items: 169732

Test query 2: SELECT data FROM t_json WHERE data @> '{"type": "PushEvent"}'
Test query 2 items:

----------------
HEAD (aka, all offsets) EXTENDED
Size: 374MB
Toast Size: 145MB

Test query 1 runtime: 680ms
Test query 2 runtime: 405ms
----------------
HEAD (aka, all offsets) EXTERNAL
Size: 366MB
Toast Size: 333MB

Test query 1 runtime: 505ms
Test query 2 runtime: 350ms
----------------
All Lengths (Tom Lane patch) EXTENDED
Size: 379MB
Toast Size: 108MB

Test query 1 runtime: 720ms
Test query 2 runtime: 420ms
----------------
All Lengths (Tom Lane patch) EXTERNAL
Size: 366MB
Toast Size: 333MB

Test query 1 runtime: 525ms
Test query 2 runtime: 355ms

--
Arthur Silva

#91Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Tom Lane (#81)
Re: jsonb format is pessimal for toast compression

On 08/16/2014 02:19 AM, Tom Lane wrote:

I think the realistic alternatives at this point are either to
switch to all-lengths as in my test patch, or to use the hybrid approach
of Heikki's test patch. IMO the major attraction of Heikki's patch
is that it'd be upward compatible with existing beta installations,
ie no initdb required (but thus, no opportunity to squeeze in a version
identifier either). It's not showing up terribly well in the performance
tests I've been doing --- it's about halfway between HEAD and my patch on
that extract-a-key-from-a-PLAIN-stored-column test. But, just as with my
patch, there are things that could be done to micro-optimize it by
touching a bit more code.

I did some quick stats comparing compressed sizes for the delicio.us
data, printing quartiles as per Josh's lead:

all-lengths {440,569,609,655,1257}
Heikki's patch {456,582,624,671,1274}
HEAD {493,636,684,744,1485}

(As before, this is pg_column_size of the jsonb within a table whose rows
are wide enough to force tuptoaster.c to try to compress the jsonb;
otherwise many of these values wouldn't get compressed.) These documents
don't have enough keys to trigger the first_success_by issue, so that
HEAD doesn't look too awful, but still there's about an 11% gain from
switching from offsets to lengths. Heikki's method captures much of
that but not all.

Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

It's not too pretty, no. But it would be nice to not have to make a
tradeoff between lookup speed and compressibility.

Yet another idea is to store all lengths, but add an additional array of
offsets to JsonbContainer. The array would contain the offset of, say,
every 16th element. It would be very small compared to the lengths
array, but would greatly speed up random access on a large array/object.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#92Tom Lane
tgl@sss.pgh.pa.us
In reply to: Heikki Linnakangas (#91)
Re: jsonb format is pessimal for toast compression

Heikki Linnakangas <hlinnakangas@vmware.com> writes:

On 08/16/2014 02:19 AM, Tom Lane wrote:

I think the realistic alternatives at this point are either to
switch to all-lengths as in my test patch, or to use the hybrid approach
of Heikki's test patch. ...
Personally I'd prefer to go to the all-lengths approach, but a large
part of that comes from a subjective assessment that the hybrid approach
is too messy. Others might well disagree.

It's not too pretty, no. But it would be nice to not have to make a
tradeoff between lookup speed and compressibility.

Yet another idea is to store all lengths, but add an additional array of
offsets to JsonbContainer. The array would contain the offset of, say,
every 16th element. It would be very small compared to the lengths
array, but would greatly speed up random access on a large array/object.

That does nothing to address my basic concern about the patch, which is
that it's too complicated and therefore bug-prone. Moreover, it'd lose
on-disk compatibility which is really the sole saving grace of the
proposal.

My feeling about it at this point is that the apparent speed gain from
using offsets is illusory: in practically all real-world cases where there
are enough keys or array elements for it to matter, costs associated with
compression (or rather failure to compress) will dominate any savings we
get from offset-assisted lookups. I agree that the evidence for this
opinion is pretty thin ... but the evidence against it is nonexistent.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#93Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/26/2014 07:51 AM, Tom Lane wrote:

My feeling about it at this point is that the apparent speed gain from
using offsets is illusory: in practically all real-world cases where there
are enough keys or array elements for it to matter, costs associated with
compression (or rather failure to compress) will dominate any savings we
get from offset-assisted lookups. I agree that the evidence for this
opinion is pretty thin ... but the evidence against it is nonexistent.

Well, I have shown one test case which shows where lengths is a net
penalty. However, for that to be the case, you have to have the
following conditions *all* be true:

* lots of top-level keys
* short values
* rows which are on the borderline for TOAST
* table which fits in RAM

... so that's a "special case" and if it's sub-optimal, no bigee. Also,
it's not like it's an order-of-magnitude slower.

Anyway, I called for feedback on by blog, and have gotten some:

http://www.databasesoup.com/2014/08/the-great-jsonb-tradeoff.html

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#94Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#93)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

Anyway, I called for feedback on by blog, and have gotten some:
http://www.databasesoup.com/2014/08/the-great-jsonb-tradeoff.html

I was hoping you'd get some useful data from that, but so far it seems
like a rehash of points made in the on-list thread :-(

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#95Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#37)
Re: jsonb format is pessimal for toast compression

On 08/26/2014 11:40 AM, Tom Lane wrote:

Josh Berkus <josh@agliodbs.com> writes:

Anyway, I called for feedback on by blog, and have gotten some:
http://www.databasesoup.com/2014/08/the-great-jsonb-tradeoff.html

I was hoping you'd get some useful data from that, but so far it seems
like a rehash of points made in the on-list thread :-(

regards, tom lane

yah, me too. :-(

Unfortunately even the outside commentors don't seem to understand that
storage size *is* related to speed, it's exchanging I/O speed for CPU speed.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#96Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#95)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

On 08/26/2014 11:40 AM, Tom Lane wrote:

I was hoping you'd get some useful data from that, but so far it seems
like a rehash of points made in the on-list thread :-(

Unfortunately even the outside commentors don't seem to understand that
storage size *is* related to speed, it's exchanging I/O speed for CPU speed.

Yeah, exactly. Given current hardware trends, data compression is
becoming more of a win not less as time goes on: CPU cycles are cheap
even compared to main memory access, let alone mass storage. So I'm
thinking we want to adopt a compression-friendly data format even if
it measures out as a small loss currently.

I wish it were cache-friendly too, per the upthread tangent about having
to fetch keys from all over the place within a large JSON object.

... and while I was typing that sentence, lightning struck. The existing
arrangement of object subfields with keys and values interleaved is just
plain dumb. We should rearrange that as all the keys in order, then all
the values in the same order. Then the keys are naturally adjacent in
memory and object-key searches become much more cache-friendly: you
probably touch most of the key portion of the object, but none of the
values portion, until you know exactly what part of the latter to fetch.
This approach might complicate the lookup logic marginally but I bet not
very much; and it will be a huge help if we ever want to do smart access
to EXTERNAL (non-compressed) JSON values.

I will go prototype that just to see how much code rearrangement is
required.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#97Andres Freund
andres@2ndquadrant.com
In reply to: Tom Lane (#96)
Re: jsonb format is pessimal for toast compression

On 2014-08-26 15:01:27 -0400, Tom Lane wrote:

Josh Berkus <josh@agliodbs.com> writes:

On 08/26/2014 11:40 AM, Tom Lane wrote:

I was hoping you'd get some useful data from that, but so far it seems
like a rehash of points made in the on-list thread :-(

Unfortunately even the outside commentors don't seem to understand that
storage size *is* related to speed, it's exchanging I/O speed for CPU speed.

Yeah, exactly. Given current hardware trends, data compression is
becoming more of a win not less as time goes on: CPU cycles are cheap
even compared to main memory access, let alone mass storage. So I'm
thinking we want to adopt a compression-friendly data format even if
it measures out as a small loss currently.

On the other hand the majority of databases these day fit into main
memory due to its increasing sizes and postgres is more often CPU than
IO bound.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#98Laurence Rowe
l@lrowe.co.uk
In reply to: Josh Berkus (#93)
Re: jsonb format is pessimal for toast compression

On 26 August 2014 11:34, Josh Berkus <josh@agliodbs.com> wrote:

On 08/26/2014 07:51 AM, Tom Lane wrote:

My feeling about it at this point is that the apparent speed gain from
using offsets is illusory: in practically all real-world cases where

there

are enough keys or array elements for it to matter, costs associated with
compression (or rather failure to compress) will dominate any savings we
get from offset-assisted lookups. I agree that the evidence for this
opinion is pretty thin ... but the evidence against it is nonexistent.

Well, I have shown one test case which shows where lengths is a net
penalty. However, for that to be the case, you have to have the
following conditions *all* be true:

* lots of top-level keys
* short values
* rows which are on the borderline for TOAST
* table which fits in RAM

... so that's a "special case" and if it's sub-optimal, no bigee. Also,
it's not like it's an order-of-magnitude slower.

Anyway, I called for feedback on by blog, and have gotten some:

http://www.databasesoup.com/2014/08/the-great-jsonb-tradeoff.html

It would be really interesting to see your results with column STORAGE
EXTERNAL for that benchmark. I think it is important to separate out the
slowdown due to decompression now being needed vs that inherent in the new
format, we can always switch off compression on a per-column basis using
STORAGE EXTERNAL.

My JSON data has smallish objects with a small number of keys, it barely
compresses at all with the patch and shows similar results to Arthur's
data. Across ~500K rows I get:

encoded=# select count(properties->>'submitted_by') from compressed;
count
--------
431948
(1 row)

Time: 250.512 ms

encoded=# select count(properties->>'submitted_by') from uncompressed;
count
--------
431948
(1 row)

Time: 218.552 ms

Laurence

#99Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#97)
Re: jsonb format is pessimal for toast compression

Andres Freund <andres@2ndquadrant.com> writes:

On 2014-08-26 15:01:27 -0400, Tom Lane wrote:

Yeah, exactly. Given current hardware trends, data compression is
becoming more of a win not less as time goes on: CPU cycles are cheap
even compared to main memory access, let alone mass storage. So I'm
thinking we want to adopt a compression-friendly data format even if
it measures out as a small loss currently.

On the other hand the majority of databases these day fit into main
memory due to its increasing sizes and postgres is more often CPU than
IO bound.

Well, better data compression helps make that true ;-). And don't forget
cache effects; actual main memory is considered slow these days.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#100Claudio Freire
klaussfreire@gmail.com
In reply to: Tom Lane (#96)
Re: jsonb format is pessimal for toast compression

On Tue, Aug 26, 2014 at 4:01 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Josh Berkus <josh@agliodbs.com> writes:

On 08/26/2014 11:40 AM, Tom Lane wrote:

I was hoping you'd get some useful data from that, but so far it seems
like a rehash of points made in the on-list thread :-(

Unfortunately even the outside commentors don't seem to understand that
storage size *is* related to speed, it's exchanging I/O speed for CPU speed.

Yeah, exactly. Given current hardware trends, data compression is
becoming more of a win not less as time goes on: CPU cycles are cheap
even compared to main memory access, let alone mass storage. So I'm
thinking we want to adopt a compression-friendly data format even if
it measures out as a small loss currently.

I wish it were cache-friendly too, per the upthread tangent about having
to fetch keys from all over the place within a large JSON object.

What about my earlier proposal?

An in-memory compressed representation would greatly help cache
locality, more so if you pack keys as you mentioned.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#101Andres Freund
andres@2ndquadrant.com
In reply to: Tom Lane (#99)
Re: jsonb format is pessimal for toast compression

On 2014-08-26 15:17:13 -0400, Tom Lane wrote:

Andres Freund <andres@2ndquadrant.com> writes:

On 2014-08-26 15:01:27 -0400, Tom Lane wrote:

Yeah, exactly. Given current hardware trends, data compression is
becoming more of a win not less as time goes on: CPU cycles are cheap
even compared to main memory access, let alone mass storage. So I'm
thinking we want to adopt a compression-friendly data format even if
it measures out as a small loss currently.

On the other hand the majority of databases these day fit into main
memory due to its increasing sizes and postgres is more often CPU than
IO bound.

Well, better data compression helps make that true ;-).

People disable toast compression though because it results in better
performance :(. Part of that could be fixed by a faster compression
method, part of it by decompressing less often. But still.

And don't forget cache effects; actual main memory is considered slow
these days.

Right. But that plays the other way round too. Compressed datums need to
be copied to be accessed uncompressed. Whereas at least in comparison to
inline compressed datums that's not necessary.

Anyway, that's just to say that I don't really agree that CPU overhead
is a worthy price to pay for storage efficiency if the gains are small.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#102Peter Geoghegan
pg@heroku.com
In reply to: Andres Freund (#101)
Re: jsonb format is pessimal for toast compression

On Tue, Aug 26, 2014 at 12:27 PM, Andres Freund <andres@2ndquadrant.com> wrote:

Anyway, that's just to say that I don't really agree that CPU overhead
is a worthy price to pay for storage efficiency if the gains are small.

+1

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#103Josh Berkus
josh@agliodbs.com
In reply to: Heikki Linnakangas (#91)
Re: jsonb format is pessimal for toast compression

On 08/26/2014 12:27 PM, Andres Freund wrote:

Anyway, that's just to say that I don't really agree that CPU overhead
is a worthy price to pay for storage efficiency if the gains are small.

But in this case the gains aren't small; we're talking up to 60% smaller
storage.

Testing STORAGE EXTENDED soon.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#104Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Lane (#96)
2 attachment(s)
Re: jsonb format is pessimal for toast compression

I wrote:

I wish it were cache-friendly too, per the upthread tangent about having
to fetch keys from all over the place within a large JSON object.

... and while I was typing that sentence, lightning struck. The existing
arrangement of object subfields with keys and values interleaved is just
plain dumb. We should rearrange that as all the keys in order, then all
the values in the same order. Then the keys are naturally adjacent in
memory and object-key searches become much more cache-friendly: you
probably touch most of the key portion of the object, but none of the
values portion, until you know exactly what part of the latter to fetch.
This approach might complicate the lookup logic marginally but I bet not
very much; and it will be a huge help if we ever want to do smart access
to EXTERNAL (non-compressed) JSON values.

I will go prototype that just to see how much code rearrangement is
required.

This looks pretty good from a coding point of view. I have not had time
yet to see if it affects the speed of the benchmark cases we've been
trying. I suspect that it won't make much difference in them. I think
if we do decide to make an on-disk format change, we should seriously
consider including this change.

The same concept could be applied to offset-based storage of course,
although I rather doubt that we'd make that combination of choices since
it would be giving up on-disk compatibility for benefits that are mostly
in the future.

Attached are two patches: one is a "delta" against the last jsonb-lengths
patch I posted, and the other is a "merged" patch showing the total change
from HEAD, for ease of application.

regards, tom lane

Attachments:

jsonb-lengths-delta.patchtext/x-diff; charset=us-ascii; name=jsonb-lengths-delta.patchDownload
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index e47eaea..4e7fe67 100644
*** a/src/backend/utils/adt/jsonb_util.c
--- b/src/backend/utils/adt/jsonb_util.c
***************
*** 26,33 ****
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (the total size of an array's elements is also limited by JENTRY_LENMASK,
!  * but we're not concerned about that here)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
--- 26,33 ----
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (The total size of an array's or object's elements is also limited by
!  * JENTRY_LENMASK, but we're not concerned about that here.)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
*************** findJsonbValueFromContainer(JsonbContain
*** 294,303 ****
  {
  	JEntry	   *children = container->children;
  	int			count = (container->header & JB_CMASK);
! 	JsonbValue *result = palloc(sizeof(JsonbValue));
  
  	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
  
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
--- 294,309 ----
  {
  	JEntry	   *children = container->children;
  	int			count = (container->header & JB_CMASK);
! 	JsonbValue *result;
  
  	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
  
+ 	/* Quick out without a palloc cycle if object/array is empty */
+ 	if (count <= 0)
+ 		return NULL;
+ 
+ 	result = palloc(sizeof(JsonbValue));
+ 
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
*************** findJsonbValueFromContainer(JsonbContain
*** 323,329 ****
  		char	   *base_addr = (char *) (children + count * 2);
  		uint32	   *offsets;
  		uint32		lastoff;
! 		int			lastoffpos;
  		uint32		stopLow = 0,
  					stopHigh = count;
  
--- 329,335 ----
  		char	   *base_addr = (char *) (children + count * 2);
  		uint32	   *offsets;
  		uint32		lastoff;
! 		int			i;
  		uint32		stopLow = 0,
  					stopHigh = count;
  
*************** findJsonbValueFromContainer(JsonbContain
*** 332,379 ****
  
  		/*
  		 * We use a cache to avoid redundant getJsonbOffset() computations
! 		 * inside the search loop.  Note that count may well be zero at this
! 		 * point; to avoid an ugly special case for initializing lastoff and
! 		 * lastoffpos, we allocate one extra array element.
  		 */
! 		offsets = (uint32 *) palloc((count * 2 + 1) * sizeof(uint32));
! 		offsets[0] = lastoff = 0;
! 		lastoffpos = 0;
  
  		/* Binary search on object/pair keys *only* */
  		while (stopLow < stopHigh)
  		{
  			uint32		stopMiddle;
- 			int			index;
  			int			difference;
  			JsonbValue	candidate;
  
  			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
  
- 			/*
- 			 * Compensate for the fact that we're searching through pairs (not
- 			 * entries).
- 			 */
- 			index = stopMiddle * 2;
- 
- 			/* Update the offsets cache through at least index+1 */
- 			while (lastoffpos <= index)
- 			{
- 				lastoff += JBE_LEN(children, lastoffpos);
- 				offsets[++lastoffpos] = lastoff;
- 			}
- 
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + offsets[index];
! 			candidate.val.string.len = JBE_LEN(children, index);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
  
  			if (difference == 0)
  			{
! 				/* Found our key, return value */
! 				fillJsonbValue(children, index + 1,
! 							   base_addr, offsets[index + 1],
  							   result);
  
  				pfree(offsets);
--- 338,383 ----
  
  		/*
  		 * We use a cache to avoid redundant getJsonbOffset() computations
! 		 * inside the search loop.  The entire cache can be filled immediately
! 		 * since we expect to need the last offset for value access.  (This
! 		 * choice could lose if the key is not present, but avoiding extra
! 		 * logic inside the search loop probably makes up for that.)
  		 */
! 		offsets = (uint32 *) palloc(count * sizeof(uint32));
! 		lastoff = 0;
! 		for (i = 0; i < count; i++)
! 		{
! 			offsets[i] = lastoff;
! 			lastoff += JBE_LEN(children, i);
! 		}
! 		/* lastoff now has the offset of the first value item */
  
  		/* Binary search on object/pair keys *only* */
  		while (stopLow < stopHigh)
  		{
  			uint32		stopMiddle;
  			int			difference;
  			JsonbValue	candidate;
  
  			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
  
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + offsets[stopMiddle];
! 			candidate.val.string.len = JBE_LEN(children, stopMiddle);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
  
  			if (difference == 0)
  			{
! 				/* Found our key, return corresponding value */
! 				int			index = stopMiddle + count;
! 
! 				/* navigate to appropriate offset */
! 				for (i = count; i < index; i++)
! 					lastoff += JBE_LEN(children, i);
! 
! 				fillJsonbValue(children, index,
! 							   base_addr, lastoff,
  							   result);
  
  				pfree(offsets);
*************** recurse:
*** 730,735 ****
--- 734,740 ----
  			val->val.array.rawScalar = (*it)->isScalar;
  			(*it)->curIndex = 0;
  			(*it)->curDataOffset = 0;
+ 			(*it)->curValueOffset = 0;	/* not actually used */
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
*************** recurse:
*** 780,785 ****
--- 785,792 ----
  			 */
  			(*it)->curIndex = 0;
  			(*it)->curDataOffset = 0;
+ 			(*it)->curValueOffset = getJsonbOffset((*it)->children,
+ 												   (*it)->nElems);
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
*************** recurse:
*** 799,805 ****
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->curIndex * 2,
  							   (*it)->dataProper, (*it)->curDataOffset,
  							   val);
  				if (val->type != jbvString)
--- 806,812 ----
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->curIndex,
  							   (*it)->dataProper, (*it)->curDataOffset,
  							   val);
  				if (val->type != jbvString)
*************** recurse:
*** 814,828 ****
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			(*it)->curDataOffset += JBE_LEN((*it)->children,
! 											(*it)->curIndex * 2);
! 
! 			fillJsonbValue((*it)->children, (*it)->curIndex * 2 + 1,
! 						   (*it)->dataProper, (*it)->curDataOffset,
  						   val);
  
  			(*it)->curDataOffset += JBE_LEN((*it)->children,
! 											(*it)->curIndex * 2 + 1);
  			(*it)->curIndex++;
  
  			/*
--- 821,834 ----
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			fillJsonbValue((*it)->children, (*it)->curIndex + (*it)->nElems,
! 						   (*it)->dataProper, (*it)->curValueOffset,
  						   val);
  
  			(*it)->curDataOffset += JBE_LEN((*it)->children,
! 											(*it)->curIndex);
! 			(*it)->curValueOffset += JBE_LEN((*it)->children,
! 											 (*it)->curIndex + (*it)->nElems);
  			(*it)->curIndex++;
  
  			/*
*************** convertJsonbObject(StringInfo buffer, JE
*** 1509,1514 ****
--- 1515,1524 ----
  	/* Reserve space for the JEntries of the keys and values. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
  
+ 	/*
+ 	 * Iterate over the keys, then over the values, since that is the ordering
+ 	 * we want in the on-disk representation.
+ 	 */
  	totallen = 0;
  	for (i = 0; i < val->val.object.nPairs; i++)
  	{
*************** convertJsonbObject(StringInfo buffer, JE
*** 1529,1534 ****
--- 1539,1561 ----
  		metaoffset += sizeof(JEntry);
  
  		/*
+ 		 * Bail out if total variable-length data exceeds what will fit in a
+ 		 * JEntry length field.  We check this in each iteration, not just
+ 		 * once at the end, to forestall possible integer overflow.
+ 		 */
+ 		if (totallen > JENTRY_LENMASK)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+ 							JENTRY_LENMASK)));
+ 	}
+ 	for (i = 0; i < val->val.object.nPairs; i++)
+ 	{
+ 		JsonbPair  *pair = &val->val.object.pairs[i];
+ 		int			len;
+ 		JEntry		meta;
+ 
+ 		/*
  		 * Convert value, producing a JEntry and appending its variable-length
  		 * data to buffer
  		 */
*************** convertJsonbObject(StringInfo buffer, JE
*** 1543,1551 ****
  		/*
  		 * Bail out if total variable-length data exceeds what will fit in a
  		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.  But it
! 		 * should be sufficient to check once per iteration, since
! 		 * JENTRY_LENMASK is several bits narrower than int.
  		 */
  		if (totallen > JENTRY_LENMASK)
  			ereport(ERROR,
--- 1570,1576 ----
  		/*
  		 * Bail out if total variable-length data exceeds what will fit in a
  		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
  		 */
  		if (totallen > JENTRY_LENMASK)
  			ereport(ERROR,
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index b9a4314..f9472af 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef uint32 JEntry;
*** 149,156 ****
  /*
   * A jsonb array or object node, within a Jsonb Datum.
   *
!  * An array has one child for each element. An object has two children for
!  * each key/value pair.
   */
  typedef struct JsonbContainer
  {
--- 149,160 ----
  /*
   * A jsonb array or object node, within a Jsonb Datum.
   *
!  * An array has one child for each element, stored in array order.
!  *
!  * An object has two children for each key/value pair.  The keys all appear
!  * first, in key sort order; then the values appear, in an order matching the
!  * key order.  This arrangement keeps the keys compact in memory, making a
!  * search for a particular key more cache-friendly.
   */
  typedef struct JsonbContainer
  {
*************** typedef struct JsonbContainer
*** 162,169 ****
  } JsonbContainer;
  
  /* flags for the header-field in JsonbContainer */
! #define JB_CMASK				0x0FFFFFFF
! #define JB_FSCALAR				0x10000000
  #define JB_FOBJECT				0x20000000
  #define JB_FARRAY				0x40000000
  
--- 166,173 ----
  } JsonbContainer;
  
  /* flags for the header-field in JsonbContainer */
! #define JB_CMASK				0x0FFFFFFF		/* mask for count field */
! #define JB_FSCALAR				0x10000000		/* flag bits */
  #define JB_FOBJECT				0x20000000
  #define JB_FARRAY				0x40000000
  
*************** struct JsonbValue
*** 238,255 ****
  									 (jsonbval)->type <= jbvBool)
  
  /*
!  * Pair within an Object.
   *
!  * Pairs with duplicate keys are de-duplicated.  We store the order for the
!  * benefit of doing so in a well-defined way with respect to the original
!  * observed order (which is "last observed wins").  This is only used briefly
!  * when originally constructing a Jsonb.
   */
  struct JsonbPair
  {
  	JsonbValue	key;			/* Must be a jbvString */
  	JsonbValue	value;			/* May be of any type */
! 	uint32		order;			/* preserves order of pairs with equal keys */
  };
  
  /* Conversion state used when parsing Jsonb from text, or for type coercion */
--- 242,261 ----
  									 (jsonbval)->type <= jbvBool)
  
  /*
!  * Key/value pair within an Object.
   *
!  * This struct type is only used briefly while constructing a Jsonb; it is
!  * *not* the on-disk representation.
!  *
!  * Pairs with duplicate keys are de-duplicated.  We store the originally
!  * observed pair ordering for the purpose of removing duplicates in a
!  * well-defined way (which is "last observed wins").
   */
  struct JsonbPair
  {
  	JsonbValue	key;			/* Must be a jbvString */
  	JsonbValue	value;			/* May be of any type */
! 	uint32		order;			/* Pair's index in original sequence */
  };
  
  /* Conversion state used when parsing Jsonb from text, or for type coercion */
*************** typedef struct JsonbIterator
*** 284,295 ****
  	/* Data proper.  This points just past end of children array */
  	char	   *dataProper;
  
! 	/* Current item in buffer (up to nElems, but must * 2 for objects) */
  	int			curIndex;
  
  	/* Data offset corresponding to current item */
  	uint32		curDataOffset;
  
  	/* Private state */
  	JsonbIterState state;
  
--- 290,308 ----
  	/* Data proper.  This points just past end of children array */
  	char	   *dataProper;
  
! 	/* Current item in buffer (up to nElems) */
  	int			curIndex;
  
  	/* Data offset corresponding to current item */
  	uint32		curDataOffset;
  
+ 	/*
+ 	 * If the container is an object, we want to return keys and values
+ 	 * alternately; so curDataOffset points to the current key, and
+ 	 * curValueOffset points to the current value.
+ 	 */
+ 	uint32		curValueOffset;
+ 
  	/* Private state */
  	JsonbIterState state;
  
jsonb-lengths-merged.patchtext/x-diff; charset=us-ascii; name=jsonb-lengths-merged.patchDownload
diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c
index 2fd87fc..456011a 100644
*** a/src/backend/utils/adt/jsonb.c
--- b/src/backend/utils/adt/jsonb.c
*************** jsonb_from_cstring(char *json, int len)
*** 196,207 ****
  static size_t
  checkStringLen(size_t len)
  {
! 	if (len > JENTRY_POSMASK)
  		ereport(ERROR,
  				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  				 errmsg("string too long to represent as jsonb string"),
  				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
! 						   JENTRY_POSMASK)));
  
  	return len;
  }
--- 196,207 ----
  static size_t
  checkStringLen(size_t len)
  {
! 	if (len > JENTRY_LENMASK)
  		ereport(ERROR,
  				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  				 errmsg("string too long to represent as jsonb string"),
  				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
! 						   JENTRY_LENMASK)));
  
  	return len;
  }
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..4e7fe67 100644
*** a/src/backend/utils/adt/jsonb_util.c
--- b/src/backend/utils/adt/jsonb_util.c
***************
*** 26,40 ****
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (the total size of an array's elements is also limited by JENTRY_POSMASK,
!  * but we're not concerned about that here)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
  
! static void fillJsonbValue(JEntry *array, int index, char *base_addr,
  			   JsonbValue *result);
! static bool	equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static Jsonb *convertToJsonb(JsonbValue *val);
  static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
--- 26,41 ----
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (The total size of an array's or object's elements is also limited by
!  * JENTRY_LENMASK, but we're not concerned about that here.)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
  
! static void fillJsonbValue(JEntry *children, int index,
! 			   char *base_addr, uint32 offset,
  			   JsonbValue *result);
! static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static Jsonb *convertToJsonb(JsonbValue *val);
  static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
*************** static void convertJsonbArray(StringInfo
*** 42,48 ****
  static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
  static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal);
  
! static int reserveFromBuffer(StringInfo buffer, int len);
  static void appendToBuffer(StringInfo buffer, const char *data, int len);
  static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len);
  static short padBufferToInt(StringInfo buffer);
--- 43,49 ----
  static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
  static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal);
  
! static int	reserveFromBuffer(StringInfo buffer, int len);
  static void appendToBuffer(StringInfo buffer, const char *data, int len);
  static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len);
  static short padBufferToInt(StringInfo buffer);
*************** JsonbValueToJsonb(JsonbValue *val)
*** 108,113 ****
--- 109,135 ----
  }
  
  /*
+  * Get the offset of the variable-length portion of a Jsonb node within
+  * the variable-length-data part of its container.  The node is identified
+  * by index within the container's JEntry array.
+  *
+  * We do this by adding up the lengths of all the previous nodes'
+  * variable-length portions.  It's best to avoid using this function when
+  * iterating through all the nodes in a container, since that would result
+  * in O(N^2) work.
+  */
+ uint32
+ getJsonbOffset(const JEntry *ja, int index)
+ {
+ 	uint32		off = 0;
+ 	int			i;
+ 
+ 	for (i = 0; i < index; i++)
+ 		off += JBE_LEN(ja, i);
+ 	return off;
+ }
+ 
+ /*
   * BT comparator worker function.  Returns an integer less than, equal to, or
   * greater than zero, indicating whether a is less than, equal to, or greater
   * than b.  Consistent with the requirements for a B-Tree operator class
*************** compareJsonbContainers(JsonbContainer *a
*** 201,207 ****
  			 *
  			 * If the two values were of the same container type, then there'd
  			 * have been a chance to observe the variation in the number of
! 			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say).  They're
  			 * either two heterogeneously-typed containers, or a container and
  			 * some scalar type.
  			 *
--- 223,229 ----
  			 *
  			 * If the two values were of the same container type, then there'd
  			 * have been a chance to observe the variation in the number of
! 			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're
  			 * either two heterogeneously-typed containers, or a container and
  			 * some scalar type.
  			 *
*************** findJsonbValueFromContainer(JsonbContain
*** 272,333 ****
  {
  	JEntry	   *children = container->children;
  	int			count = (container->header & JB_CMASK);
! 	JsonbValue *result = palloc(sizeof(JsonbValue));
  
  	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
  
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
  		int			i;
  
  		for (i = 0; i < count; i++)
  		{
! 			fillJsonbValue(children, i, base_addr, result);
  
  			if (key->type == result->type)
  			{
  				if (equalsJsonbScalarValue(key, result))
  					return result;
  			}
  		}
  	}
  	else if (flags & JB_FOBJECT & container->header)
  	{
  		/* Since this is an object, account for *Pairs* of Jentrys */
  		char	   *base_addr = (char *) (children + count * 2);
  		uint32		stopLow = 0,
! 					stopMiddle;
  
! 		/* Object key past by caller must be a string */
  		Assert(key->type == jbvString);
  
  		/* Binary search on object/pair keys *only* */
! 		while (stopLow < count)
  		{
! 			int			index;
  			int			difference;
  			JsonbValue	candidate;
  
! 			/*
! 			 * Note how we compensate for the fact that we're iterating
! 			 * through pairs (not entries) throughout.
! 			 */
! 			stopMiddle = stopLow + (count - stopLow) / 2;
! 
! 			index = stopMiddle * 2;
  
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + JBE_OFF(children, index);
! 			candidate.val.string.len = JBE_LEN(children, index);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
  
  			if (difference == 0)
  			{
! 				/* Found our key, return value */
! 				fillJsonbValue(children, index + 1, base_addr, result);
  
  				return result;
  			}
  			else
--- 294,386 ----
  {
  	JEntry	   *children = container->children;
  	int			count = (container->header & JB_CMASK);
! 	JsonbValue *result;
  
  	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
  
+ 	/* Quick out without a palloc cycle if object/array is empty */
+ 	if (count <= 0)
+ 		return NULL;
+ 
+ 	result = palloc(sizeof(JsonbValue));
+ 
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
+ 		uint32		offset = 0;
  		int			i;
  
  		for (i = 0; i < count; i++)
  		{
! 			fillJsonbValue(children, i, base_addr, offset, result);
  
  			if (key->type == result->type)
  			{
  				if (equalsJsonbScalarValue(key, result))
  					return result;
  			}
+ 
+ 			offset += JBE_LEN(children, i);
  		}
  	}
  	else if (flags & JB_FOBJECT & container->header)
  	{
  		/* Since this is an object, account for *Pairs* of Jentrys */
  		char	   *base_addr = (char *) (children + count * 2);
+ 		uint32	   *offsets;
+ 		uint32		lastoff;
+ 		int			i;
  		uint32		stopLow = 0,
! 					stopHigh = count;
  
! 		/* Object key passed by caller must be a string */
  		Assert(key->type == jbvString);
  
+ 		/*
+ 		 * We use a cache to avoid redundant getJsonbOffset() computations
+ 		 * inside the search loop.  The entire cache can be filled immediately
+ 		 * since we expect to need the last offset for value access.  (This
+ 		 * choice could lose if the key is not present, but avoiding extra
+ 		 * logic inside the search loop probably makes up for that.)
+ 		 */
+ 		offsets = (uint32 *) palloc(count * sizeof(uint32));
+ 		lastoff = 0;
+ 		for (i = 0; i < count; i++)
+ 		{
+ 			offsets[i] = lastoff;
+ 			lastoff += JBE_LEN(children, i);
+ 		}
+ 		/* lastoff now has the offset of the first value item */
+ 
  		/* Binary search on object/pair keys *only* */
! 		while (stopLow < stopHigh)
  		{
! 			uint32		stopMiddle;
  			int			difference;
  			JsonbValue	candidate;
  
! 			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
  
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + offsets[stopMiddle];
! 			candidate.val.string.len = JBE_LEN(children, stopMiddle);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
  
  			if (difference == 0)
  			{
! 				/* Found our key, return corresponding value */
! 				int			index = stopMiddle + count;
! 
! 				/* navigate to appropriate offset */
! 				for (i = count; i < index; i++)
! 					lastoff += JBE_LEN(children, i);
! 
! 				fillJsonbValue(children, index,
! 							   base_addr, lastoff,
! 							   result);
  
+ 				pfree(offsets);
  				return result;
  			}
  			else
*************** findJsonbValueFromContainer(JsonbContain
*** 335,343 ****
  				if (difference < 0)
  					stopLow = stopMiddle + 1;
  				else
! 					count = stopMiddle;
  			}
  		}
  	}
  
  	/* Not found */
--- 388,398 ----
  				if (difference < 0)
  					stopLow = stopMiddle + 1;
  				else
! 					stopHigh = stopMiddle;
  			}
  		}
+ 
+ 		pfree(offsets);
  	}
  
  	/* Not found */
*************** getIthJsonbValueFromContainer(JsonbConta
*** 368,374 ****
  
  	result = palloc(sizeof(JsonbValue));
  
! 	fillJsonbValue(container->children, i, base_addr, result);
  
  	return result;
  }
--- 423,431 ----
  
  	result = palloc(sizeof(JsonbValue));
  
! 	fillJsonbValue(container->children, i, base_addr,
! 				   getJsonbOffset(container->children, i),
! 				   result);
  
  	return result;
  }
*************** getIthJsonbValueFromContainer(JsonbConta
*** 377,387 ****
   * A helper function to fill in a JsonbValue to represent an element of an
   * array, or a key or value of an object.
   *
   * A nested array or object will be returned as jbvBinary, ie. it won't be
   * expanded.
   */
  static void
! fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
  {
  	JEntry		entry = children[index];
  
--- 434,450 ----
   * A helper function to fill in a JsonbValue to represent an element of an
   * array, or a key or value of an object.
   *
+  * The node's JEntry is at children[index], and its variable-length data
+  * is at base_addr + offset.  We make the caller determine the offset since
+  * in many cases the caller can amortize the work across multiple children.
+  *
   * A nested array or object will be returned as jbvBinary, ie. it won't be
   * expanded.
   */
  static void
! fillJsonbValue(JEntry *children, int index,
! 			   char *base_addr, uint32 offset,
! 			   JsonbValue *result)
  {
  	JEntry		entry = children[index];
  
*************** fillJsonbValue(JEntry *children, int ind
*** 392,405 ****
  	else if (JBE_ISSTRING(entry))
  	{
  		result->type = jbvString;
! 		result->val.string.val = base_addr + JBE_OFF(children, index);
! 		result->val.string.len = JBE_LEN(children, index);
  		Assert(result->val.string.len >= 0);
  	}
  	else if (JBE_ISNUMERIC(entry))
  	{
  		result->type = jbvNumeric;
! 		result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index)));
  	}
  	else if (JBE_ISBOOL_TRUE(entry))
  	{
--- 455,468 ----
  	else if (JBE_ISSTRING(entry))
  	{
  		result->type = jbvString;
! 		result->val.string.val = base_addr + offset;
! 		result->val.string.len = JBE_LENFLD(entry);
  		Assert(result->val.string.len >= 0);
  	}
  	else if (JBE_ISNUMERIC(entry))
  	{
  		result->type = jbvNumeric;
! 		result->val.numeric = (Numeric) (base_addr + INTALIGN(offset));
  	}
  	else if (JBE_ISBOOL_TRUE(entry))
  	{
*************** fillJsonbValue(JEntry *children, int ind
*** 415,422 ****
  	{
  		Assert(JBE_ISCONTAINER(entry));
  		result->type = jbvBinary;
! 		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index)));
! 		result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index));
  	}
  }
  
--- 478,486 ----
  	{
  		Assert(JBE_ISCONTAINER(entry));
  		result->type = jbvBinary;
! 		/* Remove alignment padding from data pointer and len */
! 		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset));
! 		result->val.binary.len = JBE_LENFLD(entry) - (INTALIGN(offset) - offset);
  	}
  }
  
*************** recurse:
*** 668,680 ****
  			 * a full conversion
  			 */
  			val->val.array.rawScalar = (*it)->isScalar;
! 			(*it)->i = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
  
  		case JBI_ARRAY_ELEM:
! 			if ((*it)->i >= (*it)->nElems)
  			{
  				/*
  				 * All elements within array already processed.  Report this
--- 732,746 ----
  			 * a full conversion
  			 */
  			val->val.array.rawScalar = (*it)->isScalar;
! 			(*it)->curIndex = 0;
! 			(*it)->curDataOffset = 0;
! 			(*it)->curValueOffset = 0;	/* not actually used */
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
  
  		case JBI_ARRAY_ELEM:
! 			if ((*it)->curIndex >= (*it)->nElems)
  			{
  				/*
  				 * All elements within array already processed.  Report this
*************** recurse:
*** 686,692 ****
  				return WJB_END_ARRAY;
  			}
  
! 			fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val);
  
  			if (!IsAJsonbScalar(val) && !skipNested)
  			{
--- 752,763 ----
  				return WJB_END_ARRAY;
  			}
  
! 			fillJsonbValue((*it)->children, (*it)->curIndex,
! 						   (*it)->dataProper, (*it)->curDataOffset,
! 						   val);
! 
! 			(*it)->curDataOffset += JBE_LEN((*it)->children, (*it)->curIndex);
! 			(*it)->curIndex++;
  
  			if (!IsAJsonbScalar(val) && !skipNested)
  			{
*************** recurse:
*** 697,704 ****
  			else
  			{
  				/*
! 				 * Scalar item in array, or a container and caller didn't
! 				 * want us to recurse into it.
  				 */
  				return WJB_ELEM;
  			}
--- 768,775 ----
  			else
  			{
  				/*
! 				 * Scalar item in array, or a container and caller didn't want
! 				 * us to recurse into it.
  				 */
  				return WJB_ELEM;
  			}
*************** recurse:
*** 712,724 ****
  			 * v->val.object.pairs is not actually set, because we aren't
  			 * doing a full conversion
  			 */
! 			(*it)->i = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
  
  		case JBI_OBJECT_KEY:
! 			if ((*it)->i >= (*it)->nElems)
  			{
  				/*
  				 * All pairs within object already processed.  Report this to
--- 783,798 ----
  			 * v->val.object.pairs is not actually set, because we aren't
  			 * doing a full conversion
  			 */
! 			(*it)->curIndex = 0;
! 			(*it)->curDataOffset = 0;
! 			(*it)->curValueOffset = getJsonbOffset((*it)->children,
! 												   (*it)->nElems);
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
  
  		case JBI_OBJECT_KEY:
! 			if ((*it)->curIndex >= (*it)->nElems)
  			{
  				/*
  				 * All pairs within object already processed.  Report this to
*************** recurse:
*** 732,738 ****
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val);
  				if (val->type != jbvString)
  					elog(ERROR, "unexpected jsonb type as object key");
  
--- 806,814 ----
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->curIndex,
! 							   (*it)->dataProper, (*it)->curDataOffset,
! 							   val);
  				if (val->type != jbvString)
  					elog(ERROR, "unexpected jsonb type as object key");
  
*************** recurse:
*** 745,752 ****
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1,
! 						   (*it)->dataProper, val);
  
  			/*
  			 * Value may be a container, in which case we recurse with new,
--- 821,835 ----
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			fillJsonbValue((*it)->children, (*it)->curIndex + (*it)->nElems,
! 						   (*it)->dataProper, (*it)->curValueOffset,
! 						   val);
! 
! 			(*it)->curDataOffset += JBE_LEN((*it)->children,
! 											(*it)->curIndex);
! 			(*it)->curValueOffset += JBE_LEN((*it)->children,
! 											 (*it)->curIndex + (*it)->nElems);
! 			(*it)->curIndex++;
  
  			/*
  			 * Value may be a container, in which case we recurse with new,
*************** reserveFromBuffer(StringInfo buffer, int
*** 1209,1216 ****
  	buffer->len += len;
  
  	/*
! 	 * Keep a trailing null in place, even though it's not useful for us;
! 	 * it seems best to preserve the invariants of StringInfos.
  	 */
  	buffer->data[buffer->len] = '\0';
  
--- 1292,1299 ----
  	buffer->len += len;
  
  	/*
! 	 * Keep a trailing null in place, even though it's not useful for us; it
! 	 * seems best to preserve the invariants of StringInfos.
  	 */
  	buffer->data[buffer->len] = '\0';
  
*************** convertToJsonb(JsonbValue *val)
*** 1284,1291 ****
  
  	/*
  	 * Note: the JEntry of the root is discarded. Therefore the root
! 	 * JsonbContainer struct must contain enough information to tell what
! 	 * kind of value it is.
  	 */
  
  	res = (Jsonb *) buffer.data;
--- 1367,1374 ----
  
  	/*
  	 * Note: the JEntry of the root is discarded. Therefore the root
! 	 * JsonbContainer struct must contain enough information to tell what kind
! 	 * of value it is.
  	 */
  
  	res = (Jsonb *) buffer.data;
*************** convertJsonbValue(StringInfo buffer, JEn
*** 1315,1324 ****
  		return;
  
  	/*
! 	 * A JsonbValue passed as val should never have a type of jbvBinary,
! 	 * and neither should any of its sub-components. Those values will be
! 	 * produced by convertJsonbArray and convertJsonbObject, the results of
! 	 * which will not be passed back to this function as an argument.
  	 */
  
  	if (IsAJsonbScalar(val))
--- 1398,1407 ----
  		return;
  
  	/*
! 	 * A JsonbValue passed as val should never have a type of jbvBinary, and
! 	 * neither should any of its sub-components. Those values will be produced
! 	 * by convertJsonbArray and convertJsonbObject, the results of which will
! 	 * not be passed back to this function as an argument.
  	 */
  
  	if (IsAJsonbScalar(val))
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1340,1353 ****
  	int			totallen;
  	uint32		header;
  
! 	/* Initialize pointer into conversion buffer at this level */
  	offset = buffer->len;
  
  	padBufferToInt(buffer);
  
  	/*
! 	 * Construct the header Jentry, stored in the beginning of the variable-
! 	 * length payload.
  	 */
  	header = val->val.array.nElems | JB_FARRAY;
  	if (val->val.array.rawScalar)
--- 1423,1437 ----
  	int			totallen;
  	uint32		header;
  
! 	/* Remember where variable-length data starts for this array */
  	offset = buffer->len;
  
+ 	/* Align to 4-byte boundary (any padding counts as part of my data) */
  	padBufferToInt(buffer);
  
  	/*
! 	 * Construct the header Jentry and store it in the beginning of the
! 	 * variable-length payload.
  	 */
  	header = val->val.array.nElems | JB_FARRAY;
  	if (val->val.array.rawScalar)
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1358,1364 ****
  	}
  
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
! 	/* reserve space for the JEntries of the elements. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
  
  	totallen = 0;
--- 1442,1449 ----
  	}
  
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
! 
! 	/* Reserve space for the JEntries of the elements. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
  
  	totallen = 0;
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1368,1391 ****
  		int			len;
  		JEntry		meta;
  
  		convertJsonbValue(buffer, &meta, elem, level + 1);
- 		len = meta & JENTRY_POSMASK;
- 		totallen += len;
  
! 		if (totallen > JENTRY_POSMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
- 		if (i > 0)
- 			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
  
  	totallen = buffer->len - offset;
  
  	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
--- 1453,1491 ----
  		int			len;
  		JEntry		meta;
  
+ 		/*
+ 		 * Convert element, producing a JEntry and appending its
+ 		 * variable-length data to buffer
+ 		 */
  		convertJsonbValue(buffer, &meta, elem, level + 1);
  
! 		/*
! 		 * Bail out if total variable-length data exceeds what will fit in a
! 		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
! 		 */
! 		len = JBE_LENFLD(meta);
! 		totallen += len;
! 		if (totallen > JENTRY_LENMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_LENMASK)));
  
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
  
+ 	/* Total data size is everything we've appended to buffer */
  	totallen = buffer->len - offset;
  
+ 	/* Check length again, since we didn't include the metadata above */
+ 	if (totallen > JENTRY_LENMASK)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 				 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
+ 						JENTRY_LENMASK)));
+ 
  	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
*************** convertJsonbArray(StringInfo buffer, JEn
*** 1393,1457 ****
  static void
  convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
- 	uint32		header;
  	int			offset;
  	int			metaoffset;
  	int			i;
  	int			totallen;
  
! 	/* Initialize pointer into conversion buffer at this level */
  	offset = buffer->len;
  
  	padBufferToInt(buffer);
  
! 	/* Initialize header */
  	header = val->val.object.nPairs | JB_FOBJECT;
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
  
! 	/* reserve space for the JEntries of the keys and values */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
  
  	totallen = 0;
  	for (i = 0; i < val->val.object.nPairs; i++)
  	{
! 		JsonbPair *pair = &val->val.object.pairs[i];
! 		int len;
! 		JEntry meta;
  
! 		/* put key */
  		convertJsonbScalar(buffer, &meta, &pair->key);
  
! 		len = meta & JENTRY_POSMASK;
  		totallen += len;
  
- 		if (totallen > JENTRY_POSMASK)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
- 							JENTRY_POSMASK)));
- 
- 		if (i > 0)
- 			meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
! 		convertJsonbValue(buffer, &meta, &pair->value, level);
! 		len = meta & JENTRY_POSMASK;
! 		totallen += len;
! 
! 		if (totallen > JENTRY_POSMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
- 		meta = (meta & ~JENTRY_POSMASK) | totallen;
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  	}
  
  	totallen = buffer->len - offset;
  
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
--- 1493,1595 ----
  static void
  convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
  	int			offset;
  	int			metaoffset;
  	int			i;
  	int			totallen;
+ 	uint32		header;
  
! 	/* Remember where variable-length data starts for this object */
  	offset = buffer->len;
  
+ 	/* Align to 4-byte boundary (any padding counts as part of my data) */
  	padBufferToInt(buffer);
  
! 	/*
! 	 * Construct the header Jentry and store it in the beginning of the
! 	 * variable-length payload.
! 	 */
  	header = val->val.object.nPairs | JB_FOBJECT;
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
  
! 	/* Reserve space for the JEntries of the keys and values. */
  	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
  
+ 	/*
+ 	 * Iterate over the keys, then over the values, since that is the ordering
+ 	 * we want in the on-disk representation.
+ 	 */
  	totallen = 0;
  	for (i = 0; i < val->val.object.nPairs; i++)
  	{
! 		JsonbPair  *pair = &val->val.object.pairs[i];
! 		int			len;
! 		JEntry		meta;
  
! 		/*
! 		 * Convert key, producing a JEntry and appending its variable-length
! 		 * data to buffer
! 		 */
  		convertJsonbScalar(buffer, &meta, &pair->key);
  
! 		len = JBE_LENFLD(meta);
  		totallen += len;
  
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
  
! 		/*
! 		 * Bail out if total variable-length data exceeds what will fit in a
! 		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
! 		 */
! 		if (totallen > JENTRY_LENMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
! 							JENTRY_LENMASK)));
! 	}
! 	for (i = 0; i < val->val.object.nPairs; i++)
! 	{
! 		JsonbPair  *pair = &val->val.object.pairs[i];
! 		int			len;
! 		JEntry		meta;
! 
! 		/*
! 		 * Convert value, producing a JEntry and appending its variable-length
! 		 * data to buffer
! 		 */
! 		convertJsonbValue(buffer, &meta, &pair->value, level + 1);
! 
! 		len = JBE_LENFLD(meta);
! 		totallen += len;
  
  		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
  		metaoffset += sizeof(JEntry);
+ 
+ 		/*
+ 		 * Bail out if total variable-length data exceeds what will fit in a
+ 		 * JEntry length field.  We check this in each iteration, not just
+ 		 * once at the end, to forestall possible integer overflow.
+ 		 */
+ 		if (totallen > JENTRY_LENMASK)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+ 							JENTRY_LENMASK)));
  	}
  
+ 	/* Total data size is everything we've appended to buffer */
  	totallen = buffer->len - offset;
  
+ 	/* Check length again, since we didn't include the metadata above */
+ 	if (totallen > JENTRY_LENMASK)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 				 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+ 						JENTRY_LENMASK)));
+ 
+ 	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 91e3e14..f9472af 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef struct JsonbValue JsonbValue;
*** 83,91 ****
   * buffer is accessed, but they can also be deep copied and passed around.
   *
   * Jsonb is a tree structure. Each node in the tree consists of a JEntry
!  * header, and a variable-length content.  The JEntry header indicates what
!  * kind of a node it is, e.g. a string or an array, and the offset and length
!  * of its variable-length portion within the container.
   *
   * The JEntry and the content of a node are not stored physically together.
   * Instead, the container array or object has an array that holds the JEntrys
--- 83,91 ----
   * buffer is accessed, but they can also be deep copied and passed around.
   *
   * Jsonb is a tree structure. Each node in the tree consists of a JEntry
!  * header and a variable-length content (possibly of zero size).  The JEntry
!  * header indicates what kind of a node it is, e.g. a string or an array,
!  * and includes the length of its variable-length portion.
   *
   * The JEntry and the content of a node are not stored physically together.
   * Instead, the container array or object has an array that holds the JEntrys
*************** typedef struct JsonbValue JsonbValue;
*** 95,133 ****
   * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
   * is implicitly known that the root node must be an array or an object,
   * so we can get away without the type indicator as long as we can distinguish
!  * the two.  For that purpose, both an array and an object begins with a uint32
   * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
   * scalar value needs to be stored as a Jsonb value, what we actually store is
   * an array with one element, with the flags in the array's header field set
   * to JB_FSCALAR | JB_FARRAY.
   *
-  * To encode the length and offset of the variable-length portion of each
-  * node in a compact way, the JEntry stores only the end offset within the
-  * variable-length portion of the container node. For the first JEntry in the
-  * container's JEntry array, that equals to the length of the node data.  The
-  * begin offset and length of the rest of the entries can be calculated using
-  * the end offset of the previous JEntry in the array.
-  *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
   * boundary, while others are not. When alignment is needed, the padding is
   * in the beginning of the node that requires it. For example, if a numeric
   * node is stored after a string node, so that the numeric node begins at
   * offset 3, the variable-length portion of the numeric node will begin with
!  * one padding byte.
   */
  
  /*
   * Jentry format.
   *
!  * The least significant 28 bits store the end offset of the entry (see
!  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
!  * are used to store the type of the entry. The most significant bit
!  * is unused, and should be set to zero.
   */
  typedef uint32 JEntry;
  
! #define JENTRY_POSMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
  
  /* values stored in the type bits */
--- 95,126 ----
   * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
   * is implicitly known that the root node must be an array or an object,
   * so we can get away without the type indicator as long as we can distinguish
!  * the two.  For that purpose, both an array and an object begin with a uint32
   * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
   * scalar value needs to be stored as a Jsonb value, what we actually store is
   * an array with one element, with the flags in the array's header field set
   * to JB_FSCALAR | JB_FARRAY.
   *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
   * boundary, while others are not. When alignment is needed, the padding is
   * in the beginning of the node that requires it. For example, if a numeric
   * node is stored after a string node, so that the numeric node begins at
   * offset 3, the variable-length portion of the numeric node will begin with
!  * one padding byte so that the actual numeric data is 4-byte aligned.
   */
  
  /*
   * Jentry format.
   *
!  * The least significant 28 bits store the data length of the entry (see
!  * JBE_LENFLD and JBE_LEN macros below). The next three bits store the type
!  * of the entry. The most significant bit is reserved for future use, and
!  * should be set to zero.
   */
  typedef uint32 JEntry;
  
! #define JENTRY_LENMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
  
  /* values stored in the type bits */
*************** typedef uint32 JEntry;
*** 148,166 ****
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
  /*
!  * Macros for getting the offset and length of an element. Note multiple
!  * evaluations and access to prior array element.
   */
! #define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
! #define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
! 								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
   *
!  * An array has one child for each element. An object has two children for
!  * each key/value pair.
   */
  typedef struct JsonbContainer
  {
--- 141,160 ----
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
  /*
!  * Macros for getting the data length of a JEntry.
   */
! #define JBE_LENFLD(je_)			((je_) & JENTRY_LENMASK)
! #define JBE_LEN(ja, i)			JBE_LENFLD((ja)[i])
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
   *
!  * An array has one child for each element, stored in array order.
!  *
!  * An object has two children for each key/value pair.  The keys all appear
!  * first, in key sort order; then the values appear, in an order matching the
!  * key order.  This arrangement keeps the keys compact in memory, making a
!  * search for a particular key more cache-friendly.
   */
  typedef struct JsonbContainer
  {
*************** typedef struct JsonbContainer
*** 172,179 ****
  } JsonbContainer;
  
  /* flags for the header-field in JsonbContainer */
! #define JB_CMASK				0x0FFFFFFF
! #define JB_FSCALAR				0x10000000
  #define JB_FOBJECT				0x20000000
  #define JB_FARRAY				0x40000000
  
--- 166,173 ----
  } JsonbContainer;
  
  /* flags for the header-field in JsonbContainer */
! #define JB_CMASK				0x0FFFFFFF		/* mask for count field */
! #define JB_FSCALAR				0x10000000		/* flag bits */
  #define JB_FOBJECT				0x20000000
  #define JB_FARRAY				0x40000000
  
*************** struct JsonbValue
*** 248,265 ****
  									 (jsonbval)->type <= jbvBool)
  
  /*
!  * Pair within an Object.
   *
!  * Pairs with duplicate keys are de-duplicated.  We store the order for the
!  * benefit of doing so in a well-defined way with respect to the original
!  * observed order (which is "last observed wins").  This is only used briefly
!  * when originally constructing a Jsonb.
   */
  struct JsonbPair
  {
  	JsonbValue	key;			/* Must be a jbvString */
  	JsonbValue	value;			/* May be of any type */
! 	uint32		order;			/* preserves order of pairs with equal keys */
  };
  
  /* Conversion state used when parsing Jsonb from text, or for type coercion */
--- 242,261 ----
  									 (jsonbval)->type <= jbvBool)
  
  /*
!  * Key/value pair within an Object.
   *
!  * This struct type is only used briefly while constructing a Jsonb; it is
!  * *not* the on-disk representation.
!  *
!  * Pairs with duplicate keys are de-duplicated.  We store the originally
!  * observed pair ordering for the purpose of removing duplicates in a
!  * well-defined way (which is "last observed wins").
   */
  struct JsonbPair
  {
  	JsonbValue	key;			/* Must be a jbvString */
  	JsonbValue	value;			/* May be of any type */
! 	uint32		order;			/* Pair's index in original sequence */
  };
  
  /* Conversion state used when parsing Jsonb from text, or for type coercion */
*************** typedef struct JsonbIterator
*** 287,306 ****
  {
  	/* Container being iterated */
  	JsonbContainer *container;
! 	uint32		nElems;			/* Number of elements in children array (will be
! 								 * nPairs for objects) */
  	bool		isScalar;		/* Pseudo-array scalar value? */
! 	JEntry	   *children;
  
! 	/* Current item in buffer (up to nElems, but must * 2 for objects) */
! 	int			i;
  
  	/*
! 	 * Data proper.  This points just past end of children array.
! 	 * We use the JBE_OFF() macro on the Jentrys to find offsets of each
! 	 * child in this area.
  	 */
! 	char	   *dataProper;
  
  	/* Private state */
  	JsonbIterState state;
--- 283,307 ----
  {
  	/* Container being iterated */
  	JsonbContainer *container;
! 	uint32		nElems;			/* Number of elements in children array (will
! 								 * be nPairs for objects) */
  	bool		isScalar;		/* Pseudo-array scalar value? */
! 	JEntry	   *children;		/* JEntrys for child nodes */
! 	/* Data proper.  This points just past end of children array */
! 	char	   *dataProper;
  
! 	/* Current item in buffer (up to nElems) */
! 	int			curIndex;
! 
! 	/* Data offset corresponding to current item */
! 	uint32		curDataOffset;
  
  	/*
! 	 * If the container is an object, we want to return keys and values
! 	 * alternately; so curDataOffset points to the current key, and
! 	 * curValueOffset points to the current value.
  	 */
! 	uint32		curValueOffset;
  
  	/* Private state */
  	JsonbIterState state;
*************** extern Datum gin_consistent_jsonb_path(P
*** 344,349 ****
--- 345,351 ----
  extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS);
  
  /* Support functions */
+ extern uint32 getJsonbOffset(const JEntry *ja, int index);
  extern int	compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
  extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
  							uint32 flags,
#105Arthur Silva
arthurprs@gmail.com
In reply to: Tom Lane (#104)
Re: jsonb format is pessimal for toast compression

Tom, here's the results with github data (8 top level keys) only. Here's a
sample object https://gist.github.com/igrigorik/2017462

All-Lenghts + Cache-Aware EXTERNAL
Query 1: 516ms
Query 2: 350ms

The difference is small but I's definitely faster, which makes sense since
cache line misses are probably slightly reduced.
As in the previous runs, I ran the query a dozen times and took the average
after excluding runs with a high deviation.

compare to (copied from my previous email)

HEAD (aka, all offsets) EXTERNAL
Test query 1 runtime: 505ms
Test query 2 runtime: 350ms

All Lengths (Tom Lane patch) EXTERNAL
Test query 1 runtime: 525ms
Test query 2 runtime: 355ms

--
Arthur Silva

On Tue, Aug 26, 2014 at 7:11 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Show quoted text

I wrote:

I wish it were cache-friendly too, per the upthread tangent about having
to fetch keys from all over the place within a large JSON object.

... and while I was typing that sentence, lightning struck. The existing
arrangement of object subfields with keys and values interleaved is just
plain dumb. We should rearrange that as all the keys in order, then all
the values in the same order. Then the keys are naturally adjacent in
memory and object-key searches become much more cache-friendly: you
probably touch most of the key portion of the object, but none of the
values portion, until you know exactly what part of the latter to fetch.
This approach might complicate the lookup logic marginally but I bet not
very much; and it will be a huge help if we ever want to do smart access
to EXTERNAL (non-compressed) JSON values.

I will go prototype that just to see how much code rearrangement is
required.

This looks pretty good from a coding point of view. I have not had time
yet to see if it affects the speed of the benchmark cases we've been
trying. I suspect that it won't make much difference in them. I think
if we do decide to make an on-disk format change, we should seriously
consider including this change.

The same concept could be applied to offset-based storage of course,
although I rather doubt that we'd make that combination of choices since
it would be giving up on-disk compatibility for benefits that are mostly
in the future.

Attached are two patches: one is a "delta" against the last jsonb-lengths
patch I posted, and the other is a "merged" patch showing the total change
from HEAD, for ease of application.

regards, tom lane

#106Peter Geoghegan
pg@heroku.com
In reply to: Arthur Silva (#105)
Re: jsonb format is pessimal for toast compression

On Tue, Aug 26, 2014 at 8:41 PM, Arthur Silva <arthurprs@gmail.com> wrote:

The difference is small but I's definitely faster, which makes sense since
cache line misses are probably slightly reduced.
As in the previous runs, I ran the query a dozen times and took the average
after excluding runs with a high deviation.

I'm not surprised that it hasn't beaten HEAD. I haven't studied the
problem in detail, but I don't think that the "cache awareness" of the
new revision is necessarily a distinct advantage.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#107Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Geoghegan (#106)
Re: jsonb format is pessimal for toast compression

Peter Geoghegan <pg@heroku.com> writes:

I'm not surprised that it hasn't beaten HEAD. I haven't studied the
problem in detail, but I don't think that the "cache awareness" of the
new revision is necessarily a distinct advantage.

I doubt it's a significant advantage in the current state of the code;
I'm happy if it's not a loss. I was looking ahead to someday fetching key
values efficiently from large EXTERNAL (ie out-of-line-but-not-compressed)
JSON values, analogously to the existing optimization for fetching text
substrings from EXTERNAL text values. As mentioned upthread, the current
JSONB representation would be seriously unfriendly to such a thing.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#108Arthur Silva
arthurprs@gmail.com
In reply to: Peter Geoghegan (#106)
Re: jsonb format is pessimal for toast compression

It won't be faster by any means, but it should definitely be incorporated
if any format changes are made (like Tom already suggested).

I think it's important we gather at least 2 more things before making any
calls:
* Josh tests w/ cache aware patch, which should confirm cache aware is
indeed prefered
* Tests with toast hacked to use lz4 instead, which might ease any decisions

--
Arthur Silva

On Wed, Aug 27, 2014 at 12:53 AM, Peter Geoghegan <pg@heroku.com> wrote:

Show quoted text

On Tue, Aug 26, 2014 at 8:41 PM, Arthur Silva <arthurprs@gmail.com> wrote:

The difference is small but I's definitely faster, which makes sense

since

cache line misses are probably slightly reduced.
As in the previous runs, I ran the query a dozen times and took the

average

after excluding runs with a high deviation.

I'm not surprised that it hasn't beaten HEAD. I haven't studied the
problem in detail, but I don't think that the "cache awareness" of the
new revision is necessarily a distinct advantage.

--
Peter Geoghegan

#109Arthur Silva
arthurprs@gmail.com
In reply to: Arthur Silva (#108)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

On Wed, Aug 27, 2014 at 1:09 AM, Arthur Silva <arthurprs@gmail.com> wrote:

It won't be faster by any means, but it should definitely be incorporated
if any format changes are made (like Tom already suggested).

I think it's important we gather at least 2 more things before making any
calls:
* Josh tests w/ cache aware patch, which should confirm cache aware is
indeed prefered
* Tests with toast hacked to use lz4 instead, which might ease any
decisions

--
Arthur Silva

On Wed, Aug 27, 2014 at 12:53 AM, Peter Geoghegan <pg@heroku.com> wrote:

On Tue, Aug 26, 2014 at 8:41 PM, Arthur Silva <arthurprs@gmail.com>
wrote:

The difference is small but I's definitely faster, which makes sense

since

cache line misses are probably slightly reduced.
As in the previous runs, I ran the query a dozen times and took the

average

after excluding runs with a high deviation.

I'm not surprised that it hasn't beaten HEAD. I haven't studied the
problem in detail, but I don't think that the "cache awareness" of the
new revision is necessarily a distinct advantage.

--
Peter Geoghegan

I'm attaching a quick-n-dirty patch that uses lz4 compression instead of
pglz in case someone wants to experiment with it. Seems to work in my test
env, I'll make more tests when I get home.

PS: gotta love gmail fixed defaults of top-posting...

Attachments:

lz4.patchapplication/octet-stream; name=lz4.patchDownload
diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile
index 8374533..44aa586 100644
--- a/src/backend/utils/Makefile
+++ b/src/backend/utils/Makefile
@@ -9,7 +9,7 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS        = fmgrtab.o
-SUBDIRS     = adt cache error fmgr hash init mb misc mmgr resowner sort time
+SUBDIRS     = adt cache compression error fmgr hash init mb misc mmgr resowner sort time
 
 # location of Catalog.pm
 catalogdir  = $(top_srcdir)/src/backend/catalog
diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c
index fe08890..f918fa4 100644
--- a/src/backend/utils/adt/pg_lzcompress.c
+++ b/src/backend/utils/adt/pg_lzcompress.c
@@ -178,6 +178,7 @@
 
 #include "utils/pg_lzcompress.h"
 
+#include "compression/lz4.h"
 
 /* ----------
  * Local definitions
@@ -499,26 +500,10 @@ bool
 pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 			  const PGLZ_Strategy *strategy)
 {
-	unsigned char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header);
-	unsigned char *bstart = bp;
-	int			hist_next = 1;
-	bool		hist_recycle = false;
-	const char *dp = source;
-	const char *dend = source + slen;
-	unsigned char ctrl_dummy = 0;
-	unsigned char *ctrlp = &ctrl_dummy;
-	unsigned char ctrlb = 0;
-	unsigned char ctrl = 0;
-	bool		found_match = false;
-	int32		match_len;
-	int32		match_off;
-	int32		good_match;
-	int32		good_drop;
-	int32		result_size;
-	int32		result_max;
-	int32		need_rate;
-	int			hashsz;
-	int			mask;
+	char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header);
+	int result_size;
+	int32 need_rate = strategy->min_comp_rate;
+	int32 result_max;
 
 	/*
 	 * Our fallback strategy is the default.
@@ -535,27 +520,6 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 		slen > strategy->max_input_size)
 		return false;
 
-	/*
-	 * Save the original source size in the header.
-	 */
-	dest->rawsize = slen;
-
-	/*
-	 * Limit the match parameters to the supported range.
-	 */
-	good_match = strategy->match_size_good;
-	if (good_match > PGLZ_MAX_MATCH)
-		good_match = PGLZ_MAX_MATCH;
-	else if (good_match < 17)
-		good_match = 17;
-
-	good_drop = strategy->match_size_drop;
-	if (good_drop < 0)
-		good_drop = 0;
-	else if (good_drop > 100)
-		good_drop = 100;
-
-	need_rate = strategy->min_comp_rate;
 	if (need_rate < 0)
 		need_rate = 0;
 	else if (need_rate > 99)
@@ -563,7 +527,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 
 	/*
 	 * Compute the maximum result size allowed by the strategy, namely the
-	 * input size minus the minimum wanted compression rate.  This had better
+	 * input size minus the minimum wanted compression rate. This had better
 	 * be <= slen, else we might overrun the provided output buffer.
 	 */
 	if (slen > (INT_MAX / 100))
@@ -575,95 +539,12 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 		result_max = (slen * (100 - need_rate)) / 100;
 
 	/*
-	 * Experiments suggest that these hash sizes work pretty well. A large
-	 * hash table minimizes collision, but has a higher startup cost. For a
-	 * small input, the startup cost dominates. The table size must be a power
-	 * of two.
-	 */
-	if (slen < 128)
-		hashsz = 512;
-	else if (slen < 256)
-		hashsz = 1024;
-	else if (slen < 512)
-		hashsz = 2048;
-	else if (slen < 1024)
-		hashsz = 4096;
-	else
-		hashsz = 8192;
-	mask = hashsz - 1;
-
-	/*
-	 * Initialize the history lists to empty.  We do not need to zero the
-	 * hist_entries[] array; its entries are initialized as they are used.
-	 */
-	memset(hist_start, 0, hashsz * sizeof(int16));
-
-	/*
-	 * Compress the source directly into the output buffer.
+	 * Save the original source size in the header.
 	 */
-	while (dp < dend)
-	{
-		/*
-		 * If we already exceeded the maximum result size, fail.
-		 *
-		 * We check once per loop; since the loop body could emit as many as 4
-		 * bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
-		 * allow 4 slop bytes.
-		 */
-		if (bp - bstart >= result_max)
-			return false;
-
-		/*
-		 * If we've emitted more than first_success_by bytes without finding
-		 * anything compressible at all, fail.  This lets us fall out
-		 * reasonably quickly when looking at incompressible input (such as
-		 * pre-compressed data).
-		 */
-		if (!found_match && bp - bstart >= strategy->first_success_by)
-			return false;
-
-		/*
-		 * Try to find a match in the history
-		 */
-		if (pglz_find_match(hist_start, dp, dend, &match_len,
-							&match_off, good_match, good_drop, mask))
-		{
-			/*
-			 * Create the tag and add history entries for all matched
-			 * characters.
-			 */
-			pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
-			while (match_len--)
-			{
-				pglz_hist_add(hist_start, hist_entries,
-							  hist_next, hist_recycle,
-							  dp, dend, mask);
-				dp++;			/* Do not do this ++ in the line above! */
-				/* The macro would do it four times - Jan.  */
-			}
-			found_match = true;
-		}
-		else
-		{
-			/*
-			 * No match found. Copy one literal byte.
-			 */
-			pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
-			pglz_hist_add(hist_start, hist_entries,
-						  hist_next, hist_recycle,
-						  dp, dend, mask);
-			dp++;				/* Do not do this ++ in the line above! */
-			/* The macro would do it four times - Jan.  */
-		}
-	}
+	dest->rawsize = slen;
 
-	/*
-	 * Write out the last control byte and check that we haven't overrun the
-	 * output size allowed by the strategy.
-	 */
-	*ctrlp = ctrlb;
-	result_size = bp - bstart;
-	if (result_size >= result_max)
+	result_size = LZ4_compress_limitedOutput(source, bp, slen, result_max);
+	if (!result_size)
 		return false;
 
 	/*
@@ -684,93 +565,13 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 void
 pglz_decompress(const PGLZ_Header *source, char *dest)
 {
-	const unsigned char *sp;
-	const unsigned char *srcend;
-	unsigned char *dp;
-	unsigned char *destend;
-
-	sp = ((const unsigned char *) source) + sizeof(PGLZ_Header);
-	srcend = ((const unsigned char *) source) + VARSIZE(source);
-	dp = (unsigned char *) dest;
-	destend = dp + source->rawsize;
-
-	while (sp < srcend && dp < destend)
-	{
-		/*
-		 * Read one control byte and process the next 8 items (or as many as
-		 * remain in the compressed input).
-		 */
-		unsigned char ctrl = *sp++;
-		int			ctrlc;
-
-		for (ctrlc = 0; ctrlc < 8 && sp < srcend; ctrlc++)
-		{
-			if (ctrl & 1)
-			{
-				/*
-				 * Otherwise it contains the match length minus 3 and the
-				 * upper 4 bits of the offset. The next following byte
-				 * contains the lower 8 bits of the offset. If the length is
-				 * coded as 18, another extension tag byte tells how much
-				 * longer the match really was (0-255).
-				 */
-				int32		len;
-				int32		off;
-
-				len = (sp[0] & 0x0f) + 3;
-				off = ((sp[0] & 0xf0) << 4) | sp[1];
-				sp += 2;
-				if (len == 18)
-					len += *sp++;
-
-				/*
-				 * Check for output buffer overrun, to ensure we don't clobber
-				 * memory in case of corrupt input.  Note: we must advance dp
-				 * here to ensure the error is detected below the loop.  We
-				 * don't simply put the elog inside the loop since that will
-				 * probably interfere with optimization.
-				 */
-				if (dp + len > destend)
-				{
-					dp += len;
-					break;
-				}
-
-				/*
-				 * Now we copy the bytes specified by the tag from OUTPUT to
-				 * OUTPUT. It is dangerous and platform dependent to use
-				 * memcpy() here, because the copied areas could overlap
-				 * extremely!
-				 */
-				while (len--)
-				{
-					*dp = dp[-off];
-					dp++;
-				}
-			}
-			else
-			{
-				/*
-				 * An unset control bit means LITERAL BYTE. So we just copy
-				 * one from INPUT to OUTPUT.
-				 */
-				if (dp >= destend)		/* check for buffer overrun */
-					break;		/* do not clobber memory */
-
-				*dp++ = *sp++;
-			}
-
-			/*
-			 * Advance the control bit
-			 */
-			ctrl >>= 1;
-		}
-	}
+	char *sp = ((const unsigned char *) source) + sizeof(PGLZ_Header);
+	int lz4_result = LZ4_uncompress(sp, dest, source->rawsize);
 
 	/*
 	 * Check we decompressed the right amount.
 	 */
-	if (dp != destend || sp != srcend)
+	if (lz4_result < 0)
 		elog(ERROR, "compressed data is corrupt");
 
 	/*
diff --git a/src/backend/utils/compression/Makefile b/src/backend/utils/compression/Makefile
new file mode 100644
index 0000000..933e162
--- /dev/null
+++ b/src/backend/utils/compression/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for utils/compression
+#
+# IDENTIFICATION
+#    src/backend/utils/compression/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/utils/compression
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = lz4.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/compression/lz4.c b/src/backend/utils/compression/lz4.c
new file mode 100644
index 0000000..841a2f4
--- /dev/null
+++ b/src/backend/utils/compression/lz4.c
@@ -0,0 +1,1249 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2014, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : http://code.google.com/p/lz4/
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/**************************************
+   Tuning parameters
+**************************************/
+/*
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)).
+ */
+#define HEAPMODE 0
+
+
+/**************************************
+   CPU Feature Detection
+**************************************/
+/* 32 or 64 bits ? */
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
+  || defined(__powerpc64__) || defined(__powerpc64le__) \
+  || defined(__ppc64__) || defined(__ppc64le__) \
+  || defined(__PPC64__) || defined(__PPC64LE__) \
+  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   /* Detects 64 bits mode */
+#  define LZ4_ARCH64 1
+#else
+#  define LZ4_ARCH64 0
+#endif
+#define LZ4_32BITS (sizeof(void*)==4)
+#define LZ4_64BITS (sizeof(void*)==8)
+
+/*
+ * Little Endian or Big Endian ?
+ * Overwrite the #define below if you know your architecture endianess
+ */
+#include <stdlib.h>   /* Apparently required to detect endianess */
+#if defined (__GLIBC__)
+#  include <endian.h>
+#  if (__BYTE_ORDER == __BIG_ENDIAN)
+#     define LZ4_BIG_ENDIAN 1
+#  endif
+#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
+#  define LZ4_BIG_ENDIAN 1
+#elif defined(__sparc) || defined(__sparc__) \
+   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
+   || defined(__hpux)  || defined(__hppa) \
+   || defined(_MIPSEB) || defined(__s390__)
+#  define LZ4_BIG_ENDIAN 1
+#else
+/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+ * For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property
+ * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#  define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/* Define this parameter if your target system or compiler does not support hardware bit count */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
+ * This option may provide a small boost to performance for some big endian cpu, although probably modest.
+ * You may set this option to 1 if data will remain within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86)
+ */
+
+/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+
+/**************************************
+ Compiler Options
+**************************************/
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+/* "restrict" is a known keyword */
+#else
+#  define restrict /* Disable restrict */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  if LZ4_ARCH64   /* 64-bits */
+#    pragma intrinsic(_BitScanForward64) /* For Visual 2005 */
+#    pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */
+#  else            /* 32-bits */
+#    pragma intrinsic(_BitScanForward)   /* For Visual 2005 */
+#    pragma intrinsic(_BitScanReverse)   /* For Visual 2005 */
+#  endif
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#else
+#  ifdef __GNUC__
+#    define FORCE_INLINE static inline __attribute__((always_inline))
+#  else
+#    define FORCE_INLINE static inline
+#  endif
+#endif
+
+#ifdef _MSC_VER  /* Visual Studio */
+#  define lz4_bswap16(x) _byteswap_ushort(x)
+#else
+#  define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
+#endif
+
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#define likely(expr)     expect((expr) != 0, 1)
+#define unlikely(expr)   expect((expr) != 0, 0)
+
+
+/**************************************
+   Memory routines
+**************************************/
+#include <stdlib.h>   /* malloc, calloc, free */
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM        free
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT       memset
+
+
+/**************************************
+   Includes
+**************************************/
+#include "compression/lz4.h"
+
+
+/**************************************
+   Basic Types
+**************************************/
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+#if defined(__GNUC__)  && !defined(LZ4_FORCE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#    pragma pack(1)
+#  else
+#    pragma pack(push, 1)
+#  endif
+#endif
+
+typedef struct { U16 v; }  _PACKED U16_S;
+typedef struct { U32 v; }  _PACKED U32_S;
+typedef struct { U64 v; }  _PACKED U64_S;
+typedef struct {size_t v;} _PACKED size_t_S;
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  if defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#    pragma pack(0)
+#  else
+#    pragma pack(pop)
+#  endif
+#endif
+
+#define A16(x)   (((U16_S *)(x))->v)
+#define A32(x)   (((U32_S *)(x))->v)
+#define A64(x)   (((U64_S *)(x))->v)
+#define AARCH(x) (((size_t_S *)(x))->v)
+
+
+/**************************************
+   Constants
+**************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)
+
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1U<<10)
+#define MB *(1U<<20)
+#define GB *(1U<<30)
+
+#define LZ4_64KLIMIT ((64 KB) + (MFLIMIT-1))
+#define SKIPSTRENGTH 6   /* Increasing this value will make the compression run slower on incompressible data */
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/**************************************
+   Structures and local types
+**************************************/
+typedef struct {
+    U32  hashTable[HASH_SIZE_U32];
+    U32  currentOffset;
+    U32  initCheck;
+    const BYTE* dictionary;
+    const BYTE* bufferStart;
+    U32  dictSize;
+} LZ4_stream_t_internal;
+
+typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+/**************************************
+   Architecture-specific macros
+**************************************/
+#define STEPSIZE                  sizeof(size_t)
+#define LZ4_COPYSTEP(d,s)         { AARCH(d) = AARCH(s); d+=STEPSIZE; s+=STEPSIZE; }
+#define LZ4_COPY8(d,s)            { LZ4_COPYSTEP(d,s); if (STEPSIZE<8) LZ4_COPYSTEP(d,s); }
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
+#else      /* Little Endian */
+#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
+#  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
+#endif
+
+
+/**************************************
+   Macros
+**************************************/
+#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(!!(c)) }; }   /* use only *after* variable declarations */
+#if LZ4_ARCH64 || !defined(__GNUC__)
+#  define LZ4_WILDCOPY(d,s,e)   { do { LZ4_COPY8(d,s) } while (d<e); }        /* at the end, d>=e; */
+#else
+#  define LZ4_WILDCOPY(d,s,e)   { if (likely(e-d <= 8)) LZ4_COPY8(d,s) else do { LZ4_COPY8(d,s) } while (d<e); }
+#endif
+
+
+/****************************
+   Private local functions
+****************************/
+#if LZ4_ARCH64
+
+static int LZ4_NbCommonBytes (register U64 val)
+{
+# if defined(LZ4_BIG_ENDIAN)
+#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanReverse64( &r, val );
+    return (int)(r>>3);
+#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_clzll(val) >> 3);
+#   else
+    int r;
+    if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+    if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+    r += (!val);
+    return r;
+#   endif
+# else
+#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanForward64( &r, val );
+    return (int)(r>>3);
+#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_ctzll(val) >> 3);
+#   else
+    static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+    return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#   endif
+# endif
+}
+
+#else
+
+static int LZ4_NbCommonBytes (register U32 val)
+{
+# if defined(LZ4_BIG_ENDIAN)
+#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanReverse( &r, val );
+    return (int)(r>>3);
+#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_clz(val) >> 3);
+#   else
+    int r;
+    if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+    r += (!val);
+    return r;
+#   endif
+# else
+#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r;
+    _BitScanForward( &r, val );
+    return (int)(r>>3);
+#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_ctz(val) >> 3);
+#   else
+    static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+    return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#   endif
+# endif
+}
+
+#endif
+
+
+/********************************
+   Compression functions
+********************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+
+static int LZ4_hashSequence(U32 sequence, tableType_t tableType)
+{
+    if (tableType == byU16)
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+static int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); }
+
+static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; }
+    }
+}
+
+static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+static unsigned LZ4_count(const BYTE* pIn, const BYTE* pRef, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    while (likely(pIn<pInLimit-(STEPSIZE-1)))
+    {
+        size_t diff = AARCH(pRef) ^ AARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pRef+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+    if (LZ4_64BITS) if ((pIn<(pInLimit-3)) && (A32(pRef) == A32(pIn))) { pIn+=4; pRef+=4; }
+    if ((pIn<(pInLimit-1)) && (A16(pRef) == A16(pIn))) { pIn+=2; pRef+=2; }
+    if ((pIn<pInLimit) && (*pRef == *pIn)) pIn++;
+
+    return (unsigned)(pIn - pStart);
+}
+
+
+static int LZ4_compress_generic(
+                 void* ctx,
+                 const char* source,
+                 char* dest,
+                 int inputSize,
+                 int maxOutputSize,
+
+                 limitedOutput_directive outputLimited,
+                 tableType_t tableType,
+                 dict_directive dict,
+                 dictIssue_directive dictIssue)
+{
+    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* base;
+    const BYTE* lowLimit;
+    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
+    const BYTE* const dictionary = dictPtr->dictionary;
+    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
+    const size_t dictDelta = dictEnd - (const BYTE*)source;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    const int skipStrength = SKIPSTRENGTH;
+    U32 forwardH;
+    size_t refDelta=0;
+
+    /* Init conditions */
+    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;          /* Unsupported input size, too large (or negative) */
+    switch(dict)
+    {
+    case noDict:
+    default:
+        base = (const BYTE*)source;
+        lowLimit = (const BYTE*)source;
+        break;
+    case withPrefix64k:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source - dictPtr->dictSize;
+        break;
+    case usingExtDict:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source;
+        break;
+    }
+    if ((tableType == byU16) && (inputSize>=(int)LZ4_64KLIMIT)) return 0;   /* Size too large (not within 64K limit) */
+    if (inputSize<LZ4_minLength) goto _last_literals;                       /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* ref;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step=1;
+            unsigned searchMatchNb = (1U << skipStrength);
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = searchMatchNb++ >> skipStrength;
+
+                if (unlikely(forwardIp > mflimit)) goto _last_literals;
+
+                ref = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                if (dict==usingExtDict)
+                {
+                    if (ref<(const BYTE*)source)
+                    {
+                        refDelta = dictDelta;
+                        lowLimit = dictionary;
+                    }
+                    else
+                    {
+                        refDelta = 0;
+                        lowLimit = (const BYTE*)source;
+                    }
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((dictIssue==dictSmall) ? (ref < lowRefLimit) : 0)
+                || ((tableType==byU16) ? 0 : (ref + MAX_DISTANCE < ip))
+                || (A32(ref+refDelta) != A32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (ref+refDelta > lowLimit) && (unlikely(ip[-1]==ref[refDelta-1]))) { ip--; ref--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
+                return 0;   /* Check output limit */
+            if (litLength>=RUN_MASK)
+            {
+                int len = (int)litLength-RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            { BYTE* end = op+litLength; LZ4_WILDCOPY(op,anchor,end); op=end; }
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_WRITE_LITTLEENDIAN_16(op, (U16)(ip-ref));
+
+        /* Encode MatchLength */
+        {
+            unsigned matchLength;
+
+            if ((dict==usingExtDict) && (lowLimit==dictionary))
+            {
+                const BYTE* limit;
+                ref += refDelta;
+                limit = ip + (dictEnd-ref);
+                if (limit > matchlimit) limit = matchlimit;
+                matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, limit);
+                ip += MINMATCH + matchLength;
+                if (ip==limit)
+                {
+                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
+                    matchLength += more;
+                    ip += more;
+                }
+            }
+            else
+            {
+                matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, matchlimit);
+                ip += MINMATCH + matchLength;
+            }
+
+            if (matchLength>=ML_MASK)
+            {
+                if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
+                    return 0;    /* Check output limit */
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
+                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip > mflimit) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        ref = LZ4_getPosition(ip, ctx, tableType, base);
+        if (dict==usingExtDict)
+        {
+            if (ref<(const BYTE*)source)
+            {
+                refDelta = dictDelta;
+                lowLimit = dictionary;
+            }
+            else
+            {
+                refDelta = 0;
+                lowLimit = (const BYTE*)source;
+            }
+        }
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( ((dictIssue==dictSmall) ? (ref>=lowRefLimit) : 1)
+            && (ref+MAX_DISTANCE>=ip)
+            && (A32(ref+refDelta)==A32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        int lastRun = (int)(iend - anchor);
+        if ((outputLimited) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
+            return 0;   /* Check output limit */
+        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun >= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+        else *op++ = (BYTE)(lastRun<<ML_BITS);
+        memcpy(op, anchor, iend - anchor);
+        op += iend-anchor;
+    }
+
+    /* End */
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress(const char* source, char* dest, int inputSize)
+{
+#if (HEAPMODE)
+    void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U32, 4);   /* Aligned on 4-bytes boundaries */
+#else
+    U32 ctx[LZ4_STREAMSIZE_U32] = {0};      /* Ensure data is aligned on 4-bytes boundaries */
+#endif
+    int result;
+
+    if (inputSize < (int)LZ4_64KLIMIT)
+        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue);
+    else
+        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue);
+
+#if (HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+#if (HEAPMODE)
+    void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U32, 4);   /* Aligned on 4-bytes boundaries */
+#else
+    U32 ctx[LZ4_STREAMSIZE_U32] = {0};      /* Ensure data is aligned on 4-bytes boundaries */
+#endif
+    int result;
+
+    if (inputSize < (int)LZ4_64KLIMIT)
+        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue);
+    else
+        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue);
+
+#if (HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+/*****************************************
+   Experimental : Streaming functions
+*****************************************/
+
+/*
+ * LZ4_initStream
+ * Use this function once, to init a newly allocated LZ4_stream_t structure
+ * Return : 1 if OK, 0 if error
+ */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(4, LZ4_STREAMSIZE_U32);
+    LZ4_resetStream(lz4s);
+    return lz4s;
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    if (dict->initCheck) LZ4_resetStream(LZ4_dict);                         /* Uninitialized structure detected */
+
+    if (dictSize < MINMATCH)
+    {
+        dict->dictionary = NULL;
+        dict->dictSize = 0;
+        return 1;
+    }
+
+    if (p <= dictEnd - 64 KB) p = dictEnd - 64 KB;
+    base = p - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->currentOffset += dict->dictSize;
+
+    while (p <= dictEnd-MINMATCH)
+    {
+        LZ4_putPosition(p, dict, byU32, base);
+        p+=3;
+    }
+
+    return 1;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
+{
+    if ((LZ4_dict->currentOffset > 0x80000000) ||
+        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
+    {
+        /* rescale hash table */
+        U32 delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        for (i=0; i<HASH_SIZE_U32; i++)
+        {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+FORCE_INLINE int LZ4_compress_continue_generic (void* LZ4_stream, const char* source, char* dest, int inputSize,
+                                                int maxOutputSize, limitedOutput_directive limit)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = (const BYTE*) source;
+    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
+    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
+    LZ4_renormDictT(streamPtr, smallest);
+
+    /* Check overlapping input/dictionary space */
+    {
+        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
+        {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source)
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, dictSmall);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, noDictIssue);
+        streamPtr->dictSize += (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+
+    /* external dictionary mode */
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, dictSmall);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, noDictIssue);
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+}
+
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, 0, notLimited);
+}
+
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput);
+}
+
+
+/* Hidden debug function, to force separate dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
+    int result;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = dictEnd;
+    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
+    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
+
+    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue);
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)inputSize;
+    streamPtr->currentOffset += (U32)inputSize;
+
+    return result;
+}
+
+
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
+
+    memcpy(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return 1;
+}
+
+
+
+/****************************
+   Decompression functions
+****************************/
+/*
+ * This generic decompression function cover all use cases.
+ * It shall be instanciated several times, using different sets of directives
+ * Note that it is essential this generic function is really inlined,
+ * in order to remove useless branches during compilation optimisation.
+ */
+FORCE_INLINE int LZ4_decompress_generic(
+                 const char* source,
+                 char* dest,
+                 int inputSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
+
+                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
+                 int partialDecoding,    /* full, partial */
+                 int targetOutputSize,   /* only used if partialDecoding==partial */
+                 int dict,               /* noDict, withPrefix64k, usingExtDict */
+                 const char* dictStart,  /* only if dict==usingExtDict */
+                 int dictSize            /* note : = 0 if noDict */
+                 )
+{
+    /* Local Variables */
+    const BYTE* restrict ip = (const BYTE*) source;
+    const BYTE* ref;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+    const BYTE* const lowLimit = (const BYTE*)dest - dictSize;
+
+    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
+    const size_t dec32table[] = {4-0, 4-3, 4-2, 4-3, 4-0, 4-0, 4-0, 4-0};   /* note : static reduces speed for LZ4_decompress_safe() on GCC64 */
+    static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+
+    const int safeDecode = (endOnInput==endOnInputSize);
+    const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+    /* Special cases */
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                         /* targetOutputSize too high => decode everything */
+    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
+    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
+
+
+    /* Main Loop */
+    while (1)
+    {
+        unsigned token;
+        size_t length;
+
+        /* get runlength */
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)
+        {
+            unsigned s;
+            do
+            {
+                s = *ip++;
+                length += s;
+            }
+            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
+            if ((safeDecode) && LZ4_32BITS && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
+            if ((safeDecode) && LZ4_32BITS && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
+        }
+
+        /* copy literals */
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;     /* Necessarily EOF, due to parsing restrictions */
+        }
+        LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy;
+
+        /* get offset */
+        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+        if ((checkOffset) && (unlikely(ref < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
+
+        /* get matchlength */
+        if ((length=(token&ML_MASK)) == ML_MASK)
+        {
+            unsigned s;
+            do
+            {
+                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
+                s = *ip++;
+                length += s;
+            } while (s==255);
+            if ((safeDecode) && LZ4_32BITS && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* overflow detection */
+        }
+
+        /* check external dictionary */
+        if ((dict==usingExtDict) && (ref < (BYTE* const)dest))
+        {
+            if (unlikely(op+length+MINMATCH > oend-LASTLITERALS)) goto _output_error;
+
+            if (length+MINMATCH <= (size_t)(dest-(char*)ref))
+            {
+                ref = dictEnd - (dest-(char*)ref);
+                memcpy(op, ref, length+MINMATCH);
+                op += length+MINMATCH;
+            }
+            else
+            {
+                size_t copySize = (size_t)(dest-(char*)ref);
+                memcpy(op, dictEnd - copySize, copySize);
+                op += copySize;
+                copySize = length+MINMATCH - copySize;
+                if (copySize > (size_t)((char*)op-dest))   /* overlap */
+                {
+                    BYTE* const endOfMatch = op + copySize;
+                    const BYTE* copyFrom = (BYTE*)dest;
+                    while (op < endOfMatch) *op++ = *copyFrom++;
+                }
+                else
+                {
+                    memcpy(op, dest, copySize);
+                    op += copySize;
+                }
+            }
+            continue;
+        }
+
+        /* copy repeated sequence */
+        if (unlikely((op-ref)<(int)STEPSIZE))
+        {
+            const size_t dec64 = dec64table[LZ4_32BITS ? 0 : op-ref];
+            op[0] = ref[0];
+            op[1] = ref[1];
+            op[2] = ref[2];
+            op[3] = ref[3];
+            ref += dec32table[op-ref];
+            A32(op+4) = A32(ref);
+            op += STEPSIZE; ref -= dec64;
+        } else { LZ4_COPYSTEP(op,ref); }
+        cpy = op + length - (STEPSIZE-4);
+
+        if (unlikely(cpy>oend-COPYLENGTH-(STEPSIZE-4)))
+        {
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last 5 bytes must be literals */
+            if (op<oend-COPYLENGTH) LZ4_WILDCOPY(op, ref, (oend-COPYLENGTH));
+            while(op<cpy) *op++=*ref++;
+            op=cpy;
+            continue;
+        }
+        LZ4_WILDCOPY(op, ref, cpy);
+        op=cpy;   /* correction */
+    }
+
+    /* end of decoding */
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
+    else
+       return (int) (((char*)ip)-source);   /* Nb of input bytes read */
+
+    /* Overflow error detected */
+_output_error:
+    return (int) (-(((char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, NULL, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, NULL, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, NULL, 64 KB);
+}
+
+/* streaming decompression functions */
+
+typedef struct
+{
+    const char* dictionary;
+    int dictSize;
+} LZ4_streamDecode_t_internal;
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode()
+ * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(sizeof(U32), LZ4_STREAMDECODESIZE_U32);
+    MEM_INIT(lz4s, 0, LZ4_STREAMDECODESIZE);
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary
+ * This function is not necessary if previous data is still available where it was decoded.
+ * Loading a size of 0 is allowed (same effect as no dictionary).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    lz4sd->dictionary = dictionary;
+    lz4sd->dictSize = dictSize;
+    return 1;
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setDictDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize);
+    if (result <= 0) return result;
+    if (lz4sd->dictionary + lz4sd->dictSize == dest)
+    {
+        lz4sd->dictSize += result;
+    }
+    else
+    {
+        lz4sd->dictionary = dest;
+        lz4sd->dictSize = result;
+    }
+
+    return result;
+}
+
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize);
+    if (result <= 0) return result;
+    if (lz4sd->dictionary + lz4sd->dictSize == dest)
+    {
+        lz4sd->dictSize += result;
+    }
+    else
+    {
+        lz4sd->dictionary = dest;
+        lz4sd->dictSize = result;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, dictStart, dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, dictStart, dictSize);
+}
+
+
+/***************************************************
+    Obsolete Functions
+***************************************************/
+/*
+These function names are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+
+static void LZ4_init(LZ4_stream_t_internal* lz4ds, const BYTE* base)
+{
+    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
+    lz4ds->bufferStart = base;
+}
+
+int LZ4_resetStreamState(void* state, const char* inputBuffer)
+{
+    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
+    LZ4_init((LZ4_stream_t_internal*)state, (const BYTE*)inputBuffer);
+    return 0;
+}
+
+void* LZ4_create (const char* inputBuffer)
+{
+    void* lz4ds = ALLOCATOR(4, LZ4_STREAMSIZE_U32);
+    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (const BYTE*)inputBuffer);
+    return lz4ds;
+}
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+    LZ4_stream_t_internal* lz4ds = (LZ4_stream_t_internal*)LZ4_Data;
+
+    LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)lz4ds->bufferStart, 64 KB);
+
+    return (char*)(lz4ds->bufferStart + 64 KB);
+}
+
+/*  Obsolete compresson functions using User-allocated state */
+
+int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+
+int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize)
+{
+    if (((size_t)(state)&3) != 0) return 0;   /* Error : state is not aligned on 4-bytes boundary */
+    MEM_INIT(state, 0, LZ4_STREAMSIZE);
+
+    if (inputSize < (int)LZ4_64KLIMIT)
+        return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue);
+    else
+        return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue);
+}
+
+int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    if (((size_t)(state)&3) != 0) return 0;   /* Error : state is not aligned on 4-bytes boundary */
+    MEM_INIT(state, 0, LZ4_STREAMSIZE);
+
+    if (inputSize < (int)LZ4_64KLIMIT)
+        return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue);
+    else
+        return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue);
+}
+
+/* Obsolete streaming decompression functions */
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, NULL, 64 KB);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, NULL, 64 KB);
+}
diff --git a/src/include/compression/lz4.h b/src/include/compression/lz4.h
new file mode 100644
index 0000000..f8327f0
--- /dev/null
+++ b/src/include/compression/lz4.h
@@ -0,0 +1,326 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2014, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : http://code.google.com/p/lz4/
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/**************************************
+   Version
+**************************************/
+#define LZ4_VERSION_MAJOR    1    /* for major interface/format changes  */
+#define LZ4_VERSION_MINOR    3    /* for minor interface/format changes  */
+#define LZ4_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+int LZ4_versionNumber (void);
+
+/**************************************
+   Tuning parameter
+**************************************/
+/*
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#define LZ4_MEMORY_USAGE 14
+
+
+/**************************************
+   Simple Functions
+**************************************/
+
+int LZ4_compress        (const char* source, char* dest, int inputSize);
+int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+
+/*
+LZ4_compress() :
+    Compresses 'inputSize' bytes from 'source' into 'dest'.
+    Destination buffer must be already allocated,
+    and must be sized to handle worst cases situations (input data not compressible)
+    Worst case size evaluation is provided by function LZ4_compressBound()
+    inputSize : Max supported value is LZ4_MAX_INPUT_SIZE
+    return : the number of bytes written in buffer dest
+             or 0 if the compression fails
+
+LZ4_decompress_safe() :
+    compressedSize : is obviously the source size
+    maxDecompressedSize : is the size of the destination buffer, which must be already allocated.
+    return : the number of bytes decompressed into the destination buffer (necessarily <= maxDecompressedSize)
+             If the destination buffer is not large enough, decoding will stop and output an error code (<0).
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function is protected against buffer overflow exploits :
+             it never writes outside of output buffer, and never reads outside of input buffer.
+             Therefore, it is protected against malicious data packets.
+*/
+
+
+/*
+Note :
+    Should you prefer to explicitly allocate compression-table memory using your own allocation method,
+    use the streaming functions provided below, simply reset the memory area between each call to LZ4_compress_continue()
+*/
+
+
+/**************************************
+   Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*
+LZ4_compressBound() :
+    Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible)
+    primarily useful for memory allocation of output buffer.
+    macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation).
+
+    isize  : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE
+    return : maximum output size in a "worst case" scenario
+             or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+int LZ4_compressBound(int isize);
+
+
+/*
+LZ4_compress_limitedOutput() :
+    Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+    If it cannot achieve it, compression will stop, and result of the function will be zero.
+    This function never writes outside of provided output buffer.
+
+    inputSize  : Max supported value is LZ4_MAX_INPUT_VALUE
+    maxOutputSize : is the size of the destination buffer (which must be already allocated)
+    return : the number of bytes written in buffer 'dest'
+             or 0 if the compression fails
+*/
+int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+
+/*
+LZ4_compress_withState() :
+    Same compression functions, but using an externally allocated memory space to store compression state.
+    Use LZ4_sizeofState() to know how much memory must be allocated,
+    and then, provide it as 'void* state' to compression functions.
+*/
+int LZ4_sizeofState(void);
+int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+
+/*
+LZ4_decompress_fast() :
+    originalSize : is the original and therefore uncompressed size
+    return : the number of bytes read from the source buffer (in other words, the compressed size)
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
+    note : This function fully respect memory boundaries for properly formed compressed data.
+           It is a bit faster than LZ4_decompress_safe().
+           However, it does not provide any protection against intentionnally modified data stream (malicious input).
+           Use this function in trusted environment only (data to decode comes from a trusted source).
+*/
+int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
+
+
+/*
+LZ4_decompress_safe_partial() :
+    This function decompress a compressed block of size 'compressedSize' at position 'source'
+    into destination buffer 'dest' of size 'maxDecompressedSize'.
+    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+    reducing decompression time.
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
+       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+             Always control how many bytes were decoded.
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
+
+
+/***********************************************
+   Experimental Streaming Compression Functions
+***********************************************/
+
+#define LZ4_STREAMSIZE_U32 ((1 << (LZ4_MEMORY_USAGE-2)) + 8)
+#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U32 * sizeof(unsigned int))
+/*
+ * LZ4_stream_t
+ * information structure to track an LZ4 stream.
+ * important : init this structure content before first use !
+ */
+typedef struct { unsigned int table[LZ4_STREAMSIZE_U32]; } LZ4_stream_t;
+
+/*
+ * LZ4_resetStream
+ * Use this function to init a newly allocated LZ4_stream_t structure
+ * You can also reset an existing LZ4_stream_t structure
+ */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream);
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStream will allocate and initialize an LZ4_stream_t structure
+ * LZ4_freeStream releases its memory.
+ */
+LZ4_stream_t* LZ4_createStream(void);
+int           LZ4_freeStream (LZ4_stream_t* LZ4_stream);
+
+/*
+ * LZ4_loadDict
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary' will remain in memory.
+ * Loading a size of 0 is allowed.
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_loadDict (LZ4_stream_t* LZ4_stream, const char* dictionary, int dictSize);
+
+/*
+ * LZ4_compress_continue
+ * Compress data block 'source', using blocks compressed before as dictionary to improve compression ratio
+ * Previous data blocks are assumed to still be present at their previous location.
+ */
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize);
+
+/*
+ * LZ4_compress_limitedOutput_continue
+ * Same as before, but also specify a maximum target compressed size (maxOutputSize)
+ * If objective cannot be met, compression exits, and returns a zero.
+ */
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+ * LZ4_saveDict
+ * If previously compressed data block is not guaranteed to remain available at its memory location
+ * save it into a safe place (char* safeBuffer)
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ *        dictionary is immediately usable, you can therefore call again LZ4_compress_continue()
+ * Return : 1 if OK, 0 if error
+ * Note : any dictSize > 64 KB will be interpreted as 64KB.
+ */
+int LZ4_saveDict (LZ4_stream_t* LZ4_stream, char* safeBuffer, int dictSize);
+
+
+/************************************************
+  Experimental Streaming Decompression Functions
+************************************************/
+
+#define LZ4_STREAMDECODESIZE_U32 4
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U32 * sizeof(unsigned int))
+/*
+ * LZ4_streamDecode_t
+ * information structure to track an LZ4 stream.
+ * important : init this structure content using LZ4_setStreamDecode or memset() before first use !
+ */
+typedef struct { unsigned int table[LZ4_STREAMDECODESIZE_U32]; } LZ4_streamDecode_t;
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary.
+ * This function can be used to specify a static dictionary,
+ * or to instruct where to find some previously decoded data saved into a different memory space.
+ * Setting a size of 0 is allowed (same effect as no dictionary).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
+ * LZ4_freeStreamDecode releases its memory.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where its new address using LZ4_setDictDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize);
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as
+    a combination of LZ4_setDictDecode() followed by LZ4_decompress_x_continue()
+    all together into a single function call.
+    It doesn't use nor update an LZ4_streamDecode_t structure.
+*/
+int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize);
+int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
+
+
+
+/**************************************
+   Obsolete Functions
+**************************************/
+/*
+Obsolete decompression functions
+These function names are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is the same as LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
+These function prototypes are now disabled; uncomment them if you really need them.
+It is highly recommended to stop using these functions and migrated to newer ones */
+/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
+/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode()
+ * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
+ * LZ4_free just frees it.
+ */
+/* void* LZ4_createStreamDecode(void); */
+/*int   LZ4_free (void* LZ4_stream);    yes, it's the same one as for compression */
+
+/* Obsolete streaming functions; use new streaming interface whenever possible */
+void* LZ4_create (const char* inputBuffer);
+int   LZ4_sizeofStreamState(void);
+int   LZ4_resetStreamState(void* state, const char* inputBuffer);
+char* LZ4_slideInputBuffer (void* state);
+
+/* Obsolete streaming decoding functions */
+int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int compressedSize, int maxOutputSize);
+int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int originalSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
#110Jan Wieck
jan@wi3ck.info
In reply to: Robert Haas (#46)
Re: jsonb format is pessimal for toast compression

On 08/12/2014 10:58 AM, Robert Haas wrote:

What would really be ideal here is if the JSON code could inform the
toast compression code "this many initial bytes are likely
incompressible, just pass them through without trying, and then start
compressing at byte N", where N is the byte following the TOC. But I
don't know that there's a reasonable way to implement that.

Sorry, being late for the party.

Anyhow, this strikes me as a good basic direction of thought. But I
think we should put the burden on the data type, not on toast. To do
that data types could have an optional toast_hint_values() function,
which the toast code can call with the actual datum at hand and its
default parameter array. The hint values function then can modify that
parameter array, telling toast how much to skip, how hard to try (or not
at all) and so on. A data type specific function should know much better
how to figure out how compressible a particular datum may be.

Certainly nothing for 9.4, but it might require changing the toast API
in a different way than just handing it an oid and hard-coding the
JASONBOID case into toast for 9.4. If we are going to change the API, we
might as well do it right.

Regards,
Jan

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#111Jan Wieck
jan@wi3ck.info
In reply to: Andrew Dunstan (#5)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 10:21 AM, Andrew Dunstan wrote:

On 08/07/2014 11:17 PM, Tom Lane wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.
This interacts poorly with the code in pglz_compress() that gives up if
it's found nothing compressible in the first first_success_by bytes of a
value-to-be-compressed. (first_success_by is 1024 in the default set of
compression parameters.)

[snip]

There is plenty of compressible data once we get into the repetitive
strings in the payload part --- but that starts at offset 944, and up to
that point there is nothing that pg_lzcompress can get a handle on. There
are, by definition, no sequences of 4 or more repeated bytes in that area.
I think in principle pg_lzcompress could decide to compress the 3-byte
sequences consisting of the high-order 24 bits of each offset; but it
doesn't choose to do so, probably because of the way its lookup hash table
works:

* pglz_hist_idx -
*
* Computes the history table slot for the lookup by the next 4
* characters in the input.
*
* NB: because we use the next 4 characters, we are not guaranteed to
* find 3-character matches; they very possibly will be in the wrong
* hash list. This seems an acceptable tradeoff for spreading out the
* hash keys more.

For jsonb header data, the "next 4 characters" are *always* different, so
only a chance hash collision can result in a match. There is therefore a
pretty good chance that no compression will occur before it gives up
because of first_success_by.

I'm not sure if there is any easy fix for this. We could possibly change
the default first_success_by value, but I think that'd just be postponing
the problem to larger jsonb objects/arrays, and it would hurt performance
for genuinely incompressible data. A somewhat painful, but not yet
out-of-the-question, alternative is to change the jsonb on-disk
representation. Perhaps the JEntry array could be defined as containing
element lengths instead of element ending offsets. Not sure though if
that would break binary searching for JSON object keys.

Ouch.

Back when this structure was first presented at pgCon 2013, I wondered
if we shouldn't extract the strings into a dictionary, because of key
repetition, and convinced myself that this shouldn't be necessary
because in significant cases TOAST would take care of it.

Maybe we should have pglz_compress() look at the *last* 1024 bytes if it
can't find anything worth compressing in the first, for values larger
than a certain size.

It's worth noting that this is a fairly pathological case. AIUI the
example you constructed has an array with 100k string elements. I don't
think that's typical. So I suspect that unless I've misunderstood the
statement of the problem we're going to find that almost all the jsonb
we will be storing is still compressible.

I also think that a substantial part of the problem of coming up with a
"representative" data sample is because the size of the incompressible
data at the beginning is somewhat tied to the overall size of the datum
itself. This may or may not be true in any particular use case, but as a
general rule of thumb I would assume that the larger the JSONB document,
the larger the offset array at the beginning.

Would changing 1024 to a fraction of the datum length for the time being
give us enough room to come up with a proper solution for 9.5?

Regards,
Jan

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#112Jan Wieck
jan@wi3ck.info
In reply to: Tom Lane (#7)
Re: jsonb format is pessimal for toast compression

On 08/08/2014 11:18 AM, Tom Lane wrote:

Andrew Dunstan <andrew@dunslane.net> writes:

On 08/07/2014 11:17 PM, Tom Lane wrote:

I looked into the issue reported in bug #11109. The problem appears to be
that jsonb's on-disk format is designed in such a way that the leading
portion of any JSON array or object will be fairly incompressible, because
it consists mostly of a strictly-increasing series of integer offsets.

Ouch.

Back when this structure was first presented at pgCon 2013, I wondered
if we shouldn't extract the strings into a dictionary, because of key
repetition, and convinced myself that this shouldn't be necessary
because in significant cases TOAST would take care of it.

That's not really the issue here, I think. The problem is that a
relatively minor aspect of the representation, namely the choice to store
a series of offsets rather than a series of lengths, produces
nonrepetitive data even when the original input is repetitive.

This is only because the input data was exact copies of the same strings
over and over again. PGLZ can very well compress slightly less identical
strings of varying lengths too. Not as well, but well enough. But I
suspect such input data would make it fail again, even with lengths.

Regards,
Jan

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#113David E. Wheeler
david@justatheory.com
In reply to: Jan Wieck (#112)
Re: jsonb format is pessimal for toast compression

On Sep 4, 2014, at 7:26 PM, Jan Wieck <jan@wi3ck.info> wrote:

This is only because the input data was exact copies of the same strings over and over again. PGLZ can very well compress slightly less identical strings of varying lengths too. Not as well, but well enough. But I suspect such input data would make it fail again, even with lengths.

We had a bit of discussion about JSONB compression at PDXPUG Day this morning. Josh polled the room, and about half though we should apply the patch for better compression, while the other half seemed to want faster access operations. (Some folks no doubt voted for both.) But in the ensuing discussion, I started to think that maybe we should leave it as it is, for two reasons:

1. There has been a fair amount of discussion about ways to better deal with this in future releases, such as hints to TOAST about how to compress, or the application of different compression algorithms (or pluggable compression). I’m assuming that leaving it as-is does not remove those possibilities.

2. The major advantage of JSONB is fast access operations. If those are not as important for a given use case as storage space, there’s still the JSON type, which *does* compress reasonably well. IOW, We already have a JSON alternative the compresses well. So why make the same (or similar) trade-offs with JSONB?

Just my $0.02. I would like to see some consensus on this, soon, though, as I am eager to get 9.4 and JSONB, regardless of the outcome!

Best,

David

#114Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

So, I finally got time to test Tom's latest patch on this.

TLDR: we want to go with Tom's latest patch and release beta3.

Figures:

So I tested HEAD against the latest lengths patch. Per Arthur Silva, I
checked uncompressed times for JSONB against compressed times. This
changed the picture considerably.

TABLE SIZES
-----------

HEAD

?column? | pg_size_pretty
---------------------+----------------
json text format | 393 MB
jsonb: compressed | 1147 MB
jsonb: uncompressed | 1221 MB

PATCHED

?column? | pg_size_pretty
---------------------+----------------
json text format | 394 MB
jsonb: compressed | 525 MB
jsonb: uncompressed | 1200 MB

EXTRACTION TIMES
----------------

HEAD

Q1 (search via GIN index followed by extracting 100,000 values from rows):

jsonb compressed: 4000
jsonb uncompressed: 3250

Q2 (seq scan and extract 200,000 values from rows):

json: 11700
jsonb compressed: 3150
jsonb uncompressed: 2700

PATCHED

Q1:

jsonb compressed: 6750
jsonb uncompressed: 3350

Q2:

json: 11796
jsonb compressed: 4700
jsonb uncompressed: 2650

----------------------

Conclusion: with Tom's patch, compressed JSONB is 55% smaller when
compressed (EXTENDED). Extraction times are 50% to 70% slower, but this
appears to be almost entirely due to decompression overhead. When not
compressing (EXTERNAL), extraction times for patch versions are
statistically the same as HEAD, and file sizes are similar to HEAD.

USER REACTION
-------------

I polled at both PDXpgDay and at FOSS4G, asking some ~~ 80 Postgres
users how they would feel about a compression vs. extraction time
tradeoff. The audience was evenly split.

However, with the current patch, the user can choose. Users who know
enough for performance tuning can set JSONB columns to EXTERNAL, and the
the same performance as the unpatched version.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#115Stephen Frost
sfrost@snowman.net
In reply to: Josh Berkus (#114)
Re: jsonb format is pessimal for toast compression

* Josh Berkus (josh@agliodbs.com) wrote:

TLDR: we want to go with Tom's latest patch and release beta3.

Having not even read the rest- yes please. We really need to get beta3
out and figure out when we're going to actually release 9.4...
Admittedly, the last month has been good and we've been fixing issues,
but it'd really be good to wrap 9.4 up.

Conclusion: with Tom's patch, compressed JSONB is 55% smaller when
compressed (EXTENDED). Extraction times are 50% to 70% slower, but this
appears to be almost entirely due to decompression overhead. When not
compressing (EXTERNAL), extraction times for patch versions are
statistically the same as HEAD, and file sizes are similar to HEAD.

Not really a surprise.

I polled at both PDXpgDay and at FOSS4G, asking some ~~ 80 Postgres
users how they would feel about a compression vs. extraction time
tradeoff. The audience was evenly split.

Also not a surprise.

However, with the current patch, the user can choose. Users who know
enough for performance tuning can set JSONB columns to EXTERNAL, and the
the same performance as the unpatched version.

Agreed.

Thanks,

Stephen

#116Arthur Silva
arthurprs@gmail.com
In reply to: Josh Berkus (#114)
Re: jsonb format is pessimal for toast compression

On Thu, Sep 11, 2014 at 10:01 PM, Josh Berkus <josh@agliodbs.com> wrote:

So, I finally got time to test Tom's latest patch on this.

TLDR: we want to go with Tom's latest patch and release beta3.

Figures:

So I tested HEAD against the latest lengths patch. Per Arthur Silva, I
checked uncompressed times for JSONB against compressed times. This
changed the picture considerably.

TABLE SIZES
-----------

HEAD

?column? | pg_size_pretty
---------------------+----------------
json text format | 393 MB
jsonb: compressed | 1147 MB
jsonb: uncompressed | 1221 MB

PATCHED

?column? | pg_size_pretty
---------------------+----------------
json text format | 394 MB
jsonb: compressed | 525 MB
jsonb: uncompressed | 1200 MB

EXTRACTION TIMES
----------------

HEAD

Q1 (search via GIN index followed by extracting 100,000 values from rows):

jsonb compressed: 4000
jsonb uncompressed: 3250

Q2 (seq scan and extract 200,000 values from rows):

json: 11700
jsonb compressed: 3150
jsonb uncompressed: 2700

PATCHED

Q1:

jsonb compressed: 6750
jsonb uncompressed: 3350

Q2:

json: 11796
jsonb compressed: 4700
jsonb uncompressed: 2650

----------------------

Conclusion: with Tom's patch, compressed JSONB is 55% smaller when
compressed (EXTENDED). Extraction times are 50% to 70% slower, but this
appears to be almost entirely due to decompression overhead. When not
compressing (EXTERNAL), extraction times for patch versions are
statistically the same as HEAD, and file sizes are similar to HEAD.

USER REACTION
-------------

I polled at both PDXpgDay and at FOSS4G, asking some ~~ 80 Postgres
users how they would feel about a compression vs. extraction time
tradeoff. The audience was evenly split.

However, with the current patch, the user can choose. Users who know
enough for performance tuning can set JSONB columns to EXTERNAL, and the
the same performance as the unpatched version.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

The compression ratio difference is exaggerated in this case but it does
support that Tom's patch alleviates the extraction penalty.

In my testings with the github archive data the savings <->
performance-penalty was fine, but I'm not confident in those results since
there were only 8 top level keys.
For comparison, some twitter api objects[1]https://dev.twitter.com/rest/reference/get/statuses/home_timeline have 30+ top level keys. If I
have time in the next couple of days I'll conduct some testings with the
public twitter fire-hose data.

[1]: https://dev.twitter.com/rest/reference/get/statuses/home_timeline

#117Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/11/2014 06:56 PM, Arthur Silva wrote:

In my testings with the github archive data the savings <->
performance-penalty was fine, but I'm not confident in those results
since there were only 8 top level keys.

Well, we did want to see that the patch doesn't create a regression with
data which doesn't fall into the problem case area, and your test did
that nicely.

For comparison, some twitter api objects[1] have 30+ top level keys. If
I have time in the next couple of days I'll conduct some testings with
the public twitter fire-hose data.

Yah, if we have enough time for me to get the Mozilla Socorro test
environment working, I can also test with Mozilla crash data. That has
some deep nesting and very large values.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#118Robert Haas
robertmhaas@gmail.com
In reply to: Josh Berkus (#114)
Re: jsonb format is pessimal for toast compression

On Thu, Sep 11, 2014 at 9:01 PM, Josh Berkus <josh@agliodbs.com> wrote:

So, I finally got time to test Tom's latest patch on this.

TLDR: we want to go with Tom's latest patch and release beta3.

Figures:

So I tested HEAD against the latest lengths patch. Per Arthur Silva, I
checked uncompressed times for JSONB against compressed times. This
changed the picture considerably.

Did you

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#119Robert Haas
robertmhaas@gmail.com
In reply to: Robert Haas (#118)
Re: jsonb format is pessimal for toast compression

On Fri, Sep 12, 2014 at 1:00 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Sep 11, 2014 at 9:01 PM, Josh Berkus <josh@agliodbs.com> wrote:

So, I finally got time to test Tom's latest patch on this.

TLDR: we want to go with Tom's latest patch and release beta3.

Figures:

So I tested HEAD against the latest lengths patch. Per Arthur Silva, I
checked uncompressed times for JSONB against compressed times. This
changed the picture considerably.

Did you

Blah.

Did you test Heikki's patch from here?

/messages/by-id/53EC8194.4020804@vmware.com

Tom didn't like it, but I thought it was rather clever.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#120Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/12/2014 10:00 AM, Robert Haas wrote:

On Fri, Sep 12, 2014 at 1:00 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Sep 11, 2014 at 9:01 PM, Josh Berkus <josh@agliodbs.com> wrote:

So, I finally got time to test Tom's latest patch on this.

TLDR: we want to go with Tom's latest patch and release beta3.

Figures:

So I tested HEAD against the latest lengths patch. Per Arthur Silva, I
checked uncompressed times for JSONB against compressed times. This
changed the picture considerably.

Did you

Blah.

Did you test Heikki's patch from here?

/messages/by-id/53EC8194.4020804@vmware.com

Tom didn't like it, but I thought it was rather clever.

Yes, I posted the results for that a couple weeks ago; Tom had posted a
cleaned-up version of that patch, but materially it made no difference
in sizes or extraction times compared with Tom's lengths-only patch.
Same for Arthur's tests.

It's certainly possible that there is a test case for which Heikki's
approach is superior, but if so we haven't seen it. And since it's
approach is also more complicated, sticking with the simpler
lengths-only approach seems like the way to go.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#121Robert Haas
robertmhaas@gmail.com
In reply to: Josh Berkus (#120)
Re: jsonb format is pessimal for toast compression

On Fri, Sep 12, 2014 at 1:11 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/12/2014 10:00 AM, Robert Haas wrote:

On Fri, Sep 12, 2014 at 1:00 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Sep 11, 2014 at 9:01 PM, Josh Berkus <josh@agliodbs.com> wrote:

So, I finally got time to test Tom's latest patch on this.

TLDR: we want to go with Tom's latest patch and release beta3.

Figures:

So I tested HEAD against the latest lengths patch. Per Arthur Silva, I
checked uncompressed times for JSONB against compressed times. This
changed the picture considerably.

Did you

Blah.

Did you test Heikki's patch from here?

/messages/by-id/53EC8194.4020804@vmware.com

Tom didn't like it, but I thought it was rather clever.

Yes, I posted the results for that a couple weeks ago; Tom had posted a
cleaned-up version of that patch, but materially it made no difference
in sizes or extraction times compared with Tom's lengths-only patch.
Same for Arthur's tests.

It's certainly possible that there is a test case for which Heikki's
approach is superior, but if so we haven't seen it. And since it's
approach is also more complicated, sticking with the simpler
lengths-only approach seems like the way to go.

Huh, OK. I'm slightly surprised, but that's why we benchmark these things.

Thanks for following up on this.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#122Tom Lane
tgl@sss.pgh.pa.us
In reply to: Robert Haas (#121)
Re: jsonb format is pessimal for toast compression

Robert Haas <robertmhaas@gmail.com> writes:

On Fri, Sep 12, 2014 at 1:11 PM, Josh Berkus <josh@agliodbs.com> wrote:

It's certainly possible that there is a test case for which Heikki's
approach is superior, but if so we haven't seen it. And since it's
approach is also more complicated, sticking with the simpler
lengths-only approach seems like the way to go.

Huh, OK. I'm slightly surprised, but that's why we benchmark these things.

The argument for Heikki's patch was never that it would offer better
performance; it's obvious (at least to me) that it won't. The argument
was that it'd be upward-compatible with what we're doing now, so that
we'd not have to force an on-disk compatibility break with 9.4beta2.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#123Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Tom Lane (#122)
Re: jsonb format is pessimal for toast compression

On 09/12/2014 08:52 PM, Tom Lane wrote:

Robert Haas <robertmhaas@gmail.com> writes:

On Fri, Sep 12, 2014 at 1:11 PM, Josh Berkus <josh@agliodbs.com> wrote:

It's certainly possible that there is a test case for which Heikki's
approach is superior, but if so we haven't seen it. And since it's
approach is also more complicated, sticking with the simpler
lengths-only approach seems like the way to go.

Huh, OK. I'm slightly surprised, but that's why we benchmark these things.

The argument for Heikki's patch was never that it would offer better
performance; it's obvious (at least to me) that it won't.

Performance was one argument for sure. It's not hard to come up with a
case where the all-lengths approach is much slower: take a huge array
with, say, million elements, and fetch the last element in a tight loop.
And do that in a PL/pgSQL function without storing the datum to disk, so
that it doesn't get toasted. Not a very common thing to do in real life,
although something like that might come up if you do a lot of json
processing in PL/pgSQL. but storing offsets makes that faster.

IOW, something like this:

do $$
declare
ja jsonb;
i int4;
begin
select json_agg(g) into ja from generate_series(1, 100000) g;
for i in 1..100000 loop
perform ja ->> 90000;
end loop;
end;
$$;

should perform much better with current git master or "my patch", than
with the all-lengths patch.

I'm OK with going for the all-lengths approach anyway; it's simpler, and
working with huge arrays is hopefully not that common. But it's not a
completely open-and-shut case.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#124Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/12/2014 01:30 PM, Heikki Linnakangas wrote:

Performance was one argument for sure. It's not hard to come up with a
case where the all-lengths approach is much slower: take a huge array
with, say, million elements, and fetch the last element in a tight loop.
And do that in a PL/pgSQL function without storing the datum to disk, so
that it doesn't get toasted. Not a very common thing to do in real life,
although something like that might come up if you do a lot of json
processing in PL/pgSQL. but storing offsets makes that faster.

While I didnt post the results (because they were uninteresting), I did
specifically test the "last element" in a set of 200 elements for
all-lengths vs. original offsets for JSONB, and the results were not
statistically different.

I did not test against your patch; is there some reason why your patch
would be faster for the "last element" case than the original offsets
version?

If not, I think the corner case is so obscure as to be not worth
optimizing for. I can't imagine that more than a tiny minority of our
users are going to have thousands of keys per datum.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#125Claudio Freire
klaussfreire@gmail.com
In reply to: Josh Berkus (#124)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 15, 2014 at 2:12 PM, Josh Berkus <josh@agliodbs.com> wrote:

If not, I think the corner case is so obscure as to be not worth
optimizing for. I can't imagine that more than a tiny minority of our
users are going to have thousands of keys per datum.

Worst case is linear cost scaling vs number of keys, which depends on
the number of keys how expensive it is.

It would have an effect only on uncompressed jsonb, since compressed
jsonb already pays a linear cost for decompression.

I'd suggest testing performance of large small keys in uncompressed
form. It's bound to have a noticeable regression there.

Now, large small keys could be 200 or 2000, or even 20k. I'd guess
several should be tested to find the shape of the curve.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#126Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/15/2014 10:23 AM, Claudio Freire wrote:

Now, large small keys could be 200 or 2000, or even 20k. I'd guess
several should be tested to find the shape of the curve.

Well, we know that it's not noticeable with 200, and that it is
noticeable with 100K. It's only worth testing further if we think that
having more than 200 top-level keys in one JSONB value is going to be a
use case for more than 0.1% of our users. I personally do not.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#127Claudio Freire
klaussfreire@gmail.com
In reply to: Josh Berkus (#126)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 15, 2014 at 4:09 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/15/2014 10:23 AM, Claudio Freire wrote:

Now, large small keys could be 200 or 2000, or even 20k. I'd guess
several should be tested to find the shape of the curve.

Well, we know that it's not noticeable with 200, and that it is
noticeable with 100K. It's only worth testing further if we think that
having more than 200 top-level keys in one JSONB value is going to be a
use case for more than 0.1% of our users. I personally do not.

Yes, but bear in mind that the worst case is exactly at the use case
jsonb was designed to speed up: element access within relatively big
json documents.

Having them uncompressed is expectable because people using jsonb will
often favor speed over compactness if it's a tradeoff (otherwise
they'd use plain json).

So while you're right that it's perhaps above what would be a common
use case, the range "somewhere between 200 and 100K" for the tipping
point seems overly imprecise to me.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#128Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/15/2014 12:15 PM, Claudio Freire wrote:

So while you're right that it's perhaps above what would be a common
use case, the range "somewhere between 200 and 100K" for the tipping
point seems overly imprecise to me.

Well, then, you know how to solve that.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#129Claudio Freire
klaussfreire@gmail.com
In reply to: Josh Berkus (#128)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 15, 2014 at 4:17 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/15/2014 12:15 PM, Claudio Freire wrote:

So while you're right that it's perhaps above what would be a common
use case, the range "somewhere between 200 and 100K" for the tipping
point seems overly imprecise to me.

Well, then, you know how to solve that.

I was hoping testing with other numbers was a simple hitting a key for
someone else.

But sure. I'll set something up.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#130Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/15/2014 12:25 PM, Claudio Freire wrote:

On Mon, Sep 15, 2014 at 4:17 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/15/2014 12:15 PM, Claudio Freire wrote:

So while you're right that it's perhaps above what would be a common
use case, the range "somewhere between 200 and 100K" for the tipping
point seems overly imprecise to me.

Well, then, you know how to solve that.

I was hoping testing with other numbers was a simple hitting a key for
someone else.

Nope. My test case has a fixed size.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#131Robert Haas
robertmhaas@gmail.com
In reply to: Josh Berkus (#126)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 15, 2014 at 3:09 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/15/2014 10:23 AM, Claudio Freire wrote:

Now, large small keys could be 200 or 2000, or even 20k. I'd guess
several should be tested to find the shape of the curve.

Well, we know that it's not noticeable with 200, and that it is
noticeable with 100K. It's only worth testing further if we think that
having more than 200 top-level keys in one JSONB value is going to be a
use case for more than 0.1% of our users. I personally do not.

FWIW, I have written one (1) application that uses JSONB and it has
one sub-object (not the top-level object) that in the most typical
configuration contains precisely 270 keys. Now, granted, that is not
the top-level object, if that distinction is actually relevant here,
but color me just a bit skeptical of this claim anyway. This was just
a casual thing I did for my own use, not anything industrial strength,
so it's hard to believe I'm stressing the system more than 99.9% of
users will.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#132Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/15/2014 02:16 PM, Robert Haas wrote:

On Mon, Sep 15, 2014 at 3:09 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/15/2014 10:23 AM, Claudio Freire wrote:

Now, large small keys could be 200 or 2000, or even 20k. I'd guess
several should be tested to find the shape of the curve.

Well, we know that it's not noticeable with 200, and that it is
noticeable with 100K. It's only worth testing further if we think that
having more than 200 top-level keys in one JSONB value is going to be a
use case for more than 0.1% of our users. I personally do not.

FWIW, I have written one (1) application that uses JSONB and it has
one sub-object (not the top-level object) that in the most typical
configuration contains precisely 270 keys. Now, granted, that is not
the top-level object, if that distinction is actually relevant here,
but color me just a bit skeptical of this claim anyway. This was just
a casual thing I did for my own use, not anything industrial strength,
so it's hard to believe I'm stressing the system more than 99.9% of
users will.

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#133Peter Geoghegan
pg@heroku.com
In reply to: Josh Berkus (#132)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 15, 2014 at 4:05 PM, Josh Berkus <josh@agliodbs.com> wrote:

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

I believe Robert meant that the 270 keys are not at the top level, but
are at some level (in other words, some object has 270 pairs). That is
equivalent to having them at the top level for the purposes of this
discussion.

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#134Arthur Silva
arthurprs@gmail.com
In reply to: Arthur Silva (#116)
Re: jsonb format is pessimal for toast compression

I couldn't get my hands on the twitter data but I'm generating my own. The
json template is http://paste2.org/wJ1dfcjw and data was generated with
http://www.json-generator.com/. It has 35 top level keys, just in case
someone is wondering.
I generated 10000 random objects and I'm inserting them repeatedly until I
got 320k rows.

Test query: SELECT data->>'name', data->>'email' FROM t_json
Test storage: EXTERNAL
Test jsonb lengths quartiles: {1278,1587,1731,1871,2231}
Tom's lengths+cache aware: 455ms
HEAD: 440ms

This is a realistic-ish workload in my opinion and Tom's patch performs
within 4% of HEAD.

Due to the overall lenghts I couldn't really test compressibility so I
re-ran the test. This time I inserted an array of 2 objects in each row, as
in: [obj, obj];
The objects where taken in sequence from the 10000 pool so contents match
in both tests.

Test query: SELECT data #> '{0, name}', data #> '{0, email}', data #> '{1,
name}', data #> '{1, email}' FROM t_json
Test storage: EXTENDED
HEAD: 17mb table + 878mb toast
HEAD size quartiles: {2015,2500,2591,2711,3483}
HEAD query runtime: 15s
Tom's: 220mb table + 580mb toast
Tom's size quartiles: {1665,1984,2061,2142.25,2384}
Tom's query runtime: 13s

This is an intriguing edge case that Tom's patch actually outperform the
base implementation for 3~4kb jsons.

#135Craig Ringer
craig@2ndquadrant.com
In reply to: Peter Geoghegan (#133)
Re: jsonb format is pessimal for toast compression

On 09/16/2014 07:44 AM, Peter Geoghegan wrote:

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

Also, at a certain size the fact that Pg must rewrite the whole document
for any change to it starts to introduce other practical changes.

Anyway - this is looking like the change will go in, and with it a
catversion bump. Introduction of a jsonb version/flags byte might be
worthwhile at the same time. It seems likely that there'll be more room
for improvement in jsonb, possibly even down to using different formats
for different data.

Is it worth paying a byte per value to save on possible upgrade pain?

--
Craig Ringer http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#136Robert Haas
robertmhaas@gmail.com
In reply to: Peter Geoghegan (#133)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 15, 2014 at 7:44 PM, Peter Geoghegan <pg@heroku.com> wrote:

On Mon, Sep 15, 2014 at 4:05 PM, Josh Berkus <josh@agliodbs.com> wrote:

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

I believe Robert meant that the 270 keys are not at the top level, but
are at some level (in other words, some object has 270 pairs). That is
equivalent to having them at the top level for the purposes of this
discussion.

Yes, that's exactly what I meant.

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

The JSON documents in this case were not particularly large. These
objects were < 100kB; they just had a lot of keys. I'm a little
baffled by the apparent theme that people think that (object size) /
(# of keys) will tend to be large. Maybe there will be some instances
where that's the case, but it's not what I'd expect. I would expect
people to use JSON to serialize structured data in situations where
normalizing would be unwieldly.

For example, pick your favorite Facebook or Smartphone game - Plants
vs. Zombies, Farmville, Candy Crush Saga, whatever. Or even a
traditional board game like chess. Think about what the game state
looks like as an abstract object. Almost without exception, you've
got some kind of game board with a bunch of squares and then you have
a bunch of pieces (plants, crops, candies, pawns) that are positioned
on those squares. Now you want to store this in a database. You're
certainly not going to have a table column per square, and EAV would
be stupid, so what's left? You could use an array, but an array of
strings might not be descriptive enough; for a square in Farmville,
for example, you might need to know the type of crop, and whether it
was fertilized with special magic fertilizer, and when it's going to
be ready to harvest, and when it'll wither if not harvested. So a
JSON is a pretty natural structure: an array of arrays of objects. If
you have a 30x30 farm, you'll have 900 keys. If you have a 50x50
farm, which probably means you're spending real money to buy imaginary
plants, you'll have 2500 keys.

(For the record, I have no actual knowledge of how any of these games
are implemented under the hood. I'm just speculating on how I would
have done it.)

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#137Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/16/2014 06:31 AM, Robert Haas wrote:

On Mon, Sep 15, 2014 at 7:44 PM, Peter Geoghegan <pg@heroku.com> wrote:

On Mon, Sep 15, 2014 at 4:05 PM, Josh Berkus <josh@agliodbs.com> wrote:

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

I believe Robert meant that the 270 keys are not at the top level, but
are at some level (in other words, some object has 270 pairs). That is
equivalent to having them at the top level for the purposes of this
discussion.

Yes, that's exactly what I meant.

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

Well, I can only judge from the use cases I personally have, none of
which involve more than 100 keys at any level for most rows. So far
I've seen some people argue hypotetical use cases involving hundreds of
keys per level, but nobody who *actually* has such a use case. Also,
note that we currently don't know where the "last value" extraction
becomes a performance problem at this stage, except that it's somewhere
between 200 and 100,000. Also, we don't have a test which shows the
hybrid approach (Heikki's patch) performing better with 1000's of keys.

Basically, if someone is going to make a serious case for Heikki's
hybrid approach over the simpler lengths-only approach, then please post
some test data showing the benefit ASAP, since I can't demonstrate it.
Otherwise, let's get beta 3 out the door so we can get the 9.4 release
train moving again.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#138Robert Haas
robertmhaas@gmail.com
In reply to: Josh Berkus (#137)
Re: jsonb format is pessimal for toast compression

On Tue, Sep 16, 2014 at 12:47 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/16/2014 06:31 AM, Robert Haas wrote:

On Mon, Sep 15, 2014 at 7:44 PM, Peter Geoghegan <pg@heroku.com> wrote:

On Mon, Sep 15, 2014 at 4:05 PM, Josh Berkus <josh@agliodbs.com> wrote:

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

I believe Robert meant that the 270 keys are not at the top level, but
are at some level (in other words, some object has 270 pairs). That is
equivalent to having them at the top level for the purposes of this
discussion.

Yes, that's exactly what I meant.

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

Well, I can only judge from the use cases I personally have, none of
which involve more than 100 keys at any level for most rows. So far
I've seen some people argue hypotetical use cases involving hundreds of
keys per level, but nobody who *actually* has such a use case.

I already told you that I did, and that it was the only and only app I
had written for JSONB.

Also,
note that we currently don't know where the "last value" extraction
becomes a performance problem at this stage, except that it's somewhere
between 200 and 100,000. Also, we don't have a test which shows the
hybrid approach (Heikki's patch) performing better with 1000's of keys.

Fair point.

Basically, if someone is going to make a serious case for Heikki's
hybrid approach over the simpler lengths-only approach, then please post
some test data showing the benefit ASAP, since I can't demonstrate it.
Otherwise, let's get beta 3 out the door so we can get the 9.4 release
train moving again.

I don't personally care about this enough to spend more time on it. I
told you my extremely limited experience because it seems to
contradict your broader experience. If you don't care, you don't
care.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#139Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/16/2014 09:54 AM, Robert Haas wrote:

On Tue, Sep 16, 2014 at 12:47 PM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/16/2014 06:31 AM, Robert Haas wrote:

On Mon, Sep 15, 2014 at 7:44 PM, Peter Geoghegan <pg@heroku.com> wrote:

On Mon, Sep 15, 2014 at 4:05 PM, Josh Berkus <josh@agliodbs.com> wrote:

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

I believe Robert meant that the 270 keys are not at the top level, but
are at some level (in other words, some object has 270 pairs). That is
equivalent to having them at the top level for the purposes of this
discussion.

Yes, that's exactly what I meant.

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

Well, I can only judge from the use cases I personally have, none of
which involve more than 100 keys at any level for most rows. So far
I've seen some people argue hypotetical use cases involving hundreds of
keys per level, but nobody who *actually* has such a use case.

I already told you that I did, and that it was the only and only app I
had written for JSONB.

Ah, ok, I thought yours was a test case. Did you check how it performed
on the two patches at all? My tests with 185 keys didn't show any
difference, including for a "last key" case.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#140Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Josh Berkus (#137)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

On 09/16/2014 07:47 PM, Josh Berkus wrote:

On 09/16/2014 06:31 AM, Robert Haas wrote:

On Mon, Sep 15, 2014 at 7:44 PM, Peter Geoghegan <pg@heroku.com> wrote:

On Mon, Sep 15, 2014 at 4:05 PM, Josh Berkus <josh@agliodbs.com> wrote:

Actually, having the keys all at the same level *is* relevant for the
issue we're discussing. If those 270 keys are organized in a tree, it's
not the same as having them all on one level (and not as problematic).

I believe Robert meant that the 270 keys are not at the top level, but
are at some level (in other words, some object has 270 pairs). That is
equivalent to having them at the top level for the purposes of this
discussion.

Yes, that's exactly what I meant.

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

Well, I can only judge from the use cases I personally have, none of
which involve more than 100 keys at any level for most rows. So far
I've seen some people argue hypotetical use cases involving hundreds of
keys per level, but nobody who *actually* has such a use case. Also,
note that we currently don't know where the "last value" extraction
becomes a performance problem at this stage, except that it's somewhere
between 200 and 100,000. Also, we don't have a test which shows the
hybrid approach (Heikki's patch) performing better with 1000's of keys.

Basically, if someone is going to make a serious case for Heikki's
hybrid approach over the simpler lengths-only approach, then please post
some test data showing the benefit ASAP, since I can't demonstrate it.
Otherwise, let's get beta 3 out the door so we can get the 9.4 release
train moving again.

Are you looking for someone with a real life scenario, or just synthetic
test case? The latter is easy to do.

See attached test program. It's basically the same I posted earlier.
Here are the results from my laptop with Tom's jsonb-lengths-merged.patch:

postgres=# select * from testtimes ;
elem | duration_ms
------+-------------
11 | 0.289508
12 | 0.288122
13 | 0.290558
14 | 0.287889
15 | 0.286303
17 | 0.290415
19 | 0.289829
21 | 0.289783
23 | 0.287104
25 | 0.289834
28 | 0.290735
31 | 0.291844
34 | 0.293454
37 | 0.293866
41 | 0.291217
45 | 0.289243
50 | 0.290385
55 | 0.292085
61 | 0.290892
67 | 0.292335
74 | 0.292561
81 | 0.291416
89 | 0.295714
98 | 0.29844
108 | 0.297421
119 | 0.299471
131 | 0.299877
144 | 0.301604
158 | 0.303365
174 | 0.304203
191 | 0.303596
210 | 0.306526
231 | 0.304189
254 | 0.307782
279 | 0.307372
307 | 0.306873
338 | 0.310471
372 | 0.3151
409 | 0.320354
450 | 0.32038
495 | 0.322127
545 | 0.323256
600 | 0.330419
660 | 0.334226
726 | 0.336951
799 | 0.34108
879 | 0.347746
967 | 0.354275
1064 | 0.356696
1170 | 0.366906
1287 | 0.375352
1416 | 0.392952
1558 | 0.392907
1714 | 0.402157
1885 | 0.412384
2074 | 0.425958
2281 | 0.435415
2509 | 0.45301
2760 | 0.469983
3036 | 0.487329
3340 | 0.505505
3674 | 0.530412
4041 | 0.552585
4445 | 0.581815
4890 | 0.610509
5379 | 0.642885
5917 | 0.680395
6509 | 0.713849
7160 | 0.757561
7876 | 0.805225
8664 | 0.856142
9530 | 0.913255
(72 rows)

That's up to 9530 elements - it's pretty easy to extrapolate from there
to higher counts, it's O(n).

With unpatched git master, the runtime is flat, regardless of which
element is queried, at about 0.29 s. With
jsonb-with-offsets-and-lengths-2.patch, there's no difference that I
could measure.

The difference starts to be meaningful at around 500 entries. In
practice, I doubt anyone's going to notice until you start talking about
tens of thousands of entries.

I'll leave it up to the jury to decide if we care or not. It seems like
a fairly unusual use case, where you push around large enough arrays or
objects to notice. Then again, I'm sure *someone* will do it. People do
strange things, and they find ways to abuse the features that the
original developers didn't think of.

- Heikki

Attachments:

jsonb-lengths.sqlapplication/sql; name=jsonb-lengths.sqlDownload
#141Claudio Freire
klaussfreire@gmail.com
In reply to: Heikki Linnakangas (#140)
Re: jsonb format is pessimal for toast compression

On Tue, Sep 16, 2014 at 3:12 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

I'll leave it up to the jury to decide if we care or not. It seems like a
fairly unusual use case, where you push around large enough arrays or
objects to notice. Then again, I'm sure *someone* will do it. People do
strange things, and they find ways to abuse the features that the original
developers didn't think of.

Again, it's not abusing of the feature. It's using it. Jsonb is
supposed to be fast for this.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#142Robert Haas
robertmhaas@gmail.com
In reply to: Josh Berkus (#139)
Re: jsonb format is pessimal for toast compression

On Tue, Sep 16, 2014 at 1:11 PM, Josh Berkus <josh@agliodbs.com> wrote:

Well, I can only judge from the use cases I personally have, none of
which involve more than 100 keys at any level for most rows. So far
I've seen some people argue hypotetical use cases involving hundreds of
keys per level, but nobody who *actually* has such a use case.

I already told you that I did, and that it was the only and only app I
had written for JSONB.

Ah, ok, I thought yours was a test case. Did you check how it performed
on the two patches at all? My tests with 185 keys didn't show any
difference, including for a "last key" case.

No, I didn't test it. But I think Heikki's test results pretty much
tell us everything there is to see here. This isn't really that
complicated; I've read a few papers on index compression over the
years and they seem to often use techniques that have the same general
flavor as what Heikki did here, adding complexity in the data format
to gain other advantages. So I don't think we should be put off.

Basically, I think that if we make a decision to use Tom's patch
rather than Heikki's patch, we're deciding that the initial decision,
by the folks who wrote the original jsonb code, to make array access
less than O(n) was misguided. While that could be true, I'd prefer to
bet that those folks knew what they were doing. The only way reason
we're even considering changing it is that the array of lengths
doesn't compress well, and we've got an approach that fixes that
problem while preserving the advantages of fast lookup. We should
have a darn fine reason to say no to that approach, and "it didn't
benefit my particular use case" is not it.

In practice, I'm not very surprised that the impact doesn't seem too
bad when you're running SQL queries from the client. There's so much
other overhead, for de-TOASTing and client communication and even just
planner and executor costs, that this gets lost in the noise. But
think about a PL/pgsql procedure, say, where somebody might loop over
all of the elements in array. If those operations go from O(1) to
O(n), then the loop goes from O(n) to O(n^2). I will bet you a
beverage of your choice that somebody will find that behavior within a
year of release and be dismayed by it.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#143Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

Heikki, Robert:

On 09/16/2014 11:12 AM, Heikki Linnakangas wrote:

Are you looking for someone with a real life scenario, or just synthetic
test case? The latter is easy to do.

See attached test program. It's basically the same I posted earlier.
Here are the results from my laptop with Tom's jsonb-lengths-merged.patch:

Thanks for that!

postgres=# select * from testtimes ;
elem | duration_ms
------+-------------
3674 | 0.530412
4041 | 0.552585
4445 | 0.581815

This looks like the level at which the difference gets to be really
noticeable. Note that this is completely swamped by the difference
between compressed vs. uncompressed though.

With unpatched git master, the runtime is flat, regardless of which
element is queried, at about 0.29 s. With
jsonb-with-offsets-and-lengths-2.patch, there's no difference that I
could measure.

OK, thanks.

The difference starts to be meaningful at around 500 entries. In
practice, I doubt anyone's going to notice until you start talking about
tens of thousands of entries.

I'll leave it up to the jury to decide if we care or not. It seems like
a fairly unusual use case, where you push around large enough arrays or
objects to notice. Then again, I'm sure *someone* will do it. People do
strange things, and they find ways to abuse the features that the
original developers didn't think of.

Right, but the question is whether it's worth having a more complex code
and data structure in order to support what certainly *seems* to be a
fairly obscure use-case, that is more than 4000 keys at the same level.
And it's not like it stops working or becomes completely unresponsive
at that level; it's just double the response time.

On 09/16/2014 12:20 PM, Robert Haas wrote:> Basically, I think that if
we make a decision to use Tom's patch

rather than Heikki's patch, we're deciding that the initial decision,
by the folks who wrote the original jsonb code, to make array access
less than O(n) was misguided. While that could be true, I'd prefer to
bet that those folks knew what they were doing. The only way reason
we're even considering changing it is that the array of lengths
doesn't compress well, and we've got an approach that fixes that
problem while preserving the advantages of fast lookup. We should
have a darn fine reason to say no to that approach, and "it didn't
benefit my particular use case" is not it.

Do you feel that way *as a code maintainer*? That is, if you ended up
maintaining the JSONB code, would you still feel that it's worth the
extra complexity? Because that will be the main cost here.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#144Robert Haas
robertmhaas@gmail.com
In reply to: Josh Berkus (#143)
Re: jsonb format is pessimal for toast compression

On Tue, Sep 16, 2014 at 3:24 PM, Josh Berkus <josh@agliodbs.com> wrote:

Do you feel that way *as a code maintainer*? That is, if you ended up
maintaining the JSONB code, would you still feel that it's worth the
extra complexity? Because that will be the main cost here.

I feel that Heikki doesn't have a reputation for writing or committing
unmaintainable code.

I haven't reviewed the patch.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#145Petr Jelinek
petr@2ndquadrant.com
In reply to: Robert Haas (#142)
Re: jsonb format is pessimal for toast compression

On 16/09/14 21:20, Robert Haas wrote:

In practice, I'm not very surprised that the impact doesn't seem too
bad when you're running SQL queries from the client. There's so much
other overhead, for de-TOASTing and client communication and even just
planner and executor costs, that this gets lost in the noise. But
think about a PL/pgsql procedure, say, where somebody might loop over
all of the elements in array. If those operations go from O(1) to
O(n), then the loop goes from O(n) to O(n^2). I will bet you a
beverage of your choice that somebody will find that behavior within a
year of release and be dismayed by it.

As somebody who did see server melt (quite literally that time
unfortunately) thanks to the CPU overhead of operations on varlena
arrays +1 (in fact +many).

Especially if we are trying to promote the json improvements in 9.4 as
"best of both worlds" kind of thing.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#146Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Robert Haas (#144)
Re: jsonb format is pessimal for toast compression

On 09/16/2014 10:37 PM, Robert Haas wrote:

On Tue, Sep 16, 2014 at 3:24 PM, Josh Berkus <josh@agliodbs.com> wrote:

Do you feel that way *as a code maintainer*? That is, if you ended up
maintaining the JSONB code, would you still feel that it's worth the
extra complexity? Because that will be the main cost here.

I feel that Heikki doesn't have a reputation for writing or committing
unmaintainable code.

I haven't reviewed the patch.

The patch I posted was not pretty, but I'm sure it could be refined to
something sensible.

There are many possible variations of the basic scheme of storing mostly
lengths, but an offset for every N elements. I replaced the length with
offset on some element and used a flag bit to indicate which it is.
Perhaps a simpler approach would be to store lengths, but also store a
separate smaller array of offsets, after the lengths array.

I can write a patch if we want to go that way.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#147Arthur Silva
arthurprs@gmail.com
In reply to: Robert Haas (#142)
Re: jsonb format is pessimal for toast compression

On Tue, Sep 16, 2014 at 4:20 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Tue, Sep 16, 2014 at 1:11 PM, Josh Berkus <josh@agliodbs.com> wrote:

Well, I can only judge from the use cases I personally have, none of
which involve more than 100 keys at any level for most rows. So far
I've seen some people argue hypotetical use cases involving hundreds of
keys per level, but nobody who *actually* has such a use case.

I already told you that I did, and that it was the only and only app I
had written for JSONB.

Ah, ok, I thought yours was a test case. Did you check how it performed
on the two patches at all? My tests with 185 keys didn't show any
difference, including for a "last key" case.

No, I didn't test it. But I think Heikki's test results pretty much
tell us everything there is to see here. This isn't really that
complicated; I've read a few papers on index compression over the
years and they seem to often use techniques that have the same general
flavor as what Heikki did here, adding complexity in the data format
to gain other advantages. So I don't think we should be put off.

I second this reasoning. Even if I ran a couple of very realistic test
cases that support all-lengths I do fell that the Hybrid aproach would be
better as it covers all bases. To put things in perspective Tom's latest
patch isn't much simpler either.

Since it would still be a breaking change we should consider changing the
layout to key-key-key-value-value-value as it seems to pay off.

Show quoted text

Basically, I think that if we make a decision to use Tom's patch
rather than Heikki's patch, we're deciding that the initial decision,
by the folks who wrote the original jsonb code, to make array access
less than O(n) was misguided. While that could be true, I'd prefer to
bet that those folks knew what they were doing. The only way reason
we're even considering changing it is that the array of lengths
doesn't compress well, and we've got an approach that fixes that
problem while preserving the advantages of fast lookup. We should
have a darn fine reason to say no to that approach, and "it didn't
benefit my particular use case" is not it.

In practice, I'm not very surprised that the impact doesn't seem too
bad when you're running SQL queries from the client. There's so much
other overhead, for de-TOASTing and client communication and even just
planner and executor costs, that this gets lost in the noise. But
think about a PL/pgsql procedure, say, where somebody might loop over
all of the elements in array. If those operations go from O(1) to
O(n), then the loop goes from O(n) to O(n^2). I will bet you a
beverage of your choice that somebody will find that behavior within a
year of release and be dismayed by it.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

In reply to: Robert Haas (#142)
Re: jsonb format is pessimal for toast compression

----- Цитат от Robert Haas (robertmhaas@gmail.com), на 16.09.2014 в 22:20 -----

In practice, I'm not very surprised that the impact doesn't seem too
bad when you're running SQL queries from the client. There's so much
other overhead, for de-TOASTing and client communication and even just
planner and executor costs, that this gets lost in the noise. But
think about a PL/pgsql procedure, say, where somebody might loop over
all of the elements in array. If those operations go from O(1) to
O(n), then the loop goes from O(n) to O(n^2). I will bet you a
beverage of your choice that somebody will find that behavior within a
year of release and be dismayed by it.

Hi,

I can imagine situation exactly like that. We could use jsonb object to
represent sparse vectors in the database where the key is the dimension
and the value is the value. So they could easily grow to thousands of
dimensions. Once you have than in the database it is easy to go and
write some simple numeric computations on these vectors, let's say you
want a dot product of 2 sparse vectors. If the random access inside one
vector is going to O(n^2) then the dot product computation will be going
to O(n^2*m^2), so not pretty.

I am not saying that the DB is the right place to do this type of
computations but it is somethimes convenient to have it also in the DB.

Regards,
luben

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#149Tom Lane
tgl@sss.pgh.pa.us
In reply to: Heikki Linnakangas (#146)
Re: jsonb format is pessimal for toast compression

Heikki Linnakangas <hlinnakangas@vmware.com> writes:

On 09/16/2014 10:37 PM, Robert Haas wrote:

On Tue, Sep 16, 2014 at 3:24 PM, Josh Berkus <josh@agliodbs.com> wrote:

Do you feel that way *as a code maintainer*? That is, if you ended up
maintaining the JSONB code, would you still feel that it's worth the
extra complexity? Because that will be the main cost here.

I feel that Heikki doesn't have a reputation for writing or committing
unmaintainable code.
I haven't reviewed the patch.

The patch I posted was not pretty, but I'm sure it could be refined to
something sensible.

We're somewhat comparing apples and oranges here, in that I pushed my
approach to something that I think is of committable quality (and which,
not incidentally, fixes some existing bugs that we'd need to fix in any
case); while Heikki's patch was just proof-of-concept. It would be worth
pushing Heikki's patch to committable quality so that we had a more
complete understanding of just what the complexity difference really is.

There are many possible variations of the basic scheme of storing mostly
lengths, but an offset for every N elements. I replaced the length with
offset on some element and used a flag bit to indicate which it is.

Aside from the complexity issue, a demerit of Heikki's solution is that it
eats up a flag bit that we may well wish we had back later. On the other
hand, there's definitely something to be said for not breaking
pg_upgrade-ability of 9.4beta databases.

Perhaps a simpler approach would be to store lengths, but also store a
separate smaller array of offsets, after the lengths array.

That way would also give up on-disk compatibility, and I'm not sure it's
any simpler in practice than your existing solution.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#150Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/16/2014 08:45 PM, Tom Lane wrote:

We're somewhat comparing apples and oranges here, in that I pushed my
approach to something that I think is of committable quality (and which,
not incidentally, fixes some existing bugs that we'd need to fix in any
case); while Heikki's patch was just proof-of-concept. It would be worth
pushing Heikki's patch to committable quality so that we had a more
complete understanding of just what the complexity difference really is.

Is anyone actually working on this?

If not, I'm voting for the all-lengths patch so that we can get 9.4 out
the door.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#151Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

On 09/18/2014 07:53 PM, Josh Berkus wrote:

On 09/16/2014 08:45 PM, Tom Lane wrote:

We're somewhat comparing apples and oranges here, in that I pushed my
approach to something that I think is of committable quality (and which,
not incidentally, fixes some existing bugs that we'd need to fix in any
case); while Heikki's patch was just proof-of-concept. It would be worth
pushing Heikki's patch to committable quality so that we had a more
complete understanding of just what the complexity difference really is.

Is anyone actually working on this?

If not, I'm voting for the all-lengths patch so that we can get 9.4 out
the door.

I'll try to write a more polished patch tomorrow. We'll then see what it
looks like, and can decide if we want it.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#152Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Heikki Linnakangas (#151)
2 attachment(s)
Re: jsonb format is pessimal for toast compression

On 09/18/2014 09:27 PM, Heikki Linnakangas wrote:

On 09/18/2014 07:53 PM, Josh Berkus wrote:

On 09/16/2014 08:45 PM, Tom Lane wrote:

We're somewhat comparing apples and oranges here, in that I pushed my
approach to something that I think is of committable quality (and which,
not incidentally, fixes some existing bugs that we'd need to fix in any
case); while Heikki's patch was just proof-of-concept. It would be worth
pushing Heikki's patch to committable quality so that we had a more
complete understanding of just what the complexity difference really is.

Is anyone actually working on this?

If not, I'm voting for the all-lengths patch so that we can get 9.4 out
the door.

I'll try to write a more polished patch tomorrow. We'll then see what it
looks like, and can decide if we want it.

Ok, here are two patches. One is a refined version of my earlier patch,
and the other implements the separate offsets array approach. They are
both based on Tom's jsonb-lengths-merged.patch, so they include all the
whitespace fixes etc. he mentioned.

There is no big difference in terms of code complexity between the
patches. IMHO the separate offsets array is easier to understand, but it
makes for more complicated accessor macros to find the beginning of the
variable-length data.

Unlike Tom's patch, these patches don't cache any offsets when doing a
binary search. Doesn't seem worth it, when the access time is O(1) anyway.

Both of these patches have a #define JB_OFFSET_STRIDE for the "stride
size". For the separate offsets array, the offsets array has one element
for every JB_OFFSET_STRIDE children. For the other patch, every
JB_OFFSET_STRIDE child stores the end offset, while others store the
length. A smaller value makes random access faster, at the cost of
compressibility / on-disk size. I haven't done any measurements to find
the optimal value, the values in the patches are arbitrary.

I think we should bite the bullet and break compatibility with 9.4beta2
format, even if we go with "my patch". In a jsonb object, it makes sense
to store all the keys first, like Tom did, because of cache benefits,
and the future possibility to do smart EXTERNAL access. Also, even if we
can make the on-disk format compatible, it's weird that you can get
different runtime behavior with datums created with a beta version.
Seems more clear to just require a pg_dump + restore.

Tom: You mentioned earlier that your patch fixes some existing bugs.
What were they? There were a bunch of whitespace and comment fixes that
we should apply in any case, but I couldn't see any actual bugs. I think
we should apply those fixes separately, to make sure we don't forget
about them, and to make it easier to review these patches.

- Heikki

Attachments:

jsonb-with-offsets-and-lengths-3.patchtext/x-diff; name=jsonb-with-offsets-and-lengths-3.patchDownload
diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c
index 2fd87fc..456011a 100644
--- a/src/backend/utils/adt/jsonb.c
+++ b/src/backend/utils/adt/jsonb.c
@@ -196,12 +196,12 @@ jsonb_from_cstring(char *json, int len)
 static size_t
 checkStringLen(size_t len)
 {
-	if (len > JENTRY_POSMASK)
+	if (len > JENTRY_LENMASK)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("string too long to represent as jsonb string"),
 				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
-						   JENTRY_POSMASK)));
+						   JENTRY_LENMASK)));
 
 	return len;
 }
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..7f7ed4f 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -26,15 +26,16 @@
  * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
  * reserved for that in the JsonbContainer.header field.
  *
- * (the total size of an array's elements is also limited by JENTRY_POSMASK,
- * but we're not concerned about that here)
+ * (The total size of an array's or object's elements is also limited by
+ * JENTRY_LENMASK, but we're not concerned about that here.)
  */
 #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
 #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
 
-static void fillJsonbValue(JEntry *array, int index, char *base_addr,
+static void fillJsonbValue(JEntry *children, int index,
+			   char *base_addr, uint32 offset,
 			   JsonbValue *result);
-static bool	equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
+static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
 static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
 static Jsonb *convertToJsonb(JsonbValue *val);
 static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
@@ -42,7 +43,7 @@ static void convertJsonbArray(StringInfo buffer, JEntry *header, JsonbValue *val
 static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
 static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal);
 
-static int reserveFromBuffer(StringInfo buffer, int len);
+static int	reserveFromBuffer(StringInfo buffer, int len);
 static void appendToBuffer(StringInfo buffer, const char *data, int len);
 static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len);
 static short padBufferToInt(StringInfo buffer);
@@ -108,6 +109,57 @@ JsonbValueToJsonb(JsonbValue *val)
 }
 
 /*
+ * Get the offset of the variable-length portion of a Jsonb node within
+ * the variable-length-data part of its container.  The node is identified
+ * by index within the container's JEntry array.
+ */
+static uint32
+getJsonbOffset(const JsonbContainer *jc, int index)
+{
+	int			i;
+	uint32		offset = 0;
+
+	/*
+	 * Start offset of this entry is equal to the end offset of the previous
+	 * entry. Walk backwards to the previous entry stored as an end offset,
+	 * and count up the lengths in between.
+	 */
+	for (i = index - 1; i >= 0; i--)
+	{
+		offset += JBE_LENFLD(jc->children[i]);
+		if (!JBE_HAS_LEN(jc->children[i]))
+			break;
+	}
+
+	return offset;
+}
+
+/*
+ * Get the length of a child in a container.
+ */
+static uint32
+getJsonbLength(const JsonbContainer *jc, int index)
+{
+	uint32          off;
+	uint32          len;
+
+	/*
+	 * If the length is stored directly in the JEntry, return that. Otherwise,
+	 * get the begin offset of the entry, and subtract it from the stored end
+	 * offset.
+	 */
+	if (JBE_HAS_LEN(jc->children[index]))
+		len = JBE_LENFLD(jc->children[index]);
+	else
+	{
+		off = getJsonbOffset(jc, index);
+		len = JBE_LENFLD(jc->children[index]) - off;
+	}
+
+	return len;
+}
+
+/*
  * BT comparator worker function.  Returns an integer less than, equal to, or
  * greater than zero, indicating whether a is less than, equal to, or greater
  * than b.  Consistent with the requirements for a B-Tree operator class
@@ -201,7 +253,7 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b)
 			 *
 			 * If the two values were of the same container type, then there'd
 			 * have been a chance to observe the variation in the number of
-			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say).  They're
+			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're
 			 * either two heterogeneously-typed containers, or a container and
 			 * some scalar type.
 			 *
@@ -271,62 +323,72 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
 							JsonbValue *key)
 {
 	JEntry	   *children = container->children;
+	char	   *base_addr = JB_CONTAINER_CHILD_DATA(container);
 	int			count = (container->header & JB_CMASK);
-	JsonbValue *result = palloc(sizeof(JsonbValue));
+	JsonbValue *result;
 
 	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
 
+	/* Quick out without a palloc cycle if object/array is empty */
+	if (count <= 0)
+		return NULL;
+
+	result = palloc(sizeof(JsonbValue));
+
 	if (flags & JB_FARRAY & container->header)
 	{
-		char	   *base_addr = (char *) (children + count);
+		uint32		offset = 0;
 		int			i;
 
 		for (i = 0; i < count; i++)
 		{
-			fillJsonbValue(children, i, base_addr, result);
+			fillJsonbValue(children, i, base_addr, offset, result);
 
 			if (key->type == result->type)
 			{
 				if (equalsJsonbScalarValue(key, result))
 					return result;
 			}
+
+			if (JBE_HAS_LEN(children[i]))
+				offset += JBE_LENFLD(children[i]);
+			else
+				offset = JBE_LENFLD(children[i]);
 		}
 	}
 	else if (flags & JB_FOBJECT & container->header)
 	{
-		/* Since this is an object, account for *Pairs* of Jentrys */
-		char	   *base_addr = (char *) (children + count * 2);
+		uint32		lastoff;
 		uint32		stopLow = 0,
-					stopMiddle;
+					stopHigh = count;
 
-		/* Object key past by caller must be a string */
+		/* Object key passed by caller must be a string */
 		Assert(key->type == jbvString);
 
 		/* Binary search on object/pair keys *only* */
-		while (stopLow < count)
+		while (stopLow < stopHigh)
 		{
-			int			index;
+			uint32		stopMiddle;
 			int			difference;
 			JsonbValue	candidate;
 
-			/*
-			 * Note how we compensate for the fact that we're iterating
-			 * through pairs (not entries) throughout.
-			 */
-			stopMiddle = stopLow + (count - stopLow) / 2;
-
-			index = stopMiddle * 2;
+			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
 
 			candidate.type = jbvString;
-			candidate.val.string.val = base_addr + JBE_OFF(children, index);
-			candidate.val.string.len = JBE_LEN(children, index);
+			candidate.val.string.val =
+				base_addr + getJsonbOffset(container, stopMiddle);
+			candidate.val.string.len = getJsonbLength(container, stopMiddle);
 
 			difference = lengthCompareJsonbStringValue(&candidate, key);
 
 			if (difference == 0)
 			{
-				/* Found our key, return value */
-				fillJsonbValue(children, index + 1, base_addr, result);
+				/* Found our key, return corresponding value */
+				int			index = stopMiddle + count;
+
+				lastoff = getJsonbOffset(container, index);
+
+				fillJsonbValue(children, index, base_addr, lastoff, result);
 
 				return result;
 			}
@@ -335,7 +397,7 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
 				if (difference < 0)
 					stopLow = stopMiddle + 1;
 				else
-					count = stopMiddle;
+					stopHigh = stopMiddle;
 			}
 		}
 	}
@@ -361,14 +423,16 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i)
 		elog(ERROR, "not a jsonb array");
 
 	nelements = container->header & JB_CMASK;
-	base_addr = (char *) &container->children[nelements];
+	base_addr = JB_CONTAINER_CHILD_DATA(container);
 
 	if (i >= nelements)
 		return NULL;
 
 	result = palloc(sizeof(JsonbValue));
 
-	fillJsonbValue(container->children, i, base_addr, result);
+	fillJsonbValue(container->children, i, base_addr,
+				   getJsonbOffset(container, i),
+				   result);
 
 	return result;
 }
@@ -377,11 +441,17 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i)
  * A helper function to fill in a JsonbValue to represent an element of an
  * array, or a key or value of an object.
  *
+ * The node's JEntry is at children[index], and its variable-length data
+ * is at base_addr + offset.  We make the caller determine the offset since
+ * in many cases the caller can amortize the work across multiple children.
+ *
  * A nested array or object will be returned as jbvBinary, ie. it won't be
  * expanded.
  */
 static void
-fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
+fillJsonbValue(JEntry *children, int index,
+			   char *base_addr, uint32 offset,
+			   JsonbValue *result)
 {
 	JEntry		entry = children[index];
 
@@ -392,14 +462,14 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
 	else if (JBE_ISSTRING(entry))
 	{
 		result->type = jbvString;
-		result->val.string.val = base_addr + JBE_OFF(children, index);
-		result->val.string.len = JBE_LEN(children, index);
+		result->val.string.val = base_addr + offset;
+		result->val.string.len = JBE_LENFLD(entry);
 		Assert(result->val.string.len >= 0);
 	}
 	else if (JBE_ISNUMERIC(entry))
 	{
 		result->type = jbvNumeric;
-		result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index)));
+		result->val.numeric = (Numeric) (base_addr + INTALIGN(offset));
 	}
 	else if (JBE_ISBOOL_TRUE(entry))
 	{
@@ -415,8 +485,9 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
 	{
 		Assert(JBE_ISCONTAINER(entry));
 		result->type = jbvBinary;
-		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index)));
-		result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index));
+		/* Remove alignment padding from data pointer and len */
+		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset));
+		result->val.binary.len = JBE_LENFLD(entry) - (INTALIGN(offset) - offset);
 	}
 }
 
@@ -646,6 +717,8 @@ JsonbIteratorInit(JsonbContainer *container)
 JsonbIteratorToken
 JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested)
 {
+	JEntry je;
+
 	if (*it == NULL)
 		return WJB_DONE;
 
@@ -668,13 +741,15 @@ recurse:
 			 * a full conversion
 			 */
 			val->val.array.rawScalar = (*it)->isScalar;
-			(*it)->i = 0;
+			(*it)->curIndex = 0;
+			(*it)->curDataOffset = 0;
+			(*it)->curValueOffset = 0;	/* not actually used */
 			/* Set state for next call */
 			(*it)->state = JBI_ARRAY_ELEM;
 			return WJB_BEGIN_ARRAY;
 
 		case JBI_ARRAY_ELEM:
-			if ((*it)->i >= (*it)->nElems)
+			if ((*it)->curIndex >= (*it)->nElems)
 			{
 				/*
 				 * All elements within array already processed.  Report this
@@ -686,7 +761,16 @@ recurse:
 				return WJB_END_ARRAY;
 			}
 
-			fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val);
+			fillJsonbValue((*it)->children, (*it)->curIndex,
+						   (*it)->dataProper, (*it)->curDataOffset,
+						   val);
+
+			je = (*it)->children[(*it)->curIndex];
+			if (JBE_HAS_LEN(je))
+				(*it)->curDataOffset += JBE_LENFLD(je);
+			else
+				(*it)->curDataOffset = JBE_LENFLD(je);
+			(*it)->curIndex++;
 
 			if (!IsAJsonbScalar(val) && !skipNested)
 			{
@@ -697,8 +781,8 @@ recurse:
 			else
 			{
 				/*
-				 * Scalar item in array, or a container and caller didn't
-				 * want us to recurse into it.
+				 * Scalar item in array, or a container and caller didn't want
+				 * us to recurse into it.
 				 */
 				return WJB_ELEM;
 			}
@@ -712,13 +796,15 @@ recurse:
 			 * v->val.object.pairs is not actually set, because we aren't
 			 * doing a full conversion
 			 */
-			(*it)->i = 0;
+			(*it)->curIndex = 0;
+			(*it)->curDataOffset = 0;
+			(*it)->curValueOffset = getJsonbOffset((*it)->container, (*it)->nElems);
 			/* Set state for next call */
 			(*it)->state = JBI_OBJECT_KEY;
 			return WJB_BEGIN_OBJECT;
 
 		case JBI_OBJECT_KEY:
-			if ((*it)->i >= (*it)->nElems)
+			if ((*it)->curIndex >= (*it)->nElems)
 			{
 				/*
 				 * All pairs within object already processed.  Report this to
@@ -732,7 +818,9 @@ recurse:
 			else
 			{
 				/* Return key of a key/value pair.  */
-				fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val);
+				fillJsonbValue((*it)->children, (*it)->curIndex,
+							   (*it)->dataProper, (*it)->curDataOffset,
+							   val);
 				if (val->type != jbvString)
 					elog(ERROR, "unexpected jsonb type as object key");
 
@@ -745,8 +833,21 @@ recurse:
 			/* Set state for next call */
 			(*it)->state = JBI_OBJECT_KEY;
 
-			fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1,
-						   (*it)->dataProper, val);
+			fillJsonbValue((*it)->children, (*it)->curIndex + (*it)->nElems,
+						   (*it)->dataProper, (*it)->curValueOffset,
+						   val);
+
+			je = (*it)->children[(*it)->curIndex];
+			if (JBE_HAS_LEN(je))
+				(*it)->curDataOffset += JBE_LENFLD(je);
+			else
+				(*it)->curDataOffset = JBE_LENFLD(je);
+			je = (*it)->children[(*it)->curIndex + (*it)->nElems];
+			if (JBE_HAS_LEN(je))
+				(*it)->curValueOffset += JBE_LENFLD(je);
+			else
+				(*it)->curValueOffset = JBE_LENFLD(je);
+			(*it)->curIndex++;
 
 			/*
 			 * Value may be a container, in which case we recurse with new,
@@ -781,12 +882,11 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent)
 
 	/* Array starts just after header */
 	it->children = container->children;
+	it->dataProper = JB_CONTAINER_CHILD_DATA(it->container);
 
 	switch (container->header & (JB_FARRAY | JB_FOBJECT))
 	{
 		case JB_FARRAY:
-			it->dataProper =
-				(char *) it->children + it->nElems * sizeof(JEntry);
 			it->isScalar = (container->header & JB_FSCALAR) != 0;
 			/* This is either a "raw scalar", or an array */
 			Assert(!it->isScalar || it->nElems == 1);
@@ -795,13 +895,6 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent)
 			break;
 
 		case JB_FOBJECT:
-
-			/*
-			 * Offset reflects that nElems indicates JsonbPairs in an object.
-			 * Each key and each value contain Jentry metadata just the same.
-			 */
-			it->dataProper =
-				(char *) it->children + it->nElems * sizeof(JEntry) * 2;
 			it->state = JBI_OBJECT_START;
 			break;
 
@@ -1209,8 +1302,8 @@ reserveFromBuffer(StringInfo buffer, int len)
 	buffer->len += len;
 
 	/*
-	 * Keep a trailing null in place, even though it's not useful for us;
-	 * it seems best to preserve the invariants of StringInfos.
+	 * Keep a trailing null in place, even though it's not useful for us; it
+	 * seems best to preserve the invariants of StringInfos.
 	 */
 	buffer->data[buffer->len] = '\0';
 
@@ -1284,8 +1377,8 @@ convertToJsonb(JsonbValue *val)
 
 	/*
 	 * Note: the JEntry of the root is discarded. Therefore the root
-	 * JsonbContainer struct must contain enough information to tell what
-	 * kind of value it is.
+	 * JsonbContainer struct must contain enough information to tell what kind
+	 * of value it is.
 	 */
 
 	res = (Jsonb *) buffer.data;
@@ -1315,10 +1408,10 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level)
 		return;
 
 	/*
-	 * A JsonbValue passed as val should never have a type of jbvBinary,
-	 * and neither should any of its sub-components. Those values will be
-	 * produced by convertJsonbArray and convertJsonbObject, the results of
-	 * which will not be passed back to this function as an argument.
+	 * A JsonbValue passed as val should never have a type of jbvBinary, and
+	 * neither should any of its sub-components. Those values will be produced
+	 * by convertJsonbArray and convertJsonbObject, the results of which will
+	 * not be passed back to this function as an argument.
 	 */
 
 	if (IsAJsonbScalar(val))
@@ -1334,57 +1427,75 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level)
 static void
 convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
 {
-	int			offset;
-	int			metaoffset;
+	int			base_offset;
+	int			jentry_offset;
 	int			i;
 	int			totallen;
 	uint32		header;
+	int			nElems = val->val.array.nElems;
 
-	/* Initialize pointer into conversion buffer at this level */
-	offset = buffer->len;
+	/* Remember where in the buffer this array starts. */
+	base_offset = buffer->len;
 
+	/* Align to 4-byte boundary (any padding counts as part of my data) */
 	padBufferToInt(buffer);
 
 	/*
-	 * Construct the header Jentry, stored in the beginning of the variable-
-	 * length payload.
+	 * Construct the header Jentry and store it in the beginning of the
+	 * variable-length payload.
 	 */
-	header = val->val.array.nElems | JB_FARRAY;
+	header = nElems | JB_FARRAY;
 	if (val->val.array.rawScalar)
 	{
-		Assert(val->val.array.nElems == 1);
+		Assert(nElems == 1);
 		Assert(level == 0);
 		header |= JB_FSCALAR;
 	}
 
 	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
-	/* reserve space for the JEntries of the elements. */
-	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
+
+	/* Reserve space for the JEntries of the elements. */
+	jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nElems);
 
 	totallen = 0;
-	for (i = 0; i < val->val.array.nElems; i++)
+	for (i = 0; i < nElems; i++)
 	{
 		JsonbValue *elem = &val->val.array.elems[i];
 		int			len;
 		JEntry		meta;
 
+		/*
+		 * Convert element, producing a JEntry and appending its
+		 * variable-length data to buffer
+		 */
 		convertJsonbValue(buffer, &meta, elem, level + 1);
-		len = meta & JENTRY_POSMASK;
-		totallen += len;
 
-		if (totallen > JENTRY_POSMASK)
+		/*
+		 * Bail out if total variable-length data exceeds what will fit in a
+		 * JEntry length field.  We check this in each iteration, not just
+		 * once at the end, to forestall possible integer overflow.
+		 */
+		len = JBE_LENFLD(meta);
+		totallen += len;
+		if (totallen > JENTRY_LENMASK)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
-							JENTRY_POSMASK)));
+							JENTRY_LENMASK)));
 
-		if (i > 0)
-			meta = (meta & ~JENTRY_POSMASK) | totallen;
-		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
-		metaoffset += sizeof(JEntry);
+		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
+		jentry_offset += sizeof(JEntry);
 	}
 
-	totallen = buffer->len - offset;
+	/* Total data size is everything we've appended to buffer */
+	totallen = buffer->len - base_offset;
+
+	/* Check length again, since we didn't include the metadata above */
+	if (totallen > JENTRY_LENMASK)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
+						JENTRY_LENMASK)));
 
 	/* Initialize the header of this node, in the container's JEntry array */
 	*pheader = JENTRY_ISCONTAINER | totallen;
@@ -1393,65 +1504,104 @@ convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level
 static void
 convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
 {
-	uint32		header;
-	int			offset;
-	int			metaoffset;
+	int			base_offset;
+	int			jentry_offset;
 	int			i;
 	int			totallen;
+	uint32		header;
+	int			nPairs = val->val.object.nPairs;
 
-	/* Initialize pointer into conversion buffer at this level */
-	offset = buffer->len;
+	/* Remember where in the buffer this object starts. */
+	base_offset = buffer->len;
 
+	/* Align to 4-byte boundary (any padding counts as part of my data) */
 	padBufferToInt(buffer);
 
-	/* Initialize header */
-	header = val->val.object.nPairs | JB_FOBJECT;
+	/*
+	 * Construct the header Jentry and store it in the beginning of the
+	 * variable-length payload.
+	 */
+	header = nPairs | JB_FOBJECT;
 	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
 
-	/* reserve space for the JEntries of the keys and values */
-	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
+	/* Reserve space for the JEntries of the keys and values. */
+	jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nPairs * 2);
 
+	/*
+	 * Iterate over the keys, then over the values, since that is the ordering
+	 * we want in the on-disk representation.
+	 */
 	totallen = 0;
-	for (i = 0; i < val->val.object.nPairs; i++)
+	for (i = 0; i < nPairs; i++)
 	{
-		JsonbPair *pair = &val->val.object.pairs[i];
-		int len;
-		JEntry meta;
+		JsonbPair  *pair = &val->val.object.pairs[i];
+		int			len;
+		JEntry		meta;
 
-		/* put key */
+		/*
+		 * Convert key, producing a JEntry and appending its variable-length
+		 * data to buffer
+		 */
 		convertJsonbScalar(buffer, &meta, &pair->key);
 
-		len = meta & JENTRY_POSMASK;
+		len = JBE_LENFLD(meta);
 		totallen += len;
 
-		if (totallen > JENTRY_POSMASK)
+		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
+		jentry_offset += sizeof(JEntry);
+
+		/*
+		 * Bail out if total variable-length data exceeds what will fit in a
+		 * JEntry length field.  We check this in each iteration, not just
+		 * once at the end, to forestall possible integer overflow.
+		 */
+		if (totallen > JENTRY_LENMASK)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
-							JENTRY_POSMASK)));
+					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+							JENTRY_LENMASK)));
+	}
+	for (i = 0; i < nPairs; i++)
+	{
+		JsonbPair  *pair = &val->val.object.pairs[i];
+		int			len;
+		JEntry		meta;
 
-		if (i > 0)
-			meta = (meta & ~JENTRY_POSMASK) | totallen;
-		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
-		metaoffset += sizeof(JEntry);
+		/*
+		 * Convert value, producing a JEntry and appending its variable-length
+		 * data to buffer
+		 */
+		convertJsonbValue(buffer, &meta, &pair->value, level + 1);
 
-		convertJsonbValue(buffer, &meta, &pair->value, level);
-		len = meta & JENTRY_POSMASK;
+		len = JBE_LENFLD(meta);
 		totallen += len;
 
-		if (totallen > JENTRY_POSMASK)
+		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
+		jentry_offset += sizeof(JEntry);
+
+		/*
+		 * Bail out if total variable-length data exceeds what will fit in a
+		 * JEntry length field.  We check this in each iteration, not just
+		 * once at the end, to forestall possible integer overflow.
+		 */
+		if (totallen > JENTRY_LENMASK)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
-							JENTRY_POSMASK)));
-
-		meta = (meta & ~JENTRY_POSMASK) | totallen;
-		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
-		metaoffset += sizeof(JEntry);
+					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+							JENTRY_LENMASK)));
 	}
 
-	totallen = buffer->len - offset;
+	/* Total data size is everything we've appended to buffer */
+	totallen = buffer->len - base_offset;
 
+	/* Check length again, since we didn't include the metadata above */
+	if (totallen > JENTRY_LENMASK)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+						JENTRY_LENMASK)));
+
+	/* Initialize the header of this node, in the container's JEntry array */
 	*pheader = JENTRY_ISCONTAINER | totallen;
 }
 
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 91e3e14..9f02319 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -83,9 +83,9 @@ typedef struct JsonbValue JsonbValue;
  * buffer is accessed, but they can also be deep copied and passed around.
  *
  * Jsonb is a tree structure. Each node in the tree consists of a JEntry
- * header, and a variable-length content.  The JEntry header indicates what
- * kind of a node it is, e.g. a string or an array, and the offset and length
- * of its variable-length portion within the container.
+ * header and a variable-length content (possibly of zero size).  The JEntry
+ * header indicates what kind of a node it is, e.g. a string or an array,
+ * and includes the length of its variable-length portion.
  *
  * The JEntry and the content of a node are not stored physically together.
  * Instead, the container array or object has an array that holds the JEntrys
@@ -95,39 +95,31 @@ typedef struct JsonbValue JsonbValue;
  * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
  * is implicitly known that the root node must be an array or an object,
  * so we can get away without the type indicator as long as we can distinguish
- * the two.  For that purpose, both an array and an object begins with a uint32
+ * the two.  For that purpose, both an array and an object begin with a uint32
  * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
  * scalar value needs to be stored as a Jsonb value, what we actually store is
  * an array with one element, with the flags in the array's header field set
  * to JB_FSCALAR | JB_FARRAY.
  *
- * To encode the length and offset of the variable-length portion of each
- * node in a compact way, the JEntry stores only the end offset within the
- * variable-length portion of the container node. For the first JEntry in the
- * container's JEntry array, that equals to the length of the node data.  The
- * begin offset and length of the rest of the entries can be calculated using
- * the end offset of the previous JEntry in the array.
- *
  * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
  * the variable-length portion of some node types is aligned to a 4-byte
  * boundary, while others are not. When alignment is needed, the padding is
  * in the beginning of the node that requires it. For example, if a numeric
  * node is stored after a string node, so that the numeric node begins at
  * offset 3, the variable-length portion of the numeric node will begin with
- * one padding byte.
+ * one padding byte so that the actual numeric data is 4-byte aligned.
  */
 
 /*
  * Jentry format.
  *
- * The least significant 28 bits store the end offset of the entry (see
- * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
- * are used to store the type of the entry. The most significant bit
- * is unused, and should be set to zero.
+ * The least significant 28 bits store the data length of the entry (see
+ * JBE_LENFLD and JBE_LEN macros below). The next three bits store the type
+ * of the entry.
  */
 typedef uint32 JEntry;
 
-#define JENTRY_POSMASK			0x0FFFFFFF
+#define JENTRY_LENMASK			0x0FFFFFFF
 #define JENTRY_TYPEMASK			0x70000000
 
 /* values stored in the type bits */
@@ -138,7 +130,10 @@ typedef uint32 JEntry;
 #define JENTRY_ISNULL			0x40000000
 #define JENTRY_ISCONTAINER		0x50000000		/* array or object */
 
+#define JENTRY_HAS_LEN			0x80000000
+
 /* Note possible multiple evaluations */
+#define JBE_HAS_LEN(je_)		(((je_) & JENTRY_HAS_LEN) != 0)
 #define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
 #define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
 #define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
@@ -148,19 +143,25 @@ typedef uint32 JEntry;
 #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
 
 /*
- * Macros for getting the offset and length of an element. Note multiple
- * evaluations and access to prior array element.
+ * Macros for getting the data length/offset of a JEntry.
  */
-#define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
-#define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
-#define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
-								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
+#define JBE_LENFLD(je_)			((je_) & JENTRY_LENMASK)
 
 /*
  * A jsonb array or object node, within a Jsonb Datum.
  *
- * An array has one child for each element. An object has two children for
- * each key/value pair.
+ * An array has one child for each element, stored in array order.
+ *
+ * An object has two children for each key/value pair.  The keys all appear
+ * first, in key sort order; then the values appear, in an order matching the
+ * key order.  This arrangement keeps the keys compact in memory, making a
+ * search for a particular key more cache-friendly.
+ *
+ * The JEntry for each child contains the length or offset of the entry.
+ * Some entries store the end offset of the child, while others store the
+ * length. This is a compromise to make the array compressible, but also
+ * provide random access to large arrays without having to add up all the
+ * lengths.
  */
 typedef struct JsonbContainer
 {
@@ -172,11 +173,26 @@ typedef struct JsonbContainer
 } JsonbContainer;
 
 /* flags for the header-field in JsonbContainer */
-#define JB_CMASK				0x0FFFFFFF
-#define JB_FSCALAR				0x10000000
+#define JB_CMASK				0x0FFFFFFF		/* mask for count field */
+#define JB_FSCALAR				0x10000000		/* flag bits */
 #define JB_FOBJECT				0x20000000
 #define JB_FARRAY				0x40000000
 
+/* For every JB_OFFSET_STRIDE children, an offset is stored. */
+#define JB_OFFSET_STRIDE		16
+
+/*
+ * Accessor macros for the fields of a JsonbContainer.
+ *
+ * Beware of multiple evaluation!
+ */
+#define JB_CONTAINER_NCHILDREN(jc) \
+	(((jc)->header & JB_CMASK) * (((jc)->header & JB_FOBJECT) ? 2 : 1))
+
+#define JB_CONTAINER_CHILD_DATA(jc) \
+	((char *) &(jc)->children[JB_CONTAINER_NCHILDREN(jc)])
+
+
 /* The top-level on-disk format for a jsonb datum. */
 typedef struct
 {
@@ -248,18 +264,20 @@ struct JsonbValue
 									 (jsonbval)->type <= jbvBool)
 
 /*
- * Pair within an Object.
+ * Key/value pair within an Object.
+ *
+ * This struct type is only used briefly while constructing a Jsonb; it is
+ * *not* the on-disk representation.
  *
- * Pairs with duplicate keys are de-duplicated.  We store the order for the
- * benefit of doing so in a well-defined way with respect to the original
- * observed order (which is "last observed wins").  This is only used briefly
- * when originally constructing a Jsonb.
+ * Pairs with duplicate keys are de-duplicated.  We store the originally
+ * observed pair ordering for the purpose of removing duplicates in a
+ * well-defined way (which is "last observed wins").
  */
 struct JsonbPair
 {
 	JsonbValue	key;			/* Must be a jbvString */
 	JsonbValue	value;			/* May be of any type */
-	uint32		order;			/* preserves order of pairs with equal keys */
+	uint32		order;			/* Pair's index in original sequence */
 };
 
 /* Conversion state used when parsing Jsonb from text, or for type coercion */
@@ -287,20 +305,25 @@ typedef struct JsonbIterator
 {
 	/* Container being iterated */
 	JsonbContainer *container;
-	uint32		nElems;			/* Number of elements in children array (will be
-								 * nPairs for objects) */
+	uint32		nElems;			/* Number of elements in children array (will
+								 * be nPairs for objects) */
 	bool		isScalar;		/* Pseudo-array scalar value? */
-	JEntry	   *children;
+	JEntry	   *children;		/* JEntrys for child nodes */
+	/* Data proper.  This points to the beginning of the variable-length data */
+	char	   *dataProper;
+
+	/* Current item in buffer (up to nElems) */
+	int			curIndex;
 
-	/* Current item in buffer (up to nElems, but must * 2 for objects) */
-	int			i;
+	/* Data offset corresponding to current item */
+	uint32		curDataOffset;
 
 	/*
-	 * Data proper.  This points just past end of children array.
-	 * We use the JBE_OFF() macro on the Jentrys to find offsets of each
-	 * child in this area.
+	 * If the container is an object, we want to return keys and values
+	 * alternately; so curDataOffset points to the current key, and
+	 * curValueOffset points to the current value.
 	 */
-	char	   *dataProper;
+	uint32		curValueOffset;
 
 	/* Private state */
 	JsonbIterState state;
jsonb-with-separate-offsets-array-1.patchtext/x-diff; name=jsonb-with-separate-offsets-array-1.patchDownload
diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c
index 2fd87fc..456011a 100644
--- a/src/backend/utils/adt/jsonb.c
+++ b/src/backend/utils/adt/jsonb.c
@@ -196,12 +196,12 @@ jsonb_from_cstring(char *json, int len)
 static size_t
 checkStringLen(size_t len)
 {
-	if (len > JENTRY_POSMASK)
+	if (len > JENTRY_LENMASK)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("string too long to represent as jsonb string"),
 				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
-						   JENTRY_POSMASK)));
+						   JENTRY_LENMASK)));
 
 	return len;
 }
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..7f6223a 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -26,15 +26,16 @@
  * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
  * reserved for that in the JsonbContainer.header field.
  *
- * (the total size of an array's elements is also limited by JENTRY_POSMASK,
- * but we're not concerned about that here)
+ * (The total size of an array's or object's elements is also limited by
+ * JENTRY_LENMASK, but we're not concerned about that here.)
  */
 #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
 #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
 
-static void fillJsonbValue(JEntry *array, int index, char *base_addr,
+static void fillJsonbValue(JEntry *children, int index,
+			   char *base_addr, uint32 offset,
 			   JsonbValue *result);
-static bool	equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
+static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
 static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
 static Jsonb *convertToJsonb(JsonbValue *val);
 static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
@@ -42,7 +43,7 @@ static void convertJsonbArray(StringInfo buffer, JEntry *header, JsonbValue *val
 static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
 static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal);
 
-static int reserveFromBuffer(StringInfo buffer, int len);
+static int	reserveFromBuffer(StringInfo buffer, int len);
 static void appendToBuffer(StringInfo buffer, const char *data, int len);
 static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len);
 static short padBufferToInt(StringInfo buffer);
@@ -108,6 +109,32 @@ JsonbValueToJsonb(JsonbValue *val)
 }
 
 /*
+ * Get the offset of the variable-length portion of a Jsonb node within
+ * the variable-length-data part of its container.  The node is identified
+ * by index within the container's JEntry array.
+ */
+static uint32
+getJsonbOffset(const JsonbContainer *jc, int index)
+{
+	int			i;
+	int			stride;
+	uint32		offset;
+
+	/* First, look up nearest offset stored in the offset array. */
+	stride = index / JB_OFFSET_STRIDE;
+	if (stride == 0)
+		offset = 0;
+	else
+		offset = JB_CONTAINER_OFFSETS(jc)[stride - 1];
+
+	/* Add up lengths of previous nodes to get the offset. */
+	for (i = stride * JB_OFFSET_STRIDE; i < index; i++)
+		offset += JBE_LEN(jc->children, i);
+
+	return offset;
+}
+
+/*
  * BT comparator worker function.  Returns an integer less than, equal to, or
  * greater than zero, indicating whether a is less than, equal to, or greater
  * than b.  Consistent with the requirements for a B-Tree operator class
@@ -201,7 +228,7 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b)
 			 *
 			 * If the two values were of the same container type, then there'd
 			 * have been a chance to observe the variation in the number of
-			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say).  They're
+			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're
 			 * either two heterogeneously-typed containers, or a container and
 			 * some scalar type.
 			 *
@@ -271,62 +298,69 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
 							JsonbValue *key)
 {
 	JEntry	   *children = container->children;
+	char	   *base_addr = JB_CONTAINER_CHILD_DATA(container);
 	int			count = (container->header & JB_CMASK);
-	JsonbValue *result = palloc(sizeof(JsonbValue));
+	JsonbValue *result;
 
 	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
 
+	/* Quick out without a palloc cycle if object/array is empty */
+	if (count <= 0)
+		return NULL;
+
+	result = palloc(sizeof(JsonbValue));
+
 	if (flags & JB_FARRAY & container->header)
 	{
-		char	   *base_addr = (char *) (children + count);
+		uint32		offset = 0;
 		int			i;
 
 		for (i = 0; i < count; i++)
 		{
-			fillJsonbValue(children, i, base_addr, result);
+			fillJsonbValue(children, i, base_addr, offset, result);
 
 			if (key->type == result->type)
 			{
 				if (equalsJsonbScalarValue(key, result))
 					return result;
 			}
+
+			offset += JBE_LEN(children, i);
 		}
 	}
 	else if (flags & JB_FOBJECT & container->header)
 	{
-		/* Since this is an object, account for *Pairs* of Jentrys */
-		char	   *base_addr = (char *) (children + count * 2);
+		uint32		lastoff;
 		uint32		stopLow = 0,
-					stopMiddle;
+					stopHigh = count;
 
-		/* Object key past by caller must be a string */
+		/* Object key passed by caller must be a string */
 		Assert(key->type == jbvString);
 
 		/* Binary search on object/pair keys *only* */
-		while (stopLow < count)
+		while (stopLow < stopHigh)
 		{
-			int			index;
+			uint32		stopMiddle;
 			int			difference;
 			JsonbValue	candidate;
 
-			/*
-			 * Note how we compensate for the fact that we're iterating
-			 * through pairs (not entries) throughout.
-			 */
-			stopMiddle = stopLow + (count - stopLow) / 2;
-
-			index = stopMiddle * 2;
+			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
 
 			candidate.type = jbvString;
-			candidate.val.string.val = base_addr + JBE_OFF(children, index);
-			candidate.val.string.len = JBE_LEN(children, index);
+			candidate.val.string.val =
+				base_addr + getJsonbOffset(container, stopMiddle);
+			candidate.val.string.len = JBE_LEN(children, stopMiddle);
 
 			difference = lengthCompareJsonbStringValue(&candidate, key);
 
 			if (difference == 0)
 			{
-				/* Found our key, return value */
-				fillJsonbValue(children, index + 1, base_addr, result);
+				/* Found our key, return corresponding value */
+				int			index = stopMiddle + count;
+
+				lastoff = getJsonbOffset(container, index);
+
+				fillJsonbValue(children, index, base_addr, lastoff, result);
 
 				return result;
 			}
@@ -335,7 +369,7 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
 				if (difference < 0)
 					stopLow = stopMiddle + 1;
 				else
-					count = stopMiddle;
+					stopHigh = stopMiddle;
 			}
 		}
 	}
@@ -361,14 +395,16 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i)
 		elog(ERROR, "not a jsonb array");
 
 	nelements = container->header & JB_CMASK;
-	base_addr = (char *) &container->children[nelements];
+	base_addr = JB_CONTAINER_CHILD_DATA(container);
 
 	if (i >= nelements)
 		return NULL;
 
 	result = palloc(sizeof(JsonbValue));
 
-	fillJsonbValue(container->children, i, base_addr, result);
+	fillJsonbValue(container->children, i, base_addr,
+				   getJsonbOffset(container, i),
+				   result);
 
 	return result;
 }
@@ -377,11 +413,17 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i)
  * A helper function to fill in a JsonbValue to represent an element of an
  * array, or a key or value of an object.
  *
+ * The node's JEntry is at children[index], and its variable-length data
+ * is at base_addr + offset.  We make the caller determine the offset since
+ * in many cases the caller can amortize the work across multiple children.
+ *
  * A nested array or object will be returned as jbvBinary, ie. it won't be
  * expanded.
  */
 static void
-fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
+fillJsonbValue(JEntry *children, int index,
+			   char *base_addr, uint32 offset,
+			   JsonbValue *result)
 {
 	JEntry		entry = children[index];
 
@@ -392,14 +434,14 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
 	else if (JBE_ISSTRING(entry))
 	{
 		result->type = jbvString;
-		result->val.string.val = base_addr + JBE_OFF(children, index);
-		result->val.string.len = JBE_LEN(children, index);
+		result->val.string.val = base_addr + offset;
+		result->val.string.len = JBE_LENFLD(entry);
 		Assert(result->val.string.len >= 0);
 	}
 	else if (JBE_ISNUMERIC(entry))
 	{
 		result->type = jbvNumeric;
-		result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index)));
+		result->val.numeric = (Numeric) (base_addr + INTALIGN(offset));
 	}
 	else if (JBE_ISBOOL_TRUE(entry))
 	{
@@ -415,8 +457,9 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
 	{
 		Assert(JBE_ISCONTAINER(entry));
 		result->type = jbvBinary;
-		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index)));
-		result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index));
+		/* Remove alignment padding from data pointer and len */
+		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset));
+		result->val.binary.len = JBE_LENFLD(entry) - (INTALIGN(offset) - offset);
 	}
 }
 
@@ -668,13 +711,15 @@ recurse:
 			 * a full conversion
 			 */
 			val->val.array.rawScalar = (*it)->isScalar;
-			(*it)->i = 0;
+			(*it)->curIndex = 0;
+			(*it)->curDataOffset = 0;
+			(*it)->curValueOffset = 0;	/* not actually used */
 			/* Set state for next call */
 			(*it)->state = JBI_ARRAY_ELEM;
 			return WJB_BEGIN_ARRAY;
 
 		case JBI_ARRAY_ELEM:
-			if ((*it)->i >= (*it)->nElems)
+			if ((*it)->curIndex >= (*it)->nElems)
 			{
 				/*
 				 * All elements within array already processed.  Report this
@@ -686,7 +731,12 @@ recurse:
 				return WJB_END_ARRAY;
 			}
 
-			fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val);
+			fillJsonbValue((*it)->children, (*it)->curIndex,
+						   (*it)->dataProper, (*it)->curDataOffset,
+						   val);
+
+			(*it)->curDataOffset += JBE_LEN((*it)->children, (*it)->curIndex);
+			(*it)->curIndex++;
 
 			if (!IsAJsonbScalar(val) && !skipNested)
 			{
@@ -697,8 +747,8 @@ recurse:
 			else
 			{
 				/*
-				 * Scalar item in array, or a container and caller didn't
-				 * want us to recurse into it.
+				 * Scalar item in array, or a container and caller didn't want
+				 * us to recurse into it.
 				 */
 				return WJB_ELEM;
 			}
@@ -712,13 +762,15 @@ recurse:
 			 * v->val.object.pairs is not actually set, because we aren't
 			 * doing a full conversion
 			 */
-			(*it)->i = 0;
+			(*it)->curIndex = 0;
+			(*it)->curDataOffset = 0;
+			(*it)->curValueOffset = getJsonbOffset((*it)->container, (*it)->nElems);
 			/* Set state for next call */
 			(*it)->state = JBI_OBJECT_KEY;
 			return WJB_BEGIN_OBJECT;
 
 		case JBI_OBJECT_KEY:
-			if ((*it)->i >= (*it)->nElems)
+			if ((*it)->curIndex >= (*it)->nElems)
 			{
 				/*
 				 * All pairs within object already processed.  Report this to
@@ -732,7 +784,9 @@ recurse:
 			else
 			{
 				/* Return key of a key/value pair.  */
-				fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val);
+				fillJsonbValue((*it)->children, (*it)->curIndex,
+							   (*it)->dataProper, (*it)->curDataOffset,
+							   val);
 				if (val->type != jbvString)
 					elog(ERROR, "unexpected jsonb type as object key");
 
@@ -745,8 +799,15 @@ recurse:
 			/* Set state for next call */
 			(*it)->state = JBI_OBJECT_KEY;
 
-			fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1,
-						   (*it)->dataProper, val);
+			fillJsonbValue((*it)->children, (*it)->curIndex + (*it)->nElems,
+						   (*it)->dataProper, (*it)->curValueOffset,
+						   val);
+
+			(*it)->curDataOffset += JBE_LEN((*it)->children,
+											(*it)->curIndex);
+			(*it)->curValueOffset += JBE_LEN((*it)->children,
+											 (*it)->curIndex + (*it)->nElems);
+			(*it)->curIndex++;
 
 			/*
 			 * Value may be a container, in which case we recurse with new,
@@ -781,12 +842,11 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent)
 
 	/* Array starts just after header */
 	it->children = container->children;
+	it->dataProper = JB_CONTAINER_CHILD_DATA(it->container);
 
 	switch (container->header & (JB_FARRAY | JB_FOBJECT))
 	{
 		case JB_FARRAY:
-			it->dataProper =
-				(char *) it->children + it->nElems * sizeof(JEntry);
 			it->isScalar = (container->header & JB_FSCALAR) != 0;
 			/* This is either a "raw scalar", or an array */
 			Assert(!it->isScalar || it->nElems == 1);
@@ -795,13 +855,6 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent)
 			break;
 
 		case JB_FOBJECT:
-
-			/*
-			 * Offset reflects that nElems indicates JsonbPairs in an object.
-			 * Each key and each value contain Jentry metadata just the same.
-			 */
-			it->dataProper =
-				(char *) it->children + it->nElems * sizeof(JEntry) * 2;
 			it->state = JBI_OBJECT_START;
 			break;
 
@@ -1209,8 +1262,8 @@ reserveFromBuffer(StringInfo buffer, int len)
 	buffer->len += len;
 
 	/*
-	 * Keep a trailing null in place, even though it's not useful for us;
-	 * it seems best to preserve the invariants of StringInfos.
+	 * Keep a trailing null in place, even though it's not useful for us; it
+	 * seems best to preserve the invariants of StringInfos.
 	 */
 	buffer->data[buffer->len] = '\0';
 
@@ -1284,8 +1337,8 @@ convertToJsonb(JsonbValue *val)
 
 	/*
 	 * Note: the JEntry of the root is discarded. Therefore the root
-	 * JsonbContainer struct must contain enough information to tell what
-	 * kind of value it is.
+	 * JsonbContainer struct must contain enough information to tell what kind
+	 * of value it is.
 	 */
 
 	res = (Jsonb *) buffer.data;
@@ -1315,10 +1368,10 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level)
 		return;
 
 	/*
-	 * A JsonbValue passed as val should never have a type of jbvBinary,
-	 * and neither should any of its sub-components. Those values will be
-	 * produced by convertJsonbArray and convertJsonbObject, the results of
-	 * which will not be passed back to this function as an argument.
+	 * A JsonbValue passed as val should never have a type of jbvBinary, and
+	 * neither should any of its sub-components. Those values will be produced
+	 * by convertJsonbArray and convertJsonbObject, the results of which will
+	 * not be passed back to this function as an argument.
 	 */
 
 	if (IsAJsonbScalar(val))
@@ -1334,57 +1387,91 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level)
 static void
 convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
 {
-	int			offset;
-	int			metaoffset;
+	int			base_offset;
+	int			jentry_offset;
+	int			offsets_offset;
 	int			i;
 	int			totallen;
 	uint32		header;
+	int			nElems = val->val.array.nElems;
 
-	/* Initialize pointer into conversion buffer at this level */
-	offset = buffer->len;
+	/* Remember where in the buffer this array starts. */
+	base_offset = buffer->len;
 
+	/* Align to 4-byte boundary (any padding counts as part of my data) */
 	padBufferToInt(buffer);
 
 	/*
-	 * Construct the header Jentry, stored in the beginning of the variable-
-	 * length payload.
+	 * Construct the header Jentry and store it in the beginning of the
+	 * variable-length payload.
 	 */
-	header = val->val.array.nElems | JB_FARRAY;
+	header = nElems | JB_FARRAY;
 	if (val->val.array.rawScalar)
 	{
-		Assert(val->val.array.nElems == 1);
+		Assert(nElems == 1);
 		Assert(level == 0);
 		header |= JB_FSCALAR;
 	}
 
 	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
-	/* reserve space for the JEntries of the elements. */
-	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
+
+	/* Reserve space for the JEntries of the elements. */
+	jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nElems);
+
+	/* Reserve space for the offsets array */
+	offsets_offset = reserveFromBuffer(buffer,
+						   sizeof(uint32) * JB_CONTAINER_NUM_OFFSETS(nElems));
 
 	totallen = 0;
-	for (i = 0; i < val->val.array.nElems; i++)
+	for (i = 0; i < nElems; i++)
 	{
 		JsonbValue *elem = &val->val.array.elems[i];
 		int			len;
 		JEntry		meta;
 
+		/*
+		 * Every JB_OFFSET_STRIDE elements, store the current offset (from
+		 * the beginning of the variable-length portion) in the offsets array.
+		 */
+		if (i % JB_OFFSET_STRIDE == 0 && i > 0)
+		{
+			uint32 off = (uint32) totallen;
+			copyToBuffer(buffer, offsets_offset, (char *) &off, sizeof(uint32));
+			offsets_offset += sizeof(uint32);
+		}
+
+		/*
+		 * Convert element, producing a JEntry and appending its
+		 * variable-length data to buffer
+		 */
 		convertJsonbValue(buffer, &meta, elem, level + 1);
-		len = meta & JENTRY_POSMASK;
-		totallen += len;
 
-		if (totallen > JENTRY_POSMASK)
+		/*
+		 * Bail out if total variable-length data exceeds what will fit in a
+		 * JEntry length field.  We check this in each iteration, not just
+		 * once at the end, to forestall possible integer overflow.
+		 */
+		len = JBE_LENFLD(meta);
+		totallen += len;
+		if (totallen > JENTRY_LENMASK)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
-							JENTRY_POSMASK)));
+							JENTRY_LENMASK)));
 
-		if (i > 0)
-			meta = (meta & ~JENTRY_POSMASK) | totallen;
-		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
-		metaoffset += sizeof(JEntry);
+		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
+		jentry_offset += sizeof(JEntry);
 	}
 
-	totallen = buffer->len - offset;
+	/* Total data size is everything we've appended to buffer */
+	totallen = buffer->len - base_offset;
+
+	/* Check length again, since we didn't include the metadata above */
+	if (totallen > JENTRY_LENMASK)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
+						JENTRY_LENMASK)));
 
 	/* Initialize the header of this node, in the container's JEntry array */
 	*pheader = JENTRY_ISCONTAINER | totallen;
@@ -1393,65 +1480,131 @@ convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level
 static void
 convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
 {
-	uint32		header;
-	int			offset;
-	int			metaoffset;
+	int			base_offset;
+	int			jentry_offset;
+	int			offsets_offset;
 	int			i;
 	int			totallen;
+	uint32		header;
+	int			nPairs = val->val.object.nPairs;
 
-	/* Initialize pointer into conversion buffer at this level */
-	offset = buffer->len;
+	/* Remember where in the buffer this object starts. */
+	base_offset = buffer->len;
 
+	/* Align to 4-byte boundary (any padding counts as part of my data) */
 	padBufferToInt(buffer);
 
-	/* Initialize header */
-	header = val->val.object.nPairs | JB_FOBJECT;
+	/*
+	 * Construct the header Jentry and store it in the beginning of the
+	 * variable-length payload.
+	 */
+	header = nPairs | JB_FOBJECT;
 	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
 
-	/* reserve space for the JEntries of the keys and values */
-	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
+	/* Reserve space for the JEntries of the keys and values. */
+	jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nPairs * 2);
+
+	/* Reserve space for the offsets array */
+	offsets_offset = reserveFromBuffer(buffer,
+					   sizeof(uint32) * JB_CONTAINER_NUM_OFFSETS(nPairs * 2));
 
+	/*
+	 * Iterate over the keys, then over the values, since that is the ordering
+	 * we want in the on-disk representation.
+	 */
 	totallen = 0;
-	for (i = 0; i < val->val.object.nPairs; i++)
+	for (i = 0; i < nPairs; i++)
 	{
-		JsonbPair *pair = &val->val.object.pairs[i];
-		int len;
-		JEntry meta;
+		JsonbPair  *pair = &val->val.object.pairs[i];
+		int			len;
+		JEntry		meta;
 
-		/* put key */
+		/*
+		 * Every JB_OFFSET_STRIDE elements, store the current offset (from
+		 * the beginning of the variable-length portion) in the offsets array.
+		 */
+		if (i % JB_OFFSET_STRIDE == 0 && i > 0)
+		{
+			uint32 off = (uint32) totallen;
+			copyToBuffer(buffer, offsets_offset, (char *) &off, sizeof(uint32));
+			offsets_offset += sizeof(uint32);
+		}
+
+		/*
+		 * Convert key, producing a JEntry and appending its variable-length
+		 * data to buffer
+		 */
 		convertJsonbScalar(buffer, &meta, &pair->key);
 
-		len = meta & JENTRY_POSMASK;
+		len = JBE_LENFLD(meta);
 		totallen += len;
 
-		if (totallen > JENTRY_POSMASK)
+		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
+		jentry_offset += sizeof(JEntry);
+
+		/*
+		 * Bail out if total variable-length data exceeds what will fit in a
+		 * JEntry length field.  We check this in each iteration, not just
+		 * once at the end, to forestall possible integer overflow.
+		 */
+		if (totallen > JENTRY_LENMASK)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
-							JENTRY_POSMASK)));
+					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+							JENTRY_LENMASK)));
+	}
+	for (i = 0; i < nPairs; i++)
+	{
+		JsonbPair  *pair = &val->val.object.pairs[i];
+		int			len;
+		JEntry		meta;
+
+		/*
+		 * Every JB_OFFSET_STRIDE elements, store the current offset (from
+		 * the beginning of the variable-length portion) in the offsets array.
+		 */
+		if ((nPairs + i) % JB_OFFSET_STRIDE == 0)
+		{
+			uint32 off = (uint32) totallen;
+			copyToBuffer(buffer, offsets_offset, (char *) &off, sizeof(uint32));
+			offsets_offset += sizeof(uint32);
+		}
 
-		if (i > 0)
-			meta = (meta & ~JENTRY_POSMASK) | totallen;
-		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
-		metaoffset += sizeof(JEntry);
+		/*
+		 * Convert value, producing a JEntry and appending its variable-length
+		 * data to buffer
+		 */
+		convertJsonbValue(buffer, &meta, &pair->value, level + 1);
 
-		convertJsonbValue(buffer, &meta, &pair->value, level);
-		len = meta & JENTRY_POSMASK;
+		len = JBE_LENFLD(meta);
 		totallen += len;
 
-		if (totallen > JENTRY_POSMASK)
+		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
+		jentry_offset += sizeof(JEntry);
+
+		/*
+		 * Bail out if total variable-length data exceeds what will fit in a
+		 * JEntry length field.  We check this in each iteration, not just
+		 * once at the end, to forestall possible integer overflow.
+		 */
+		if (totallen > JENTRY_LENMASK)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
-							JENTRY_POSMASK)));
-
-		meta = (meta & ~JENTRY_POSMASK) | totallen;
-		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
-		metaoffset += sizeof(JEntry);
+					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+							JENTRY_LENMASK)));
 	}
 
-	totallen = buffer->len - offset;
+	/* Total data size is everything we've appended to buffer */
+	totallen = buffer->len - base_offset;
 
+	/* Check length again, since we didn't include the metadata above */
+	if (totallen > JENTRY_LENMASK)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+						JENTRY_LENMASK)));
+
+	/* Initialize the header of this node, in the container's JEntry array */
 	*pheader = JENTRY_ISCONTAINER | totallen;
 }
 
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 91e3e14..7c3c93c 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -83,9 +83,9 @@ typedef struct JsonbValue JsonbValue;
  * buffer is accessed, but they can also be deep copied and passed around.
  *
  * Jsonb is a tree structure. Each node in the tree consists of a JEntry
- * header, and a variable-length content.  The JEntry header indicates what
- * kind of a node it is, e.g. a string or an array, and the offset and length
- * of its variable-length portion within the container.
+ * header and a variable-length content (possibly of zero size).  The JEntry
+ * header indicates what kind of a node it is, e.g. a string or an array,
+ * and includes the length of its variable-length portion.
  *
  * The JEntry and the content of a node are not stored physically together.
  * Instead, the container array or object has an array that holds the JEntrys
@@ -95,39 +95,32 @@ typedef struct JsonbValue JsonbValue;
  * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
  * is implicitly known that the root node must be an array or an object,
  * so we can get away without the type indicator as long as we can distinguish
- * the two.  For that purpose, both an array and an object begins with a uint32
+ * the two.  For that purpose, both an array and an object begin with a uint32
  * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
  * scalar value needs to be stored as a Jsonb value, what we actually store is
  * an array with one element, with the flags in the array's header field set
  * to JB_FSCALAR | JB_FARRAY.
  *
- * To encode the length and offset of the variable-length portion of each
- * node in a compact way, the JEntry stores only the end offset within the
- * variable-length portion of the container node. For the first JEntry in the
- * container's JEntry array, that equals to the length of the node data.  The
- * begin offset and length of the rest of the entries can be calculated using
- * the end offset of the previous JEntry in the array.
- *
  * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
  * the variable-length portion of some node types is aligned to a 4-byte
  * boundary, while others are not. When alignment is needed, the padding is
  * in the beginning of the node that requires it. For example, if a numeric
  * node is stored after a string node, so that the numeric node begins at
  * offset 3, the variable-length portion of the numeric node will begin with
- * one padding byte.
+ * one padding byte so that the actual numeric data is 4-byte aligned.
  */
 
 /*
  * Jentry format.
  *
- * The least significant 28 bits store the end offset of the entry (see
- * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
- * are used to store the type of the entry. The most significant bit
- * is unused, and should be set to zero.
+ * The least significant 28 bits store the data length of the entry (see
+ * JBE_LENFLD and JBE_LEN macros below). The next three bits store the type
+ * of the entry. The most significant bit is reserved for future use, and
+ * should be set to zero.
  */
 typedef uint32 JEntry;
 
-#define JENTRY_POSMASK			0x0FFFFFFF
+#define JENTRY_LENMASK			0x0FFFFFFF
 #define JENTRY_TYPEMASK			0x70000000
 
 /* values stored in the type bits */
@@ -148,19 +141,30 @@ typedef uint32 JEntry;
 #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
 
 /*
- * Macros for getting the offset and length of an element. Note multiple
- * evaluations and access to prior array element.
+ * Macros for getting the data length of a JEntry.
  */
-#define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
-#define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
-#define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
-								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
+#define JBE_LENFLD(je_)			((je_) & JENTRY_LENMASK)
+#define JBE_LEN(ja, i)			JBE_LENFLD((ja)[i])
 
 /*
  * A jsonb array or object node, within a Jsonb Datum.
  *
- * An array has one child for each element. An object has two children for
- * each key/value pair.
+ * An array has one child for each element, stored in array order.
+ *
+ * An object has two children for each key/value pair.  The keys all appear
+ * first, in key sort order; then the values appear, in an order matching the
+ * key order.  This arrangement keeps the keys compact in memory, making a
+ * search for a particular key more cache-friendly.
+ *
+ * The JEntry for each child contains the length of the entry. The offset of
+ * a child's variable-length data, can be found by adding up the lengths of
+ * all preceding children.
+ *
+ * To speed up random access of a large array or object, a redundant array of
+ * offsets is stored after the 'children' array. It contains one element for
+ * every JB_OFFSET_STRIDE children. The offset of the first child is always 0,
+ * which is not included in the offset array. An array or object with less
+ * than JB_OFFSET_STRIDE children has no offsets array.
  */
 typedef struct JsonbContainer
 {
@@ -168,15 +172,37 @@ typedef struct JsonbContainer
 								 * flags */
 	JEntry		children[1];	/* variable length */
 
+	/* an uint32 array of offsets follows. */
+
 	/* the data for each child node follows. */
 } JsonbContainer;
 
 /* flags for the header-field in JsonbContainer */
-#define JB_CMASK				0x0FFFFFFF
-#define JB_FSCALAR				0x10000000
+#define JB_CMASK				0x0FFFFFFF		/* mask for count field */
+#define JB_FSCALAR				0x10000000		/* flag bits */
 #define JB_FOBJECT				0x20000000
 #define JB_FARRAY				0x40000000
 
+/* For every JB_OFFSET_STRIDE children, an offset is stored. */
+#define JB_OFFSET_STRIDE		32
+
+/*
+ * Accessor macros for the fields of a JsonbContainer.
+ *
+ * Beware of multiple evaluation!
+ */
+#define JB_CONTAINER_NCHILDREN(jc) \
+	(((jc)->header & JB_CMASK) * (((jc)->header & JB_FOBJECT) ? 2 : 1))
+
+/* # of offsets stored in the offsets array, for given number of children */
+#define JB_CONTAINER_NUM_OFFSETS(nchildren) \
+	(((nchildren) <= JB_OFFSET_STRIDE) ? 0 : (((nchildren) - 1) / JB_OFFSET_STRIDE))
+
+#define JB_CONTAINER_OFFSETS(jc) \
+	((uint32 *) &(jc)->children[JB_CONTAINER_NCHILDREN(jc)])
+#define JB_CONTAINER_CHILD_DATA(jc) \
+	((char *) &JB_CONTAINER_OFFSETS(jc)[JB_CONTAINER_NUM_OFFSETS(JB_CONTAINER_NCHILDREN(jc))])
+
 /* The top-level on-disk format for a jsonb datum. */
 typedef struct
 {
@@ -248,18 +274,20 @@ struct JsonbValue
 									 (jsonbval)->type <= jbvBool)
 
 /*
- * Pair within an Object.
+ * Key/value pair within an Object.
  *
- * Pairs with duplicate keys are de-duplicated.  We store the order for the
- * benefit of doing so in a well-defined way with respect to the original
- * observed order (which is "last observed wins").  This is only used briefly
- * when originally constructing a Jsonb.
+ * This struct type is only used briefly while constructing a Jsonb; it is
+ * *not* the on-disk representation.
+ *
+ * Pairs with duplicate keys are de-duplicated.  We store the originally
+ * observed pair ordering for the purpose of removing duplicates in a
+ * well-defined way (which is "last observed wins").
  */
 struct JsonbPair
 {
 	JsonbValue	key;			/* Must be a jbvString */
 	JsonbValue	value;			/* May be of any type */
-	uint32		order;			/* preserves order of pairs with equal keys */
+	uint32		order;			/* Pair's index in original sequence */
 };
 
 /* Conversion state used when parsing Jsonb from text, or for type coercion */
@@ -287,20 +315,25 @@ typedef struct JsonbIterator
 {
 	/* Container being iterated */
 	JsonbContainer *container;
-	uint32		nElems;			/* Number of elements in children array (will be
-								 * nPairs for objects) */
+	uint32		nElems;			/* Number of elements in children array (will
+								 * be nPairs for objects) */
 	bool		isScalar;		/* Pseudo-array scalar value? */
-	JEntry	   *children;
+	JEntry	   *children;		/* JEntrys for child nodes */
+	/* Data proper.  This points to the beginning of the variable-length data */
+	char	   *dataProper;
 
-	/* Current item in buffer (up to nElems, but must * 2 for objects) */
-	int			i;
+	/* Current item in buffer (up to nElems) */
+	int			curIndex;
+
+	/* Data offset corresponding to current item */
+	uint32		curDataOffset;
 
 	/*
-	 * Data proper.  This points just past end of children array.
-	 * We use the JBE_OFF() macro on the Jentrys to find offsets of each
-	 * child in this area.
+	 * If the container is an object, we want to return keys and values
+	 * alternately; so curDataOffset points to the current key, and
+	 * curValueOffset points to the current value.
 	 */
-	char	   *dataProper;
+	uint32		curValueOffset;
 
 	/* Private state */
 	JsonbIterState state;
#153Tom Lane
tgl@sss.pgh.pa.us
In reply to: Heikki Linnakangas (#152)
Re: jsonb format is pessimal for toast compression

Heikki Linnakangas <hlinnakangas@vmware.com> writes:

Tom: You mentioned earlier that your patch fixes some existing bugs.
What were they?

What I remember at the moment (sans caffeine) is that the routines for
assembling jsonb values out of field data were lacking some necessary
tests for overflow of the size/offset fields. If you like I can apply
those fixes separately, but I think they were sufficiently integrated with
other changes in the logic that it wouldn't really help much for patch
reviewability.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#154Josh Berkus
josh@agliodbs.com
In reply to: Tom Lane (#1)
Re: jsonb format is pessimal for toast compression

On 09/19/2014 07:07 AM, Tom Lane wrote:

Heikki Linnakangas <hlinnakangas@vmware.com> writes:

Tom: You mentioned earlier that your patch fixes some existing bugs.
What were they?

What I remember at the moment (sans caffeine) is that the routines for
assembling jsonb values out of field data were lacking some necessary
tests for overflow of the size/offset fields. If you like I can apply
those fixes separately, but I think they were sufficiently integrated with
other changes in the logic that it wouldn't really help much for patch
reviewability.

Where are we on this? Do we have a patch ready for testing?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#155Peter Geoghegan
pg@heroku.com
In reply to: Heikki Linnakangas (#152)
Re: jsonb format is pessimal for toast compression

On Fri, Sep 19, 2014 at 5:40 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

I think we should bite the bullet and break compatibility with 9.4beta2
format, even if we go with "my patch". In a jsonb object, it makes sense to
store all the keys first, like Tom did, because of cache benefits, and the
future possibility to do smart EXTERNAL access. Also, even if we can make
the on-disk format compatible, it's weird that you can get different runtime
behavior with datums created with a beta version. Seems more clear to just
require a pg_dump + restore.

I vote for going with your patch, and breaking compatibility for the
reasons stated here (though I'm skeptical of the claims about cache
benefits, FWIW).

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#156Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Geoghegan (#155)
Re: jsonb format is pessimal for toast compression

Peter Geoghegan <pg@heroku.com> writes:

On Fri, Sep 19, 2014 at 5:40 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

I think we should bite the bullet and break compatibility with 9.4beta2
format, even if we go with "my patch". In a jsonb object, it makes sense to
store all the keys first, like Tom did, because of cache benefits, and the
future possibility to do smart EXTERNAL access. Also, even if we can make
the on-disk format compatible, it's weird that you can get different runtime
behavior with datums created with a beta version. Seems more clear to just
require a pg_dump + restore.

I vote for going with your patch, and breaking compatibility for the
reasons stated here (though I'm skeptical of the claims about cache
benefits, FWIW).

I'm also skeptical of that, but I think the potential for smart EXTERNAL
access is a valid consideration.

I've not had time to read Heikki's updated patch yet --- has anyone
else compared the two patches for code readability? If they're fairly
close on that score, then I'd agree his approach is the best solution.
(I will look at his code, but I'm not sure I'm the most unbiased
observer.)

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#157Jan Wieck
jan@wi3ck.info
In reply to: Craig Ringer (#135)
Re: jsonb format is pessimal for toast compression

On 09/15/2014 09:46 PM, Craig Ringer wrote:

On 09/16/2014 07:44 AM, Peter Geoghegan wrote:

FWIW, I am slightly concerned about weighing use cases around very
large JSON documents too heavily. Having enormous jsonb documents just
isn't going to work out that well, but neither will equivalent designs
in popular document database systems for similar reasons. For example,
the maximum BSON document size supported by MongoDB is 16 megabytes,
and that seems to be something that their users don't care too much
about. Having 270 pairs in an object isn't unreasonable, but it isn't
going to be all that common either.

Also, at a certain size the fact that Pg must rewrite the whole document
for any change to it starts to introduce other practical changes.

Anyway - this is looking like the change will go in, and with it a
catversion bump. Introduction of a jsonb version/flags byte might be
worthwhile at the same time. It seems likely that there'll be more room
for improvement in jsonb, possibly even down to using different formats
for different data.

Is it worth paying a byte per value to save on possible upgrade pain?

This comment seems to have drowned in the discussion.

If there indeed has to be a catversion bump in the process of this, then
I agree with Craig.

Jan

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#158Peter Geoghegan
pg@heroku.com
In reply to: Jan Wieck (#157)
Re: jsonb format is pessimal for toast compression

On Tue, Sep 23, 2014 at 10:02 PM, Jan Wieck <jan@wi3ck.info> wrote:

Is it worth paying a byte per value to save on possible upgrade pain?

This comment seems to have drowned in the discussion.

If there indeed has to be a catversion bump in the process of this, then I
agree with Craig.

-1. We already have a reserved bit.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#159Tom Lane
tgl@sss.pgh.pa.us
In reply to: Jan Wieck (#157)
Re: jsonb format is pessimal for toast compression

Jan Wieck <jan@wi3ck.info> writes:

On 09/15/2014 09:46 PM, Craig Ringer wrote:

Anyway - this is looking like the change will go in, and with it a
catversion bump. Introduction of a jsonb version/flags byte might be
worthwhile at the same time. It seems likely that there'll be more room
for improvement in jsonb, possibly even down to using different formats
for different data.

Is it worth paying a byte per value to save on possible upgrade pain?

If there indeed has to be a catversion bump in the process of this, then
I agree with Craig.

FWIW, I don't really. To begin with, it wouldn't be a byte per value,
it'd be four bytes, because we need word-alignment of the jsonb contents
so there's noplace to squeeze in an ID byte for free. Secondly, as I
wrote in <15378.1408548595@sss.pgh.pa.us>:

: There remains the
: question of whether to take this opportunity to add a version ID to the
: binary format. I'm not as excited about that idea as I originally was;
: having now studied the code more carefully, I think that any expansion
: would likely happen by adding more type codes and/or commandeering the
: currently-unused high-order bit of JEntrys. We don't need a version ID
: in the header for that. Moreover, if we did have such an ID, it would be
: notationally painful to get it to most of the places that might need it.

Heikki's patch would eat up the high-order JEntry bits, but the other
points remain.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#160Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Tom Lane (#159)
Re: jsonb format is pessimal for toast compression

On 09/24/2014 08:16 AM, Tom Lane wrote:

Jan Wieck <jan@wi3ck.info> writes:

On 09/15/2014 09:46 PM, Craig Ringer wrote:

Anyway - this is looking like the change will go in, and with it a
catversion bump. Introduction of a jsonb version/flags byte might be
worthwhile at the same time. It seems likely that there'll be more room
for improvement in jsonb, possibly even down to using different formats
for different data.

Is it worth paying a byte per value to save on possible upgrade pain?

If there indeed has to be a catversion bump in the process of this, then
I agree with Craig.

FWIW, I don't really. To begin with, it wouldn't be a byte per value,
it'd be four bytes, because we need word-alignment of the jsonb contents
so there's noplace to squeeze in an ID byte for free. Secondly, as I
wrote in <15378.1408548595@sss.pgh.pa.us>:

: There remains the
: question of whether to take this opportunity to add a version ID to the
: binary format. I'm not as excited about that idea as I originally was;
: having now studied the code more carefully, I think that any expansion
: would likely happen by adding more type codes and/or commandeering the
: currently-unused high-order bit of JEntrys. We don't need a version ID
: in the header for that. Moreover, if we did have such an ID, it would be
: notationally painful to get it to most of the places that might need it.

Heikki's patch would eat up the high-order JEntry bits, but the other
points remain.

If we don't need to be backwards-compatible with the 9.4beta on-disk
format, we don't necessarily need to eat the high-order JEntry bit. You
can just assume that that every nth element is stored as an offset, and
the rest as lengths. Although it would be nice to have the flag for it
explicitly.

There are also a few free bits in the JsonbContainer header that can be
used as a version ID in the future. So I don't think we need to change
the format to add an explicit version ID field.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#161Tom Lane
tgl@sss.pgh.pa.us
In reply to: Heikki Linnakangas (#160)
Re: jsonb format is pessimal for toast compression

Heikki Linnakangas <hlinnakangas@vmware.com> writes:

On 09/24/2014 08:16 AM, Tom Lane wrote:

Heikki's patch would eat up the high-order JEntry bits, but the other
points remain.

If we don't need to be backwards-compatible with the 9.4beta on-disk
format, we don't necessarily need to eat the high-order JEntry bit. You
can just assume that that every nth element is stored as an offset, and
the rest as lengths. Although it would be nice to have the flag for it
explicitly.

If we go with this approach, I think that we *should* eat the high bit
for it. The main reason I want to do that is that it avoids having to
engrave the value of N on stone tablets. I think that we should use
a pretty large value of N --- maybe 32 or so --- and having the freedom
to change it later based on experience seems like a good thing.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#162Andres Freund
andres@2ndquadrant.com
In reply to: Heikki Linnakangas (#152)
Re: jsonb format is pessimal for toast compression

On 2014-09-19 15:40:14 +0300, Heikki Linnakangas wrote:

On 09/18/2014 09:27 PM, Heikki Linnakangas wrote:

I'll try to write a more polished patch tomorrow. We'll then see what it
looks like, and can decide if we want it.

Ok, here are two patches. One is a refined version of my earlier patch, and
the other implements the separate offsets array approach. They are both
based on Tom's jsonb-lengths-merged.patch, so they include all the
whitespace fixes etc. he mentioned.

There is no big difference in terms of code complexity between the patches.
IMHO the separate offsets array is easier to understand, but it makes for
more complicated accessor macros to find the beginning of the
variable-length data.

I personally am pretty clearly in favor of Heikki's version. I think it
could stand to slightly expand the reasoning behind the mixed
length/offset format; it's not immediately obvious why the offsets are
problematic for compression. Otherwise, based on a cursory look, it
looks good.

But independent of which version is chosen, we *REALLY* need to make the
decision soon. This issue has held up the next beta (like jsonb has
blocked previous beta) for *weeks*.

Personally it doesn't make me very happy that Heikki and Tom had to be
the people stepping up to fix this.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#163Josh Berkus
josh@agliodbs.com
In reply to: Heikki Linnakangas (#140)
Re: jsonb format is pessimal for toast compression

On 09/25/2014 09:01 AM, Andres Freund wrote:

But independent of which version is chosen, we *REALLY* need to make the
decision soon. This issue has held up the next beta (like jsonb has
blocked previous beta) for *weeks*.

Yes, please!

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#164Bruce Momjian
bruce@momjian.us
In reply to: Andres Freund (#162)
Re: jsonb format is pessimal for toast compression

On Thu, Sep 25, 2014 at 06:01:08PM +0200, Andres Freund wrote:

But independent of which version is chosen, we *REALLY* need to make the
decision soon. This issue has held up the next beta (like jsonb has
blocked previous beta) for *weeks*.

Personally it doesn't make me very happy that Heikki and Tom had to be
the people stepping up to fix this.

I think there are a few reasons this has been delayed, aside from the
scheduling ones:

1. compression issues were a surprise, and we are wondering if
there are any other surprises
2. pg_upgrade makes future data format changes problematic
3. 9.3 multi-xact bugs spooked us into being more careful

I am not sure what we can do to increase our speed based on these items.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#165Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#143)
Re: jsonb format is pessimal for toast compression

On 09/25/2014 10:14 AM, Bruce Momjian wrote:

On Thu, Sep 25, 2014 at 06:01:08PM +0200, Andres Freund wrote:

But independent of which version is chosen, we *REALLY* need to make the
decision soon. This issue has held up the next beta (like jsonb has
blocked previous beta) for *weeks*.

Personally it doesn't make me very happy that Heikki and Tom had to be
the people stepping up to fix this.

I think there are a few reasons this has been delayed, aside from the
scheduling ones:

1. compression issues were a surprise, and we are wondering if
there are any other surprises
2. pg_upgrade makes future data format changes problematic
3. 9.3 multi-xact bugs spooked us into being more careful

I am not sure what we can do to increase our speed based on these items.

Alternately, this is delayed because:

1. We have one tested patch to fix the issue.

2. However, people are convinced that there's a better patch possible.

3. But nobody is working on this better patch except "in their spare time".

Given this, I once again vote for releasing based on Tom's lengths-only
patch, which is done, tested, and ready to go.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#166Andres Freund
andres@2ndquadrant.com
In reply to: Josh Berkus (#165)
Re: jsonb format is pessimal for toast compression

On 2014-09-25 10:18:24 -0700, Josh Berkus wrote:

On 09/25/2014 10:14 AM, Bruce Momjian wrote:

On Thu, Sep 25, 2014 at 06:01:08PM +0200, Andres Freund wrote:

But independent of which version is chosen, we *REALLY* need to make the
decision soon. This issue has held up the next beta (like jsonb has
blocked previous beta) for *weeks*.

Personally it doesn't make me very happy that Heikki and Tom had to be
the people stepping up to fix this.

I think there are a few reasons this has been delayed, aside from the
scheduling ones:

1. compression issues were a surprise, and we are wondering if
there are any other surprises
2. pg_upgrade makes future data format changes problematic
3. 9.3 multi-xact bugs spooked us into being more careful

I am not sure what we can do to increase our speed based on these items.

Alternately, this is delayed because:

1. We have one tested patch to fix the issue.

2. However, people are convinced that there's a better patch possible.

3. But nobody is working on this better patch except "in their spare time".

Given this, I once again vote for releasing based on Tom's lengths-only
patch, which is done, tested, and ready to go.

Heikki's patch is there and polished.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#167Josh Berkus
josh@agliodbs.com
In reply to: Heikki Linnakangas (#146)
Re: jsonb format is pessimal for toast compression

On 09/25/2014 10:20 AM, Andres Freund wrote:

On 2014-09-25 10:18:24 -0700, Josh Berkus wrote:

On 09/25/2014 10:14 AM, Bruce Momjian wrote:

On Thu, Sep 25, 2014 at 06:01:08PM +0200, Andres Freund wrote:

But independent of which version is chosen, we *REALLY* need to make the
decision soon. This issue has held up the next beta (like jsonb has
blocked previous beta) for *weeks*.

Personally it doesn't make me very happy that Heikki and Tom had to be
the people stepping up to fix this.

I think there are a few reasons this has been delayed, aside from the
scheduling ones:

1. compression issues were a surprise, and we are wondering if
there are any other surprises
2. pg_upgrade makes future data format changes problematic
3. 9.3 multi-xact bugs spooked us into being more careful

I am not sure what we can do to increase our speed based on these items.

Alternately, this is delayed because:

1. We have one tested patch to fix the issue.

2. However, people are convinced that there's a better patch possible.

3. But nobody is working on this better patch except "in their spare time".

Given this, I once again vote for releasing based on Tom's lengths-only
patch, which is done, tested, and ready to go.

Heikki's patch is there and polished.

If Heikki says it's ready, I'll test. So far he's said that it wasn't
done yet.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#168Andres Freund
andres@2ndquadrant.com
In reply to: Josh Berkus (#167)
Re: jsonb format is pessimal for toast compression

On 2014-09-25 10:25:24 -0700, Josh Berkus wrote:

If Heikki says it's ready, I'll test. So far he's said that it wasn't
done yet.

/messages/by-id/541C242E.3030004@vmware.com

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#169Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

On 09/25/2014 10:26 AM, Andres Freund wrote:

On 2014-09-25 10:25:24 -0700, Josh Berkus wrote:

If Heikki says it's ready, I'll test. So far he's said that it wasn't
done yet.

/messages/by-id/541C242E.3030004@vmware.com

Yeah, and that didn't include some of Tom's bug fixes apparently, per
the succeeding message. Which is why I asked Heikki if he was done, to
which he has not replied.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#170Andres Freund
andres@2ndquadrant.com
In reply to: Josh Berkus (#169)
Re: jsonb format is pessimal for toast compression

On 2014-09-25 10:29:51 -0700, Josh Berkus wrote:

On 09/25/2014 10:26 AM, Andres Freund wrote:

On 2014-09-25 10:25:24 -0700, Josh Berkus wrote:

If Heikki says it's ready, I'll test. So far he's said that it wasn't
done yet.

/messages/by-id/541C242E.3030004@vmware.com

Yeah, and that didn't include some of Tom's bug fixes apparently, per
the succeeding message. Which is why I asked Heikki if he was done, to
which he has not replied.

Well, Heikki said he doesn't see any fixes in Tom's patch. But either
way, this isn't anything that should prevent you from testing.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#171Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#169)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

On 09/25/2014 10:26 AM, Andres Freund wrote:

On 2014-09-25 10:25:24 -0700, Josh Berkus wrote:

If Heikki says it's ready, I'll test. So far he's said that it wasn't
done yet.

/messages/by-id/541C242E.3030004@vmware.com

Yeah, and that didn't include some of Tom's bug fixes apparently, per
the succeeding message. Which is why I asked Heikki if he was done, to
which he has not replied.

I took a quick look at the two patches Heikki posted. I find the
"separate offsets array" approach unappealing. It takes more space
than the other approaches, and that space will be filled with data
that we already know will not be at all compressible. Moreover,
AFAICS we'd have to engrave the stride on stone tablets, which as
I already mentioned I'd really like to not do.

The "offsets-and-lengths" patch seems like the approach we ought to
compare to my patch, but it looks pretty unfinished to me: AFAICS it
includes logic to understand offsets sprinkled into a mostly-lengths
array, but no logic that would actually *store* any such offsets,
which means it's going to act just like my patch for performance
purposes.

In the interests of pushing this forward, I will work today on
trying to finish and review Heikki's offsets-and-lengths patch
so that we have something we can do performance testing on.
I doubt that the performance testing will tell us anything we
don't expect, but we should do it anyway.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#172Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Bruce Momjian (#164)
Re: jsonb format is pessimal for toast compression

Bruce Momjian wrote:

3. 9.3 multi-xact bugs spooked us into being more careful

Uh. Multixact changes in 9.3 were infinitely more invasive than the
jsonb changes will ever be. a) they touched basic visibility design and routines,
which are complex, understood by very few people, and have remained
mostly unchanged for ages; b) they changed on-disk format for an
underlying support structure, requiring pg_upgrade to handle the
conversion; c) they added new catalog infrastructure to keep track of
required freezing; d) they introduced new uint32 counters subject to
wraparound; e) they introduced a novel user of slru.c with 5-char long
filenames; f) they messed with tuple locking protocol and EvalPlanQual
logic for traversing update chains. Maybe I'm forgetting others.

JSONB has none of these properties. As far as I can see, the only hairy
issue here (other than getting Josh Berkus to actually test the proposed
patches) is that JSONB is changing on-disk format; but we're avoiding
most issues there by dictating that people with existing JSONB databases
need to pg_dump them, i.e. there is no conversion step being written for
pg_upgrade.

It's good to be careful; it's even better to be more careful. I too
have learned a lesson there.

Anyway I have no opinion on the JSONB stuff, other than considering that
ignoring performance for large arrays and large objects seems to run
counter to the whole point of JSONB in the first place (and of course
failing to compress is part of that, too.)

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#173Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

On 09/25/2014 11:22 AM, Tom Lane wrote:

In the interests of pushing this forward, I will work today on
trying to finish and review Heikki's offsets-and-lengths patch
so that we have something we can do performance testing on.
I doubt that the performance testing will tell us anything we
don't expect, but we should do it anyway.

OK. I'll spend some time trying to get Socorro with JSONB working so
that I'll have a second test case.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#174Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#173)
Re: jsonb format is pessimal for toast compression

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#175Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#174)
Re: jsonb format is pessimal for toast compression

On Thu, Sep 25, 2014 at 02:39:37PM -0400, Tom Lane wrote:

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

Yes, I can do that easily. Tell me when you want it --- I just need a
catalog version number to trigger on.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#176Andres Freund
andres@2ndquadrant.com
In reply to: Bruce Momjian (#175)
Re: jsonb format is pessimal for toast compression

On 2014-09-25 14:46:18 -0400, Bruce Momjian wrote:

On Thu, Sep 25, 2014 at 02:39:37PM -0400, Tom Lane wrote:

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

Yes, I can do that easily. Tell me when you want it --- I just need a
catalog version number to trigger on.

Do you plan to make it conditional on jsonb being used in the database?
That'd not be bad to reduce the pain for testers that haven't used jsonb.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#177Bruce Momjian
bruce@momjian.us
In reply to: Andres Freund (#176)
Re: jsonb format is pessimal for toast compression

On Thu, Sep 25, 2014 at 09:00:07PM +0200, Andres Freund wrote:

On 2014-09-25 14:46:18 -0400, Bruce Momjian wrote:

On Thu, Sep 25, 2014 at 02:39:37PM -0400, Tom Lane wrote:

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

Yes, I can do that easily. Tell me when you want it --- I just need a
catalog version number to trigger on.

Do you plan to make it conditional on jsonb being used in the database?
That'd not be bad to reduce the pain for testers that haven't used jsonb.

Yes, I already have code that scans pg_attribute looking for columns
with problematic data types and output them to a file, and then throw an
error.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#178Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Lane (#171)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

I wrote:

The "offsets-and-lengths" patch seems like the approach we ought to
compare to my patch, but it looks pretty unfinished to me: AFAICS it
includes logic to understand offsets sprinkled into a mostly-lengths
array, but no logic that would actually *store* any such offsets,
which means it's going to act just like my patch for performance
purposes.

In the interests of pushing this forward, I will work today on
trying to finish and review Heikki's offsets-and-lengths patch
so that we have something we can do performance testing on.
I doubt that the performance testing will tell us anything we
don't expect, but we should do it anyway.

I've now done that, and attached is what I think would be a committable
version. Having done this work, I no longer think that this approach
is significantly messier code-wise than the all-lengths version, and
it does have the merit of not degrading on very large objects/arrays.
So at the moment I'm leaning to this solution not the all-lengths one.

To get a sense of the compression effects of varying the stride distance,
I repeated the compression measurements I'd done on 14 August with Pavel's
geometry data (<24077.1408052877@sss.pgh.pa.us>). The upshot of that was

min max avg

external text representation 220 172685 880.3
JSON representation (compressed text) 224 78565 541.3
pg_column_size, JSONB HEAD repr. 225 82540 639.0
pg_column_size, all-lengths repr. 225 66794 531.1

Here's what I get with this patch and different stride distances:

JB_OFFSET_STRIDE = 8 225 68551 559.7
JB_OFFSET_STRIDE = 16 225 67601 552.3
JB_OFFSET_STRIDE = 32 225 67120 547.4
JB_OFFSET_STRIDE = 64 225 66886 546.9
JB_OFFSET_STRIDE = 128 225 66879 546.9
JB_OFFSET_STRIDE = 256 225 66846 546.8

So at least for that test data, 32 seems like the sweet spot.
We are giving up a couple percent of space in comparison to the
all-lengths version, but this is probably an acceptable tradeoff
for not degrading on very large arrays.

I've not done any speed testing.

regards, tom lane

Attachments:

jsonb-with-offsets-and-lengths-4.patchtext/x-diff; charset=us-ascii; name=jsonb-with-offsets-and-lengths-4.patchDownload
diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c
index 2fd87fc..9beebb3 100644
*** a/src/backend/utils/adt/jsonb.c
--- b/src/backend/utils/adt/jsonb.c
*************** jsonb_from_cstring(char *json, int len)
*** 196,207 ****
  static size_t
  checkStringLen(size_t len)
  {
! 	if (len > JENTRY_POSMASK)
  		ereport(ERROR,
  				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  				 errmsg("string too long to represent as jsonb string"),
  				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
! 						   JENTRY_POSMASK)));
  
  	return len;
  }
--- 196,207 ----
  static size_t
  checkStringLen(size_t len)
  {
! 	if (len > JENTRY_OFFLENMASK)
  		ereport(ERROR,
  				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  				 errmsg("string too long to represent as jsonb string"),
  				 errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
! 						   JENTRY_OFFLENMASK)));
  
  	return len;
  }
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..f157df3 100644
*** a/src/backend/utils/adt/jsonb_util.c
--- b/src/backend/utils/adt/jsonb_util.c
***************
*** 26,40 ****
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (the total size of an array's elements is also limited by JENTRY_POSMASK,
!  * but we're not concerned about that here)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
  
! static void fillJsonbValue(JEntry *array, int index, char *base_addr,
  			   JsonbValue *result);
! static bool	equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static Jsonb *convertToJsonb(JsonbValue *val);
  static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
--- 26,41 ----
   * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
   * reserved for that in the JsonbContainer.header field.
   *
!  * (The total size of an array's or object's elements is also limited by
!  * JENTRY_OFFLENMASK, but we're not concerned about that here.)
   */
  #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
  #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
  
! static void fillJsonbValue(JsonbContainer *container, int index,
! 			   char *base_addr, uint32 offset,
  			   JsonbValue *result);
! static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static int	compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
  static Jsonb *convertToJsonb(JsonbValue *val);
  static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
*************** static void convertJsonbArray(StringInfo
*** 42,48 ****
  static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
  static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal);
  
! static int reserveFromBuffer(StringInfo buffer, int len);
  static void appendToBuffer(StringInfo buffer, const char *data, int len);
  static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len);
  static short padBufferToInt(StringInfo buffer);
--- 43,49 ----
  static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level);
  static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal);
  
! static int	reserveFromBuffer(StringInfo buffer, int len);
  static void appendToBuffer(StringInfo buffer, const char *data, int len);
  static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len);
  static short padBufferToInt(StringInfo buffer);
*************** JsonbValueToJsonb(JsonbValue *val)
*** 108,113 ****
--- 109,166 ----
  }
  
  /*
+  * Get the offset of the variable-length portion of a Jsonb node within
+  * the variable-length-data part of its container.  The node is identified
+  * by index within the container's JEntry array.
+  */
+ uint32
+ getJsonbOffset(const JsonbContainer *jc, int index)
+ {
+ 	uint32		offset = 0;
+ 	int			i;
+ 
+ 	/*
+ 	 * Start offset of this entry is equal to the end offset of the previous
+ 	 * entry.  Walk backwards to the most recent entry stored as an end
+ 	 * offset, returning that offset plus any lengths in between.
+ 	 */
+ 	for (i = index - 1; i >= 0; i--)
+ 	{
+ 		offset += JBE_OFFLENFLD(jc->children[i]);
+ 		if (JBE_HAS_OFF(jc->children[i]))
+ 			break;
+ 	}
+ 
+ 	return offset;
+ }
+ 
+ /*
+  * Get the length of the variable-length portion of a Jsonb node.
+  * The node is identified by index within the container's JEntry array.
+  */
+ uint32
+ getJsonbLength(const JsonbContainer *jc, int index)
+ {
+ 	uint32		off;
+ 	uint32		len;
+ 
+ 	/*
+ 	 * If the length is stored directly in the JEntry, just return it.
+ 	 * Otherwise, get the begin offset of the entry, and subtract that from
+ 	 * the stored end+1 offset.
+ 	 */
+ 	if (JBE_HAS_OFF(jc->children[index]))
+ 	{
+ 		off = getJsonbOffset(jc, index);
+ 		len = JBE_OFFLENFLD(jc->children[index]) - off;
+ 	}
+ 	else
+ 		len = JBE_OFFLENFLD(jc->children[index]);
+ 
+ 	return len;
+ }
+ 
+ /*
   * BT comparator worker function.  Returns an integer less than, equal to, or
   * greater than zero, indicating whether a is less than, equal to, or greater
   * than b.  Consistent with the requirements for a B-Tree operator class
*************** compareJsonbContainers(JsonbContainer *a
*** 201,207 ****
  			 *
  			 * If the two values were of the same container type, then there'd
  			 * have been a chance to observe the variation in the number of
! 			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say).  They're
  			 * either two heterogeneously-typed containers, or a container and
  			 * some scalar type.
  			 *
--- 254,260 ----
  			 *
  			 * If the two values were of the same container type, then there'd
  			 * have been a chance to observe the variation in the number of
! 			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're
  			 * either two heterogeneously-typed containers, or a container and
  			 * some scalar type.
  			 *
*************** findJsonbValueFromContainer(JsonbContain
*** 272,295 ****
  {
  	JEntry	   *children = container->children;
  	int			count = (container->header & JB_CMASK);
! 	JsonbValue *result = palloc(sizeof(JsonbValue));
  
  	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
  
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
  		int			i;
  
  		for (i = 0; i < count; i++)
  		{
! 			fillJsonbValue(children, i, base_addr, result);
  
  			if (key->type == result->type)
  			{
  				if (equalsJsonbScalarValue(key, result))
  					return result;
  			}
  		}
  	}
  	else if (flags & JB_FOBJECT & container->header)
--- 325,357 ----
  {
  	JEntry	   *children = container->children;
  	int			count = (container->header & JB_CMASK);
! 	JsonbValue *result;
  
  	Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
  
+ 	/* Quick out without a palloc cycle if object/array is empty */
+ 	if (count <= 0)
+ 		return NULL;
+ 
+ 	result = palloc(sizeof(JsonbValue));
+ 
  	if (flags & JB_FARRAY & container->header)
  	{
  		char	   *base_addr = (char *) (children + count);
+ 		uint32		offset = 0;
  		int			i;
  
  		for (i = 0; i < count; i++)
  		{
! 			fillJsonbValue(container, i, base_addr, offset, result);
  
  			if (key->type == result->type)
  			{
  				if (equalsJsonbScalarValue(key, result))
  					return result;
  			}
+ 
+ 			JBE_ADVANCE_OFFSET(offset, children[i]);
  		}
  	}
  	else if (flags & JB_FOBJECT & container->header)
*************** findJsonbValueFromContainer(JsonbContain
*** 297,332 ****
  		/* Since this is an object, account for *Pairs* of Jentrys */
  		char	   *base_addr = (char *) (children + count * 2);
  		uint32		stopLow = 0,
! 					stopMiddle;
  
! 		/* Object key past by caller must be a string */
  		Assert(key->type == jbvString);
  
  		/* Binary search on object/pair keys *only* */
! 		while (stopLow < count)
  		{
! 			int			index;
  			int			difference;
  			JsonbValue	candidate;
  
! 			/*
! 			 * Note how we compensate for the fact that we're iterating
! 			 * through pairs (not entries) throughout.
! 			 */
! 			stopMiddle = stopLow + (count - stopLow) / 2;
! 
! 			index = stopMiddle * 2;
  
  			candidate.type = jbvString;
! 			candidate.val.string.val = base_addr + JBE_OFF(children, index);
! 			candidate.val.string.len = JBE_LEN(children, index);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
  
  			if (difference == 0)
  			{
! 				/* Found our key, return value */
! 				fillJsonbValue(children, index + 1, base_addr, result);
  
  				return result;
  			}
--- 359,393 ----
  		/* Since this is an object, account for *Pairs* of Jentrys */
  		char	   *base_addr = (char *) (children + count * 2);
  		uint32		stopLow = 0,
! 					stopHigh = count;
  
! 		/* Object key passed by caller must be a string */
  		Assert(key->type == jbvString);
  
  		/* Binary search on object/pair keys *only* */
! 		while (stopLow < stopHigh)
  		{
! 			uint32		stopMiddle;
  			int			difference;
  			JsonbValue	candidate;
  
! 			stopMiddle = stopLow + (stopHigh - stopLow) / 2;
  
  			candidate.type = jbvString;
! 			candidate.val.string.val =
! 				base_addr + getJsonbOffset(container, stopMiddle);
! 			candidate.val.string.len = getJsonbLength(container, stopMiddle);
  
  			difference = lengthCompareJsonbStringValue(&candidate, key);
  
  			if (difference == 0)
  			{
! 				/* Found our key, return corresponding value */
! 				int			index = stopMiddle + count;
! 
! 				fillJsonbValue(container, index, base_addr,
! 							   getJsonbOffset(container, index),
! 							   result);
  
  				return result;
  			}
*************** findJsonbValueFromContainer(JsonbContain
*** 335,341 ****
  				if (difference < 0)
  					stopLow = stopMiddle + 1;
  				else
! 					count = stopMiddle;
  			}
  		}
  	}
--- 396,402 ----
  				if (difference < 0)
  					stopLow = stopMiddle + 1;
  				else
! 					stopHigh = stopMiddle;
  			}
  		}
  	}
*************** getIthJsonbValueFromContainer(JsonbConta
*** 368,374 ****
  
  	result = palloc(sizeof(JsonbValue));
  
! 	fillJsonbValue(container->children, i, base_addr, result);
  
  	return result;
  }
--- 429,437 ----
  
  	result = palloc(sizeof(JsonbValue));
  
! 	fillJsonbValue(container, i, base_addr,
! 				   getJsonbOffset(container, i),
! 				   result);
  
  	return result;
  }
*************** getIthJsonbValueFromContainer(JsonbConta
*** 377,389 ****
   * A helper function to fill in a JsonbValue to represent an element of an
   * array, or a key or value of an object.
   *
   * A nested array or object will be returned as jbvBinary, ie. it won't be
   * expanded.
   */
  static void
! fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
  {
! 	JEntry		entry = children[index];
  
  	if (JBE_ISNULL(entry))
  	{
--- 440,459 ----
   * A helper function to fill in a JsonbValue to represent an element of an
   * array, or a key or value of an object.
   *
+  * The node's JEntry is at container->children[index], and its variable-length
+  * data is at base_addr + offset.  We make the caller determine the offset
+  * since in many cases the caller can amortize that work across multiple
+  * children.  When it can't, it can just call getJsonbOffset().
+  *
   * A nested array or object will be returned as jbvBinary, ie. it won't be
   * expanded.
   */
  static void
! fillJsonbValue(JsonbContainer *container, int index,
! 			   char *base_addr, uint32 offset,
! 			   JsonbValue *result)
  {
! 	JEntry		entry = container->children[index];
  
  	if (JBE_ISNULL(entry))
  	{
*************** fillJsonbValue(JEntry *children, int ind
*** 392,405 ****
  	else if (JBE_ISSTRING(entry))
  	{
  		result->type = jbvString;
! 		result->val.string.val = base_addr + JBE_OFF(children, index);
! 		result->val.string.len = JBE_LEN(children, index);
  		Assert(result->val.string.len >= 0);
  	}
  	else if (JBE_ISNUMERIC(entry))
  	{
  		result->type = jbvNumeric;
! 		result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index)));
  	}
  	else if (JBE_ISBOOL_TRUE(entry))
  	{
--- 462,475 ----
  	else if (JBE_ISSTRING(entry))
  	{
  		result->type = jbvString;
! 		result->val.string.val = base_addr + offset;
! 		result->val.string.len = getJsonbLength(container, index);
  		Assert(result->val.string.len >= 0);
  	}
  	else if (JBE_ISNUMERIC(entry))
  	{
  		result->type = jbvNumeric;
! 		result->val.numeric = (Numeric) (base_addr + INTALIGN(offset));
  	}
  	else if (JBE_ISBOOL_TRUE(entry))
  	{
*************** fillJsonbValue(JEntry *children, int ind
*** 415,422 ****
  	{
  		Assert(JBE_ISCONTAINER(entry));
  		result->type = jbvBinary;
! 		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index)));
! 		result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index));
  	}
  }
  
--- 485,494 ----
  	{
  		Assert(JBE_ISCONTAINER(entry));
  		result->type = jbvBinary;
! 		/* Remove alignment padding from data pointer and length */
! 		result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset));
! 		result->val.binary.len = getJsonbLength(container, index) -
! 			(INTALIGN(offset) - offset);
  	}
  }
  
*************** recurse:
*** 668,680 ****
  			 * a full conversion
  			 */
  			val->val.array.rawScalar = (*it)->isScalar;
! 			(*it)->i = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
  
  		case JBI_ARRAY_ELEM:
! 			if ((*it)->i >= (*it)->nElems)
  			{
  				/*
  				 * All elements within array already processed.  Report this
--- 740,754 ----
  			 * a full conversion
  			 */
  			val->val.array.rawScalar = (*it)->isScalar;
! 			(*it)->curIndex = 0;
! 			(*it)->curDataOffset = 0;
! 			(*it)->curValueOffset = 0;	/* not actually used */
  			/* Set state for next call */
  			(*it)->state = JBI_ARRAY_ELEM;
  			return WJB_BEGIN_ARRAY;
  
  		case JBI_ARRAY_ELEM:
! 			if ((*it)->curIndex >= (*it)->nElems)
  			{
  				/*
  				 * All elements within array already processed.  Report this
*************** recurse:
*** 686,692 ****
  				return WJB_END_ARRAY;
  			}
  
! 			fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val);
  
  			if (!IsAJsonbScalar(val) && !skipNested)
  			{
--- 760,772 ----
  				return WJB_END_ARRAY;
  			}
  
! 			fillJsonbValue((*it)->container, (*it)->curIndex,
! 						   (*it)->dataProper, (*it)->curDataOffset,
! 						   val);
! 
! 			JBE_ADVANCE_OFFSET((*it)->curDataOffset,
! 							   (*it)->children[(*it)->curIndex]);
! 			(*it)->curIndex++;
  
  			if (!IsAJsonbScalar(val) && !skipNested)
  			{
*************** recurse:
*** 697,704 ****
  			else
  			{
  				/*
! 				 * Scalar item in array, or a container and caller didn't
! 				 * want us to recurse into it.
  				 */
  				return WJB_ELEM;
  			}
--- 777,784 ----
  			else
  			{
  				/*
! 				 * Scalar item in array, or a container and caller didn't want
! 				 * us to recurse into it.
  				 */
  				return WJB_ELEM;
  			}
*************** recurse:
*** 712,724 ****
  			 * v->val.object.pairs is not actually set, because we aren't
  			 * doing a full conversion
  			 */
! 			(*it)->i = 0;
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
  
  		case JBI_OBJECT_KEY:
! 			if ((*it)->i >= (*it)->nElems)
  			{
  				/*
  				 * All pairs within object already processed.  Report this to
--- 792,807 ----
  			 * v->val.object.pairs is not actually set, because we aren't
  			 * doing a full conversion
  			 */
! 			(*it)->curIndex = 0;
! 			(*it)->curDataOffset = 0;
! 			(*it)->curValueOffset = getJsonbOffset((*it)->container,
! 												   (*it)->nElems);
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  			return WJB_BEGIN_OBJECT;
  
  		case JBI_OBJECT_KEY:
! 			if ((*it)->curIndex >= (*it)->nElems)
  			{
  				/*
  				 * All pairs within object already processed.  Report this to
*************** recurse:
*** 732,738 ****
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val);
  				if (val->type != jbvString)
  					elog(ERROR, "unexpected jsonb type as object key");
  
--- 815,823 ----
  			else
  			{
  				/* Return key of a key/value pair.  */
! 				fillJsonbValue((*it)->container, (*it)->curIndex,
! 							   (*it)->dataProper, (*it)->curDataOffset,
! 							   val);
  				if (val->type != jbvString)
  					elog(ERROR, "unexpected jsonb type as object key");
  
*************** recurse:
*** 745,752 ****
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1,
! 						   (*it)->dataProper, val);
  
  			/*
  			 * Value may be a container, in which case we recurse with new,
--- 830,844 ----
  			/* Set state for next call */
  			(*it)->state = JBI_OBJECT_KEY;
  
! 			fillJsonbValue((*it)->container, (*it)->curIndex + (*it)->nElems,
! 						   (*it)->dataProper, (*it)->curValueOffset,
! 						   val);
! 
! 			JBE_ADVANCE_OFFSET((*it)->curDataOffset,
! 							   (*it)->children[(*it)->curIndex]);
! 			JBE_ADVANCE_OFFSET((*it)->curValueOffset,
! 						   (*it)->children[(*it)->curIndex + (*it)->nElems]);
! 			(*it)->curIndex++;
  
  			/*
  			 * Value may be a container, in which case we recurse with new,
*************** iteratorFromContainer(JsonbContainer *co
*** 795,805 ****
  			break;
  
  		case JB_FOBJECT:
- 
- 			/*
- 			 * Offset reflects that nElems indicates JsonbPairs in an object.
- 			 * Each key and each value contain Jentry metadata just the same.
- 			 */
  			it->dataProper =
  				(char *) it->children + it->nElems * sizeof(JEntry) * 2;
  			it->state = JBI_OBJECT_START;
--- 887,892 ----
*************** reserveFromBuffer(StringInfo buffer, int
*** 1209,1216 ****
  	buffer->len += len;
  
  	/*
! 	 * Keep a trailing null in place, even though it's not useful for us;
! 	 * it seems best to preserve the invariants of StringInfos.
  	 */
  	buffer->data[buffer->len] = '\0';
  
--- 1296,1303 ----
  	buffer->len += len;
  
  	/*
! 	 * Keep a trailing null in place, even though it's not useful for us; it
! 	 * seems best to preserve the invariants of StringInfos.
  	 */
  	buffer->data[buffer->len] = '\0';
  
*************** convertToJsonb(JsonbValue *val)
*** 1284,1291 ****
  
  	/*
  	 * Note: the JEntry of the root is discarded. Therefore the root
! 	 * JsonbContainer struct must contain enough information to tell what
! 	 * kind of value it is.
  	 */
  
  	res = (Jsonb *) buffer.data;
--- 1371,1378 ----
  
  	/*
  	 * Note: the JEntry of the root is discarded. Therefore the root
! 	 * JsonbContainer struct must contain enough information to tell what kind
! 	 * of value it is.
  	 */
  
  	res = (Jsonb *) buffer.data;
*************** convertToJsonb(JsonbValue *val)
*** 1298,1307 ****
  /*
   * Subroutine of convertJsonb: serialize a single JsonbValue into buffer.
   *
!  * The JEntry header for this node is returned in *header. It is filled in
!  * with the length of this value, but if it is stored in an array or an
!  * object (which is always, except for the root node), it is the caller's
!  * responsibility to adjust it with the offset within the container.
   *
   * If the value is an array or an object, this recurses. 'level' is only used
   * for debugging purposes.
--- 1385,1394 ----
  /*
   * Subroutine of convertJsonb: serialize a single JsonbValue into buffer.
   *
!  * The JEntry header for this node is returned in *header.  It is filled in
!  * with the length of this value and appropriate type bits.  If we wish to
!  * store an end offset rather than a length, it is the caller's responsibility
!  * to adjust for that.
   *
   * If the value is an array or an object, this recurses. 'level' is only used
   * for debugging purposes.
*************** convertJsonbValue(StringInfo buffer, JEn
*** 1315,1324 ****
  		return;
  
  	/*
! 	 * A JsonbValue passed as val should never have a type of jbvBinary,
! 	 * and neither should any of its sub-components. Those values will be
! 	 * produced by convertJsonbArray and convertJsonbObject, the results of
! 	 * which will not be passed back to this function as an argument.
  	 */
  
  	if (IsAJsonbScalar(val))
--- 1402,1411 ----
  		return;
  
  	/*
! 	 * A JsonbValue passed as val should never have a type of jbvBinary, and
! 	 * neither should any of its sub-components. Those values will be produced
! 	 * by convertJsonbArray and convertJsonbObject, the results of which will
! 	 * not be passed back to this function as an argument.
  	 */
  
  	if (IsAJsonbScalar(val))
*************** convertJsonbValue(StringInfo buffer, JEn
*** 1334,1457 ****
  static void
  convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
! 	int			offset;
! 	int			metaoffset;
  	int			i;
  	int			totallen;
  	uint32		header;
  
! 	/* Initialize pointer into conversion buffer at this level */
! 	offset = buffer->len;
  
  	padBufferToInt(buffer);
  
  	/*
! 	 * Construct the header Jentry, stored in the beginning of the variable-
! 	 * length payload.
  	 */
! 	header = val->val.array.nElems | JB_FARRAY;
  	if (val->val.array.rawScalar)
  	{
! 		Assert(val->val.array.nElems == 1);
  		Assert(level == 0);
  		header |= JB_FSCALAR;
  	}
  
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
! 	/* reserve space for the JEntries of the elements. */
! 	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems);
  
  	totallen = 0;
! 	for (i = 0; i < val->val.array.nElems; i++)
  	{
  		JsonbValue *elem = &val->val.array.elems[i];
  		int			len;
  		JEntry		meta;
  
  		convertJsonbValue(buffer, &meta, elem, level + 1);
! 		len = meta & JENTRY_POSMASK;
  		totallen += len;
  
! 		if (totallen > JENTRY_POSMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
! 		if (i > 0)
! 			meta = (meta & ~JENTRY_POSMASK) | totallen;
! 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
! 		metaoffset += sizeof(JEntry);
  	}
  
! 	totallen = buffer->len - offset;
  
! 	/* Initialize the header of this node, in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
  static void
  convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
! 	uint32		header;
! 	int			offset;
! 	int			metaoffset;
  	int			i;
  	int			totallen;
  
! 	/* Initialize pointer into conversion buffer at this level */
! 	offset = buffer->len;
  
  	padBufferToInt(buffer);
  
! 	/* Initialize header */
! 	header = val->val.object.nPairs | JB_FOBJECT;
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
  
! 	/* reserve space for the JEntries of the keys and values */
! 	metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2);
  
  	totallen = 0;
! 	for (i = 0; i < val->val.object.nPairs; i++)
  	{
! 		JsonbPair *pair = &val->val.object.pairs[i];
! 		int len;
! 		JEntry meta;
  
! 		/* put key */
  		convertJsonbScalar(buffer, &meta, &pair->key);
  
! 		len = meta & JENTRY_POSMASK;
  		totallen += len;
  
! 		if (totallen > JENTRY_POSMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
! 		if (i > 0)
! 			meta = (meta & ~JENTRY_POSMASK) | totallen;
! 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
! 		metaoffset += sizeof(JEntry);
  
! 		convertJsonbValue(buffer, &meta, &pair->value, level);
! 		len = meta & JENTRY_POSMASK;
  		totallen += len;
  
! 		if (totallen > JENTRY_POSMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_POSMASK)));
  
! 		meta = (meta & ~JENTRY_POSMASK) | totallen;
! 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
! 		metaoffset += sizeof(JEntry);
  	}
  
! 	totallen = buffer->len - offset;
  
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
--- 1421,1620 ----
  static void
  convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
! 	int			base_offset;
! 	int			jentry_offset;
  	int			i;
  	int			totallen;
  	uint32		header;
+ 	int			nElems = val->val.array.nElems;
  
! 	/* Remember where in the buffer this array starts. */
! 	base_offset = buffer->len;
  
+ 	/* Align to 4-byte boundary (any padding counts as part of my data) */
  	padBufferToInt(buffer);
  
  	/*
! 	 * Construct the header Jentry and store it in the beginning of the
! 	 * variable-length payload.
  	 */
! 	header = nElems | JB_FARRAY;
  	if (val->val.array.rawScalar)
  	{
! 		Assert(nElems == 1);
  		Assert(level == 0);
  		header |= JB_FSCALAR;
  	}
  
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
! 
! 	/* Reserve space for the JEntries of the elements. */
! 	jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nElems);
  
  	totallen = 0;
! 	for (i = 0; i < nElems; i++)
  	{
  		JsonbValue *elem = &val->val.array.elems[i];
  		int			len;
  		JEntry		meta;
  
+ 		/*
+ 		 * Convert element, producing a JEntry and appending its
+ 		 * variable-length data to buffer
+ 		 */
  		convertJsonbValue(buffer, &meta, elem, level + 1);
! 
! 		len = JBE_OFFLENFLD(meta);
  		totallen += len;
  
! 		/*
! 		 * Bail out if total variable-length data exceeds what will fit in a
! 		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
! 		 */
! 		if (totallen > JENTRY_OFFLENMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 							JENTRY_OFFLENMASK)));
  
! 		/*
! 		 * Convert each JB_OFFSET_STRIDE'th length to an offset.
! 		 */
! 		if ((i % JB_OFFSET_STRIDE) == 0)
! 			meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF;
! 
! 		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
! 		jentry_offset += sizeof(JEntry);
  	}
  
! 	/* Total data size is everything we've appended to buffer */
! 	totallen = buffer->len - base_offset;
  
! 	/* Check length again, since we didn't include the metadata above */
! 	if (totallen > JENTRY_OFFLENMASK)
! 		ereport(ERROR,
! 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 				 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
! 						JENTRY_OFFLENMASK)));
! 
! 	/* Initialize the header of this node in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
  static void
  convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
  {
! 	int			base_offset;
! 	int			jentry_offset;
  	int			i;
  	int			totallen;
+ 	uint32		header;
+ 	int			nPairs = val->val.object.nPairs;
  
! 	/* Remember where in the buffer this object starts. */
! 	base_offset = buffer->len;
  
+ 	/* Align to 4-byte boundary (any padding counts as part of my data) */
  	padBufferToInt(buffer);
  
! 	/*
! 	 * Construct the header Jentry and store it in the beginning of the
! 	 * variable-length payload.
! 	 */
! 	header = nPairs | JB_FOBJECT;
  	appendToBuffer(buffer, (char *) &header, sizeof(uint32));
  
! 	/* Reserve space for the JEntries of the keys and values. */
! 	jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nPairs * 2);
  
+ 	/*
+ 	 * Iterate over the keys, then over the values, since that is the ordering
+ 	 * we want in the on-disk representation.
+ 	 */
  	totallen = 0;
! 	for (i = 0; i < nPairs; i++)
  	{
! 		JsonbPair  *pair = &val->val.object.pairs[i];
! 		int			len;
! 		JEntry		meta;
  
! 		/*
! 		 * Convert key, producing a JEntry and appending its variable-length
! 		 * data to buffer
! 		 */
  		convertJsonbScalar(buffer, &meta, &pair->key);
  
! 		len = JBE_OFFLENFLD(meta);
  		totallen += len;
  
! 		/*
! 		 * Bail out if total variable-length data exceeds what will fit in a
! 		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
! 		 */
! 		if (totallen > JENTRY_OFFLENMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
! 							JENTRY_OFFLENMASK)));
  
! 		/*
! 		 * Convert each JB_OFFSET_STRIDE'th length to an offset.
! 		 */
! 		if ((i % JB_OFFSET_STRIDE) == 0)
! 			meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF;
  
! 		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
! 		jentry_offset += sizeof(JEntry);
! 	}
! 	for (i = 0; i < nPairs; i++)
! 	{
! 		JsonbPair  *pair = &val->val.object.pairs[i];
! 		int			len;
! 		JEntry		meta;
! 
! 		/*
! 		 * Convert value, producing a JEntry and appending its variable-length
! 		 * data to buffer
! 		 */
! 		convertJsonbValue(buffer, &meta, &pair->value, level + 1);
! 
! 		len = JBE_OFFLENFLD(meta);
  		totallen += len;
  
! 		/*
! 		 * Bail out if total variable-length data exceeds what will fit in a
! 		 * JEntry length field.  We check this in each iteration, not just
! 		 * once at the end, to forestall possible integer overflow.
! 		 */
! 		if (totallen > JENTRY_OFFLENMASK)
  			ereport(ERROR,
  					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
! 							JENTRY_OFFLENMASK)));
  
! 		/*
! 		 * Convert each JB_OFFSET_STRIDE'th length to an offset.
! 		 */
! 		if (((i + nPairs) % JB_OFFSET_STRIDE) == 0)
! 			meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF;
! 
! 		copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
! 		jentry_offset += sizeof(JEntry);
  	}
  
! 	/* Total data size is everything we've appended to buffer */
! 	totallen = buffer->len - base_offset;
  
+ 	/* Check length again, since we didn't include the metadata above */
+ 	if (totallen > JENTRY_OFFLENMASK)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 				 errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
+ 						JENTRY_OFFLENMASK)));
+ 
+ 	/* Initialize the header of this node in the container's JEntry array */
  	*pheader = JENTRY_ISCONTAINER | totallen;
  }
  
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 91e3e14..b89e4cb 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef struct JsonbValue JsonbValue;
*** 83,91 ****
   * buffer is accessed, but they can also be deep copied and passed around.
   *
   * Jsonb is a tree structure. Each node in the tree consists of a JEntry
!  * header, and a variable-length content.  The JEntry header indicates what
!  * kind of a node it is, e.g. a string or an array, and the offset and length
!  * of its variable-length portion within the container.
   *
   * The JEntry and the content of a node are not stored physically together.
   * Instead, the container array or object has an array that holds the JEntrys
--- 83,91 ----
   * buffer is accessed, but they can also be deep copied and passed around.
   *
   * Jsonb is a tree structure. Each node in the tree consists of a JEntry
!  * header and a variable-length content (possibly of zero size).  The JEntry
!  * header indicates what kind of a node it is, e.g. a string or an array,
!  * and provides the length of its variable-length portion.
   *
   * The JEntry and the content of a node are not stored physically together.
   * Instead, the container array or object has an array that holds the JEntrys
*************** typedef struct JsonbValue JsonbValue;
*** 95,134 ****
   * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
   * is implicitly known that the root node must be an array or an object,
   * so we can get away without the type indicator as long as we can distinguish
!  * the two.  For that purpose, both an array and an object begins with a uint32
   * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
   * scalar value needs to be stored as a Jsonb value, what we actually store is
   * an array with one element, with the flags in the array's header field set
   * to JB_FSCALAR | JB_FARRAY.
   *
-  * To encode the length and offset of the variable-length portion of each
-  * node in a compact way, the JEntry stores only the end offset within the
-  * variable-length portion of the container node. For the first JEntry in the
-  * container's JEntry array, that equals to the length of the node data.  The
-  * begin offset and length of the rest of the entries can be calculated using
-  * the end offset of the previous JEntry in the array.
-  *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
   * boundary, while others are not. When alignment is needed, the padding is
   * in the beginning of the node that requires it. For example, if a numeric
   * node is stored after a string node, so that the numeric node begins at
   * offset 3, the variable-length portion of the numeric node will begin with
!  * one padding byte.
   */
  
  /*
!  * Jentry format.
   *
!  * The least significant 28 bits store the end offset of the entry (see
!  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
!  * are used to store the type of the entry. The most significant bit
!  * is unused, and should be set to zero.
   */
  typedef uint32 JEntry;
  
! #define JENTRY_POSMASK			0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
  
  /* values stored in the type bits */
  #define JENTRY_ISSTRING			0x00000000
--- 95,146 ----
   * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
   * is implicitly known that the root node must be an array or an object,
   * so we can get away without the type indicator as long as we can distinguish
!  * the two.  For that purpose, both an array and an object begin with a uint32
   * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
   * scalar value needs to be stored as a Jsonb value, what we actually store is
   * an array with one element, with the flags in the array's header field set
   * to JB_FSCALAR | JB_FARRAY.
   *
   * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
   * the variable-length portion of some node types is aligned to a 4-byte
   * boundary, while others are not. When alignment is needed, the padding is
   * in the beginning of the node that requires it. For example, if a numeric
   * node is stored after a string node, so that the numeric node begins at
   * offset 3, the variable-length portion of the numeric node will begin with
!  * one padding byte so that the actual numeric data is 4-byte aligned.
   */
  
  /*
!  * JEntry format.
   *
!  * The least significant 28 bits store either the data length of the entry,
!  * or its end+1 offset from the start of the variable-length portion of the
!  * containing object.  The next three bits store the type of the entry, and
!  * the high-order bit tells whether the least significant bits store a length
!  * or an offset.
!  *
!  * The reason for the offset-or-length complication is to compromise between
!  * access speed and data compressibility.  In the initial design each JEntry
!  * always stored an offset, but this resulted in JEntry arrays with horrible
!  * compressibility properties, so that TOAST compression of a JSONB did not
!  * work well.  Storing only lengths would greatly improve compressibility,
!  * but it makes random access into large arrays expensive (O(N) not O(1)).
!  * So what we do is store an offset in every JB_OFFSET_STRIDE'th JEntry and
!  * a length in the rest.  This results in reasonably compressible data (as
!  * long as the stride isn't too small).  We may have to examine as many as
!  * JB_OFFSET_STRIDE JEntrys in order to find out the offset or length of any
!  * given item, but that's still O(1) no matter how large the container is.
!  *
!  * We could avoid eating a flag bit for this purpose if we were to store
!  * the stride in the container header, or if we were willing to treat the
!  * stride as an unchangeable constant.  Neither of those options is very
!  * attractive though.
   */
  typedef uint32 JEntry;
  
! #define JENTRY_OFFLENMASK		0x0FFFFFFF
  #define JENTRY_TYPEMASK			0x70000000
+ #define JENTRY_HAS_OFF			0x80000000
  
  /* values stored in the type bits */
  #define JENTRY_ISSTRING			0x00000000
*************** typedef uint32 JEntry;
*** 138,144 ****
  #define JENTRY_ISNULL			0x40000000
  #define JENTRY_ISCONTAINER		0x50000000		/* array or object */
  
! /* Note possible multiple evaluations */
  #define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
  #define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
  #define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
--- 150,158 ----
  #define JENTRY_ISNULL			0x40000000
  #define JENTRY_ISCONTAINER		0x50000000		/* array or object */
  
! /* Access macros.  Note possible multiple evaluations */
! #define JBE_OFFLENFLD(je_)		((je_) & JENTRY_OFFLENMASK)
! #define JBE_HAS_OFF(je_)		(((je_) & JENTRY_HAS_OFF) != 0)
  #define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
  #define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
  #define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
*************** typedef uint32 JEntry;
*** 147,166 ****
  #define JBE_ISBOOL_FALSE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
  /*
!  * Macros for getting the offset and length of an element. Note multiple
!  * evaluations and access to prior array element.
   */
! #define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
! #define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
! #define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
! 								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
   *
!  * An array has one child for each element. An object has two children for
!  * each key/value pair.
   */
  typedef struct JsonbContainer
  {
--- 161,194 ----
  #define JBE_ISBOOL_FALSE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
  #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
  
+ /* Macro for advancing an offset variable to the next JEntry */
+ #define JBE_ADVANCE_OFFSET(offset, je) \
+ 	do { \
+ 		JEntry	je_ = (je); \
+ 		if (JBE_HAS_OFF(je_)) \
+ 			(offset) = JBE_OFFLENFLD(je_); \
+ 		else \
+ 			(offset) += JBE_OFFLENFLD(je_); \
+ 	} while(0)
+ 
  /*
!  * We store an offset, not a length, every JB_OFFSET_STRIDE children.
!  * Caution: this macro should only be referenced when creating a JSONB
!  * value.  When examining an existing value, pay attention to the HAS_OFF
!  * bits instead.  This allows changes in the offset-placement heuristic
!  * without breaking on-disk compatibility.
   */
! #define JB_OFFSET_STRIDE		32
  
  /*
   * A jsonb array or object node, within a Jsonb Datum.
   *
!  * An array has one child for each element, stored in array order.
!  *
!  * An object has two children for each key/value pair.  The keys all appear
!  * first, in key sort order; then the values appear, in an order matching the
!  * key order.  This arrangement keeps the keys compact in memory, making a
!  * search for a particular key more cache-friendly.
   */
  typedef struct JsonbContainer
  {
*************** typedef struct JsonbContainer
*** 172,179 ****
  } JsonbContainer;
  
  /* flags for the header-field in JsonbContainer */
! #define JB_CMASK				0x0FFFFFFF
! #define JB_FSCALAR				0x10000000
  #define JB_FOBJECT				0x20000000
  #define JB_FARRAY				0x40000000
  
--- 200,207 ----
  } JsonbContainer;
  
  /* flags for the header-field in JsonbContainer */
! #define JB_CMASK				0x0FFFFFFF		/* mask for count field */
! #define JB_FSCALAR				0x10000000		/* flag bits */
  #define JB_FOBJECT				0x20000000
  #define JB_FARRAY				0x40000000
  
*************** struct JsonbValue
*** 248,265 ****
  									 (jsonbval)->type <= jbvBool)
  
  /*
!  * Pair within an Object.
   *
!  * Pairs with duplicate keys are de-duplicated.  We store the order for the
!  * benefit of doing so in a well-defined way with respect to the original
!  * observed order (which is "last observed wins").  This is only used briefly
!  * when originally constructing a Jsonb.
   */
  struct JsonbPair
  {
  	JsonbValue	key;			/* Must be a jbvString */
  	JsonbValue	value;			/* May be of any type */
! 	uint32		order;			/* preserves order of pairs with equal keys */
  };
  
  /* Conversion state used when parsing Jsonb from text, or for type coercion */
--- 276,295 ----
  									 (jsonbval)->type <= jbvBool)
  
  /*
!  * Key/value pair within an Object.
   *
!  * This struct type is only used briefly while constructing a Jsonb; it is
!  * *not* the on-disk representation.
!  *
!  * Pairs with duplicate keys are de-duplicated.  We store the originally
!  * observed pair ordering for the purpose of removing duplicates in a
!  * well-defined way (which is "last observed wins").
   */
  struct JsonbPair
  {
  	JsonbValue	key;			/* Must be a jbvString */
  	JsonbValue	value;			/* May be of any type */
! 	uint32		order;			/* Pair's index in original sequence */
  };
  
  /* Conversion state used when parsing Jsonb from text, or for type coercion */
*************** typedef struct JsonbIterator
*** 287,306 ****
  {
  	/* Container being iterated */
  	JsonbContainer *container;
! 	uint32		nElems;			/* Number of elements in children array (will be
! 								 * nPairs for objects) */
  	bool		isScalar;		/* Pseudo-array scalar value? */
! 	JEntry	   *children;
  
! 	/* Current item in buffer (up to nElems, but must * 2 for objects) */
! 	int			i;
  
  	/*
! 	 * Data proper.  This points just past end of children array.
! 	 * We use the JBE_OFF() macro on the Jentrys to find offsets of each
! 	 * child in this area.
  	 */
! 	char	   *dataProper;
  
  	/* Private state */
  	JsonbIterState state;
--- 317,341 ----
  {
  	/* Container being iterated */
  	JsonbContainer *container;
! 	uint32		nElems;			/* Number of elements in children array (will
! 								 * be nPairs for objects) */
  	bool		isScalar;		/* Pseudo-array scalar value? */
! 	JEntry	   *children;		/* JEntrys for child nodes */
! 	/* Data proper.  This points to the beginning of the variable-length data */
! 	char	   *dataProper;
  
! 	/* Current item in buffer (up to nElems) */
! 	int			curIndex;
! 
! 	/* Data offset corresponding to current item */
! 	uint32		curDataOffset;
  
  	/*
! 	 * If the container is an object, we want to return keys and values
! 	 * alternately; so curDataOffset points to the current key, and
! 	 * curValueOffset points to the current value.
  	 */
! 	uint32		curValueOffset;
  
  	/* Private state */
  	JsonbIterState state;
*************** extern Datum gin_consistent_jsonb_path(P
*** 344,349 ****
--- 379,386 ----
  extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS);
  
  /* Support functions */
+ extern uint32 getJsonbOffset(const JsonbContainer *jc, int index);
+ extern uint32 getJsonbLength(const JsonbContainer *jc, int index);
  extern int	compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
  extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
  							uint32 flags,
#179Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

On 09/25/2014 08:10 PM, Tom Lane wrote:

I wrote:

The "offsets-and-lengths" patch seems like the approach we ought to
compare to my patch, but it looks pretty unfinished to me: AFAICS it
includes logic to understand offsets sprinkled into a mostly-lengths
array, but no logic that would actually *store* any such offsets,
which means it's going to act just like my patch for performance
purposes.

In the interests of pushing this forward, I will work today on
trying to finish and review Heikki's offsets-and-lengths patch
so that we have something we can do performance testing on.
I doubt that the performance testing will tell us anything we
don't expect, but we should do it anyway.

I've now done that, and attached is what I think would be a committable
version. Having done this work, I no longer think that this approach
is significantly messier code-wise than the all-lengths version, and
it does have the merit of not degrading on very large objects/arrays.
So at the moment I'm leaning to this solution not the all-lengths one.

To get a sense of the compression effects of varying the stride distance,
I repeated the compression measurements I'd done on 14 August with Pavel's
geometry data (<24077.1408052877@sss.pgh.pa.us>). The upshot of that was

min max avg

external text representation 220 172685 880.3
JSON representation (compressed text) 224 78565 541.3
pg_column_size, JSONB HEAD repr. 225 82540 639.0
pg_column_size, all-lengths repr. 225 66794 531.1

Here's what I get with this patch and different stride distances:

JB_OFFSET_STRIDE = 8 225 68551 559.7
JB_OFFSET_STRIDE = 16 225 67601 552.3
JB_OFFSET_STRIDE = 32 225 67120 547.4
JB_OFFSET_STRIDE = 64 225 66886 546.9
JB_OFFSET_STRIDE = 128 225 66879 546.9
JB_OFFSET_STRIDE = 256 225 66846 546.8

So at least for that test data, 32 seems like the sweet spot.
We are giving up a couple percent of space in comparison to the
all-lengths version, but this is probably an acceptable tradeoff
for not degrading on very large arrays.

I've not done any speed testing.

I'll do some tommorrow. I should have some different DBs to test on, too.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#180Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

All,

So these results have become a bit complex. So spreadsheet time.

https://docs.google.com/spreadsheets/d/1Mokpx3EqlbWlFDIkF9qzpM7NneN9z-QOXWSzws3E-R4

Some details:

The Length-and-Offset test was performed using a more recent 9.4
checkout than the other two tests. This was regrettable, and due to a
mistake with git, since the results tell me that there have been some
other changes.

I added two new datasets:

errlog2 is a simple, 4-column error log in JSON format, with 2 small
values and 2 large values in each datum. It was there to check if any
of our changes affected the performance or size of such simple
structures (answer: no).

processed_b is a synthetic version of Mozilla Socorro's crash dumps,
about 900,000 of them, with nearly identical JSON on each row. These are
large json values (around 4KB each) with a broad mix of values and 5
levels of nesting. However, none of the levels have very many keys per
level; the max is that the top level has up to 40 keys. Unlike the
other data sets, I can provide a copy of processed_b for asking.

So, some observations:

* Data sizes with lengths-and-offets are slightly (3%) larger than
all-lengths for the pathological case (jsonbish) and unaffected for
other cases.

* Even large, complex JSON (processed_b) gets better compression with
the two patches than with head, although only slightly better (16%)

* This better compression for processed_b leads to slightly slower
extraction (6-7%), and surprisingly slower extraction for
length-and-offset than for all-lengths (about 2%).

* in the patholgical case, length-and-offset was notably faster on Q1
than all-lengths (24%), and somewhat slower on Q2 (8%). I think this
shows me that I don't understand what JSON keys are "at the end".

* notably, length-and-offset when uncompressed (EXTERNAL) was faster on
Q1 than head! This was surprising enough that I retested it.

Overall, I'm satisfied with the performance of the length-and-offset
patch.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#181Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

On 09/26/2014 06:20 PM, Josh Berkus wrote:

Overall, I'm satisfied with the performance of the length-and-offset
patch.

Oh, also ... no bugs found.

So, can we get Beta3 out now?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#182Tom Lane
tgl@sss.pgh.pa.us
In reply to: Josh Berkus (#181)
Re: jsonb format is pessimal for toast compression

Josh Berkus <josh@agliodbs.com> writes:

So, can we get Beta3 out now?

If nobody else steps up and says they want to do some performance
testing, I'll push the latest lengths+offsets patch tomorrow.

Are any of the other open items listed at
https://wiki.postgresql.org/wiki/PostgreSQL_9.4_Open_Items
things that we must-fix-before-beta3?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#183Tom Lane
tgl@sss.pgh.pa.us
In reply to: Bruce Momjian (#175)
Re: jsonb format is pessimal for toast compression

Bruce Momjian <bruce@momjian.us> writes:

On Thu, Sep 25, 2014 at 02:39:37PM -0400, Tom Lane wrote:

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

Yes, I can do that easily. Tell me when you want it --- I just need a
catalog version number to trigger on.

Done --- 201409291 is the cutover point.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#184Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#183)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Bruce Momjian <bruce@momjian.us> writes:

On Thu, Sep 25, 2014 at 02:39:37PM -0400, Tom Lane wrote:

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

Yes, I can do that easily. Tell me when you want it --- I just need a
catalog version number to trigger on.

Done --- 201409291 is the cutover point.

Just to clarify- the commit bumped the catversion to 201409292, so
version <= 201409291 has the old format while version > 201409291 has
the new format. There was no 201409291, so I suppose it doesn't matter
too much, but technically 'version >= 201409291' wouldn't be accurate.

I'm guessing this all makes sense for how pg_upgrade works, but I found
it a bit surprising that the version mentioned as the cutover point
wasn't the catversion committed.

Thanks,

Stephen

#185Tom Lane
tgl@sss.pgh.pa.us
In reply to: Stephen Frost (#184)
Re: jsonb format is pessimal for toast compression

Stephen Frost <sfrost@snowman.net> writes:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Done --- 201409291 is the cutover point.

Just to clarify- the commit bumped the catversion to 201409292, so
version <= 201409291 has the old format while version > 201409291 has
the new format. There was no 201409291, so I suppose it doesn't matter
too much, but technically 'version >= 201409291' wouldn't be accurate.

Nope. See my response to Andrew: ...1 is the cutover commit Bruce
should use, because that's what it is in 9.4.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#186Stephen Frost
sfrost@snowman.net
In reply to: Tom Lane (#185)
Re: jsonb format is pessimal for toast compression

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Stephen Frost <sfrost@snowman.net> writes:

* Tom Lane (tgl@sss.pgh.pa.us) wrote:

Done --- 201409291 is the cutover point.

Just to clarify- the commit bumped the catversion to 201409292, so
version <= 201409291 has the old format while version > 201409291 has
the new format. There was no 201409291, so I suppose it doesn't matter
too much, but technically 'version >= 201409291' wouldn't be accurate.

Nope. See my response to Andrew: ...1 is the cutover commit Bruce
should use, because that's what it is in 9.4.

Yup, makes sense.

Thanks!

Stephen

#187Arthur Silva
arthurprs@gmail.com
In reply to: Josh Berkus (#181)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 29, 2014 at 12:19 AM, Josh Berkus <josh@agliodbs.com> wrote:

On 09/26/2014 06:20 PM, Josh Berkus wrote:

Overall, I'm satisfied with the performance of the length-and-offset
patch.

Oh, also ... no bugs found.

So, can we get Beta3 out now?

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

What's the call on the stride length? Are we going to keep it hardcoded?

#188Tom Lane
tgl@sss.pgh.pa.us
In reply to: Arthur Silva (#187)
Re: jsonb format is pessimal for toast compression

Arthur Silva <arthurprs@gmail.com> writes:

What's the call on the stride length? Are we going to keep it hardcoded?

At the moment it's 32, but we could change it without forcing a new
initdb. I ran a simple test that seemed to show 32 was a good choice,
but if anyone else wants to try other cases, go for it.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#189Josh Berkus
josh@agliodbs.com
In reply to: Josh Berkus (#150)
Re: jsonb format is pessimal for toast compression

On 09/29/2014 11:49 AM, Arthur Silva wrote:

What's the call on the stride length? Are we going to keep it hardcoded?

Please, yes. The complications caused by a variable stride length would
be horrible.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#190Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#183)
1 attachment(s)
Re: jsonb format is pessimal for toast compression

On Mon, Sep 29, 2014 at 12:30:40PM -0400, Tom Lane wrote:

Bruce Momjian <bruce@momjian.us> writes:

On Thu, Sep 25, 2014 at 02:39:37PM -0400, Tom Lane wrote:

BTW, it seems like there is consensus that we ought to reorder the items
in a jsonb object to have keys first and then values, independently of the
other issues under discussion. This means we *will* be breaking on-disk
compatibility with 9.4beta2, which means pg_upgrade will need to be taught
to refuse an upgrade if the database contains any jsonb columns. Bruce,
do you have time to crank out a patch for that?

Yes, I can do that easily. Tell me when you want it --- I just need a
catalog version number to trigger on.

Done --- 201409291 is the cutover point.

Attached patch applied to head, and backpatched to 9.4. I think we need
to keep this in all future pg_ugprade versions in case someone from the
beta tries to jump versions, e.g. 9.4 beta1 to 9.5.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

Attachments:

pg_upgrade.difftext/x-diff; charset=us-asciiDownload
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
new file mode 100644
index 88fe12d..bbfcab7
*** a/contrib/pg_upgrade/check.c
--- b/contrib/pg_upgrade/check.c
*************** static void check_is_install_user(Cluste
*** 24,29 ****
--- 24,30 ----
  static void check_for_prepared_transactions(ClusterInfo *cluster);
  static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
  static void check_for_reg_data_type_usage(ClusterInfo *cluster);
+ static void check_for_jsonb_9_4_usage(ClusterInfo *cluster);
  static void get_bin_version(ClusterInfo *cluster);
  static char *get_canonical_locale_name(int category, const char *locale);
  
*************** check_and_dump_old_cluster(bool live_che
*** 99,104 ****
--- 100,108 ----
  	check_for_prepared_transactions(&old_cluster);
  	check_for_reg_data_type_usage(&old_cluster);
  	check_for_isn_and_int8_passing_mismatch(&old_cluster);
+ 	if (GET_MAJOR_VERSION(old_cluster.major_version) == 904 &&
+ 		old_cluster.controldata.cat_ver < JSONB_FORMAT_CHANGE_CAT_VER)
+ 		check_for_jsonb_9_4_usage(&old_cluster);
  
  	/* Pre-PG 9.4 had a different 'line' data type internal format */
  	if (GET_MAJOR_VERSION(old_cluster.major_version) <= 903)
*************** check_for_reg_data_type_usage(ClusterInf
*** 911,916 ****
--- 915,1010 ----
  				 "    %s\n\n", output_path);
  	}
  	else
+ 		check_ok();
+ }
+ 
+ 
+ /*
+  * check_for_jsonb_9_4_usage()
+  *
+  *	JSONB changed its storage format during 9.4 beta, so check for it.
+  */
+ static void
+ check_for_jsonb_9_4_usage(ClusterInfo *cluster)
+ {
+ 	int			dbnum;
+ 	FILE	   *script = NULL;
+ 	bool		found = false;
+ 	char		output_path[MAXPGPATH];
+ 
+ 	prep_status("Checking for JSONB user data types");
+ 
+ 	snprintf(output_path, sizeof(output_path), "tables_using_jsonb.txt");
+ 
+ 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
+ 	{
+ 		PGresult   *res;
+ 		bool		db_used = false;
+ 		int			ntups;
+ 		int			rowno;
+ 		int			i_nspname,
+ 					i_relname,
+ 					i_attname;
+ 		DbInfo	   *active_db = &cluster->dbarr.dbs[dbnum];
+ 		PGconn	   *conn = connectToServer(cluster, active_db->db_name);
+ 
+ 		/*
+ 		 * While several relkinds don't store any data, e.g. views, they can
+ 		 * be used to define data types of other columns, so we check all
+ 		 * relkinds.
+ 		 */
+ 		res = executeQueryOrDie(conn,
+ 								"SELECT n.nspname, c.relname, a.attname "
+ 								"FROM	pg_catalog.pg_class c, "
+ 								"		pg_catalog.pg_namespace n, "
+ 								"		pg_catalog.pg_attribute a "
+ 								"WHERE	c.oid = a.attrelid AND "
+ 								"		NOT a.attisdropped AND "
+ 								"		a.atttypid = 'pg_catalog.jsonb'::pg_catalog.regtype AND "
+ 								"		c.relnamespace = n.oid AND "
+ 		/* exclude possible orphaned temp tables */
+ 								"  		n.nspname !~ '^pg_temp_' AND "
+ 							  "		n.nspname NOT IN ('pg_catalog', 'information_schema')");
+ 
+ 		ntups = PQntuples(res);
+ 		i_nspname = PQfnumber(res, "nspname");
+ 		i_relname = PQfnumber(res, "relname");
+ 		i_attname = PQfnumber(res, "attname");
+ 		for (rowno = 0; rowno < ntups; rowno++)
+ 		{
+ 			found = true;
+ 			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
+ 				pg_fatal("Could not open file \"%s\": %s\n",
+ 						 output_path, getErrorText(errno));
+ 			if (!db_used)
+ 			{
+ 				fprintf(script, "Database: %s\n", active_db->db_name);
+ 				db_used = true;
+ 			}
+ 			fprintf(script, "  %s.%s.%s\n",
+ 					PQgetvalue(res, rowno, i_nspname),
+ 					PQgetvalue(res, rowno, i_relname),
+ 					PQgetvalue(res, rowno, i_attname));
+ 		}
+ 
+ 		PQclear(res);
+ 
+ 		PQfinish(conn);
+ 	}
+ 
+ 	if (script)
+ 		fclose(script);
+ 
+ 	if (found)
+ 	{
+ 		pg_log(PG_REPORT, "fatal\n");
+ 		pg_fatal("Your installation contains one of the JSONB data types in user tables.\n"
+ 		 "The internal format of JSONB changed during 9.4 beta so this cluster cannot currently\n"
+ 				 "be upgraded.  You can remove the problem tables and restart the upgrade.  A list\n"
+ 				 "of the problem columns is in the file:\n"
+ 				 "    %s\n\n", output_path);
+ 	}
+ 	else
  		check_ok();
  }
  
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
new file mode 100644
index 0207391..56a7505
*** a/contrib/pg_upgrade/pg_upgrade.h
--- b/contrib/pg_upgrade/pg_upgrade.h
*************** extern char *output_files[];
*** 122,127 ****
--- 122,132 ----
  #define LARGE_OBJECT_SIZE_PG_CONTROL_VER 942
  
  /*
+  * change in JSONB format during 9.4 beta
+  */
+ #define JSONB_FORMAT_CHANGE_CAT_VER 201409291
+ 
+ /*
   * Each relation is represented by a relinfo structure.
   */
  typedef struct