PATCH: two slab-like memory allocators

Started by Tomas Vondraover 9 years ago102 messages
#1Tomas Vondra
tomas.vondra@2ndquadrant.com
2 attachment(s)

Hi,

Back in the bug #14231 thread [1]/messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org, dealing with performance issues in
reorderbuffer due to excessive number of expensive free() calls, I've
proposed to resolve that by a custom slab-like memory allocator,
suitable for fixed-size allocations. I'd like to put this into the next
CF, and it's probably too invasive change to count as a bugfix anyway.

[1]: /messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org
/messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org

This patch actually includes two new memory allocators (not one). Very
brief summary (for more detailed explanation of the ideas, see comments
at the beginning of slab.c and genslab.c):

Slab
----
* suitable for fixed-length allocations (other pallocs fail)
* much simpler than AllocSet (no global freelist management etc.)
* free space is tracked per block (using a simple bitmap)
* which allows freeing the block once all chunks are freed (AllocSet
will hold the memory forever, in the hope of reusing it)

GenSlab
-------
* suitable for non-fixed-length allocations, but with chunks of mostly
the same size (initially unknown, the context will tune itself)
* a combination AllocSet and Slab (or a sequence of Slab allocators)
* the goal is to do most allocations in Slab context
* there's always a single 'current' Slab context, and every now and and
then it's replaced with a new generation (with the chunk size computed
from recent requests)
* the AllocSet context is used for chunks too large for current Slab

So none of this is meant as a universal replacement of AllocSet, but in
the suitable cases the results seem really promising. For example for
the simple test query in [1]/messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org, the performance improvement is this:

N | master | patched
-----------------------------
10000 | 100ms | 100ms
50000 | 15000ms | 350ms
100000 | 146000ms | 700ms
200000 | ? | 1400ms

That's a fairly significant improvement, and the submitted version of
the patches should perform even better (~2x, IIRC).

There's a bunch of TODOs - e.g. handling of realloc() calls in the
GenSlab, and probably things I haven't thought about.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0001-simple-slab-allocator-fixed-size-allocations.patchbinary/octet-stream; name=0001-simple-slab-allocator-fixed-size-allocations.patchDownload
From 1ecbda98030e5742378f4d5115bccddc3b154f17 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Tue, 19 Jul 2016 00:15:54 +0200
Subject: [PATCH 1/2] simple slab allocator (fixed-size allocations)

---
 src/backend/replication/logical/reorderbuffer.c | 163 ++---
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/slab.c                   | 836 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   2 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  13 +
 src/include/utils/memutils.h                    |  11 +
 7 files changed, 935 insertions(+), 93 deletions(-)
 create mode 100644 src/backend/utils/mmgr/slab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 77375d9..c6c61e8 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -135,6 +135,9 @@ typedef struct ReorderBufferDiskChange
 	/* data follows */
 } ReorderBufferDiskChange;
 
+/* 10k tuples seems like a reasonable value (~80MB with MaxHeapTupleSize) */
+#define		TUPLES_PER_GENERATION		10000
+
 /*
  * Maximum number of changes kept in memory, per transaction. After that,
  * changes are spooled to disk.
@@ -156,9 +159,6 @@ static const Size max_changes_in_memory = 4096;
  * major bottleneck, especially when spilling to disk while decoding batch
  * workloads.
  */
-static const Size max_cached_changes = 4096 * 2;
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
 
 
 /* ---------------------------------------
@@ -243,6 +243,28 @@ ReorderBufferAllocate(void)
 
 	buffer->context = new_ctx;
 
+	buffer->change_context = SlabContextCreate(new_ctx,
+									"Change",
+									SLAB_DEFAULT_BLOCK_SIZE,
+									sizeof(ReorderBufferChange));
+
+	buffer->txn_context = SlabContextCreate(new_ctx,
+									"TXN",
+									SLAB_DEFAULT_BLOCK_SIZE,
+									sizeof(ReorderBufferTXN));
+
+	buffer->tup_context_slab = SlabContextCreate(new_ctx,
+									"TuplesSlab",
+									SLAB_LARGE_BLOCK_SIZE,
+									sizeof(ReorderBufferTupleBuf) +
+									MAXIMUM_ALIGNOF + MaxHeapTupleSize);
+
+	buffer->tup_context_oversized = AllocSetContextCreate(new_ctx,
+									"TuplesOversized",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -262,11 +284,17 @@ ReorderBufferAllocate(void)
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
+	buffer->tuples_count = 0;
+	buffer->tuples_size = 0;
+
 	dlist_init(&buffer->toplevel_by_lsn);
 	dlist_init(&buffer->cached_transactions);
 	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
+	buffer->current_size = sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + MaxHeapTupleSize;
+
 	return buffer;
 }
 
@@ -293,19 +321,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)
+			MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -346,18 +363,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	pfree(txn);
 }
 
 /*
@@ -368,19 +374,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 {
 	ReorderBufferChange *change;
 
-	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)
+			MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -436,21 +431,9 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	pfree(change);
 }
 
-
 /*
  * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at
  * least a tuple of size tuple_len (excluding header overhead).
@@ -463,37 +446,49 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
+	/* see if we need to allocate a new context generation */
+	if (rb->tuples_count == TUPLES_PER_GENERATION)
+	{
+		Size	new_size;
+		Size	avg_length = (rb->tuples_size / rb->tuples_count);
+
+		/* mark the current SLAB context for automatic destruction */
+		SlabAutodestruct(rb->tup_context_slab);
+
+		/* assume +50% is enough slack to fit most tuples into the slab context */
+		new_size = MAXALIGN(avg_length * 1.5);
 
+		rb->current_size = new_size;
+		rb->tup_context_slab = SlabContextCreate(rb->context,
+									"TuplesSlab",
+									SLAB_LARGE_BLOCK_SIZE,
+									sizeof(ReorderBufferTupleBuf) +
+									MAXIMUM_ALIGNOF + rb->current_size);
+
+		/* we could also recreate the aset context, with block sizes set so
+		 * that the palloc always does malloc(), but not sure about that */
+
+		rb->tuples_count = 0;
+		rb->tuples_size = 0;
+	}
+
+	rb->tuples_count += 1;
+	rb->tuples_size  += alloc_len;
 
 	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
+	if (alloc_len <= rb->current_size)
 	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
+		tuple = (ReorderBufferTupleBuf *)
+			MemoryContextAlloc(rb->tup_context_slab,
+							   sizeof(ReorderBufferTupleBuf) +
+							   MAXIMUM_ALIGNOF + rb->current_size);
+		tuple->alloc_tuple_size = rb->current_size;
 		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
 	}
 	else
 	{
 		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
+			MemoryContextAlloc(rb->tup_context_oversized,
 							   sizeof(ReorderBufferTupleBuf) +
 							   MAXIMUM_ALIGNOF + alloc_len);
 		tuple->alloc_tuple_size = alloc_len;
@@ -512,21 +507,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index b2403e1..321289f 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o
+OBJS = aset.o mcxt.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000..aa751cd
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,836 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom memory context MemoryContext implementation designed for
+ * cases of equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations that are not possible in AllocSet. Firstly, we can get rid
+ *	of the doubling and carve the blocks into chunks of exactly the right size
+ *	(plus alignment), now wasting memory.
+ *
+ *	The allocator also does not need the complex array of freelists (one for
+ *	each possible size), and as the number of chunks per block is constant,
+ *	which allows us to store the per-block freelist directly in the block as
+ *	a bitmap, and organize the blocks into lists by number of free chunks.
+ *
+ *	The in-block free bitmap allows us to quickly determine whether the whole
+ *	block is empty, and free it in that case. This is another major difference
+ *	compared to AllocSet, which never frees the allocated memory (unless the
+ *	whole context is reset, which is not very practical in most cases).
+ *
+ *	To make the freeing more likely, we use the global freelist starting from
+ *	the most full blocks, and reusing those. This works particularly well for
+ *	use cases that allocate a lot of objects, use them for a while and then
+ *	free most of them at once.
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+#define SLAB_BLOCKHDRSZ	MAXALIGN(sizeof(SlabBlockData))
+#define SLAB_CHUNKHDRSZ	MAXALIGN(sizeof(SlabChunkData))
+
+/* Portion of SLAB_CHUNKHDRSZ examined outside slab.c. */
+#define SLAB_CHUNK_PUBLIC	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+
+/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define SLAB_CHUNK_USED	\
+	(offsetof(SlabChunkData, requested_size) + sizeof(Size))
+#else
+#define SLAB_CHUNK_USED	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;
+
+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+
+/*
+ * SlabContext is our standard implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock;	/* number of chunks per block */
+	int			minFreeCount;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	bool		autodestruct;	/* destruct after freeing the last block */
+	/* Info about storage allocated in this context: */
+	SlabBlock	freelist[1];	/* free lists (block-level) */
+} SlabContext;
+
+typedef SlabContext *Slab;
+
+typedef struct SlabBlockData
+{
+	Slab		slab;			/* slab that owns this block */
+	SlabBlock	prev;			/* previous block in slab's block list */
+	SlabBlock	next;			/* next block in slab's blocks list */
+	int			nfree;			/* number of free chunks */
+	int			firstFreeChunk;	/* index of the first free chunk in the block */
+	char	   *bitmapptr;		/* pointer to free bitmap */
+}	SlabBlockData;
+
+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ */
+typedef struct SlabChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+	/* aset is the owning aset if allocated, or the freelist link if free */
+	void	   *aset;
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif
+}	SlabChunkData;
+
+/*
+ * SlabPointerIsValid
+ *		True iff pointer is valid allocation pointer.
+ */
+#define SlabPointerIsValid(pointer) PointerIsValid(pointer)
+
+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)
+
+#define SlabPointerGetChunk(ptr)	\
+					((SlabChunk)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+#define SlabChunkGetPointer(chk)	\
+					((SlabPointer)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabInit(MemoryContext context);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods SlabMethods = {
+	SlabAlloc,
+	SlabFree,
+	SlabRealloc,
+	SlabInit,
+	SlabReset,
+	SlabDelete,
+	SlabGetChunkSpace,
+	SlabIsEmpty,
+	SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,SlabCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif
+
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static void
+wipe_mem(void *ptr, size_t size)
+{
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	memset(ptr, 0x7F, size);
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
+}
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void
+set_sentinel(void *base, Size offset)
+{
+	char	   *ptr = (char *) base + offset;
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
+	*ptr = 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+}
+
+static bool
+sentinel_ok(const void *base, Size offset)
+{
+	const char *ptr = (const char *) base + offset;
+	bool		ret;
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
+	ret = *ptr == 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+
+	return ret;
+}
+#endif
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+static void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize)
+{
+	int		i, chunksPerBlock;
+	Size	fullChunkSize;
+	Slab	set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+
+	/* so how many chunks can we fit into a block, including header and bitmap */
+	chunksPerBlock = 0;
+	for (i = 0; i <= blockSize / chunkSize; i++)
+	{
+		Size bitmapsize = ((i + 7) / 8);
+
+		/* repeat until we hit the block size */
+		if (((i * fullChunkSize) + sizeof(SlabBlockData) + bitmapsize) > blockSize)
+			break;
+
+		chunksPerBlock = i;
+	}
+
+	/* if we can't fit at least one chunk into the block, we're hosed */
+	Assert(chunksPerBlock > 0);
+
+	/* Do the type-independent part of context creation */
+	set = (Slab) MemoryContextCreate(T_SlabContext,
+									 /* allocate context and freelist at once */
+									 (offsetof(SlabContext, freelist) + sizeof(SlabChunk) * (chunksPerBlock + 1)),
+									 &SlabMethods,
+									 parent,
+									 name);
+
+	set->blockSize = blockSize;
+	set->chunkSize = chunkSize;
+	set->fullChunkSize = fullChunkSize;
+	set->chunksPerBlock = chunksPerBlock;
+	set->nblocks = 0;
+	set->minFreeCount = 0;
+	set->autodestruct = false;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * SlabInit
+ *		Context-type-specific initialization routine. SlabContext does not
+ *		need anything extra, at this moment.
+ */
+static void
+SlabInit(MemoryContext context)
+{
+	/*
+	 * Since MemoryContextCreate already zeroed the context node, we don't
+	 * have to do anything here: it's already OK.
+	 */
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+	int		i;
+	Slab	set = (Slab) context;
+
+	AssertArg(SlabIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		SlabBlock block = set->freelist[i];
+		set->freelist[i] = NULL;
+
+		while (block != NULL)
+		{
+			SlabBlock	next = block->next;
+
+			/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, set->blockSize);
+#endif
+			free(block);
+			set->nblocks--;
+
+			block = next;
+		}
+	}
+
+	set->minFreeCount = 0;
+	set->autodestruct = false;
+
+	Assert(set->nblocks == 0);
+}
+
+/*
+ * SlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call SlabReset().
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+	/* just reset the context */
+	SlabReset(context);
+}
+
+/* operations on the freelist - adding/removing/moving blocks */
+static void
+remove_from_freelist(Slab set, SlabBlock block, int nfree_old)
+{
+	/* either it has a previous block, or it's the first block in list */
+	if (block->prev)
+		block->prev->next = block->next;
+	else
+		set->freelist[nfree_old] = block->next;
+
+	/* if it has a next block, update it too */
+	if (block->next)
+		block->next->prev = block->prev;
+
+	block->prev = NULL;
+	block->next = NULL;
+}
+
+static void
+add_to_freelist(Slab set, SlabBlock block)
+{
+	/* otherwise add it to the proper freelist bin */
+	if (set->freelist[block->nfree])
+		set->freelist[block->nfree]->prev = block;
+
+	block->next = set->freelist[block->nfree];
+	set->freelist[block->nfree] = block;
+}
+
+static void
+move_in_freelist(Slab set, SlabBlock block, int nfree_old)
+{
+	remove_from_freelist(set, block, nfree_old);
+	add_to_freelist(set, block);
+}
+
+
+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab	set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert(size == set->chunkSize);
+	Assert((set->minFreeCount >= 0) && (set->minFreeCount < set->chunksPerBlock));
+
+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 */
+	if (set->minFreeCount == 0)
+	{
+		block = (SlabBlock)malloc(set->blockSize);
+
+		if (block == NULL)
+			return NULL;
+
+		memset(block, 0, set->blockSize);
+
+		block->slab = set;
+		block->nfree = set->chunksPerBlock;
+		block->prev = NULL;
+		block->next = NULL;
+		block->firstFreeChunk = 0;
+
+		/* the free bitmap is placed at the end */
+		block->bitmapptr
+			= ((char *) block) + set->blockSize - ((set->chunksPerBlock + 7) / 8);
+
+		/*
+		 * And add it to the last freelist with all chunks empty (we know
+		 * there are no blocks in the freelist, otherwise we wouldn't need
+		 * a new block.
+		 */
+		set->freelist[set->chunksPerBlock] = block;
+		set->minFreeCount = set->chunksPerBlock;
+		set->nblocks += 1;
+	}
+
+	/* grab the block from the freelist (even the new block is there) */
+	block = set->freelist[set->minFreeCount];
+
+	/* make sure we actually got a valid block, with matching nfree */
+	Assert(block != NULL);
+	Assert(set->minFreeCount == block->nfree);
+	Assert(block->nfree > 0);
+
+	Assert((char*)block < block->bitmapptr);
+	Assert((char*)block + set->blockSize > block->bitmapptr);
+
+	/* we know the first free chunk */
+	idx = block->firstFreeChunk;
+
+	/* make sure the chunk index is valid, and that it's marked as empty */
+	Assert((idx >= 0) && (idx < set->chunksPerBlock));
+	Assert(!((block->bitmapptr[idx/8] & (0x01 << (idx % 8)))));
+
+	/* mark the chunk as used (set 1 to the bit) */
+	block->bitmapptr[idx/8] |= (0x01 << (idx % 8));
+
+	/* compute the chunk location block start (after the block header) */
+	chunk = (SlabChunk) ((char*)block + sizeof(SlabBlockData)
+									  + (idx * set->fullChunkSize));
+
+	/*
+	 * update the block nfree count, and also the minFreeCount as we've
+	 * decreased nfree for a block with the minimum count
+	 */
+	block->nfree--;
+	set->minFreeCount = block->nfree;
+
+	/* but we need to find the next one, for the next alloc call (unless the
+	 * block just got full, in that case simply set it to -1 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int byte = block->firstFreeChunk / 8;
+			int bit  = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (! (block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}
+
+	/* move the block to the right place in the freelist */
+	move_in_freelist(set, block, (block->nfree + 1));
+
+	/* but if the minimum is 0, we need to look for a new one */
+	if (set->minFreeCount == 0)
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+			if (set->freelist[idx])
+			{
+				set->minFreeCount = idx;
+				break;
+			}
+
+	if (set->minFreeCount == set->chunksPerBlock)
+		set->minFreeCount = 0;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+
+	chunk->aset = (void *) set;
+	chunk->block = (void *) block;
+	chunk->size = MAXALIGN(size);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+							   sizeof(chunk->requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		set_sentinel(SlabChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+	SlabAllocInfo(set, chunk);
+	return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+	int		idx;
+	Slab	set = (Slab) context;
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+	SlabBlock	block = chunk->block;
+
+	SlabFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/* compute index wrt to block start */
+	idx = ((char*)chunk - ((char*)block + sizeof(SlabBlockData))) / set->fullChunkSize;
+
+	Assert((block->bitmapptr[idx/8] & (0x01 << (idx % 8))));
+
+	/* mark the chunk as unused (set 0 to the bit), and update block nfree count */
+	block->bitmapptr[idx/8] ^= (0x01 << (idx % 8));
+	block->nfree++;
+	block->firstFreeChunk = Min(block->firstFreeChunk, idx);
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= set->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->requested_size = 0;
+#endif
+
+	/* now decide what to do with the block */
+
+	/*
+	 * See if we need to update the minFreeCount field for the set - we only
+	 * need to do that if the block had that number of free chunks before we
+	 * freed one. In that case, we check if there still are blocks with that
+	 * number of free chunks - we can simply check if the chunk has siblings.
+	 * Otherwise we simply increment the value by one, as the new block is
+	 * still the one with minimum free chunks (even without the one chunk).
+	 */
+	if (set->minFreeCount == (block->nfree-1))
+		if ((block->prev == NULL) && (block->next == NULL)) /* no other blocks */
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == set->chunksPerBlock)
+				set->minFreeCount = 0;
+			else
+				set->minFreeCount++;
+		}
+
+	/* remove the block from a freelist */
+	remove_from_freelist(set, block, block->nfree-1);
+
+	/* If the block is now completely empty, free it. */
+	if (block->nfree == set->chunksPerBlock)
+	{
+		free(block);
+		set->nblocks--;
+	}
+	else
+		add_to_freelist(set, block);
+
+	Assert(set->nblocks >= 0);
+
+	/*
+	 * If we've just released the last block in the context, destruct it.
+	 *
+	 * XXX But don't do that if the context has children.
+	 */
+	if (set->autodestruct && (set->nblocks == 0) && (context->firstchild == NULL))
+		MemoryContextDelete(context);
+}
+
+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc. However we try to be gentle and
+ *		allow calls with exactly the same size as in that case we can simply
+ *		return the same chunk. When the size differs, we fail with assert
+ *		failure or return NULL.
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab	set = (Slab)context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	/* we can't really do repalloc with this allocator */
+	Assert(false);
+
+	return NULL;
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+
+	return chunk->size + SLAB_CHUNKHDRSZ;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+	Slab		set = (Slab)context;
+	return (set->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of an Slab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+SlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals)
+{
+	Slab		set = (Slab) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+	int			i;
+
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		SlabBlock block = set->freelist[i];
+		while (block != NULL)
+		{
+			nblocks++;
+			totalspace += set->blockSize;
+			freespace += set->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+			block = block->next;
+		}
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"%s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used; autodestruct %d\n",
+				set->header.name, totalspace, nblocks, freespace, freechunks,
+				totalspace - freespace, set->autodestruct);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+void
+SlabAutodestruct(MemoryContext context)
+{
+	Slab	set = (Slab)context;
+
+	set->autodestruct = true;
+}
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+	/* FIXME */
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index ba069cc..92a7478 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,6 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext)))
+	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 6b850e4..62005bb 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -274,6 +274,7 @@ typedef enum NodeTag
 	 */
 	T_MemoryContext = 600,
 	T_AllocSetContext,
+	T_SlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 9e209ae..e8a8d77 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -331,6 +331,19 @@ struct ReorderBuffer
 	MemoryContext context;
 
 	/*
+	 * slab contexts for change and TXN objects.
+	 */
+	MemoryContext change_context;
+	MemoryContext txn_context;
+	MemoryContext tup_context_slab;
+	MemoryContext tup_context_oversized;
+
+	/* counters for current generation of tuples */
+	int		tuples_count;
+	Size	tuples_size;
+	Size	current_size;
+
+	/*
 	 * Data structure slab cache.
 	 *
 	 * We allocate/deallocate some structures very frequently, to avoid bigger
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index ae07705..cc63f27 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -135,6 +135,14 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
 					  Size initBlockSize,
 					  Size maxBlockSize);
 
+/* slab.c */
+extern MemoryContext SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize);
+
+extern void SlabAutodestruct(MemoryContext context);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
@@ -159,4 +167,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
  */
 #define ALLOCSET_SEPARATE_THRESHOLD  8192
 
+#define SLAB_DEFAULT_BLOCK_SIZE		8192
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
+
 #endif   /* MEMUTILS_H */
-- 
2.5.5

0002-generational-slab-auto-tuning-allocator.patchbinary/octet-stream; name=0002-generational-slab-auto-tuning-allocator.patchDownload
From 91cbdc65420388f671487a3f45d86ac7146dce4d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Wed, 20 Jul 2016 23:46:36 +0200
Subject: [PATCH 2/2] generational slab (auto-tuning allocator)

---
 src/backend/replication/logical/reorderbuffer.c |  71 +----
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/genslab.c                | 347 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   4 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |   8 +-
 src/include/utils/memutils.h                    |   7 +
 7 files changed, 369 insertions(+), 71 deletions(-)
 create mode 100644 src/backend/utils/mmgr/genslab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index c6c61e8..cc2ec44 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -253,17 +253,12 @@ ReorderBufferAllocate(void)
 									SLAB_DEFAULT_BLOCK_SIZE,
 									sizeof(ReorderBufferTXN));
 
-	buffer->tup_context_slab = SlabContextCreate(new_ctx,
+	buffer->tup_context = GenSlabContextCreate(new_ctx,
 									"TuplesSlab",
 									SLAB_LARGE_BLOCK_SIZE,
 									sizeof(ReorderBufferTupleBuf) +
-									MAXIMUM_ALIGNOF + MaxHeapTupleSize);
-
-	buffer->tup_context_oversized = AllocSetContextCreate(new_ctx,
-									"TuplesOversized",
-									ALLOCSET_DEFAULT_MINSIZE,
-									ALLOCSET_DEFAULT_INITSIZE,
-									ALLOCSET_DEFAULT_MAXSIZE);
+									MAXIMUM_ALIGNOF + MaxHeapTupleSize,
+									TUPLES_PER_GENERATION);
 
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
@@ -284,17 +279,11 @@ ReorderBufferAllocate(void)
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
-	buffer->tuples_count = 0;
-	buffer->tuples_size = 0;
-
 	dlist_init(&buffer->toplevel_by_lsn);
 	dlist_init(&buffer->cached_transactions);
 	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
-	buffer->current_size = sizeof(ReorderBufferTupleBuf) +
-						   MAXIMUM_ALIGNOF + MaxHeapTupleSize;
-
 	return buffer;
 }
 
@@ -446,54 +435,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/* see if we need to allocate a new context generation */
-	if (rb->tuples_count == TUPLES_PER_GENERATION)
-	{
-		Size	new_size;
-		Size	avg_length = (rb->tuples_size / rb->tuples_count);
-
-		/* mark the current SLAB context for automatic destruction */
-		SlabAutodestruct(rb->tup_context_slab);
-
-		/* assume +50% is enough slack to fit most tuples into the slab context */
-		new_size = MAXALIGN(avg_length * 1.5);
-
-		rb->current_size = new_size;
-		rb->tup_context_slab = SlabContextCreate(rb->context,
-									"TuplesSlab",
-									SLAB_LARGE_BLOCK_SIZE,
-									sizeof(ReorderBufferTupleBuf) +
-									MAXIMUM_ALIGNOF + rb->current_size);
-
-		/* we could also recreate the aset context, with block sizes set so
-		 * that the palloc always does malloc(), but not sure about that */
-
-		rb->tuples_count = 0;
-		rb->tuples_size = 0;
-	}
-
-	rb->tuples_count += 1;
-	rb->tuples_size  += alloc_len;
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= rb->current_size)
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->tup_context_slab,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + rb->current_size);
-		tuple->alloc_tuple_size = rb->current_size;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->tup_context_oversized,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index 321289f..08b5e3a 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o slab.o
+OBJS = aset.o genslab.o mcxt.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/genslab.c b/src/backend/utils/mmgr/genslab.c
new file mode 100644
index 0000000..1f300aa
--- /dev/null
+++ b/src/backend/utils/mmgr/genslab.c
@@ -0,0 +1,347 @@
+/*-------------------------------------------------------------------------
+ *
+ * genslab.c
+ *	  Generational SLAB allocator definitions.
+ *
+ * An extension of the SLAB allocator relaxing the fixed-size limitation by
+ * using a generational design.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/genslab.c
+ *
+ *
+ *	The simple SLAB allocator only allows allocating chunks with exactly the
+ *	same size. That only works for some special cases, e.g. when the context
+ *	is only used for instances of a single structure with fixed size.
+ * 
+ *	This implementation tries to relax this restriction by treating the chunk
+ *	size as an upper boundary, and using a regular AllocSet context to serve
+ *	requests for larger pieces of memory.
+ *
+ *	Furthermore, instead of using a single SLAB context (fixing the maximum
+ *	chunk size) it's possible to automatically tune the chunk size based on
+ *	past allocations. This is done by replacing the single SLAB context with
+ *	a sequence of contexts (with only the last one used for allocations).
+ *
+ *	This works particularly well when we can't predict the size of the
+ *	objects easily, but we know that the size is unlikely to vary too much.
+ *	It also works quite nicely when the memory is freed in about the same
+ *	sequence as it was allocated, because the old SLAB contexts will get
+ *	empty and freed automatically (one of the benefits of SLAB contexts).
+ *
+ *	A good example is ReorderBuffer - the tuples tend to be of about the
+ *	same size, and freed in roughly the same sequence as allocated.
+ *
+ *	In a sense, this delegates the allocation to actual implementations,
+ *	which also handle CLOBBER_FREED_MEMORY and MEMORY_CONTEXT_CHECKING.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+/*
+ * GenSlabContext is a self-tuning version of SlabContext.
+ */
+typedef struct GenSlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	MemoryContext	slab;
+	MemoryContext	aset;
+
+	/* SLAB parameters */
+	Size		blockSize;		/* block size */
+	Size		chunkSize;		/* chunk size */
+
+	/* counters used for tuning chunk size */
+
+	Size		nbytes;			/* bytes allocated (as requested) */
+	int			nallocations;	/* number of allocations */
+	int			maxallocations;	/* self-tune after number of allocations */
+
+} GenSlabContext;
+
+typedef GenSlabContext *GenSlab;
+
+/*
+ * These functions implement the MemoryContext API for GenSlab contexts.
+ */
+static void *GenSlabAlloc(MemoryContext context, Size size);
+static void GenSlabFree(MemoryContext context, void *pointer);
+static void *GenSlabRealloc(MemoryContext context, void *pointer, Size size);
+static void GenSlabInit(MemoryContext context);
+static void GenSlabReset(MemoryContext context);
+static void GenSlabDelete(MemoryContext context);
+static Size GenSlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenSlabIsEmpty(MemoryContext context);
+static void GenSlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenSlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods GenSlabMethods = {
+	GenSlabAlloc,
+	GenSlabFree,
+	GenSlabRealloc,
+	GenSlabInit,
+	GenSlabReset,
+	GenSlabDelete,
+	GenSlabGetChunkSpace,
+	GenSlabIsEmpty,
+	GenSlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenSlabCheck
+#endif
+};
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenSlabContextCreate
+ *		Create a new GenSlab context.
+ */
+MemoryContext
+GenSlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize,
+					  int maxAllocations)
+{
+	GenSlab	set;
+
+	/* Do the type-independent part of context creation */
+	set = (GenSlab) MemoryContextCreate(T_GenSlabContext,
+										sizeof(GenSlabContext),
+										&GenSlabMethods,
+										parent,
+										name);
+
+	/* the default context */
+	set->slab = SlabContextCreate((MemoryContext)set,
+								  "slab",
+								  blockSize,
+								  chunkSize);
+
+	/*
+	 * TODO Maybe we could set the parameters so that all requests exceeding
+	 * the SLAB chunk size (and thus falling through to the AllocSet) also
+	 * exceed allocChunkLimit and thus get allocated using malloc(). That's
+	 * more expensive, but vast majority of requests should be handled by
+	 * the SLAB context anyway. And chunks over allocChunkLimit are freed
+	 * immediately, which is also nice.
+	 */
+	set->aset = AllocSetContextCreate((MemoryContext)set,
+									 "oversized",
+									 ALLOCSET_DEFAULT_MINSIZE,
+									 ALLOCSET_DEFAULT_INITSIZE,
+									 ALLOCSET_DEFAULT_MAXSIZE);
+
+	set->blockSize = blockSize;
+	set->nbytes = 0;
+	set->nallocations = 0;
+	set->maxallocations = maxAllocations;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenSlabInit
+ *		Context-type-specific initialization routine. Simply delegate the
+ *		child contexts.
+ */
+static void
+GenSlabInit(MemoryContext context)
+{
+	GenSlab set = (GenSlab)context;
+
+	set->nallocations = 0;
+	set->nbytes = 0;
+}
+
+/*
+ * GenSlabReset
+ *		Frees all memory which is allocated in the given set. We also get
+ *		rid of all the old SLAB generations and only keep the current one.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenSlabReset(MemoryContext context)
+{
+	GenSlab	set = (GenSlab) context;
+
+	set->nallocations = 0;
+	set->nbytes = 0;
+}
+
+/*
+ * GenSlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We don't really need to do anything special
+ *		as MemoryContextDelete deletes child contexts automatically.
+ */
+static void
+GenSlabDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenSlabReset(context);
+}
+
+/*
+ * GenSlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenSlabAlloc(MemoryContext context, Size size)
+{
+	GenSlab	set = (GenSlab) context;
+
+	/* do we need to auto-tune the SLAB chunk size */
+	if (set->nallocations > set->maxallocations)
+	{
+		/*
+		 * TODO we could also assume the requests follow normal distribution,
+		 * computing stddev and then computing a chosen percentile (e.g. 0.95).
+		 * For now we simply use 1.5x the average, as it's simple.
+		 */
+
+		/* compute the new chunk size */
+		Size chunkSize = (1.5 * set->nbytes) / set->nallocations;
+
+		/* mark for autodestruction */
+		SlabAutodestruct(set->slab);
+
+		set->slab = SlabContextCreate((MemoryContext)set,
+									  "slab",
+									  set->blockSize,
+									  chunkSize);
+
+		set->chunkSize = chunkSize;
+		set->nallocations = 0;
+		set->nbytes = 0;
+	}
+
+	if (size <= set->chunkSize)
+		return MemoryContextAlloc(set->slab, set->chunkSize);
+	else
+		return MemoryContextAlloc(set->aset, size);
+}
+
+/*
+ * GenSlabFree
+ *		As the memory is actually allocated in other contexts, we should
+ *		never really get here.
+ *
+ * FIXME Although someone could call MemoryContextFree directly.
+ */
+static void
+GenSlabFree(MemoryContext context, void *pointer)
+{
+	return pfree(pointer);
+}
+
+/*
+ * GenSlabRealloc
+ *		As the memory is actually allocated in other contexts, we should
+ *		never really get here.
+ *
+ * FIXME Although someone could call MemoryContextRealloc directly.
+ */
+static void *
+GenSlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	return repalloc(pointer, size);
+}
+
+/*
+ * GenSlabGetChunkSpace
+ *		As the memory is actually allocated in other contexts, we should
+ *		never really get here.
+ *
+ * FIXME Although someone could call MemoryContextGetChunkSpace directly.
+ */
+static Size
+GenSlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	return GetMemoryChunkSpace(pointer);
+}
+
+/*
+ * GenSlabIsEmpty
+ *		Is an GenSlab empty of any allocated space?
+ *
+ * TODO This does not really work, as MemoryContextIsEmpty returns false if
+ * 		there are any children, and GenSlab always has at least two.
+ */
+static bool
+GenSlabIsEmpty(MemoryContext context)
+{
+	/* */
+	return true;
+}
+
+/*
+ * GenSlabStats
+ *		Compute stats about memory consumption of an GenSlab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+GenSlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals)
+{
+	GenSlab		set = (GenSlab) context;
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr, "%s\n", set->header.name);
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenSlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenSlabCheck(MemoryContext context)
+{
+	
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 92a7478..aae2349 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenSlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 62005bb..9ba5a6a 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -275,6 +275,7 @@ typedef enum NodeTag
 	T_MemoryContext = 600,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenSlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index e8a8d77..2dfab26 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -335,13 +335,7 @@ struct ReorderBuffer
 	 */
 	MemoryContext change_context;
 	MemoryContext txn_context;
-	MemoryContext tup_context_slab;
-	MemoryContext tup_context_oversized;
-
-	/* counters for current generation of tuples */
-	int		tuples_count;
-	Size	tuples_size;
-	Size	current_size;
+	MemoryContext tup_context;
 
 	/*
 	 * Data structure slab cache.
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index cc63f27..f0b6372 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -143,6 +143,13 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 
 extern void SlabAutodestruct(MemoryContext context);
 
+/* genslab.c */
+extern MemoryContext GenSlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize,
+					  int maxAllocations);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
-- 
2.5.5

#2Petr Jelinek
petr@2ndquadrant.com
In reply to: Tomas Vondra (#1)
Re: PATCH: two slab-like memory allocators

Hi Tomas,

On 02/08/16 17:44, Tomas Vondra wrote:

This patch actually includes two new memory allocators (not one). Very
brief summary (for more detailed explanation of the ideas, see comments
at the beginning of slab.c and genslab.c):

Slab
----
* suitable for fixed-length allocations (other pallocs fail)
* much simpler than AllocSet (no global freelist management etc.)
* free space is tracked per block (using a simple bitmap)
* which allows freeing the block once all chunks are freed (AllocSet
will hold the memory forever, in the hope of reusing it)

GenSlab
-------
* suitable for non-fixed-length allocations, but with chunks of mostly
the same size (initially unknown, the context will tune itself)
* a combination AllocSet and Slab (or a sequence of Slab allocators)
* the goal is to do most allocations in Slab context
* there's always a single 'current' Slab context, and every now and and
then it's replaced with a new generation (with the chunk size computed
from recent requests)
* the AllocSet context is used for chunks too large for current Slab

So it's just wrapper around the other two allocators to make this
usecase easier to handle. Do you expect there will be use for the
GenSlab eventually outside of the reoderbuffer?

So none of this is meant as a universal replacement of AllocSet, but in
the suitable cases the results seem really promising. For example for
the simple test query in [1], the performance improvement is this:

N | master | patched
-----------------------------
10000 | 100ms | 100ms
50000 | 15000ms | 350ms
100000 | 146000ms | 700ms
200000 | ? | 1400ms

That's a fairly significant improvement, and the submitted version of
the patches should perform even better (~2x, IIRC).

I agree that it improves performance quite nicely and that reoderbuffer
could use this.

About the code. I am not quite sure that this needs to be split into two
patches especially if 1/3 of the second patch is the removal of the code
added by the first one and otherwise it's quite small and
straightforward. That is unless you expect the GenSlab to not go in.

Slab:
In general it seems understandable, the initial description helps to
understand what's happening well enough.

One thing I don't understand however is why the freelist is both array
and doubly linked list and why there is new implementation of said
doubly linked list given that we have dlist.

+/*
+ * SlabContext is our standard implementation of MemoryContext.
+ *

Really?

+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ */

Is this true? Why? And if it is then why doesn't the SlabChunk actually
match the StandardChunkHeader?

+#define SlabPointerIsValid(pointer) PointerIsValid(pointer)

What's the point of this given that it's defined in the .c file?

+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab	set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert(size == set->chunkSize);

I wonder if there should be stronger protection (ie, elog) for the size
matching.

+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab	set = (Slab)context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	/* we can't really do repalloc with this allocator */
+	Assert(false);

This IMHO should definitely be elog.

+static void
+SlabCheck(MemoryContext context)
+{
+	/* FIXME */
+}

Do you plan to implement this interface?

+#define SLAB_DEFAULT_BLOCK_SIZE		8192
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)

I am guessing this is based on max_cached_tuplebufs? Maybe these could
be written with same style?

GenSlab:

Since this is relatively simple wrapper it looks mostly ok to me. The
only issue I have here is that I am not quite sure about those FIXME
functions (Free, Realloc, GetChunkSpace). It's slightly weird to call to
mcxt but I guess the alternative there is to error out so this is
probably preferable. Would want to hear other opinions here.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Petr Jelinek (#2)
Re: PATCH: two slab-like memory allocators

On 09/25/2016 08:48 PM, Petr Jelinek wrote:

Hi Tomas,

On 02/08/16 17:44, Tomas Vondra wrote:

This patch actually includes two new memory allocators (not one). Very
brief summary (for more detailed explanation of the ideas, see comments
at the beginning of slab.c and genslab.c):

Slab
----
* suitable for fixed-length allocations (other pallocs fail)
* much simpler than AllocSet (no global freelist management etc.)
* free space is tracked per block (using a simple bitmap)
* which allows freeing the block once all chunks are freed (AllocSet
will hold the memory forever, in the hope of reusing it)

GenSlab
-------
* suitable for non-fixed-length allocations, but with chunks of mostly
the same size (initially unknown, the context will tune itself)
* a combination AllocSet and Slab (or a sequence of Slab allocators)
* the goal is to do most allocations in Slab context
* there's always a single 'current' Slab context, and every now and and
then it's replaced with a new generation (with the chunk size computed
from recent requests)
* the AllocSet context is used for chunks too large for current Slab

So it's just wrapper around the other two allocators to make this
usecase easier to handle. Do you expect there will be use for the
GenSlab eventually outside of the reoderbuffer?

Yes, you might say it's just a wrapper around the other two allocators,
but it *also* includes the logic of recomputing chunk size etc.

I haven't thought about other places that might benefit from these new
allocators very much - in general, it's useful for places that produce a
stream of equally-sized items (GenSlab relaxes this), that are pfreed()
in ~FIFO manner (i.e. roughly in the order of allocation).

So none of this is meant as a universal replacement of AllocSet,
but in the suitable cases the results seem really promising. For
example for the simple test query in [1], the performance
improvement is this:

N | master | patched
-----------------------------
10000 | 100ms | 100ms
50000 | 15000ms | 350ms
100000 | 146000ms | 700ms
200000 | ? | 1400ms

That's a fairly significant improvement, and the submitted version
of the patches should perform even better (~2x, IIRC).

I agree that it improves performance quite nicely and that
reoderbuffer could use this.

About the code. I am not quite sure that this needs to be split into
two patches especially if 1/3 of the second patch is the removal of
the code added by the first one and otherwise it's quite small and
straightforward. That is unless you expect the GenSlab to not go in.

I don't know - it seemed natural to first introduce the Slab, as it's
easier to discuss it separately, and it works for 2 of the 3 contexts
needed in reorderbuffer.

GenSlab is an improvement of Slab, or rather based on it, so that it
works for the third context. And it introduces some additional ideas
(particularly the generational design, etc.)

Of course, none of this means it has to be committed in two separate
chunks, or that I don't expect GenSlab to get committed ...

Slab:
In general it seems understandable, the initial description helps to
understand what's happening well enough.

One thing I don't understand however is why the freelist is both
array and doubly linked list and why there is new implementation of
said doubly linked list given that we have dlist.

Hmm, perhaps that should be explained better.

In AllocSet, we only have a global freelist of chunks, i.e. we have a
list of free chunks for each possible size (there's 11 sizes, starting
with 8 bytes and then doubling the size). So freelist[0] is a list of
free 8B chunks, freelist[1] is a list of free 16B chunks, etc.

In Slab, the freelist has two levels - first there's a bitmap on each
block (which is possible, as the chunks have constant size), tracking
which chunks of that particular block are free. This makes it trivial to
check that all chunks on the block are free, and free the whole block
(which is impossible with AllocSet).

Second, the freelist at the context level tracks blocks with a given
number of free chunks - so freelist[0] tracks completely full blocks,
freelist[1] is a list of blocks with 1 free chunk, etc. This is used to
reuse space on almost full blocks first, in the hope that some of the
less full blocks will get completely empty (and freed to the OS).

Is that clear?

+/*
+ * SlabContext is our standard implementation of MemoryContext.
+ *

Really?

Meh, that's clearly a bogus comment.

+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ */

Is this true? Why? And if it is then why doesn't the SlabChunk
actually match the StandardChunkHeader?

It is true, a lot of stuff in MemoryContext infrastructure relies on
that. For example when you do pfree(ptr), we actually do something like

header = (StandardChunkHeader*)(ptr - sizeof(StandardChunkHeader))

to get the chunk header - which includes pointer to the memory context
and other useful stuff.

This also means we can put additional fields before StandardChunkHeader
as that does not break this pointer arithmetic, i.e. SlabChunkData is
effectively defined like this:

typedef struct SlabChunkData
{
/* block owning this chunk */
void *block;

/* standard header */
StandardChunkHeader header;
} SlabChunkData;

+#define SlabPointerIsValid(pointer) PointerIsValid(pointer)

What's the point of this given that it's defined in the .c file?

Meh, I've copied this from aset.c, but I see it's useless there too.

+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab	set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert(size == set->chunkSize);

I wonder if there should be stronger protection (ie, elog) for the size
matching.

Perhaps, I'm not opposed to that.

+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab	set = (Slab)context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	/* we can't really do repalloc with this allocator */
+	Assert(false);

This IMHO should definitely be elog.

Yeah, you're probably right.

+static void
+SlabCheck(MemoryContext context)
+{
+	/* FIXME */
+}

Do you plan to implement this interface?

Yes, although I'm not sure what checks should go there. The only thing I
can think of right now is checking that the number of free chunks on a
block (according to the bitmap) matches the freelist index.

+#define SLAB_DEFAULT_BLOCK_SIZE		8192
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)

I am guessing this is based on max_cached_tuplebufs? Maybe these
could be written with same style?

Not sure I understand what you mean by "based on"? I don't quite
remember how I came up with those constants, but I guess 8kB and 8MB
seemed like good values.

Also, what style you mean? I've used the same style as for ALLOCSET_*
constants in the same file.

GenSlab:

Since this is relatively simple wrapper it looks mostly ok to me. The
only issue I have here is that I am not quite sure about those FIXME
functions (Free, Realloc, GetChunkSpace). It's slightly weird to call to
mcxt but I guess the alternative there is to error out so this is
probably preferable. Would want to hear other opinions here.

Yeah, I'd like to get some opinions on that too - that's why I left
there the FIXMEs, actually.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#4Petr Jelinek
petr@2ndquadrant.com
In reply to: Tomas Vondra (#3)
Re: PATCH: two slab-like memory allocators

On 25/09/16 22:17, Tomas Vondra wrote:

On 09/25/2016 08:48 PM, Petr Jelinek wrote:

Slab:
In general it seems understandable, the initial description helps to
understand what's happening well enough.

One thing I don't understand however is why the freelist is both
array and doubly linked list and why there is new implementation of
said doubly linked list given that we have dlist.

Hmm, perhaps that should be explained better.

In AllocSet, we only have a global freelist of chunks, i.e. we have a
list of free chunks for each possible size (there's 11 sizes, starting
with 8 bytes and then doubling the size). So freelist[0] is a list of
free 8B chunks, freelist[1] is a list of free 16B chunks, etc.

In Slab, the freelist has two levels - first there's a bitmap on each
block (which is possible, as the chunks have constant size), tracking
which chunks of that particular block are free. This makes it trivial to
check that all chunks on the block are free, and free the whole block
(which is impossible with AllocSet).

Second, the freelist at the context level tracks blocks with a given
number of free chunks - so freelist[0] tracks completely full blocks,
freelist[1] is a list of blocks with 1 free chunk, etc. This is used to
reuse space on almost full blocks first, in the hope that some of the
less full blocks will get completely empty (and freed to the OS).

Is that clear?

Ah okay, makes sense, the documentation of this could be improved then
though as it's all squashed into single sentence that wasn't quite clear
for me.

+/*
+ * SlabChunk
+ *        The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by
utils/memutils.h.
+ */

Is this true? Why? And if it is then why doesn't the SlabChunk
actually match the StandardChunkHeader?

It is true, a lot of stuff in MemoryContext infrastructure relies on
that. For example when you do pfree(ptr), we actually do something like

header = (StandardChunkHeader*)(ptr - sizeof(StandardChunkHeader))

to get the chunk header - which includes pointer to the memory context
and other useful stuff.

This also means we can put additional fields before StandardChunkHeader
as that does not break this pointer arithmetic, i.e. SlabChunkData is
effectively defined like this:

typedef struct SlabChunkData
{
/* block owning this chunk */
void *block;

/* standard header */
StandardChunkHeader header;
} SlabChunkData;

Yes but your struct then does not match StandardChunkHeader exactly so
it should be explained in more detail (the aset.c where this comment is
also present has struct that matches StandardChunkHeader so it's
sufficient there).

+static void
+SlabCheck(MemoryContext context)
+{
+    /* FIXME */
+}

Do you plan to implement this interface?

Yes, although I'm not sure what checks should go there. The only thing I
can think of right now is checking that the number of free chunks on a
block (according to the bitmap) matches the freelist index.

Yeah this context does not seem like it needs too much checking. The
freelist vs free chunks check sounds ok to me. I guess GenSlab then will
call the checks for the underlying contexts.

+#define SLAB_DEFAULT_BLOCK_SIZE        8192
+#define SLAB_LARGE_BLOCK_SIZE        (8 * 1024 * 1024)

I am guessing this is based on max_cached_tuplebufs? Maybe these
could be written with same style?

Not sure I understand what you mean by "based on"? I don't quite
remember how I came up with those constants, but I guess 8kB and 8MB
seemed like good values.

Also, what style you mean? I've used the same style as for ALLOCSET_*
constants in the same file.

I mean using 8 * 1024 for SLAB_DEFAULT_BLOCK_SIZE so that it's more
readable. ALLOCSET_* does that too (with exception of
ALLOCSET_SEPARATE_THRESHOLD which I have no idea why it's different from
rest of the code).

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#5Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Petr Jelinek (#4)
2 attachment(s)
Re: PATCH: two slab-like memory allocators

Hi,

Attached is v2 of the patch, updated based on the review. That means:

- Better comment explaining how free chunks are tracked in Slab context.

- Removed the unused SlabPointerIsValid macro.

- Modified the comment before SlabChunkData, explaining how it relates
to StandardChunkHeader.

- Replaced the two Assert() calls with elog().

- Implemented SlabCheck(). I've ended up with quite a few checks there,
checking pointers between the context, block and chunks, checks due
to MEMORY_CONTEXT_CHECKING etc. And of course, cross-checking the
number of free chunks (bitmap, freelist vs. chunk header).

- I've also modified SlabContextCreate() to compute chunksPerBlock a
bit more efficiently (use a simple formula instead of the loop, which
might be a bit too expensive for large blocks / small chunks).

I haven't done any changes to GenSlab, but I do have a few notes:

Firstly, I've realized there's an issue when chunkSize gets too large -
once it exceeds blockSize, the SlabContextCreate() fails as it's
impossible to place a single chunk into the block. In reorderbuffer,
this may happen when the tuples (allocated in tup_context) get larger
than 8MB, as the context uses SLAB_LARGE_BLOCK_SIZE (which is 8MB).

For Slab the elog(ERROR) is fine as both parameters are controlled by
the developer directly, but GenSlab computes the chunkSize on the fly,
so we must not let it fail like that - that'd result in unpredictable
failures, which is not very nice.

I see two ways to fix this. We may either increase the block size
automatically - e.g. instead of specifying specifying chunkSize and
blockSize when creating the Slab, specify chunkSize and chunksPerBlock
(and then choose the smallest 2^k block large enough). For example with
chunkSize=96 and chunksPerBlock=1000, we'd get 128kB blocks, as that's
the closest 2^k block larger than 96000 bytes.

But maybe there's a simpler solution - we may simply cap the chunkSize
(in GenSlab) to ALLOC_CHUNK_LIMIT. That's fine, because AllocSet handles
those requests in a special way - for example instead of tracking them
in freelist, those chunks got freed immediately.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0001-simple-slab-allocator-fixed-size-allocations.patchbinary/octet-stream; name=0001-simple-slab-allocator-fixed-size-allocations.patchDownload
From d97ea94fc77b50d3b95159497331ef7dc5a93fae Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Tue, 19 Jul 2016 00:15:54 +0200
Subject: [PATCH 1/2] simple slab allocator (fixed-size allocations)

improvements based on review by Petr Jelinek

- Better comment explaining how free chunks are tracked, etc.
- Removed unused SlabPointerIsValid macro.
- Added comment to SlabBlockData, explaining the fields.
- Modified SlabChunkData comment, explaining relation to StandardChunkHeader.
- Modified SlabContextCreate() to compute chunksPerBlock a bit more efficiently.
- Implemented SlabCheck().
- Replace two Assert() calls with elog().
---
 src/backend/replication/logical/reorderbuffer.c | 163 ++--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/slab.c                   | 940 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   2 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  13 +
 src/include/utils/memutils.h                    |  11 +
 7 files changed, 1039 insertions(+), 93 deletions(-)
 create mode 100644 src/backend/utils/mmgr/slab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 9b430b9..00e2b7b 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -135,6 +135,9 @@ typedef struct ReorderBufferDiskChange
 	/* data follows */
 } ReorderBufferDiskChange;
 
+/* 10k tuples seems like a reasonable value (~80MB with MaxHeapTupleSize) */
+#define		TUPLES_PER_GENERATION		10000
+
 /*
  * Maximum number of changes kept in memory, per transaction. After that,
  * changes are spooled to disk.
@@ -156,9 +159,6 @@ static const Size max_changes_in_memory = 4096;
  * major bottleneck, especially when spilling to disk while decoding batch
  * workloads.
  */
-static const Size max_cached_changes = 4096 * 2;
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
 
 
 /* ---------------------------------------
@@ -241,6 +241,28 @@ ReorderBufferAllocate(void)
 
 	buffer->context = new_ctx;
 
+	buffer->change_context = SlabContextCreate(new_ctx,
+									"Change",
+									SLAB_DEFAULT_BLOCK_SIZE,
+									sizeof(ReorderBufferChange));
+
+	buffer->txn_context = SlabContextCreate(new_ctx,
+									"TXN",
+									SLAB_DEFAULT_BLOCK_SIZE,
+									sizeof(ReorderBufferTXN));
+
+	buffer->tup_context_slab = SlabContextCreate(new_ctx,
+									"TuplesSlab",
+									SLAB_LARGE_BLOCK_SIZE,
+									sizeof(ReorderBufferTupleBuf) +
+									MAXIMUM_ALIGNOF + MaxHeapTupleSize);
+
+	buffer->tup_context_oversized = AllocSetContextCreate(new_ctx,
+									"TuplesOversized",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -260,11 +282,17 @@ ReorderBufferAllocate(void)
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
+	buffer->tuples_count = 0;
+	buffer->tuples_size = 0;
+
 	dlist_init(&buffer->toplevel_by_lsn);
 	dlist_init(&buffer->cached_transactions);
 	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
+	buffer->current_size = sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + MaxHeapTupleSize;
+
 	return buffer;
 }
 
@@ -291,19 +319,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)
+			MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -344,18 +361,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	pfree(txn);
 }
 
 /*
@@ -366,19 +372,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 {
 	ReorderBufferChange *change;
 
-	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)
+			MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -434,21 +429,9 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	pfree(change);
 }
 
-
 /*
  * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at
  * least a tuple of size tuple_len (excluding header overhead).
@@ -461,37 +444,49 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
+	/* see if we need to allocate a new context generation */
+	if (rb->tuples_count == TUPLES_PER_GENERATION)
+	{
+		Size	new_size;
+		Size	avg_length = (rb->tuples_size / rb->tuples_count);
+
+		/* mark the current SLAB context for automatic destruction */
+		SlabAutodestruct(rb->tup_context_slab);
+
+		/* assume +50% is enough slack to fit most tuples into the slab context */
+		new_size = MAXALIGN(avg_length * 1.5);
+
+		rb->current_size = new_size;
+		rb->tup_context_slab = SlabContextCreate(rb->context,
+									"TuplesSlab",
+									SLAB_LARGE_BLOCK_SIZE,
+									sizeof(ReorderBufferTupleBuf) +
+									MAXIMUM_ALIGNOF + rb->current_size);
+
+		/* we could also recreate the aset context, with block sizes set so
+		 * that the palloc always does malloc(), but not sure about that */
+
+		rb->tuples_count = 0;
+		rb->tuples_size = 0;
+	}
 
+	rb->tuples_count += 1;
+	rb->tuples_size  += alloc_len;
 
 	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
+	if (alloc_len <= rb->current_size)
 	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
+		tuple = (ReorderBufferTupleBuf *)
+			MemoryContextAlloc(rb->tup_context_slab,
+							   sizeof(ReorderBufferTupleBuf) +
+							   MAXIMUM_ALIGNOF + rb->current_size);
+		tuple->alloc_tuple_size = rb->current_size;
 		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
 	}
 	else
 	{
 		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
+			MemoryContextAlloc(rb->tup_context_oversized,
 							   sizeof(ReorderBufferTupleBuf) +
 							   MAXIMUM_ALIGNOF + alloc_len);
 		tuple->alloc_tuple_size = alloc_len;
@@ -510,21 +505,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index b2403e1..321289f 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o
+OBJS = aset.o mcxt.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000..bde0ea1
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,940 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom memory context MemoryContext implementation designed for
+ * cases of equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations that are not possible in AllocSet. Firstly, we can get rid
+ *	of the doubling and carve the blocks into chunks of exactly the right size
+ *	(plus alignment), now wasting memory.
+ *
+ *	The information about free chunks is maintained both at the block level and
+ *	global (context) level. This is possible as the chunk size (and thus also
+ *	the number of chunks per block) is fixed.
+ *
+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with AllocSet,
+ *	as it stores free chunks from all blocks in a single global freelist).
+ *
+ *	At the context level, we use 'freelist' array to track blocks grouped by
+ *	number of free chunks. For example freelist[0] is a list of completely full
+ *	blocks, freelist[1] is a block with a single free chunk, etc.
+ *
+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, we the allocator reuses space from the most full blocks first,
+ *	in the hope that some of the less full blocks will get completely empty
+ *	(and returned back to the OS).
+ *
+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+#define SLAB_BLOCKHDRSZ	MAXALIGN(sizeof(SlabBlockData))
+#define SLAB_CHUNKHDRSZ	MAXALIGN(sizeof(SlabChunkData))
+
+/* Portion of SLAB_CHUNKHDRSZ examined outside slab.c. */
+#define SLAB_CHUNK_PUBLIC	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+
+/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define SLAB_CHUNK_USED	\
+	(offsetof(SlabChunkData, requested_size) + sizeof(Size))
+#else
+#define SLAB_CHUNK_USED	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;
+
+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock;	/* number of chunks per block */
+	int			minFreeCount;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	bool		autodestruct;	/* destruct after freeing the last block */
+	/* Info about storage allocated in this context: */
+	SlabBlock	freelist[1];	/* free lists (block-level) */
+} SlabContext;
+
+typedef SlabContext *Slab;
+
+/*
+ * SlabBlockData
+ *		Structure of a single block in SLAB allocator.
+ *
+ * slab: context owning this block
+ * prev, next: used for doubly-linked list of blocks in global freelist
+ * nfree: number of free chunks in this block
+ * firstFreeChunk: pointer to the first free chunk
+ * bitmapptr: pointer to the free bitmap (tracking free chunks)
+ */
+typedef struct SlabBlockData
+{
+	Slab		slab;			/* slab that owns this block */
+	SlabBlock	prev;			/* previous block in slab's block list */
+	SlabBlock	next;			/* next block in slab's blocks list */
+	int			nfree;			/* number of free chunks */
+	int			firstFreeChunk;	/* index of the first free chunk in the block */
+	char	   *bitmapptr;		/* pointer to free bitmap */
+}	SlabBlockData;
+
+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ * However it's possible to fields in front of the StandardChunkHeader fields,
+ * which is used to add pointer to the block.
+ */
+typedef struct SlabChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+	/* slab is the owning slab context */
+	void	   *slab;
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif
+}	SlabChunkData;
+
+
+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)
+
+#define SlabPointerGetChunk(ptr)	\
+					((SlabChunk)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+#define SlabChunkGetPointer(chk)	\
+					((SlabPointer)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabInit(MemoryContext context);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods SlabMethods = {
+	SlabAlloc,
+	SlabFree,
+	SlabRealloc,
+	SlabInit,
+	SlabReset,
+	SlabDelete,
+	SlabGetChunkSpace,
+	SlabIsEmpty,
+	SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,SlabCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif
+
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static void
+wipe_mem(void *ptr, size_t size)
+{
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	memset(ptr, 0x7F, size);
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
+}
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void
+set_sentinel(void *base, Size offset)
+{
+	char	   *ptr = (char *) base + offset;
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
+	*ptr = 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+}
+
+static bool
+sentinel_ok(const void *base, Size offset)
+{
+	const char *ptr = (const char *) base + offset;
+	bool		ret;
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
+	ret = *ptr == 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+
+	return ret;
+}
+#endif
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+static void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize)
+{
+	int		chunksPerBlock;
+	Size	fullChunkSize;
+	Slab	set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+
+	/* make sure the block can store at least one chunk (with 1B for a bitmap)? */
+	if (blockSize - sizeof(SlabChunkData) < fullChunkSize + 1)
+		elog(ERROR, "block size %ld for slab is too small for chunks %ld",
+					blockSize, chunkSize);
+
+	/* so how many chunks can we fit into a block, including header and bitmap? */
+	chunksPerBlock
+		=  (8 * (blockSize - sizeof(SlabBlockData)) - 7) / (8 * fullChunkSize + 1);
+
+	/* if we can't fit at least one chunk into the block, we're hosed */
+	Assert(chunksPerBlock > 0);
+
+	/* make sure the chunks (and bitmap) actually fit on the block  */
+	Assert(fullChunkSize * chunksPerBlock + ((chunksPerBlock + 7) / 8) + sizeof(SlabBlockData) <= blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Slab) MemoryContextCreate(T_SlabContext,
+									 /* allocate context and freelist at once */
+									 (offsetof(SlabContext, freelist) + sizeof(SlabChunk) * (chunksPerBlock + 1)),
+									 &SlabMethods,
+									 parent,
+									 name);
+
+	set->blockSize = blockSize;
+	set->chunkSize = chunkSize;
+	set->fullChunkSize = fullChunkSize;
+	set->chunksPerBlock = chunksPerBlock;
+	set->nblocks = 0;
+	set->minFreeCount = 0;
+	set->autodestruct = false;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * SlabInit
+ *		Context-type-specific initialization routine. SlabContext does not
+ *		need anything extra, at this moment.
+ */
+static void
+SlabInit(MemoryContext context)
+{
+	/*
+	 * Since MemoryContextCreate already zeroed the context node, we don't
+	 * have to do anything here: it's already OK.
+	 */
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+	int		i;
+	Slab	set = (Slab) context;
+
+	AssertArg(SlabIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		SlabBlock block = set->freelist[i];
+		set->freelist[i] = NULL;
+
+		while (block != NULL)
+		{
+			SlabBlock	next = block->next;
+
+			/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, set->blockSize);
+#endif
+			free(block);
+			set->nblocks--;
+
+			block = next;
+		}
+	}
+
+	set->minFreeCount = 0;
+	set->autodestruct = false;
+
+	Assert(set->nblocks == 0);
+}
+
+/*
+ * SlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call SlabReset().
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+	/* just reset the context */
+	SlabReset(context);
+}
+
+/* operations on the freelist - adding/removing/moving blocks */
+static void
+remove_from_freelist(Slab set, SlabBlock block, int nfree_old)
+{
+	/* either it has a previous block, or it's the first block in list */
+	if (block->prev)
+		block->prev->next = block->next;
+	else
+		set->freelist[nfree_old] = block->next;
+
+	/* if it has a next block, update it too */
+	if (block->next)
+		block->next->prev = block->prev;
+
+	block->prev = NULL;
+	block->next = NULL;
+}
+
+static void
+add_to_freelist(Slab set, SlabBlock block)
+{
+	/* otherwise add it to the proper freelist bin */
+	if (set->freelist[block->nfree])
+		set->freelist[block->nfree]->prev = block;
+
+	block->next = set->freelist[block->nfree];
+	set->freelist[block->nfree] = block;
+}
+
+static void
+move_in_freelist(Slab set, SlabBlock block, int nfree_old)
+{
+	remove_from_freelist(set, block, nfree_old);
+	add_to_freelist(set, block);
+}
+
+
+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab	set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert((set->minFreeCount >= 0) && (set->minFreeCount < set->chunksPerBlock));
+
+	/* make sure we only allow correct request size */
+	if (size != set->chunkSize)
+		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+					size, set->chunkSize);
+
+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 */
+	if (set->minFreeCount == 0)
+	{
+		block = (SlabBlock)malloc(set->blockSize);
+
+		if (block == NULL)
+			return NULL;
+
+		memset(block, 0, set->blockSize);
+
+		block->slab = set;
+		block->nfree = set->chunksPerBlock;
+		block->prev = NULL;
+		block->next = NULL;
+		block->firstFreeChunk = 0;
+
+		/* the free bitmap is placed at the end */
+		block->bitmapptr
+			= ((char *) block) + set->blockSize - ((set->chunksPerBlock + 7) / 8);
+
+		/*
+		 * And add it to the last freelist with all chunks empty (we know
+		 * there are no blocks in the freelist, otherwise we wouldn't need
+		 * a new block.
+		 */
+		set->freelist[set->chunksPerBlock] = block;
+		set->minFreeCount = set->chunksPerBlock;
+		set->nblocks += 1;
+	}
+
+	/* grab the block from the freelist (even the new block is there) */
+	block = set->freelist[set->minFreeCount];
+
+	/* make sure we actually got a valid block, with matching nfree */
+	Assert(block != NULL);
+	Assert(set->minFreeCount == block->nfree);
+	Assert(block->nfree > 0);
+
+	Assert((char*)block < block->bitmapptr);
+	Assert((char*)block + set->blockSize > block->bitmapptr);
+
+	/* we know the first free chunk */
+	idx = block->firstFreeChunk;
+
+	/* make sure the chunk index is valid, and that it's marked as empty */
+	Assert((idx >= 0) && (idx < set->chunksPerBlock));
+	Assert(!((block->bitmapptr[idx/8] & (0x01 << (idx % 8)))));
+
+	/* mark the chunk as used (set 1 to the bit) */
+	block->bitmapptr[idx/8] |= (0x01 << (idx % 8));
+
+	/* compute the chunk location block start (after the block header) */
+	chunk = (SlabChunk) ((char*)block + sizeof(SlabBlockData)
+									  + (idx * set->fullChunkSize));
+
+	/*
+	 * update the block nfree count, and also the minFreeCount as we've
+	 * decreased nfree for a block with the minimum count
+	 */
+	block->nfree--;
+	set->minFreeCount = block->nfree;
+
+	/* but we need to find the next one, for the next alloc call (unless the
+	 * block just got full, in that case simply set it to -1 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int byte = block->firstFreeChunk / 8;
+			int bit  = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (! (block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}
+
+	/* move the block to the right place in the freelist */
+	move_in_freelist(set, block, (block->nfree + 1));
+
+	/* but if the minimum is 0, we need to look for a new one */
+	if (set->minFreeCount == 0)
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+			if (set->freelist[idx])
+			{
+				set->minFreeCount = idx;
+				break;
+			}
+
+	if (set->minFreeCount == set->chunksPerBlock)
+		set->minFreeCount = 0;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+
+	chunk->slab = (void *) set;
+	chunk->block = (void *) block;
+	chunk->size = MAXALIGN(size);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+							   sizeof(chunk->requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		set_sentinel(SlabChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+	SlabAllocInfo(set, chunk);
+	return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+	int		idx;
+	Slab	set = (Slab) context;
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+	SlabBlock	block = chunk->block;
+
+	SlabFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/* compute index wrt to block start */
+	idx = ((char*)chunk - ((char*)block + sizeof(SlabBlockData))) / set->fullChunkSize;
+
+	Assert((block->bitmapptr[idx/8] & (0x01 << (idx % 8))));
+
+	/* mark the chunk as unused (set 0 to the bit), and update block nfree count */
+	block->bitmapptr[idx/8] ^= (0x01 << (idx % 8));
+	block->nfree++;
+	block->firstFreeChunk = Min(block->firstFreeChunk, idx);
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= set->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->requested_size = 0;
+#endif
+
+	/* now decide what to do with the block */
+
+	/*
+	 * See if we need to update the minFreeCount field for the set - we only
+	 * need to do that if the block had that number of free chunks before we
+	 * freed one. In that case, we check if there still are blocks with that
+	 * number of free chunks - we can simply check if the chunk has siblings.
+	 * Otherwise we simply increment the value by one, as the new block is
+	 * still the one with minimum free chunks (even without the one chunk).
+	 */
+	if (set->minFreeCount == (block->nfree-1))
+		if ((block->prev == NULL) && (block->next == NULL)) /* no other blocks */
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == set->chunksPerBlock)
+				set->minFreeCount = 0;
+			else
+				set->minFreeCount++;
+		}
+
+	/* remove the block from a freelist */
+	remove_from_freelist(set, block, block->nfree-1);
+
+	/* If the block is now completely empty, free it. */
+	if (block->nfree == set->chunksPerBlock)
+	{
+		free(block);
+		set->nblocks--;
+	}
+	else
+		add_to_freelist(set, block);
+
+	Assert(set->nblocks >= 0);
+
+	/*
+	 * If we've just released the last block in the context, destruct it.
+	 *
+	 * XXX But don't do that if the context has children.
+	 */
+	if (set->autodestruct && (set->nblocks == 0) && (context->firstchild == NULL))
+		MemoryContextDelete(context);
+}
+
+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc. However we try to be gentle and
+ *		allow calls with exactly the same size as in that case we can simply
+ *		return the same chunk. When the size differs, we fail with assert
+ *		failure or return NULL.
+ *
+ *	XXX We might be even gentler and allow cases when (size < chunkSize).
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab	set = (Slab)context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	elog(ERROR, "slab allocator does not support realloc()");
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+
+	return chunk->size + SLAB_CHUNKHDRSZ;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+	Slab		set = (Slab)context;
+	return (set->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of an Slab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+SlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals)
+{
+	Slab		set = (Slab) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+	int			i;
+
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		SlabBlock block = set->freelist[i];
+		while (block != NULL)
+		{
+			nblocks++;
+			totalspace += set->blockSize;
+			freespace += set->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+			block = block->next;
+		}
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"%s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used; autodestruct %d\n",
+				set->header.name, totalspace, nblocks, freespace, freechunks,
+				totalspace - freespace, set->autodestruct);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+void
+SlabAutodestruct(MemoryContext context)
+{
+	Slab	set = (Slab)context;
+
+	Assert(IsA(set, SlabContext));
+
+	set->autodestruct = true;
+}
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+	int i;
+	Slab	slab = (Slab) context;
+	char	   *name = slab->header.name;
+
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		int j, nfree;
+		SlabBlock block = slab->freelist[i];
+
+		/* no entries in this freelist slot */
+		if (! block)
+			continue;
+
+		if (block->slab != (void *) slab)
+			elog(WARNING, "problem in slab %s: bogus slab link in block %p",
+				 name, block);
+
+		/*
+		 * Make sure the number of free chunks (in the block header) matches
+		 * position in the freelist.
+		 */
+		if (block->nfree != i)
+			elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
+				 name, block->nfree, block, i);
+
+		/*
+		 * Now walk through the chunks, count the free ones and also perform
+		 * some additional checks for the used ones.
+		 */
+
+		nfree = 0;
+		for (j = 0; j <= slab->chunksPerBlock; j++)
+		{
+			/* non-zero bit in the bitmap means chunk the chunk is used */
+			if ((block->bitmapptr[j/8] & (0x01 << (j % 8))) != 0)
+			{
+				SlabChunk chunk = (SlabChunk) ((char*)block + sizeof(SlabBlockData)
+								  + (j * slab->fullChunkSize));
+
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+										  sizeof(chunk->requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+										   sizeof(chunk->requested_size));
+
+				/* chunks have both block and slab pointers, so check both of them */
+
+				if (chunk->block != block)
+					elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+						 name, block, chunk);
+
+				if (chunk->slab != slab)
+					elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* now make sure the chunk size is correct */
+				if (chunk->size != MAXALIGN(slab->chunkSize))
+					elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+				/* now make sure the chunk size is correct */
+				if (chunk->size != slab->chunkSize)
+					elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->requested_size < chunk->size &&
+					!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->requested_size))
+					elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+#endif
+			}
+			else
+				/* free chunk */
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of free chunks (as tracked in
+		 * the block header).
+		 */
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
+				 name, block->nfree, block, nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index ba069cc..92a7478 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,6 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext)))
+	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 88297bb..2cbbfec 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -275,6 +275,7 @@ typedef enum NodeTag
 	 */
 	T_MemoryContext = 600,
 	T_AllocSetContext,
+	T_SlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 9e209ae..e8a8d77 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -331,6 +331,19 @@ struct ReorderBuffer
 	MemoryContext context;
 
 	/*
+	 * slab contexts for change and TXN objects.
+	 */
+	MemoryContext change_context;
+	MemoryContext txn_context;
+	MemoryContext tup_context_slab;
+	MemoryContext tup_context_oversized;
+
+	/* counters for current generation of tuples */
+	int		tuples_count;
+	Size	tuples_size;
+	Size	current_size;
+
+	/*
 	 * Data structure slab cache.
 	 *
 	 * We allocate/deallocate some structures very frequently, to avoid bigger
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index e6334a2..fd2c9c2 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -135,6 +135,14 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
 					  Size initBlockSize,
 					  Size maxBlockSize);
 
+/* slab.c */
+extern MemoryContext SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize);
+
+extern void SlabAutodestruct(MemoryContext context);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
@@ -171,4 +179,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
  */
 #define ALLOCSET_SEPARATE_THRESHOLD  8192
 
+#define SLAB_DEFAULT_BLOCK_SIZE		8192
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
+
 #endif   /* MEMUTILS_H */
-- 
2.5.5

0002-generational-slab-auto-tuning-allocator.patchbinary/octet-stream; name=0002-generational-slab-auto-tuning-allocator.patchDownload
From 64a0b078127d371bc64520c508b253058dc70b09 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Wed, 20 Jul 2016 23:46:36 +0200
Subject: [PATCH 2/2] generational slab (auto-tuning allocator)

---
 src/backend/replication/logical/reorderbuffer.c |  71 +----
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/genslab.c                | 347 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   4 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |   8 +-
 src/include/utils/memutils.h                    |   7 +
 7 files changed, 369 insertions(+), 71 deletions(-)
 create mode 100644 src/backend/utils/mmgr/genslab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 00e2b7b..42a3792 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -251,17 +251,12 @@ ReorderBufferAllocate(void)
 									SLAB_DEFAULT_BLOCK_SIZE,
 									sizeof(ReorderBufferTXN));
 
-	buffer->tup_context_slab = SlabContextCreate(new_ctx,
+	buffer->tup_context = GenSlabContextCreate(new_ctx,
 									"TuplesSlab",
 									SLAB_LARGE_BLOCK_SIZE,
 									sizeof(ReorderBufferTupleBuf) +
-									MAXIMUM_ALIGNOF + MaxHeapTupleSize);
-
-	buffer->tup_context_oversized = AllocSetContextCreate(new_ctx,
-									"TuplesOversized",
-									ALLOCSET_DEFAULT_MINSIZE,
-									ALLOCSET_DEFAULT_INITSIZE,
-									ALLOCSET_DEFAULT_MAXSIZE);
+									MAXIMUM_ALIGNOF + MaxHeapTupleSize,
+									TUPLES_PER_GENERATION);
 
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
@@ -282,17 +277,11 @@ ReorderBufferAllocate(void)
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
-	buffer->tuples_count = 0;
-	buffer->tuples_size = 0;
-
 	dlist_init(&buffer->toplevel_by_lsn);
 	dlist_init(&buffer->cached_transactions);
 	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
-	buffer->current_size = sizeof(ReorderBufferTupleBuf) +
-						   MAXIMUM_ALIGNOF + MaxHeapTupleSize;
-
 	return buffer;
 }
 
@@ -444,54 +433,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/* see if we need to allocate a new context generation */
-	if (rb->tuples_count == TUPLES_PER_GENERATION)
-	{
-		Size	new_size;
-		Size	avg_length = (rb->tuples_size / rb->tuples_count);
-
-		/* mark the current SLAB context for automatic destruction */
-		SlabAutodestruct(rb->tup_context_slab);
-
-		/* assume +50% is enough slack to fit most tuples into the slab context */
-		new_size = MAXALIGN(avg_length * 1.5);
-
-		rb->current_size = new_size;
-		rb->tup_context_slab = SlabContextCreate(rb->context,
-									"TuplesSlab",
-									SLAB_LARGE_BLOCK_SIZE,
-									sizeof(ReorderBufferTupleBuf) +
-									MAXIMUM_ALIGNOF + rb->current_size);
-
-		/* we could also recreate the aset context, with block sizes set so
-		 * that the palloc always does malloc(), but not sure about that */
-
-		rb->tuples_count = 0;
-		rb->tuples_size = 0;
-	}
-
-	rb->tuples_count += 1;
-	rb->tuples_size  += alloc_len;
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= rb->current_size)
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->tup_context_slab,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + rb->current_size);
-		tuple->alloc_tuple_size = rb->current_size;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->tup_context_oversized,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index 321289f..08b5e3a 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o slab.o
+OBJS = aset.o genslab.o mcxt.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/genslab.c b/src/backend/utils/mmgr/genslab.c
new file mode 100644
index 0000000..1f300aa
--- /dev/null
+++ b/src/backend/utils/mmgr/genslab.c
@@ -0,0 +1,347 @@
+/*-------------------------------------------------------------------------
+ *
+ * genslab.c
+ *	  Generational SLAB allocator definitions.
+ *
+ * An extension of the SLAB allocator relaxing the fixed-size limitation by
+ * using a generational design.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/genslab.c
+ *
+ *
+ *	The simple SLAB allocator only allows allocating chunks with exactly the
+ *	same size. That only works for some special cases, e.g. when the context
+ *	is only used for instances of a single structure with fixed size.
+ * 
+ *	This implementation tries to relax this restriction by treating the chunk
+ *	size as an upper boundary, and using a regular AllocSet context to serve
+ *	requests for larger pieces of memory.
+ *
+ *	Furthermore, instead of using a single SLAB context (fixing the maximum
+ *	chunk size) it's possible to automatically tune the chunk size based on
+ *	past allocations. This is done by replacing the single SLAB context with
+ *	a sequence of contexts (with only the last one used for allocations).
+ *
+ *	This works particularly well when we can't predict the size of the
+ *	objects easily, but we know that the size is unlikely to vary too much.
+ *	It also works quite nicely when the memory is freed in about the same
+ *	sequence as it was allocated, because the old SLAB contexts will get
+ *	empty and freed automatically (one of the benefits of SLAB contexts).
+ *
+ *	A good example is ReorderBuffer - the tuples tend to be of about the
+ *	same size, and freed in roughly the same sequence as allocated.
+ *
+ *	In a sense, this delegates the allocation to actual implementations,
+ *	which also handle CLOBBER_FREED_MEMORY and MEMORY_CONTEXT_CHECKING.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+/*
+ * GenSlabContext is a self-tuning version of SlabContext.
+ */
+typedef struct GenSlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	MemoryContext	slab;
+	MemoryContext	aset;
+
+	/* SLAB parameters */
+	Size		blockSize;		/* block size */
+	Size		chunkSize;		/* chunk size */
+
+	/* counters used for tuning chunk size */
+
+	Size		nbytes;			/* bytes allocated (as requested) */
+	int			nallocations;	/* number of allocations */
+	int			maxallocations;	/* self-tune after number of allocations */
+
+} GenSlabContext;
+
+typedef GenSlabContext *GenSlab;
+
+/*
+ * These functions implement the MemoryContext API for GenSlab contexts.
+ */
+static void *GenSlabAlloc(MemoryContext context, Size size);
+static void GenSlabFree(MemoryContext context, void *pointer);
+static void *GenSlabRealloc(MemoryContext context, void *pointer, Size size);
+static void GenSlabInit(MemoryContext context);
+static void GenSlabReset(MemoryContext context);
+static void GenSlabDelete(MemoryContext context);
+static Size GenSlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenSlabIsEmpty(MemoryContext context);
+static void GenSlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenSlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods GenSlabMethods = {
+	GenSlabAlloc,
+	GenSlabFree,
+	GenSlabRealloc,
+	GenSlabInit,
+	GenSlabReset,
+	GenSlabDelete,
+	GenSlabGetChunkSpace,
+	GenSlabIsEmpty,
+	GenSlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenSlabCheck
+#endif
+};
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenSlabContextCreate
+ *		Create a new GenSlab context.
+ */
+MemoryContext
+GenSlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize,
+					  int maxAllocations)
+{
+	GenSlab	set;
+
+	/* Do the type-independent part of context creation */
+	set = (GenSlab) MemoryContextCreate(T_GenSlabContext,
+										sizeof(GenSlabContext),
+										&GenSlabMethods,
+										parent,
+										name);
+
+	/* the default context */
+	set->slab = SlabContextCreate((MemoryContext)set,
+								  "slab",
+								  blockSize,
+								  chunkSize);
+
+	/*
+	 * TODO Maybe we could set the parameters so that all requests exceeding
+	 * the SLAB chunk size (and thus falling through to the AllocSet) also
+	 * exceed allocChunkLimit and thus get allocated using malloc(). That's
+	 * more expensive, but vast majority of requests should be handled by
+	 * the SLAB context anyway. And chunks over allocChunkLimit are freed
+	 * immediately, which is also nice.
+	 */
+	set->aset = AllocSetContextCreate((MemoryContext)set,
+									 "oversized",
+									 ALLOCSET_DEFAULT_MINSIZE,
+									 ALLOCSET_DEFAULT_INITSIZE,
+									 ALLOCSET_DEFAULT_MAXSIZE);
+
+	set->blockSize = blockSize;
+	set->nbytes = 0;
+	set->nallocations = 0;
+	set->maxallocations = maxAllocations;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenSlabInit
+ *		Context-type-specific initialization routine. Simply delegate the
+ *		child contexts.
+ */
+static void
+GenSlabInit(MemoryContext context)
+{
+	GenSlab set = (GenSlab)context;
+
+	set->nallocations = 0;
+	set->nbytes = 0;
+}
+
+/*
+ * GenSlabReset
+ *		Frees all memory which is allocated in the given set. We also get
+ *		rid of all the old SLAB generations and only keep the current one.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenSlabReset(MemoryContext context)
+{
+	GenSlab	set = (GenSlab) context;
+
+	set->nallocations = 0;
+	set->nbytes = 0;
+}
+
+/*
+ * GenSlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We don't really need to do anything special
+ *		as MemoryContextDelete deletes child contexts automatically.
+ */
+static void
+GenSlabDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenSlabReset(context);
+}
+
+/*
+ * GenSlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenSlabAlloc(MemoryContext context, Size size)
+{
+	GenSlab	set = (GenSlab) context;
+
+	/* do we need to auto-tune the SLAB chunk size */
+	if (set->nallocations > set->maxallocations)
+	{
+		/*
+		 * TODO we could also assume the requests follow normal distribution,
+		 * computing stddev and then computing a chosen percentile (e.g. 0.95).
+		 * For now we simply use 1.5x the average, as it's simple.
+		 */
+
+		/* compute the new chunk size */
+		Size chunkSize = (1.5 * set->nbytes) / set->nallocations;
+
+		/* mark for autodestruction */
+		SlabAutodestruct(set->slab);
+
+		set->slab = SlabContextCreate((MemoryContext)set,
+									  "slab",
+									  set->blockSize,
+									  chunkSize);
+
+		set->chunkSize = chunkSize;
+		set->nallocations = 0;
+		set->nbytes = 0;
+	}
+
+	if (size <= set->chunkSize)
+		return MemoryContextAlloc(set->slab, set->chunkSize);
+	else
+		return MemoryContextAlloc(set->aset, size);
+}
+
+/*
+ * GenSlabFree
+ *		As the memory is actually allocated in other contexts, we should
+ *		never really get here.
+ *
+ * FIXME Although someone could call MemoryContextFree directly.
+ */
+static void
+GenSlabFree(MemoryContext context, void *pointer)
+{
+	return pfree(pointer);
+}
+
+/*
+ * GenSlabRealloc
+ *		As the memory is actually allocated in other contexts, we should
+ *		never really get here.
+ *
+ * FIXME Although someone could call MemoryContextRealloc directly.
+ */
+static void *
+GenSlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	return repalloc(pointer, size);
+}
+
+/*
+ * GenSlabGetChunkSpace
+ *		As the memory is actually allocated in other contexts, we should
+ *		never really get here.
+ *
+ * FIXME Although someone could call MemoryContextGetChunkSpace directly.
+ */
+static Size
+GenSlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	return GetMemoryChunkSpace(pointer);
+}
+
+/*
+ * GenSlabIsEmpty
+ *		Is an GenSlab empty of any allocated space?
+ *
+ * TODO This does not really work, as MemoryContextIsEmpty returns false if
+ * 		there are any children, and GenSlab always has at least two.
+ */
+static bool
+GenSlabIsEmpty(MemoryContext context)
+{
+	/* */
+	return true;
+}
+
+/*
+ * GenSlabStats
+ *		Compute stats about memory consumption of an GenSlab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+GenSlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals)
+{
+	GenSlab		set = (GenSlab) context;
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr, "%s\n", set->header.name);
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenSlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenSlabCheck(MemoryContext context)
+{
+	
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 92a7478..aae2349 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenSlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 2cbbfec..2992b6e 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -276,6 +276,7 @@ typedef enum NodeTag
 	T_MemoryContext = 600,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenSlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index e8a8d77..2dfab26 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -335,13 +335,7 @@ struct ReorderBuffer
 	 */
 	MemoryContext change_context;
 	MemoryContext txn_context;
-	MemoryContext tup_context_slab;
-	MemoryContext tup_context_oversized;
-
-	/* counters for current generation of tuples */
-	int		tuples_count;
-	Size	tuples_size;
-	Size	current_size;
+	MemoryContext tup_context;
 
 	/*
 	 * Data structure slab cache.
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index fd2c9c2..f4417d5 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -143,6 +143,13 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 
 extern void SlabAutodestruct(MemoryContext context);
 
+/* genslab.c */
+extern MemoryContext GenSlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize,
+					  int maxAllocations);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
-- 
2.5.5

#6John Gorman
johngorman2@gmail.com
In reply to: Tomas Vondra (#5)
Re: PATCH: two slab-like memory allocators

I reproduced the quadradic pfree performance problem and verified that
these patches solved it.

The slab.c data structures and functions contain no quadradic components.

I noticed the sizing loop in SlabContextCreate() and came up with
a similar formula to determine chunksPerBlock that you arrived at.

Firstly, I've realized there's an issue when chunkSize gets too
large - once it exceeds blockSize, the SlabContextCreate() fails
as it's impossible to place a single chunk into the block. In
reorderbuffer, this may happen when the tuples (allocated in
tup_context) get larger than 8MB, as the context uses
SLAB_LARGE_BLOCK_SIZE (which is 8MB).

But maybe there's a simpler solution - we may simply cap the
chunkSize (in GenSlab) to ALLOC_CHUNK_LIMIT. That's fine, because
AllocSet handles those requests in a special way - for example
instead of tracking them in freelist, those chunks got freed
immediately.

I like this approach because it fixes the performance problems
with smaller allocations and doesn't change how larger
allocations are handled.

In slab.c it looks like a line in the top comments could be clearer.
Perhaps this is what is meant.

< * (plus alignment), now wasting memory.

* (plus alignment), not wasting memory.

In slab.c some lines are over 80 characters could be folded.

It would be nice to give each patch version a unique file name.

Nice patch, I enjoyed reading it!

Best, John

John Gorman

On Mon, Sep 26, 2016 at 10:10 PM, Tomas Vondra <tomas.vondra@2ndquadrant.com

Show quoted text

wrote:

Hi,

Attached is v2 of the patch, updated based on the review. That means:

- Better comment explaining how free chunks are tracked in Slab context.

- Removed the unused SlabPointerIsValid macro.

- Modified the comment before SlabChunkData, explaining how it relates
to StandardChunkHeader.

- Replaced the two Assert() calls with elog().

- Implemented SlabCheck(). I've ended up with quite a few checks there,
checking pointers between the context, block and chunks, checks due
to MEMORY_CONTEXT_CHECKING etc. And of course, cross-checking the
number of free chunks (bitmap, freelist vs. chunk header).

- I've also modified SlabContextCreate() to compute chunksPerBlock a
bit more efficiently (use a simple formula instead of the loop, which
might be a bit too expensive for large blocks / small chunks).

I haven't done any changes to GenSlab, but I do have a few notes:

Firstly, I've realized there's an issue when chunkSize gets too large -
once it exceeds blockSize, the SlabContextCreate() fails as it's impossible
to place a single chunk into the block. In reorderbuffer, this may happen
when the tuples (allocated in tup_context) get larger than 8MB, as the
context uses SLAB_LARGE_BLOCK_SIZE (which is 8MB).

For Slab the elog(ERROR) is fine as both parameters are controlled by the
developer directly, but GenSlab computes the chunkSize on the fly, so we
must not let it fail like that - that'd result in unpredictable failures,
which is not very nice.

I see two ways to fix this. We may either increase the block size
automatically - e.g. instead of specifying specifying chunkSize and
blockSize when creating the Slab, specify chunkSize and chunksPerBlock (and
then choose the smallest 2^k block large enough). For example with
chunkSize=96 and chunksPerBlock=1000, we'd get 128kB blocks, as that's the
closest 2^k block larger than 96000 bytes.

But maybe there's a simpler solution - we may simply cap the chunkSize (in
GenSlab) to ALLOC_CHUNK_LIMIT. That's fine, because AllocSet handles those
requests in a special way - for example instead of tracking them in
freelist, those chunks got freed immediately.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#7Jim Nasby
Jim.Nasby@BlueTreble.com
In reply to: Tomas Vondra (#5)
Re: PATCH: two slab-like memory allocators

On 9/26/16 9:10 PM, Tomas Vondra wrote:

Attached is v2 of the patch, updated based on the review. That means:

+ /* make sure the block can store at least one chunk (with 1B for a
bitmap)? */
(and the comment below it)

I find the question to be confusing... I think these would be better as

+	/* make sure the block can store at least one chunk (with 1B for a 
bitmap) */
and
+	/* number of chunks can we fit into a block, including header and 
bitmap */

I'm also wondering if a 1B freespace bitmap is actually useful. If
nothing else, there should probably be a #define for the initial bitmap
size.

+ /* otherwise add it to the proper freelist bin */
Looks like something went missing... :)

Should zeroing the block in SlabAlloc be optional like it is with
palloc/palloc0?

+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 */
+	if (set->minFreeCount == 0)
Couldn't there be blocks that have a free count > minFreeCount? (In 
which case you don't want to just alloc a new block, no?)

Nevermind, after reading further down I understand what's going on. I
got confused by "we've decreased nfree for a block with the minimum
count" until I got to the part that deals with minFreeCount = 0. I think
it'd be helpful if the "we've decreased nfree" comment mentioned that if
nfree is 0 we'll look for the correct value after updating the freelists.

+ block->bitmapptr[idx/8] |= (0x01 << (idx % 8));
Did you consider using a pre-defined map of values to avoid the shift? I
know I've seen that somewhere in the code...

Patch 2...

Doesn't GenSlabReset() need to actually free something? If the call
magically percolates to the other contexts it'd be nice to note that in
the comment.
--
Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
Experts in Analytics, Data Architecture and PostgreSQL
Data in Trouble? Get it in Treble! http://BlueTreble.com
855-TREBLE2 (855-873-2532) mobile: 512-569-9461

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#8Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Jim Nasby (#7)
Re: PATCH: two slab-like memory allocators

On 10/02/2016 01:53 AM, Jim Nasby wrote:

On 9/26/16 9:10 PM, Tomas Vondra wrote:

Attached is v2 of the patch, updated based on the review. That means:

+ /* make sure the block can store at least one chunk (with 1B for a
bitmap)? */
(and the comment below it)

I find the question to be confusing... I think these would be better as

+    /* make sure the block can store at least one chunk (with 1B for a
bitmap) */
and
+    /* number of chunks can we fit into a block, including header and
bitmap */

Thanks, will rephrase the comments a bit.

I'm also wondering if a 1B freespace bitmap is actually useful. If
nothing else, there should probably be a #define for the initial
bitmap size.

That's not the point. The point is that we need to store at least one
entry per block, with one bit in a bitmap. But we can't store just a
single byte - we always allocate at least 1 byte. So it's more an
explanation for the "1" literal in the check, nothing more.

+ /* otherwise add it to the proper freelist bin */
Looks like something went missing... :)

Ummm? The patch contains this:

+	/* otherwise add it to the proper freelist bin */
+	if (set->freelist[block->nfree])
+		set->freelist[block->nfree]->prev = block;
+
+	block->next = set->freelist[block->nfree];
+	set->freelist[block->nfree] = block;

Which does exactly the thing it should do. Or what is missing?

Should zeroing the block in SlabAlloc be optional like it is with
palloc/palloc0?

Good catch. The memset(,0) should not be in SlabAlloc() as all, as the
zeroing is handled in mctx.c, independently of the implementation.

+    /*
+     * If there are no free chunks in any existing block, create a new
block
+     * and put it to the last freelist bucket.
+     */
+    if (set->minFreeCount == 0)
Couldn't there be blocks that have a free count > minFreeCount? (In
which case you don't want to just alloc a new block, no?)

Nevermind, after reading further down I understand what's going on. I
got confused by "we've decreased nfree for a block with the minimum
count" until I got to the part that deals with minFreeCount = 0. I think
it'd be helpful if the "we've decreased nfree" comment mentioned that if
nfree is 0 we'll look for the correct value after updating the freelists.

Right. I think it'd be good to add an assert ensuring the minFreeCount
value matches the block freelist. Or at least SlabCheck() could check this.

+ block->bitmapptr[idx/8] |= (0x01 << (idx % 8));
Did you consider using a pre-defined map of values to avoid the shift? I
know I've seen that somewhere in the code...

Patch 2...

Doesn't GenSlabReset() need to actually free something? If the call
magically percolates to the other contexts it'd be nice to note that in
the comment.

No, the other contexts are created as children of GenSlab, so the memory
context infrastructure resets them automatically. I don't think this
needs to be mentioned in the comments - it's pretty basic part of the
parent-child context relationship.

Thanks!

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#9Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: John Gorman (#6)
Re: PATCH: two slab-like memory allocators

On 10/02/2016 12:23 AM, John Gorman wrote:

I reproduced the quadradic pfree performance problem and verified
that these patches solved it.

The slab.c data structures and functions contain no quadradic
components.

I noticed the sizing loop in SlabContextCreate() and came up with a
similar formula to determine chunksPerBlock that you arrived at.

;-)

Firstly, I've realized there's an issue when chunkSize gets too
large - once it exceeds blockSize, the SlabContextCreate() fails
as it's impossible to place a single chunk into the block. In
reorderbuffer, this may happen when the tuples (allocated in
tup_context) get larger than 8MB, as the context uses
SLAB_LARGE_BLOCK_SIZE (which is 8MB).

But maybe there's a simpler solution - we may simply cap the
chunkSize (in GenSlab) to ALLOC_CHUNK_LIMIT. That's fine, because
AllocSet handles those requests in a special way - for example
instead of tracking them in freelist, those chunks got freed
immediately.

I like this approach because it fixes the performance problems
with smaller allocations and doesn't change how larger
allocations are handled.

Right.

In slab.c it looks like a line in the top comments could be clearer.
Perhaps this is what is meant.

< * (plus alignment), now wasting memory.

* (plus alignment), not wasting memory.

In slab.c some lines are over 80 characters could be folded.

It would be nice to give each patch version a unique file name.

OK, will fix.

Nice patch, I enjoyed reading it!

;-)

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: John Gorman (#6)
Re: PATCH: two slab-like memory allocators

On 10/02/2016 12:23 AM, John Gorman wrote:

I reproduced the quadradic pfree performance problem and verified that
these patches solved it.

The slab.c data structures and functions contain no quadradic components.

I noticed the sizing loop in SlabContextCreate() and came up with
a similar formula to determine chunksPerBlock that you arrived at.

Firstly, I've realized there's an issue when chunkSize gets too
large - once it exceeds blockSize, the SlabContextCreate() fails
as it's impossible to place a single chunk into the block. In
reorderbuffer, this may happen when the tuples (allocated in
tup_context) get larger than 8MB, as the context uses
SLAB_LARGE_BLOCK_SIZE (which is 8MB).

But maybe there's a simpler solution - we may simply cap the
chunkSize (in GenSlab) to ALLOC_CHUNK_LIMIT. That's fine, because
AllocSet handles those requests in a special way - for example
instead of tracking them in freelist, those chunks got freed
immediately.

I like this approach because it fixes the performance problems
with smaller allocations and doesn't change how larger
allocations are handled.

One more comment about GenSlab, particularly about unpredictability of
the repalloc() behavior. It works for "large" chunks allocated in the
AllocSet part, and mostly does not work for "small" chunks allocated in
the SlabContext. Moreover, the chunkSize changes over time, so for two
chunks of the same size, repalloc() may work on one of them and fail on
the other, depending on time of allocation.

With SlabContext it's perfectly predictable - repalloc() call fails
unless the chunk size is exactly the same as before (which is perhaps a
bit pointless, but if we decide to fail even in this case it'll work
100% time).

But with GenSlabContext it's unclear whether the call fails or not.

I don't like this unpredictability - I'd much rather have consistent
failures (making sure people don't do repalloc() on with GenSlab). But I
don't see a nice way to achieve that - the repalloc() call does not go
through GenSlabRealloc() at all, but directly to SlabRealloc() or
AllocSetRealloc().

The best solution I can think of is adding an alternate version of
AllocSetMethods, pointing to a different AllocSetReset implementation.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11Michael Paquier
michael.paquier@gmail.com
In reply to: Tomas Vondra (#10)
Re: PATCH: two slab-like memory allocators

On Sun, Oct 2, 2016 at 10:15 AM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:

One more comment about GenSlab, particularly about unpredictability of the
repalloc() behavior. It works for "large" chunks allocated in the AllocSet
part, and mostly does not work for "small" chunks allocated in the
SlabContext. Moreover, the chunkSize changes over time, so for two chunks of
the same size, repalloc() may work on one of them and fail on the other,
depending on time of allocation.

With SlabContext it's perfectly predictable - repalloc() call fails unless
the chunk size is exactly the same as before (which is perhaps a bit
pointless, but if we decide to fail even in this case it'll work 100% time).

But with GenSlabContext it's unclear whether the call fails or not.

I don't like this unpredictability - I'd much rather have consistent
failures (making sure people don't do repalloc() on with GenSlab). But I
don't see a nice way to achieve that - the repalloc() call does not go
through GenSlabRealloc() at all, but directly to SlabRealloc() or
AllocSetRealloc().

The best solution I can think of is adding an alternate version of
AllocSetMethods, pointing to a different AllocSetReset implementation.

You guys are still playing with this patch, so moved to next CF with
"waiting on author".
--
Michael

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12John Gorman
johngorman2@gmail.com
In reply to: Tomas Vondra (#10)
Re: PATCH: two slab-like memory allocators

SlabContext has a parent context. It can delegate requests that
it cannot handle to the parent context which is GenSlab. Genslab
can send them to the sister aset context.

This could handle all reallocs so there will be no surprises.

Remind me again why we cannot realloc in place for sizes
smaller than chunkSize? GenSlab is happy to allocate smaller
sizes and put them into the fixed size chunks.

Maybe SlabAlloc can be happy with sizes up to chunkSize.

if (size <= set->chunkSize)
return MemoryContextAlloc(set->slab, size);
else
return MemoryContextAlloc(set->aset, size);

On Sat, Oct 1, 2016 at 10:15 PM, Tomas Vondra <tomas.vondra@2ndquadrant.com>
wrote:

Show quoted text

On 10/02/2016 12:23 AM, John Gorman wrote:

I reproduced the quadradic pfree performance problem and verified that
these patches solved it.

The slab.c data structures and functions contain no quadradic components.

I noticed the sizing loop in SlabContextCreate() and came up with
a similar formula to determine chunksPerBlock that you arrived at.

Firstly, I've realized there's an issue when chunkSize gets too

large - once it exceeds blockSize, the SlabContextCreate() fails
as it's impossible to place a single chunk into the block. In
reorderbuffer, this may happen when the tuples (allocated in
tup_context) get larger than 8MB, as the context uses
SLAB_LARGE_BLOCK_SIZE (which is 8MB).

But maybe there's a simpler solution - we may simply cap the
chunkSize (in GenSlab) to ALLOC_CHUNK_LIMIT. That's fine, because
AllocSet handles those requests in a special way - for example
instead of tracking them in freelist, those chunks got freed
immediately.

I like this approach because it fixes the performance problems
with smaller allocations and doesn't change how larger
allocations are handled.

One more comment about GenSlab, particularly about unpredictability of the
repalloc() behavior. It works for "large" chunks allocated in the AllocSet
part, and mostly does not work for "small" chunks allocated in the
SlabContext. Moreover, the chunkSize changes over time, so for two chunks
of the same size, repalloc() may work on one of them and fail on the other,
depending on time of allocation.

With SlabContext it's perfectly predictable - repalloc() call fails unless
the chunk size is exactly the same as before (which is perhaps a bit
pointless, but if we decide to fail even in this case it'll work 100% time).

But with GenSlabContext it's unclear whether the call fails or not.

I don't like this unpredictability - I'd much rather have consistent
failures (making sure people don't do repalloc() on with GenSlab). But I
don't see a nice way to achieve that - the repalloc() call does not go
through GenSlabRealloc() at all, but directly to SlabRealloc() or
AllocSetRealloc().

The best solution I can think of is adding an alternate version of
AllocSetMethods, pointing to a different AllocSetReset implementation.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#13Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: John Gorman (#12)
Re: PATCH: two slab-like memory allocators

On 10/04/2016 09:44 PM, John Gorman wrote:

SlabContext has a parent context. It can delegate requests that
it cannot handle to the parent context which is GenSlab. Genslab
can send them to the sister aset context.

But Slab may also be used separately, not just as part of GenSlab
(actually, reorderbuffer has two such contexts). That complicates things
quite a bit, and it also seems a bit awkward, because:

(a) It'd require a flag in SlabContext (or perhaps a pointer to the
second context), which introduces coupling between the contexts.

(b) SlabContext was meant to be extremely simple (based on the "single
chunk size" idea), and this contradicts that a bit.

(c) It'd move chunks between the memory contexts in unpredictable ways
(although the user should treat it as a single context, and not reset
the parts independently for example).

This could handle all reallocs so there will be no surprises.

Yeah, but it's also

Remind me again why we cannot realloc in place for sizes
smaller than chunkSize? GenSlab is happy to allocate smaller
sizes and put them into the fixed size chunks.

Maybe SlabAlloc can be happy with sizes up to chunkSize.

if (size <= set->chunkSize)
return MemoryContextAlloc(set->slab, size);
else
return MemoryContextAlloc(set->aset, size);

That'd be certainly possible, but it seems a bit strange as the whole
Slab is based on the idea that all chunks have the same size. Moreover,
people usually realloc() to a larger chunk, so it does not really fix
anything in practice.

In GenSlab, the situation is more complicated. But I don't like the
coupling / moving chunks between contexts, etc.

If realloc() support is a hard requirement, it immediately rules out
SlabContext() as an independent memory context. Which seems stupid, as
independent Slab contexts are quite useful for reorderbuffer use case.

For GenSlab the situation is less clear, as there probably are ways to
make it work, but I'd vote to keep it simple for now, and simply do
elog(ERROR) in the realloc() methods - both for Slab and GenSlab. The
current use case (reorderbuffer) does not need that, and it seems like a
can of worms to me.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#13)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

Hi,

attached is v3 of the patches, with a few minor fixes in Slab, and much
larger fixes in GenSlab.

Slab (minor fixes)
------------------
- Removed the unnecessary memset() of new blocks in SlabAlloc(),
although we still need to zero the free bitmap at the end of the block.
- Renamed minFreeCount to minFreeChunks, added a few comments explaining
why/how firstFreeChunk and minFreeChunks are maintained.
- Fixed / improved a bunch of additional comments, based on feedback.

GenSlab
-------
Fixed a bunch of bugs that made GenSlab utterly useless. Firstly,
chunkSize was not stored in GenSlabContextCreate(), so this check in
SlabAlloc()

if (size <= set->chunkSize)
return MemoryContextAlloc(set->slab, set->chunkSize);
else
return MemoryContextAlloc(set->aset, size);

always fell through to the set->aset case, not allocating stuff in the
Slab at all.

Secondly, nallocations / nbytes counters were not updated at all, so the
Slab was never recreated, so GenSlab was not really generational.

This only affected 1 of 3 contexts in ReorderBuffer, but apparently
those "important ones" to affect performance.

Both issues are fixed in the attached v3, which also introduces two
additional improvements discussed in this thread:

- The chunk size is limited by ALLOCSET_SEPARATE_THRESHOLD, as for large
chunks AllocSet works just fine (thanks to keeping them out of free list
etc.)

- Instead of specifying blockSize and chunkSize, GenSlabCreate() now
accepts three parameters - minBlockSize, minChunkCount and chunkSize,
and computes the minimum block size (>= minBlockSize), sufficient to
store minChunkCount chunks, each chunkSize bytes. This works much better
in the auto-tuning scenario.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-allocators-v3.tgzapplication/x-compressed-tar; name=slab-allocators-v3.tgzDownload
�2z�W�\ys����S�zvm
tY��1��mf$�+������G=j��>���}����/�&^���%��2���Ufv���n����'��gZm������=q��~����Qga�����|��9>>�����n�'���w�G���'p	��|�����oD�Y\��'�b3~�p������~>��\�q�m�9�^�����V�t��>������X'�c������#������?�6�#�7�.����)����������t��;v0Wo��%��S�C����1��9:�.����i�o�����]~���?;�@'���F��"���z�����K?��eF� }(\��J|�q(~�����z�-.d�P��8E���g���O�,X�I(��g�	3�"M�^:�������r���	r��_����'�s17�0����#4�8�� �=h�4����WzN����oL�e�	�Rab(=R��M�1C�f|��#�uj��_��h�4 �Hb��/2$�),7�t '�v���B�C�J�J(��F������e �#��x�`�H,�x&�Lid�]Qh�[�]��a��6��1�����`%�	������h��s������|�_��r����?���hm���<�?���������\��G���#����/�Y��$��[���=Qb�W\Qa����Ay�Z���/e���Z��GL������Gc6ZMC�Gz�/��ua���9&t��k�^O��v{
�g�?�?������#��ux��N;�nW��K��N�|)��A�������q,Z��T��x�����(;���)4���_�1����'�I�����~]��i�Y]�[0���q�@�FR�#�����(f��A�`z����^_p�^��,�;�7(n"��� %�k�v����`4�2��n����oj���up����{H��'s�'s�8�(����\��V�2W�X�m@`?2mT_G�O���DJOD��A�&�9��+�������n�d�k��Q,p	���cEf��cf-������1����dq�I�8ZH�\[��L����<r�
W�i��es)���{/0�*��A ��IG�������'f�I��
sj�"�|
V|�cN�H2t������Cz�����G�5*��������C�:M$_�Ph��y���\����6�Mg��w55��Y���}�E�����b�~���������������o���6�I� :Sj6���3��G��2��|����I�,�4c��v���Kvu~�q�������������_���W��7�?��:�l^d^!�r`A�B#����znae�W����.��������1������`�8�V#j33��`^�^�(�}+�����:0:7R��4�l�����qq��&/,E���q��=���o@v���L�$�+�x��m�E>p����|Lo��%�q�[
X2�Q����=��.k���(@v���t`�7�V��V]5�������E[Hjxm�`I)g����R������8	}eS��'�'{����Z�Scg?���(��-���@��Mbp��
����J��m 5w"��~�����;�m����om��v�GP��c�=$QS&C��G�k��I�Ack��E������K��'�(���Wz��?!�5�
u(\H*��M��y���om���`�����&�q����h����C�5��q�X��[��J72D��w���\S�7_��0��,��P�f!fOP�B����nS"��7�Z���}@R����c���@�/�l����������?��������7��l���Q1]O��42����&J}.��Sq2��1��I�>8��
Ax����B��8�a��`��B��KZ����>B�����I���!Y��&}��0)&�O��A���Urs���}������C�q�?�
�g6�[	j��F�S� �����.��������p�F��o�����SN�
R�SJ1���y[��/�������P�Q�ZP6/�X*�t���J����c����Hy������v
(��oc(�EC>bk�:���X/��M���vp�����b��XW�����scZ1��Ec�I��q3V"��M��0����,��?0N �R��`����$�V<"�w�hyKs�Q0+V�+�����c/E����p���E�9JH�h���t*CC��J8��*3�L���`��N��+@�v�%s;��,�2�}�^���U[�cxGR
���%�}�Uc�@��\-	����J#�X����+D��M�s
�H��W��)�9�g�9������H�����������v*5��L��
�k�}�t5,������yJ�73������7���}
w��M�sNY�I%��Z@���L�K#��=��i�y�����*�+�-���iQ^TN���`g�g��
����&M�CE��RG���-H��1�;������Irj�N�t��4��!{�i	@�iA�D4SN�k�w������T���j����
��7�@�L4������'2����x�R����W�\��f�*��DtJ{
�[�M���Q5�����3�Gct�������m�����;��_G���h4��_~\�8���4U��R���=����R�����,�(��H�[�;��J��ZoO��%�bU�];*�-��hU�V�m��9et�1=qy[���f��;���:Y��!6��C�YEs5eGJ�9�����u��fW��M^����/��?E�f����a&�G�.,G�����s�El��+{+[�l�8;���*t����6�K�Bh
���%��-�Z��d�]����U�m������aqwl�<3&v���������N&����iWe�������y��+�>R������U'2��F���V��{ {��A��}:��4�������Q�`�O����pb9n�	@��:��bl%����Ng_��u�B�{#?�Ibh���X��%���F�5��@��G�����`3�@���h(���c�Y��
~g~����9��:T'5��"��w(v�����G��M��w����x�v)�h����x��������O�wx>�{�W(n�+lTze�5ip�$d:��@�dB�Aa&
��p��(.5�.s����m0f$�	���&xv��@�!��

���q,V�;���a7��7�x��^���V�=>t��c$�������n�axI��Z�v�����au�'L����A�%�-�Lw��8�N��f��x0C7H"",bw���P��[�0�5$���~���0���Ae��C)��Q"6���	�_ 3�����J���@L���
�h�WB���'��c� ��fDM(>�)(����s�R*���],�\�$<>2�����e��D����������q�T�n�06Cs��G��#��kH <MC��H�Tf����S��~�cZn<7|����0:wL�{����R-�����}pamdLT ���P[��)�!#U���)^X�8v9<=���`Xz���������d9�D��
A���&xB�J���O��2�������-��#d4Cw�^�	�`]��B
�5�)�*:���e�	1�~K�����#~!x?��&�Y��\�lROMb��v�5f,�a���]�,�*�E1���sZN>%�����`�i��,US$��P&�����9�>����6AH`c��:�`��Q0��0^��6*L\����Z0���|��)���ZDu����Q��d�4Vh:�!F�$��G���
*�U4���1�DD�V������]�7R(����6:f�+y�~�Dxb�. q�L^�'�����g@��b{	�C��
Y���f�`�b�8�"�����
�H0�!]������C�m9����b@�L�x�|��Q��
\P�vd�H��|����n��RBQ�)>��,e�{�J'e5'd�b��P+��5��kS)����0D��C����Y��k��M\^}����?��1��|��o�1�	�=Z���C�|��1�#P4G�GxK�e��}VC�����(��.z��PP�H�AIg��.J�Y�oc<�Pm��y6���,�����������-a�")�A�n:
�_����,t�����`���7�b������(���{�w>��0)�M:jj0�4���Z	��&�N�K�=1xS<�0�?*�����T}���(�`��`������G ?
@R�j�|�,������mo7/���n�����	�kI��
��$^��6
��NZ�I��7����?T�H�	�mH��]���|
�M�T�%��6����@�H���H�A�U�vn��R{��p��q��$�!X�'2A�n�AL����lR��:C�5�������s�4�7�,Iry�w�"�r��&�?��4t1����-c��v:�;q1OzQ���Q"�G��>�-b��:���Rm`v���`��>>��������V2��@���c7�!�XB]A1�2����[�>B�5O�������������HL(�(0������Mu;�F:V�x��#�S�w4R���fP�k|BQD�0����Z
rKK��R`��FCb�$�N����t=�i���r�����'Fd:�1�����O�xie�P��cM�t"��Y�\�`��12�����j�O�����V�:��8*��������{1���w�b�BX,�6��mBT�������m�LL!��so�t�l>X�pRk��9�Z���*�`In�����I�������A��Ns@�<��k�^�*D&/FT��a��@�g���6����)(!S�HJNy��\3��k#TP�En���6/�������b����3{Q�������E�M���M���V?7���v��Z��W���5�������l������m��@���w�"�[����_=�M�zq5���O�Uk�I$�2Wn�6EK7H�	�N��&���9������l�2+�l�]�"�TH�-�o���$��^�Uz	�`v��~�WL���Y2�Z� �����~=c�s�-{��B�v�q+--;(�F$������u ��w�$C_N
|
�\�2���G�~���t��0+���������#)L"0����;�j�m�gAhW��0�<[�������{J���7<�N�����L������ ��/���q��U\���3G����Wv�A����(}���m�X�'J:�WDe���8������ld�����LO��+�Z���!�������~����!�F!�SMB������Pc�p:�j��N7v�p�M�O�:��y�;�q��)����6G����GK0�t�H7	K-��M��&m�K��3����H	���pS��:�+<@vWtg+����pD��oo6�����Zx�T���p�s���q�j�6��hN��fP���wel|�	F��W�S[�e@�Qne�@U�9�+��R!�W�;i�4M�fS^93�MU����*u�\T:y07p

�\�������P!>��K����3H�����p>-.))�M\��:Q
���,$��R_��x����:��
	���1�VM�?	�����������h�?�0	?�j���C)��:R`Xm8N��I���.�S���h��j��	wW6��:����r�����)LgF�K4Q�V��~xI�?��4T��|�����9��R3:���!������I
�������F�����-��d�C��c�+�ugY��~�*�������iB�n��)�	�t�9��	Eu�����n��aA�
�@�-�+! Y"�I�%~Y^�l4�0x(��4�)�����w�q[��K}
Dg����$��,�J,;����R��m�:Cr$��Hi[m�����R�������L����>7��=q���
�+t��a��H6��u�4ngK�L����#�,G�-�z{��/�S�533s3��G���-X�N��1�f�*�:XT���#V���&���X��cp��+�:7���?$��q���A(o��)��N0h������|75	�eE���1=|�����M�������l2��������3�R�w'���M���������j��q���^�"����{*�(����o������%�W�V�Q��/�%�7o�m]Z?�Wg	����'�(������'@�Y9�?�t+���vt�a��/����}����{M���}rQ�f�B��p��x��e���
uo���z��W��Z�1�U��������5�C��\��u�|������Y��r�����xm��S��b�����S�G�YH���uM���2M�MIrN���;e��
z�/��7�(�����m|�����_��yj�q:��P�yJ��]���|�~5x��m��Y�2����p���eF���z��@�?���2��q`����)? �������M��-�n9D�
!������>|�����3�9<�7�����N���t�P>A���T��u�}�B	�zc��
Z��\�,v`4�NI�M��~j|C8��P�I���T��F%	���G���]"��0�������e��7���Lw'��b��3Mr�J�f>!9��slt��Bq�V&C��XCh�����}����O�p��!f��C0�>58?��������M%��%��"���LZ�mM�����&i��2�N]$�)h�Z��&�0��`�9EO��t��	���f���,Lf���l<Aq�t�x�N�%�P{c4���& ������b8�Cy�KJ��NI8#�l�j�����6S����$�
�������un}��ux�c���P������o�B�s��N���{,A��X_e���
>� �w�>X��bf�"�bS�I��ov�L�;9�	����q��z��&��l4OI��C�/���Z�oc�b�j�D�`�jB6����h�J��)���Uc{����g��k���`B������#��U�,�}�7��i���o}����������(��kT�
�vG}+����O:L�����n0d'!E�������)�FR��A����N7.��1q��[C��������[;|L�]y����C�<z����ff�]���4�V�4�p"z<�FR��n	����j
`��-G�a�xp`Mb�f����R����R�]���fBXV����z@���b�L�����������A�u��)z�C�%H�@+6���sC��v�����������~���a?�4��^����	u]~B��f������7�t&�����Y?|C-������"�����@t����:}cu�#������6)�`���S�y��V�����O@�x�t_���s�o��q�Kb�OC��#y�
���'�X�Mv&R�L��i��W'ZL��r����p���u<���I����,|�\�p��a�9|'�<K�lk.��A�MWf����O�� ���(��@cg���?�ut���Z�����t���S�q����g�`���:�HaZ;�����AT��a���j����YBSu9��:�����m�1	�Yig$�Cj:��i
n�$���������|R�e�Pq����(���[Rx8������@�P�E��d:�_��M��g������t�Q�"�B��{t,��'�$L�7���=����@�#�Z?������|3pk����x��E�xof���o�2	����i���#v����|2G�/6M��L�c�J�buI|���SN.��@�OQ88F��������B�gG��B����������%e���-�E D�{����<@�V��0�7�X� *��sXja��J�pL�)�0�s@o��'B��X��7%^�MEN_���?�P��2���CH��i�<�Tp������h���QTB�:ez�~���*;��5��������>���G��P�p�H�N�;_����:����_�6��0�������w��{
z=R���Y��*���4z%�g�%V�-*L>�i0�!�H�kA05C,���y3��~����{���3����c���E����M;�y�F�=V�J�t3\
W��K�D�
��
�(d�g�%ML�4�a���d�����2@����'6��6	�u0�
�Cy9����wL?�5����7�����7/�E��8�<m��rl��%YGG``W����cf ,��@�na��H<��T���FH�u������b�z9���U��e��,��=|c>�J�N��2�5`��1\F(������
ky��|Lq){`H�IPQT�c��f��*�7]&F�\�A����1'����B��t����=��)���R��	�#�5���m]Q���D#���E��i�g2 O��AMA��{2���k<��^�@iv�wV�pT5�/�g.�)
j���������~�����d"�����u������u+.���e�}�N6A�����	�}�<���[�^t8�Uu�@�%�5��S�a,��xh��q^�w�E>
{�����^�Fhi��������&s��9������������\�o����5��1Sp[�t	��V�yk�O�|dt�.�����
�q�ug� �w9EGR#���j;9U��8���g�S�Z������/qBku���7����B��\��T:;RQ��w���;������t*Ms����_�w=R���
yEs��q�.���n���/R����%zY~��	��I4�����@!�a�s^�W�{�_���7��Y����`���hjw��G��=8�5�Z6H����}Va�U���gr;F���A�E�������B3���.��~&�~Mh����Z�������������*�)�W�O$+���Fd�apt��,~L1l� %�*�GxQ��
�y����	Ff�����v�IqQ[�O��SJi��q�T����5F���/Sq��=�L�G�r�����1�26z�����V��B'Nc��N���C=�F�x��>�(E�����Anp�O�L��8.	W�t��Q�cU���p��*dQ
�+�%�����4Q���8&�	�-�^YX������/�.#�(-����^������Q0f��5���L���1PmUe ��cC��7*OE�_JXc�����"�)����/w�2`�=��1|]���X\�](�x�V�ex(z}N�P���<�'���#
5l�-|�i��T���9��)*$Q:4�e0�sh��@L
��T=���;����	��O�~�w�	������4B�@4P|M`�M��1M��rk�����+���4��O���DTX�!i����	7��m����E�P�M��j��\e�r��g�V��r�}t�iy+�C+����1��V�4�-co�G2���p�s�*�N�6Z7��-H����c3"�����h�H�:���:�GJL�YT���!��,��,�C���C�:� #�5xg����f����hM8��o3�4*����F(n2b���Xa<���LN97���D�O�|�|�4j�q^�"�@�r��� �P�h\���������.qfJSa,8K��3�J.i���?��p2`�����?����������>��a�������z.j�0%g
8UrD�I�����{�F7I��������p����0�)&�D�Y
[B�5R�a����j��Q�tK��L,;��-S���)��i�b�����|���B^��pg�p	����.$�,�x���*nm����e��L����o��&:�f����mb`���0#������RF0���4$s'���
�Y��4vj+d7m��	�o����a$Z
���7B�YP���T��c������D_�`���h��\	��"���=�,2���S�Ii�{7�w�g��LRC�i�)�mn��������#u�K����,!�Vtk.@u����5����������!�z>���`Rn�	1�}��o�aA- d�P:/b�KXi�/��+E4����r���$8
��[�{,�?jU>��;f&Z�j�����rr�8�zc&��H�w�y| \h��=us�AeD��<�wgk8���g��������{������Y���Z�l|�m�W=7!����[t�f���!'����L����O	�R�gW��������8(}���!yBS��/��}l������%h��]�^�Y���Q�Gc��`���c�6������j���zct	�E3eH�/������n�[
D,��IoW�T��@����4����)O
Ce3 L�����'0<67M�T�����M�6`��I�z��&�p�AH�CH����,E:%�K!��1��~� �p_���f���h\C�L�Y|D�}��b�\z[��!�dM3x}=��"�)���k~����
�t�������z �D}���+����}u[�t-0�|WTp<|vb��jj"��c�������o�`*�,�M��C�If� ����_�g��]8�D�P(s�y�x�ABw�p�w+	���R�������S@<��834l�C���P��g�_���!��M�:��)���<�����A�XmA���!�����=�4j�i�(���F���5}��;)yG)���Z85�����@�,E�g�o1���p��"�@��}�	=���}(���V8E��)dn&�`�7���=��~��Q[�+����^��3?���<4�Me���}�J�a�a4Rh������c:W#��~�@�qi������������R���O��:�;�s�d�:�d��l s\�_i\5�����A��2��r��P����on��	�@MY0��
s��2{�f��z�/�O>��}�
�{�}��d��/�G��;�^b����`i�8;�Z�[�$����r�$/R��hi�A���	j�YBy1�
��@��	M���}W���#���*�xze���5e����~������������;�bA/�}���b�1=����:�z`�,���M�c�98���@J����
��T_�bc���"���F�� �)��e�=V��D\��r�bt`<�2|��+"�X���B��yu��c#>^#��x���a������&FAV���6����y:Y��iF���i�-�>"zMor9��+@F�3�P_���s|��L��\U��8����c;��3E��n����J�Q�i�o�r��E'B�4����~X�07=/�]D��`���+�����"q�n�@������������A}7]��DKJ)���h��+������*5��#_�����P�u�8x�3��gB����m��r�����VVP_�o���1�j��XeA��Rk�r/*��@���I�9�O����&���K��m��`��e,��E��%��2�h�������n��4�Y�S��[�'�(��R�O��t�PMG,��\��7l*�y��NQa���S8�w�O�����������������m����&�����5�m�]f�������%���Y�s~6U/�f/5Om�w�*��9��Dn�]gn0�����r���]�0P��p#`
�i��]�a>{�����o�{����\��%q��������rr��d������[N8Z��d�M��{�~���Mv6wv��gpIM�@N��m�9���_��\^�5LQ6�{n"�l����J�}e%	Xl����(� �����heI�K�_UAO �AF�`�^�<9:=�q��*[,�vw�{;�^����zi�J-Z&�F��-X$��#W)��@����,��s�KS9;�����n7�A0wp�=W@
�-uv��T=���f�~:R8iZ*Kx�m�x���=�\����~����s�XR�gu/������Jw����N���������0��q����d�t �t�z������F�	�f�0�p�g?��v�@�b�s!���g��%E{�Hy	��]D��r��>����q�I���Z�~8��VM�?�P���3?�i�a
��fz�)���-|N��8������hg�f.y1�tY���b*����z7r�����1o�t{cc3��Z�n��-�����4W��o 9�?�M��z���8��{)��b�������{���]%��#H����<�>��hO~w�;�L��d����Ay}���)�kY���sM��6Zd�����
�q2�Y���B�bc�B�%�L�� {"����t�zaw�����z�5:#�i�������������N<y�D���^��K��e������9 T*X�/�����#�"�u��M���-����s����X���Z[kkZ}��|��v��.�d�D)���QQ]&�k`�������&���l��_�����h�����?��m�������������n����N���?�7�<�������F�qq�������5l�uv/���docg���%�
-khv�4�V��n?����^�5h��:�\iA�o��`��G3����KP��g-M��i�"}���AC�S�������n<��~��/��nn�t���C��W�g?�����(�I���]�\Rjkp��)��![�&x��%[��Q�r���J�tU�����������`s?�����5u��������S��Oa{���'�-��B�7^Sm�,��5;��W,Ad���+s<_�+������f���}������G��Z���^W�.�U�Ze�+�
$�p����y�����mk0'�n����i��N'����M������[Z
���/�����8�`����h���0
Q���^{Z&�!o��P7��C1���|l�~<��>W���<K�%�\���PX����p<!�%
�i-���Y��iK�8����O/�Q�}�`��O?��5g��Y0#Vl�c�s�v���<�Sw]>���Y��/i�%e�_�-,�
e�p���I>�������WG����<�|������o������>�!�z
���8K0
�1D��+�����7���1(xv�tvv������eV�H�Z.�@���r�_�|�����������^$|p
�`OV��3���je��s�'�����kt":���[q*b�P�����e���bYI��U�[r��(Z����w���C�����>�����������-}�mnl6:��z>Kg��~o��i�yqL��tL��	|���;qL�k���fCh��!�T!!C���7T�O{�9P���+�fu@@H�c~I>\�s���Y\�*|&?4�5U�Noz�N�s��G��� *9%�����mI��U}�}�l�t!B�qp�6
2�t.��j32�Z����[[����`�z��u������7G���q��>����6���4L�8�)�O>
>j�e�9�S7�a,[6�f�!1I����Hte��;y�OS�U��2�����J�O���X-W�p�\-x`�.M�R��b
�p{	��;���?:�O�9}�Z8a��R�k�^�Z���b��m��pD)gl#pg��idD�x����8�i�[��*�!�4W�`���l�_{�,��������9�;��&������V��p����P_r��MA��3���?�^(��D���,ymt;����V����J7�b�+�F^���B�s
	]2�f��`�)��U� ���7�T��z`���)GR�UY���0�lQ�l�7N���)��AL�U����d��o�o�|'ku�������3��+fX����[W��Z@�A)�W'����-D|+Q����M��{�.��A���x��e�^+�n���5@�������>�_��|���N	��Y�<��L�9���1Z0��@V�M�Q��$/�~JDI����G��j�|Ud�7R*�	i_���B4��k��k����r����s���?adr��������m���������? ����pqDW0�7�����P~3H��,a`'6f�;9:��8���*�=!Hb��>p-�%rcU���JC����7�(���0�2t.,�g�9I�i���z����A����������j4J�^M�j~��0,,�(Z���]0M/��dj�
�����t������h�0�sJI
�H|@��|
g!������3��9�d���3�+����W������S�[!#Ng�q�g���2���b.�0k�$��
�A,/���~��	���%.��K3��*�/n	�
���!�_k�
�{A{2
��(�-S���������)��]l1l�= T�dC���!
�J�
=�|(#�/��`�����[��	
�+2�r"2{<R��p�@1�9��h&*�T��t$C9�P�t"�;�x�KN2C��&U� X�*�3S��&nZn���5(���rWj���E�W
4)P����R .�k3H��4D���0���Q.&�����1�Zg�"�#�����Rd
�+$6x��
]!�J\���Gzp1 �l����W�
�+.���t�d����k;����8��
�^�_5���
B���	��EE9�x���Q�O���3�A�<��_�X����������D%�MTI2�,w����'�i}5��G��5���W��u:��L�uuP�0:���w�����s���%{<�A��\+d��c@��
���1�����Te,�)�����'���[�:f{��@Z��Z�s�'p���	��}��/��fd4�3y��u�'<�Oyu�S��ey��*.�d2�}�6W^5n?���'��S�-^�,yW�Fi��}V�T�����o�W���oi�T�D�?eQ������o!�p�7�9���mw�e���t�|=P@���h�����W����w^�u�i���!���0j��`���^7���o�*8f&�d���� �n���E���6�?r�N��c�������P��tas����_�5�,J�O���
:����������g����0��e�WN���C_���y,�(���
	���nm/A����l����z�e�����
��Rh�	2~��E3LF��s+�\��S������U	���N����M/u��%Epp�x�1�
��SJ�Iku�C��c����a�A�?NV������_C���v���~?���Mi�Jp[$�K�H�cX>�<aL�l�!%��C�)��M�CB���a@'=$�P��W8�|LnZ�n`��*�gNw9�v�WWz���k���zzATu���@8�YQ�����O�����������(m��X��"��������`���i��
���p���X���\9E��1���Gz"������M���2X&�UN�JA��������P��l�K���������b���Hi"@P����c0�����!5�2��SYyZ����u��j2�SM���n{�O�,���d�-�`�#��J��L������84zqXx	�+t��}HO��=V�qy�>!r,�o;�`&��G��1eT�����7D�xxfz�?0����{�P*�1h��-�_h]i���Lp�z�7���0�����>�&T���%��Q�lC>�H�
�������U������E �tM�Z�����=�JI�WI��R�}�}�! G	n��f���	�L�s���������L'1��N����e�5�=!E����=���%��cg3��4�|���qI��h�+�E�
�A�;�
�Mw�4H��
�-iG?��n�2��Q������1��9��2��o>��8���m:�5����4��b'9���/,����ot���e�8���M[���m������V����7/�M�������?�8��oi�/�H+&?�F��� E�9����0�-��QF��h��Mw�l��-�e��<o�Ib+�`��������f�d�|���3�.�����\+�{1�k$����$X/�Jf�Y��Vl�Z�
|!Cn�R%K��U�������|u������+�'�m���$�X��]J�oF�'79�����]�x���)0���J�����������e��������)��=��Bo	��e����!��:>�c�$[{I��j'��dk)�X��W�^)|+76w;�N���T��lr�1�">GE2����	 ��Y38�@�m������h�A]�\H�z�*��G��{]ujk���2^YU|�~��q���v���#D�]�F?:A�C������
�b�?�=�^�`��b��c�}�"�}a~��	
�rD�a"9���8W����+M�&H����t�ON=��c*��,�\���(���(Q#6.�����jt�5��A�7�pc]�K~nf��t8'�k
tS��ux��R��������V`u��X���=���?P]�"w�gG)���|et9�k��$�nl����r�4t���qK���
�5UX(0��nPu�P�ju�������B�mT�v)T�m@�TR�u���
�����.�^w������:��)�#��r�����4�\�4WVB �q������1��fi�E�+�����J`��2hc����mov/��h�&�z�?��Bp��Y]����d��[�q-l�+0\���������
]�����@u+��g�Y}V��g�Y}V��g�Y}V��g�Y}V��g�Y}V��g�Y}V��g�Y}V��g�Y}V��g�Y}n��_���5
#15Petr Jelinek
petr@2ndquadrant.com
In reply to: Tomas Vondra (#13)
Re: PATCH: two slab-like memory allocators

On 05/10/16 03:11, Tomas Vondra wrote:

On 10/04/2016 09:44 PM, John Gorman wrote:

Remind me again why we cannot realloc in place for sizes
smaller than chunkSize? GenSlab is happy to allocate smaller
sizes and put them into the fixed size chunks.

Maybe SlabAlloc can be happy with sizes up to chunkSize.

if (size <= set->chunkSize)
return MemoryContextAlloc(set->slab, size);
else
return MemoryContextAlloc(set->aset, size);

That'd be certainly possible, but it seems a bit strange as the whole
Slab is based on the idea that all chunks have the same size. Moreover,
people usually realloc() to a larger chunk, so it does not really fix
anything in practice.

In GenSlab, the situation is more complicated. But I don't like the
coupling / moving chunks between contexts, etc.

If realloc() support is a hard requirement, it immediately rules out
SlabContext() as an independent memory context. Which seems stupid, as
independent Slab contexts are quite useful for reorderbuffer use case.

For GenSlab the situation is less clear, as there probably are ways to
make it work, but I'd vote to keep it simple for now, and simply do
elog(ERROR) in the realloc() methods - both for Slab and GenSlab. The
current use case (reorderbuffer) does not need that, and it seems like a
can of worms to me.

Hmm, so this in practice means that the caller still has to know the
details of what chunks go where. I would prefer if the realloc just
failed always and "don't do realloc on GenSlab" would be part of spec of
hat context, the randomness that you described originally is the main
problem IMHO. Maybe you could add new "constructor" function for Aset
that would create Aset which can't realloc for use inside the GenSlab?

Alternative would be of course having the individual API calls behind
Aset and Slab exported and used by GenSlab directly instead of using
child contexts. Then all the calls would go to GenSlab which could
decide what to do (and move the chunks between the allocators).

But honestly given the usecases for GenSlab, I would at the moment
prefer just to have predictable error as it can be done more cleanly and
nobody needs the functionality so far, it can be revisited once we
actually do need it.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16John Gorman
johngorman2@gmail.com
In reply to: Tomas Vondra (#13)
Re: PATCH: two slab-like memory allocators

On Tue, Oct 4, 2016 at 10:11 PM, Tomas Vondra

For GenSlab the situation is less clear, as there probably are ways to make

it work, but I'd vote to keep it simple for now, and simply do elog(ERROR)
in the realloc() methods - both for Slab and GenSlab. The current use case
(reorderbuffer) does not need that, and it seems like a can of worms to me.

Good plan. Realloc can be added later if there is an actual use case.

#17Robert Haas
robertmhaas@gmail.com
In reply to: Tomas Vondra (#14)
Re: PATCH: two slab-like memory allocators

On Wed, Oct 5, 2016 at 12:22 AM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:

attached is v3 of the patches, with a few minor fixes in Slab, and much
larger fixes in GenSlab.

Slab (minor fixes)
------------------
- Removed the unnecessary memset() of new blocks in SlabAlloc(), although we
still need to zero the free bitmap at the end of the block.
- Renamed minFreeCount to minFreeChunks, added a few comments explaining
why/how firstFreeChunk and minFreeChunks are maintained.
- Fixed / improved a bunch of additional comments, based on feedback.

I had a look at 0001 today, but it seems to me that it still needs
work. It's still got a lot of remnants of where you've
copy-and-pasted aset.c. I dispute this allegation:

+ * SlabContext is our standard implementation of MemoryContext.

And all of this is just a direct copy-paste; I don't really want two copies:

+ *  When running under Valgrind, we want a NOACCESS memory region both before
+ *  and after the allocation.  The chunk header is tempting as the preceding
+ *  region, but mcxt.c expects to able to examine the standard chunk header
+ *  fields.  Therefore, we use, when available, the requested_size field and
+ *  any subsequent padding.  requested_size is made NOACCESS before returning
+ *  a chunk pointer to a caller.  However, to reduce client request traffic,
+ *  it is kept DEFINED in chunks on the free list.

And then there's this:

+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+            fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+                (_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+            fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+                (_cxt)->header.name, (_chunk), (_chunk)->size)

Well, it's pretty stupid that AllocSetAlloc is reporting it's name as
AllocAlloc, a think that, as far as I can tell, is not real. But
having this new context type also pretend to be AllocAlloc is even
dumber.

+static void
+randomize_mem(char *ptr, size_t size)
+{
+    static int  save_ctr = 1;
+    size_t      remaining = size;
+    int         ctr;
+
+    ctr = save_ctr;
+    VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+    while (remaining-- > 0)
+    {
+        *ptr++ = ctr;
+        if (++ctr > 251)
+            ctr = 1;
+    }
+    VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+    save_ctr = ctr;
+}
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */

Another copy of this doesn't seem like a good idea, either.

More broadly, I'm not sure I like this design very much. The whole
point of a slab context is that all of the objects are the same size.
I wouldn't find it too difficult to support this patch if we were
adding an allocator for fixed-size objects that was then being used to
allocate objects which are fixed size. However, what we seem to be
doing is creating an allocator for fixed-size objects and then using
it for variable-size tuples. That's really pretty weird. Isn't the
root of this problem that aset.c is utterly terrible at handling large
number of allocations? Maybe we should try to attack that problem
more directly.

On a related note, the autodestruct thing is a weird hack that's only
necessary because of the hijinks already discussed in the previous
paragraph. The context has no fixed lifetime; we're just trying to
find a way of coping with possibly-shifting tuple sizes over time.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Petr Jelinek
petr@2ndquadrant.com
In reply to: Robert Haas (#17)
Re: PATCH: two slab-like memory allocators

On 18/10/16 22:25, Robert Haas wrote:

On Wed, Oct 5, 2016 at 12:22 AM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:

attached is v3 of the patches, with a few minor fixes in Slab, and much
larger fixes in GenSlab.

Slab (minor fixes)
------------------
- Removed the unnecessary memset() of new blocks in SlabAlloc(), although we
still need to zero the free bitmap at the end of the block.
- Renamed minFreeCount to minFreeChunks, added a few comments explaining
why/how firstFreeChunk and minFreeChunks are maintained.
- Fixed / improved a bunch of additional comments, based on feedback.

I had a look at 0001 today, but it seems to me that it still needs
work. It's still got a lot of remnants of where you've
copy-and-pasted aset.c. I dispute this allegation:

+ * SlabContext is our standard implementation of MemoryContext.

Are you looking at old version of the patch? I complained about this as
well and Tomas has changed that.

And then there's this:

+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+            fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+                (_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+            fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+                (_cxt)->header.name, (_chunk), (_chunk)->size)

Well, it's pretty stupid that AllocSetAlloc is reporting it's name as
AllocAlloc, a think that, as far as I can tell, is not real. But
having this new context type also pretend to be AllocAlloc is even
dumber.

You are definitely looking at old version.

More broadly, I'm not sure I like this design very much. The whole
point of a slab context is that all of the objects are the same size.
I wouldn't find it too difficult to support this patch if we were
adding an allocator for fixed-size objects that was then being used to
allocate objects which are fixed size. However, what we seem to be
doing is creating an allocator for fixed-size objects and then using
it for variable-size tuples. That's really pretty weird. Isn't the
root of this problem that aset.c is utterly terrible at handling large
number of allocations? Maybe we should try to attack that problem
more directly.

It's used for TXNs which are fixed and some tuples (there is assumption
that the decoded tuples have more or less normal distribution).

I agree though that the usability beyond the ReoderBuffer is limited
because everything that will want to use it for part of allocations will
get much more complicated by the fact that it will have to use two
different allocators.

I was wondering if rather than trying to implement new allocator we
should maybe implement palloc_fixed which would use some optimized
algorithm for fixed sized objects in our current allocator. The
advantage of that would be that we could for example use that for things
like ListCell easily (memory management of which I see quite often in
profiles).

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Petr Jelinek (#18)
Re: PATCH: two slab-like memory allocators

On 10/19/2016 12:27 AM, Petr Jelinek wrote:

On 18/10/16 22:25, Robert Haas wrote:

On Wed, Oct 5, 2016 at 12:22 AM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:

attached is v3 of the patches, with a few minor fixes in Slab, and much
larger fixes in GenSlab.

Slab (minor fixes)
------------------
- Removed the unnecessary memset() of new blocks in SlabAlloc(), although we
still need to zero the free bitmap at the end of the block.
- Renamed minFreeCount to minFreeChunks, added a few comments explaining
why/how firstFreeChunk and minFreeChunks are maintained.
- Fixed / improved a bunch of additional comments, based on feedback.

I had a look at 0001 today, but it seems to me that it still needs
work. It's still got a lot of remnants of where you've
copy-and-pasted aset.c. I dispute this allegation:

+ * SlabContext is our standard implementation of MemoryContext.

Are you looking at old version of the patch? I complained about this as
well and Tomas has changed that.

And then there's this:

+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+            fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+                (_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+            fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+                (_cxt)->header.name, (_chunk), (_chunk)->size)

Well, it's pretty stupid that AllocSetAlloc is reporting it's name as
AllocAlloc, a think that, as far as I can tell, is not real. But
having this new context type also pretend to be AllocAlloc is even
dumber.

You are definitely looking at old version.

Yeah.

More broadly, I'm not sure I like this design very much. The whole
point of a slab context is that all of the objects are the same size.
I wouldn't find it too difficult to support this patch if we were
adding an allocator for fixed-size objects that was then being used to
allocate objects which are fixed size. However, what we seem to be
doing is creating an allocator for fixed-size objects and then using
it for variable-size tuples. That's really pretty weird. Isn't the
root of this problem that aset.c is utterly terrible at handling large
number of allocations? Maybe we should try to attack that problem
more directly.

It's used for TXNs which are fixed and some tuples (there is
assumption that the decoded tuples have more or less normal
distribution).

Yeah. There are three contexts in reorder buffers:

- changes (fixed size)
- txns (fixed size)
- tuples (variable size)

The first two work perfectly fine with Slab.

The last one (tuples) is used to allocate variable-sized bits, so I've
tried to come up with something smart - a sequence of Slabs + overflow
AllocSet. I agree that in hindsight it's a bit strange, and that the
"generational" aspect is the key aspect here - i.e. it might be possible
to implement a memory context that allocates variable-length chunks but
still segregates them into generations. That is, don't build this on top
of Slab. That would also fix the issue with two allocators in GenSlab.
I'll think about this.

I agree though that the usability beyond the ReoderBuffer is limited
because everything that will want to use it for part of allocations will
get much more complicated by the fact that it will have to use two
different allocators.

It wasn't my (primary) goal to provide allocators usable outside
ReorderBuffer. I've intended to show that perhaps using AllocSet and
then trying to compensate for the pfree() issues is the wrong direction,
and that perhaps different allocation strategy (exploiting the
ReorderBuffer specifics) would work much better. And I think the two
allocators show prove that.

I was wondering if rather than trying to implement new allocator we
should maybe implement palloc_fixed which would use some optimized
algorithm for fixed sized objects in our current allocator. The
advantage of that would be that we could for example use that for things
like ListCell easily (memory management of which I see quite often in
profiles).

I don't see how inveting palloc_fixed() solves any of the problems, and
I think it's going to be much more complicated than you think. The idea
of injecting this into AllocSet seems like a dead-end to me, as the code
is already complex enough and it's likely to cause regressions no matter
what you do.

I prefer the idea of implementing separate specialized memory contexts.
If the bar is moved to "implement palloc_fixed()" or something like
that, someone else will have to do that - I'm not all that interested in
ReorderBuffer (this was the first time I actually saw that code), so my
motivation to spend much more time on this is rather small.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#20Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#19)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 10/19/2016 02:51 PM, Tomas Vondra wrote:

...

Yeah. There are three contexts in reorder buffers:

- changes (fixed size)
- txns (fixed size)
- tuples (variable size)

The first two work perfectly fine with Slab.

The last one (tuples) is used to allocate variable-sized bits, so I've
tried to come up with something smart - a sequence of Slabs + overflow
AllocSet. I agree that in hindsight it's a bit strange, and that the
"generational" aspect is the key aspect here - i.e. it might be possible
to implement a memory context that allocates variable-length chunks but
still segregates them into generations. That is, don't build this on top
of Slab. That would also fix the issue with two allocators in GenSlab.
I'll think about this.

And here is a fairly complete prototype of this idea, adding "Gen"
generational memory context based only on the "similar lifespan"
assumption (and abandoning the fixed-size assumption). It's much simpler
than GenSlab (which it's supposed to replace), and abandoning the idea
of composing two memory contexts also fixed the warts with some of the
API methods (e.g. repalloc).

I've also been thinking that perhaps "Gen" would be useful for all three
contexts in ReorderBuffer - so I've done a quick test comparing the
various combinations (using the test1() function used before).

master slabs slabs+genslab slabs+gen gens
----------------------------------------------------------------
50k 18700 210 220 190 190
100k 160000 380 470 350 350
200k N/A 750 920 740 679
500k N/A 2250 2240 1790 1740
1000k N/A 4600 5000 3910 3700

Where:

* master - 23ed2ba812117
* slabs - all three contexts use Slab (patch 0001)
* slabs+genslab - third context is GenSlab (patch 0002)
* slabs+gen - third context is the new Gen (patch 0003)
* gens - all three contexts use Gen

The results are a bit noisy, but I think it's clear the new Gen context
performs well - it actually seems a bit faster than GenSlab, and using
only Gen for all three contexts does not hurt peformance.

This is most likely due to the trivial (practically absent) freespace
management in Gen context, compared to both Slab and GenSlab. So the
speed is not the only criteria - I haven't measured memory consumption,
but I'm pretty sure there are cases where Slab consumes much less memory
than Gen, thanks to reusing free space.

I'd say throwing away GenSlab and keeping Slab+Gen is the way to go.

There's still a fair bit of work on this, particularly implementing the
missing API methods in Gen - GenCheck() and GenStats(). As Robert
pointed out, there's also quite a bit of duplicated code between the
different memory contexts (randomization and valgrind-related), so this
needs to be moved to a shared place.

I'm also thinking that we need better names, particularly for the Gen
allocator. It's supposed to mean Generational, although there are no
explicit generations anymore. Slab is probably OK - it does not match
any of the existing kernel slab allocators exactly, but it follows the
same principles, which is what matters.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-allocators-v4.tgzapplication/x-compressed-tar; name=slab-allocators-v4.tgzDownload
#21Robert Haas
robertmhaas@gmail.com
In reply to: Petr Jelinek (#18)
Re: PATCH: two slab-like memory allocators

On Tue, Oct 18, 2016 at 6:27 PM, Petr Jelinek <petr@2ndquadrant.com> wrote:

I agree though that the usability beyond the ReoderBuffer is limited
because everything that will want to use it for part of allocations will
get much more complicated by the fact that it will have to use two
different allocators.

I was wondering if rather than trying to implement new allocator we
should maybe implement palloc_fixed which would use some optimized
algorithm for fixed sized objects in our current allocator. The
advantage of that would be that we could for example use that for things
like ListCell easily (memory management of which I see quite often in
profiles).

The sb_alloc allocator I proposed a couple of years ago would work
well for this case, I think.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Robert Haas (#21)
Re: PATCH: two slab-like memory allocators

On 10/20/2016 04:43 PM, Robert Haas wrote:

On Tue, Oct 18, 2016 at 6:27 PM, Petr Jelinek <petr@2ndquadrant.com> wrote:

I agree though that the usability beyond the ReoderBuffer is limited
because everything that will want to use it for part of allocations will
get much more complicated by the fact that it will have to use two
different allocators.

I was wondering if rather than trying to implement new allocator we
should maybe implement palloc_fixed which would use some optimized
algorithm for fixed sized objects in our current allocator. The
advantage of that would be that we could for example use that for things
like ListCell easily (memory management of which I see quite often in
profiles).

The sb_alloc allocator I proposed a couple of years ago would work
well for this case, I think.

Maybe, but it does not follow the Memory Context design at all, if I
understand it correctly. I was willing to give it a spin anyway and see
how it compares to the two other allocators, but this is a significant
paradigm shift and certainly much larger step than what I proposed.

I'm not even sure it's possible to implement a MemoryContext based on
the same ideas as sb_alloc(), because one of the important points of
sb_alloc design seems to be throwing away the chunk header. While that
may be possible, it would certainly affect the whole tree (not just the
reorderbuffer bit), and it'd require way more work.

Moreover, the two allocators I proposed significantly benefit from the
"same lifespan" assumption. I don't think sb_alloc can do that.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#23Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#22)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 10/22/2016 08:30 PM, Tomas Vondra wrote:

On 10/20/2016 04:43 PM, Robert Haas wrote:

...

The sb_alloc allocator I proposed a couple of years ago would work
well for this case, I think.

Maybe, but it does not follow the Memory Context design at all, if I
understand it correctly. I was willing to give it a spin anyway and see
how it compares to the two other allocators, but this is a significant
paradigm shift and certainly much larger step than what I proposed.

I'm not even sure it's possible to implement a MemoryContext based on
the same ideas as sb_alloc(), because one of the important points of
sb_alloc design seems to be throwing away the chunk header. While that
may be possible, it would certainly affect the whole tree (not just the
reorderbuffer bit), and it'd require way more work.

Moreover, the two allocators I proposed significantly benefit from the
"same lifespan" assumption. I don't think sb_alloc can do that.

I've given the sb_alloc patch another try - essentially hacking it into
reorderbuffer, ignoring the issues mentioned yesterday. And yes, it's
faster than the allocators discussed in this thread. Based on a few very
quick tests on my laptop, the difference is usually ~5-10%.

That might seem like a significant improvement, but it's negligible
compared to the "master -> slab/gen" improvement, which improves
performance by orders of magnitude (at least for the tested cases).

Moreover, the slab/gen allocators proposed here seem like a better fit
for reorderbuffer, e.g. because they release memory. I haven't looked at
sb_alloc too closely, but I think it behaves more like AllocSet in this
regard (i.e. keeping the memory indefinitely).

FWIW I'm not making any conclusions about sb_alloc benefits outside
reorderbuffer.c - it might easily be worth pursuing, no doubt about
that. The amount of remaining work however seems quite high, though.

Attached is the modified sb_alloc patch that I used - it's mostly v1
with removed uses in nbtree etc. FWIW the patch does not implement
sb_destroy_private_allocator (it's only defined in the header), which
seems like a bug.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

sballoc-v2-tomas.patchtext/x-diff; name=sballoc-v2-tomas.patchDownload
diff --git a/contrib/test_freepage/Makefile b/contrib/test_freepage/Makefile
new file mode 100644
index 0000000..b482fe9
--- /dev/null
+++ b/contrib/test_freepage/Makefile
@@ -0,0 +1,17 @@
+# contrib/test_freepage/Makefile
+
+MODULES = test_freepage
+
+EXTENSION = test_freepage
+DATA = test_freepage--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_freepage
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/test_freepage/test_freepage--1.0.sql b/contrib/test_freepage/test_freepage--1.0.sql
new file mode 100644
index 0000000..5d3191e
--- /dev/null
+++ b/contrib/test_freepage/test_freepage--1.0.sql
@@ -0,0 +1,15 @@
+/* contrib/test_freepage/test_freepage--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_freepage" to load this file. \quit
+
+CREATE FUNCTION init(size pg_catalog.int8) RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+CREATE FUNCTION get(pages pg_catalog.int8) RETURNS pg_catalog.int8
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+CREATE FUNCTION inquire_largest() RETURNS pg_catalog.int8
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+CREATE FUNCTION put(first_page pg_catalog.int8, npages pg_catalog.int8)
+	RETURNS pg_catalog.void AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+CREATE FUNCTION dump() RETURNS pg_catalog.text
+    AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
diff --git a/contrib/test_freepage/test_freepage.c b/contrib/test_freepage/test_freepage.c
new file mode 100644
index 0000000..074cf56
--- /dev/null
+++ b/contrib/test_freepage/test_freepage.c
@@ -0,0 +1,113 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_freepage.c
+ *		Test harness code for free page manager.
+ *
+ * Copyright (C) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		contrib/test_freepage/test_freepage.c
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/freepage.h"
+
+PG_MODULE_MAGIC;
+PG_FUNCTION_INFO_V1(init);
+PG_FUNCTION_INFO_V1(get);
+PG_FUNCTION_INFO_V1(inquire_largest);
+PG_FUNCTION_INFO_V1(put);
+PG_FUNCTION_INFO_V1(dump);
+
+Datum		init(PG_FUNCTION_ARGS);
+Datum		get(PG_FUNCTION_ARGS);
+Datum		inquire_largest(PG_FUNCTION_ARGS);
+Datum		put(PG_FUNCTION_ARGS);
+Datum		dump(PG_FUNCTION_ARGS);
+
+char *space;
+FreePageManager *fpm;
+
+Datum
+init(PG_FUNCTION_ARGS)
+{
+	int64 size = PG_GETARG_INT64(0);
+	Size	first_usable_page;
+	Size	total_pages;
+
+	if (size <= 0 || size % FPM_PAGE_SIZE != 0)
+		elog(ERROR, "bad size");
+
+	if (space != NULL)
+	{
+		free(space);
+		space = NULL;
+		fpm = NULL;
+	}
+
+	space = malloc(size);
+	if (space == NULL)
+		elog(ERROR, "malloc failed: %m");
+
+	fpm = (FreePageManager *) space;
+	FreePageManagerInitialize(fpm, space, NULL, false);
+
+	first_usable_page = sizeof(FreePageManager) / FPM_PAGE_SIZE +
+		(sizeof(FreePageManager) % FPM_PAGE_SIZE == 0 ? 0 : 1);
+	total_pages = size / FPM_PAGE_SIZE;
+
+	FreePageManagerPut(fpm, first_usable_page,
+					   total_pages - first_usable_page);
+
+	PG_RETURN_VOID();
+}
+
+Datum
+get(PG_FUNCTION_ARGS)
+{
+	int64 npages = PG_GETARG_INT64(0);
+	Size first_page;
+
+	if (fpm == NULL)
+		PG_RETURN_NULL();
+
+	if (!FreePageManagerGet(fpm, npages, &first_page))
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(first_page);
+}
+
+Datum
+inquire_largest(PG_FUNCTION_ARGS)
+{
+	if (fpm == NULL)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(FreePageManagerInquireLargest(fpm));
+}
+
+Datum
+put(PG_FUNCTION_ARGS)
+{
+	int64 first_page = PG_GETARG_INT64(0);
+	int64 npages = PG_GETARG_INT64(1);
+
+	FreePageManagerPut(fpm, first_page, npages);
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dump(PG_FUNCTION_ARGS)
+{
+	if (fpm == NULL)
+		PG_RETURN_NULL();
+
+	PG_RETURN_TEXT_P(cstring_to_text(FreePageManagerDump(fpm)));
+}
diff --git a/contrib/test_freepage/test_freepage.control b/contrib/test_freepage/test_freepage.control
new file mode 100644
index 0000000..fca4cd9
--- /dev/null
+++ b/contrib/test_freepage/test_freepage.control
@@ -0,0 +1,4 @@
+comment = 'Test code for shared memory message queues'
+default_version = '1.0'
+module_pathname = '$libdir/test_freepage'
+relocatable = true
diff --git a/contrib/test_sballoc/Makefile b/contrib/test_sballoc/Makefile
new file mode 100644
index 0000000..880bccb
--- /dev/null
+++ b/contrib/test_sballoc/Makefile
@@ -0,0 +1,17 @@
+# contrib/test_sballoc/Makefile
+
+MODULES = test_sballoc
+
+EXTENSION = test_sballoc
+DATA = test_sballoc--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_sballoc
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/test_sballoc/test_sballoc--1.0.sql b/contrib/test_sballoc/test_sballoc--1.0.sql
new file mode 100644
index 0000000..1cf8a5a
--- /dev/null
+++ b/contrib/test_sballoc/test_sballoc--1.0.sql
@@ -0,0 +1,20 @@
+/* contrib/test_sballoc/test_sballoc--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_sballoc" to load this file. \quit
+
+CREATE FUNCTION alloc(size pg_catalog.int8, count pg_catalog.int8)
+    RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+CREATE FUNCTION alloc_with_palloc(size pg_catalog.int8, count pg_catalog.int8)
+    RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+CREATE FUNCTION alloc_list(size pg_catalog.int8, count pg_catalog.int8)
+    RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+CREATE FUNCTION alloc_list_with_palloc(size pg_catalog.int8, count pg_catalog.int8)
+    RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
diff --git a/contrib/test_sballoc/test_sballoc.c b/contrib/test_sballoc/test_sballoc.c
new file mode 100644
index 0000000..38c03da
--- /dev/null
+++ b/contrib/test_sballoc/test_sballoc.c
@@ -0,0 +1,144 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_sballoc.c
+ *		Test harness code for superblock allocator.
+ *
+ * Copyright (C) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		contrib/test_sballoc/test_sballoc.c
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "utils/memutils.h"
+#include "utils/sb_alloc.h"
+#include "utils/sb_region.h"
+
+typedef struct llnode
+{
+	struct llnode *next;
+} llnode;
+
+PG_MODULE_MAGIC;
+PG_FUNCTION_INFO_V1(alloc);
+PG_FUNCTION_INFO_V1(alloc_with_palloc);
+PG_FUNCTION_INFO_V1(alloc_list);
+PG_FUNCTION_INFO_V1(alloc_list_with_palloc);
+
+Datum
+alloc(PG_FUNCTION_ARGS)
+{
+	int64 size = PG_GETARG_INT64(0);
+	int64 count = PG_GETARG_INT64(1);
+	int64 i;
+	int64 *p;
+	sb_allocator *a;
+
+	a = sb_create_private_allocator();
+	for (i = 0; i < count; ++i)
+	{
+		p = sb_alloc(a, size, 0);
+		*p = i;
+	}
+	sb_reset_allocator(a);
+	sb_dump_regions();
+
+	PG_RETURN_VOID();
+}
+
+Datum
+alloc_with_palloc(PG_FUNCTION_ARGS)
+{
+	int64 size = PG_GETARG_INT64(0);
+	int64 count = PG_GETARG_INT64(1);
+	int64 i;
+	int64 *p;
+	MemoryContext context;
+
+	context = AllocSetContextCreate(CurrentMemoryContext,
+   								    "alloc_with_palloc test",
+								    ALLOCSET_DEFAULT_MINSIZE,
+								    ALLOCSET_DEFAULT_INITSIZE,
+								    ALLOCSET_DEFAULT_MAXSIZE);
+	for (i = 0; i < count; ++i)
+	{
+		p = MemoryContextAlloc(context, size);
+		*p = i;
+	}
+	MemoryContextStats(context);
+	MemoryContextDelete(context);
+
+	PG_RETURN_VOID();
+}
+
+Datum
+alloc_list(PG_FUNCTION_ARGS)
+{
+	int64 size = PG_GETARG_INT64(0);
+	int64 count = PG_GETARG_INT64(1);
+	int64 i;
+	llnode *h = NULL;
+	llnode *p;
+	sb_allocator *a;
+
+	if (size < sizeof(llnode))
+		elog(ERROR, "size too small");
+
+	a = sb_create_private_allocator();
+	for (i = 0; i < count; ++i)
+	{
+		p = sb_alloc(a, size, 0);
+		p->next = h;
+		h = p;
+	}
+	while (h != NULL)
+	{
+		p = h->next;
+		sb_free(h);
+		h = p;
+	}
+	sb_dump_regions();
+	sb_reset_allocator(a);
+
+	PG_RETURN_VOID();
+}
+
+Datum
+alloc_list_with_palloc(PG_FUNCTION_ARGS)
+{
+	int64 size = PG_GETARG_INT64(0);
+	int64 count = PG_GETARG_INT64(1);
+	int64 i;
+	llnode *h = NULL;
+	llnode *p;
+	MemoryContext context;
+
+	if (size < sizeof(llnode))
+		elog(ERROR, "size too small");
+
+	context = AllocSetContextCreate(CurrentMemoryContext,
+   								    "alloc_list_with_palloc test",
+								    ALLOCSET_DEFAULT_MINSIZE,
+								    ALLOCSET_DEFAULT_INITSIZE,
+								    ALLOCSET_DEFAULT_MAXSIZE);
+	for (i = 0; i < count; ++i)
+	{
+		p = MemoryContextAlloc(context, size);
+		p->next = h;
+		h = p;
+	}
+	while (h != NULL)
+	{
+		p = h->next;
+		pfree(h);
+		h = p;
+	}
+	MemoryContextStats(context);
+	MemoryContextDelete(context);
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/test_sballoc/test_sballoc.control b/contrib/test_sballoc/test_sballoc.control
new file mode 100644
index 0000000..58f61c0
--- /dev/null
+++ b/contrib/test_sballoc/test_sballoc.control
@@ -0,0 +1,4 @@
+comment = 'Test code for shared memory message queues'
+default_version = '1.0'
+module_pathname = '$libdir/test_sballoc'
+relocatable = true
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 6ad7e7d..4eb4377 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -71,6 +71,7 @@
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/relfilenodemap.h"
+#include "utils/sb_alloc.h"
 #include "utils/tqual.h"
 
 
@@ -149,17 +150,6 @@ typedef struct ReorderBufferDiskChange
  */
 static const Size max_changes_in_memory = 4096;
 
-/*
- * We use a very simple form of a slab allocator for frequently allocated
- * objects, simply keeping a fixed number in a linked list when unused,
- * instead pfree()ing them. Without that in many workloads aset.c becomes a
- * major bottleneck, especially when spilling to disk while decoding batch
- * workloads.
- */
-static const Size max_cached_changes = 4096 * 2;
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
-
 
 /* ---------------------------------------
  * primary reorderbuffer support routines
@@ -240,6 +230,7 @@ ReorderBufferAllocate(void)
 	memset(&hash_ctl, 0, sizeof(hash_ctl));
 
 	buffer->context = new_ctx;
+	buffer->allocator = sb_create_private_allocator();
 
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
@@ -251,19 +242,12 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_transactions = 0;
-	buffer->nr_cached_changes = 0;
-	buffer->nr_cached_tuplebufs = 0;
-
 	buffer->outbuf = NULL;
 	buffer->outbufsize = 0;
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	dlist_init(&buffer->cached_transactions);
-	dlist_init(&buffer->cached_changes);
-	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
 }
@@ -281,6 +265,9 @@ ReorderBufferFree(ReorderBuffer *rb)
 	 * memory context.
 	 */
 	MemoryContextDelete(context);
+
+	/* And we also destroy the private sb allocator instance. */
+	// sb_destroy_private_allocator(rb->allocator);
 }
 
 /*
@@ -291,19 +278,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)sb_alloc(rb->allocator,
+									   sizeof(ReorderBufferTXN), 0);
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -344,18 +320,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	sb_free(txn);
 }
 
 /*
@@ -367,18 +332,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 	ReorderBufferChange *change;
 
 	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)sb_alloc(rb->allocator,
+											 sizeof(ReorderBufferChange), 0);
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -434,18 +389,7 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	sb_free(change);
 }
 
 
@@ -461,42 +405,11 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
-
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)sb_alloc(rb->allocator,
+											  sizeof(ReorderBufferTupleBuf) +
+											  MAXIMUM_ALIGNOF + alloc_len, 0);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
@@ -511,20 +424,7 @@ void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
 	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	sb_free(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index b2403e1..c318a73 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o
+OBJS = aset.o freepage.o mcxt.o portalmem.o sb_alloc.o sb_map.o sb_region.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c
new file mode 100644
index 0000000..0fdd758
--- /dev/null
+++ b/src/backend/utils/mmgr/freepage.c
@@ -0,0 +1,1778 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.c
+ *	  Management of free memory pages.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/freepage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "utils/sb_region.h"
+
+/* Magic numbers to identify various page types */
+#define FREE_PAGE_SPAN_LEADER_MAGIC		0xea4020f0
+#define FREE_PAGE_LEAF_MAGIC            0x98eae728
+#define FREE_PAGE_INTERNAL_MAGIC        0x19aa32c9
+
+/* Doubly linked list of spans of free pages; stored in first page of span. */
+struct FreePageSpanLeader
+{
+	int		magic;				/* always FREE_PAGE_SPAN_LEADER_MAGIC */
+	Size	npages;				/* number of pages in span */
+	relptr(FreePageSpanLeader)	prev;
+	relptr(FreePageSpanLeader)	next;
+};
+
+/* Common header for btree leaf and internal pages. */
+typedef struct FreePageBtreeHeader
+{
+	int		magic;		/* FREE_PAGE_LEAF_MAGIC or FREE_PAGE_INTERNAL_MAGIC */
+	Size	nused;		/* number of items used */
+	relptr(FreePageBtree) parent;	/* uplink */
+} FreePageBtreeHeader;
+
+/* Internal key; points to next level of btree. */
+typedef struct FreePageBtreeInternalKey
+{
+	Size	first_page;				/* low bound for keys on child page */
+	relptr(FreePageBtree) child;	/* downlink */
+} FreePageBtreeInternalKey;
+
+/* Leaf key; no payload data. */
+typedef struct FreePageBtreeLeafKey
+{
+	Size	first_page;				/* first page in span */
+	Size	npages;					/* number of pages in span */
+} FreePageBtreeLeafKey;
+
+/* Work out how many keys will fit on a page. */
+#define FPM_ITEMS_PER_INTERNAL_PAGE \
+	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+		sizeof(FreePageBtreeInternalKey))
+#define FPM_ITEMS_PER_LEAF_PAGE \
+	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+		sizeof(FreePageBtreeLeafKey))
+
+/* A btree page of either sort */
+struct FreePageBtree
+{
+	FreePageBtreeHeader	hdr;
+	union
+	{
+		FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
+		FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
+	} u;
+};
+
+/* Results of a btree search */
+typedef struct FreePageBtreeSearchResult
+{
+	FreePageBtree  *page;
+	Size			index;
+	bool			found;
+	unsigned		split_pages;
+} FreePageBtreeSearchResult;
+
+/* Helper functions */
+static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
+					FreePageBtree *btp);
+static Size FreePageBtreeCleanup(FreePageManager *fpm);
+static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
+							 FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
+							  FreePageBtree *btp);
+static Size FreePageBtreeFirstKey(FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
+static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
+							Size index, Size first_page, FreePageBtree *child);
+static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
+						Size first_page, Size npages);
+static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
+static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
+					Size index);
+static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
+static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+					FreePageBtreeSearchResult *result);
+static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
+static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
+static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
+					   FreePageBtree *btp);
+static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
+static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+						 FreePageBtree *parent, int level, StringInfo buf);
+static void FreePageManagerDumpSpans(FreePageManager *fpm,
+						 FreePageSpanLeader *span, Size expected_pages,
+						 StringInfo buf);
+static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
+						   Size *first_page);
+static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
+						   Size npages, bool soft);
+static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
+static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
+					   Size npages);
+
+/*
+ * Initialize a new, empty free page manager.
+ *
+ * 'fpm' should reference caller-provided memory large enough to contain a
+ * FreePageManager.  We'll initialize it here.
+ *
+ * 'base' is the address to which all pointers are relative.  When managing
+ * a dynamic shared memory segment, it should normally be the base of the
+ * segment.  When managing backend-private memory, it can be either NULL or,
+ * if managing a single contiguous extent of memory, the start of that extent.
+ *
+ * 'lock' is the lock to be used to synchronize access to this FreePageManager.
+ * It can be NULL if synchronization is not required, either because we're
+ * managing backend-private memory or because we're managing shared memory but
+ * synchronization is caller-provided or not required.  (For example, if only
+ * one process is allocating and freeing memory, locking isn't needed.)
+ *
+ * 'lock_address_is_fixed' should be false if the LWLock to be used for
+ * synchronization is stored in the same dynamic shared memory segment as
+ * the managed region, and true if it is stored in the main shared memory
+ * segment.  Storing the LWLock in some other dynamic shared memory segment
+ * isn't supported.  This is ignored when lock is NULL.
+ */
+void
+FreePageManagerInitialize(FreePageManager *fpm, char *base, LWLock *lock,
+						  bool lock_address_is_fixed)
+{
+	Size	f;
+
+	relptr_store(base, fpm->self, fpm);
+	relptr_store(base, fpm->lock, lock);
+	fpm->lock_address_is_fixed = lock_address_is_fixed;
+	relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+	relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
+	fpm->btree_depth = 0;
+	fpm->btree_recycle_count = 0;
+	fpm->singleton_first_page = 0;
+	fpm->singleton_npages = 0;
+	fpm->largest_reported_chunk = 0;
+
+	for (f = 0; f < FPM_NUM_FREELISTS; f++)
+		relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
+}
+
+/*
+ * Allocate a run of pages of the given length from the free page manager.
+ * The return value indicates whether we were able to satisfy the request;
+ * if true, the first page of the allocation is stored in *first_page.
+ */
+bool
+FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
+{
+	LWLock *lock = fpm_lock(fpm);
+	bool	result;
+	Size	contiguous_pages;
+
+	if (lock != NULL)
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+	result = FreePageManagerGetInternal(fpm, npages, first_page);
+
+	/*
+	 * It's a bit counterintuitive, but allocating pages can actually create
+	 * opportunities for cleanup that create larger ranges.  We might pull
+	 * a key out of the btree that enables the item at the head of the btree
+	 * recycle list to be inserted; and then if there are more items behind it
+	 * one of those might cause two currently-separated ranges to merge,
+	 * creating a single range of contiguous pages larger than any that existed
+	 * previously.  It might be worth trying to improve the cleanup algorithm
+	 * to avoid such corner cases, but for now we just notice the condition
+	 * and do the appropriate reporting.
+	 *
+	 * Reporting is only needed for backend-private regions, so we can skip
+	 * it when locking is in use, or if we discover that the region has an
+	 * associated dynamic shared memory segment.
+	 */
+	contiguous_pages = FreePageBtreeCleanup(fpm);
+	if (lock == NULL && contiguous_pages > fpm->largest_reported_chunk)
+	{
+		sb_region *region = sb_lookup_region(fpm);
+
+		if (region != NULL && region->seg == NULL)
+		{
+			sb_report_contiguous_freespace(region, contiguous_pages);
+			fpm->largest_reported_chunk = contiguous_pages;
+		}
+		else
+		{
+			/* There's no containing region, so try to avoid future work. */
+			fpm->largest_reported_chunk = (Size) -1;
+		}
+	}
+
+	if (lock != NULL)
+		LWLockRelease(lock);
+
+	return result;
+}
+
+/*
+ * Return the size of the largest run of pages that the user could
+ * succesfully get.  (If this value subsequently increases, it will trigger
+ * a callback to sb_report_contiguous_freespace.)
+ */
+Size
+FreePageManagerInquireLargest(FreePageManager *fpm)
+{
+	LWLock *lock = fpm_lock(fpm);
+	char   *base = fpm_segment_base(fpm);
+	Size	largest = 0;
+
+	if (lock != NULL)
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
+	{
+		FreePageSpanLeader *candidate;
+
+		candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
+		do
+		{
+			if (candidate->npages > largest)
+				largest = candidate->npages;
+			candidate = relptr_access(base, candidate->next);
+		} while (candidate != NULL);
+	}
+	else
+	{
+		Size	f = FPM_NUM_FREELISTS - 1;
+
+		do
+		{
+			--f;
+			if (!relptr_is_null(fpm->freelist[f]))
+				largest = f + 1;
+		} while (f > 0);
+	}
+
+	fpm->largest_reported_chunk = largest;
+
+	if (lock != NULL)
+		LWLockRelease(lock);
+
+	return largest;
+}
+
+/*
+ * Transfer a run of pages to the free page manager.  (If the number of
+ * contiguous pages now available is larger than it was previously, then
+ * we attempt to report this to the sb_region module.)
+ */
+void
+FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
+{
+	LWLock *lock = fpm_lock(fpm);
+	Size	contiguous_pages;
+	Assert(npages > 0);
+
+	/* Acquire lock (if there is one). */
+	if (lock != NULL)
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	/* Record the new pages. */
+	contiguous_pages =
+		FreePageManagerPutInternal(fpm, first_page, npages, false);
+
+	/*
+	 * If the new range we inserted into the page manager was contiguous
+	 * with an existing range, it may have opened up cleanup opportunities.
+	 */
+	if (contiguous_pages > npages)
+	{
+		Size	cleanup_contiguous_pages;
+
+		cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
+		if (cleanup_contiguous_pages > contiguous_pages)
+			contiguous_pages = cleanup_contiguous_pages;
+	}
+
+	/*
+	 * If we now have more contiguous pages available than previously
+	 * reported, attempt to notify sb_region system.
+	 *
+	 * Reporting is only needed for backend-private regions, so we can skip
+	 * it when locking is in use, or if we discover that the region has an
+	 * associated dynamic shared memory segment.
+	 */
+	if (lock == NULL && contiguous_pages > fpm->largest_reported_chunk)
+	{
+		sb_region *region = sb_lookup_region(fpm);
+
+		if (region != NULL && region->seg == NULL)
+		{
+			fpm->largest_reported_chunk = contiguous_pages;
+			sb_report_contiguous_freespace(region, contiguous_pages);
+		}
+		else
+		{
+			/* There's no containing region, so try to avoid future work. */
+			fpm->largest_reported_chunk = (Size) -1;
+		}
+	}
+
+	/* Release lock (if there is one). */
+	if (lock != NULL)
+		LWLockRelease(lock);
+}
+
+/*
+ * Produce a debugging dump of the state of a free page manager.
+ */
+char *
+FreePageManagerDump(FreePageManager *fpm)
+{
+	LWLock *lock = fpm_lock(fpm);
+	char *base = fpm_segment_base(fpm);
+	StringInfoData	buf;
+	FreePageSpanLeader *recycle;
+	bool	dumped_any_freelist = false;
+	Size	f;
+
+	/* Initialize output buffer. */
+	initStringInfo(&buf);
+
+	/* Acquire lock (if there is one). */
+	if (lock != NULL)
+		LWLockAcquire(lock, LW_SHARED);
+
+	/* Dump general stuff. */
+	appendStringInfo(&buf, "metadata: self %zu lock %zu fixed %c\n",
+					 fpm->self.relptr_off, fpm->lock.relptr_off,
+					 fpm->lock_address_is_fixed ? 't' : 'f');
+
+	/* Dump btree. */
+	if (fpm->btree_depth > 0)
+	{
+		FreePageBtree *root;
+
+		appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
+		root = relptr_access(base, fpm->btree_root);
+		FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
+	}
+	else if (fpm->singleton_npages > 0)
+	{
+		appendStringInfo(&buf, "singleton: %zu(%zu)\n",
+						 fpm->singleton_first_page, fpm->singleton_npages);
+	}
+
+	/* Dump btree recycle list. */
+	recycle = relptr_access(base, fpm->btree_recycle);
+	if (recycle != NULL)
+	{
+		appendStringInfo(&buf, "btree recycle:");
+		FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
+	}
+
+	/* Dump free lists. */
+	for (f = 0; f < FPM_NUM_FREELISTS; ++f)
+	{
+		FreePageSpanLeader *span;
+
+		if (relptr_is_null(fpm->freelist[f]))
+			continue;
+		if (!dumped_any_freelist)
+		{
+			appendStringInfo(&buf, "freelists:\n");
+			dumped_any_freelist = true;
+		}
+		appendStringInfo(&buf, "  %zu:", f + 1);
+		span = relptr_access(base, fpm->freelist[f]);
+		FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
+	}
+
+	/* Release lock (if there is one). */
+	if (lock != NULL)
+		LWLockRelease(lock);
+
+	/* And return result to caller. */
+	return buf.data;
+}
+
+
+/*
+ * The first_page value stored at index zero in any non-root page must match
+ * the first_page value stored in its parent at the index which points to that
+ * page.  So when the value stored at index zero in a btree page changes, we've
+ * got to walk up the tree adjusting ancestor keys until we reach an ancestor
+ * where that key isn't index zero.  This function should be called after
+ * updating the first key on the target page; it will propagate the change
+ * upward as far as needed.
+ *
+ * We assume here that the first key on the page has not changed enough to
+ * require changes in the ordering of keys on its ancestor pages.  Thus,
+ * if we search the parent page for the first key greater than or equal to
+ * the first key on the current page, the downlink to this page will be either
+ * the exact index returned by the search (if the first key decreased)
+ * or one less (if the first key increased).
+ */
+static void
+FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char *base = fpm_segment_base(fpm);
+	Size	first_page;
+	FreePageBtree *parent;
+	FreePageBtree *child;
+
+	/* This might be either a leaf or an internal page. */
+	Assert(btp->hdr.nused > 0);
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+	{
+		Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+		first_page = btp->u.leaf_key[0].first_page;
+	}
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+		first_page = btp->u.internal_key[0].first_page;
+	}
+	child = btp;
+
+	/* Loop until we find an ancestor that does not require adjustment. */
+	for (;;)
+	{
+		Size	s;
+
+		parent = relptr_access(base, child->hdr.parent);
+		if (parent == NULL)
+			break;
+		s = FreePageBtreeSearchInternal(parent, first_page);
+
+		/* Key is either at index s or index s-1; figure out which. */
+		if (s >= parent->hdr.nused)
+		{
+			Assert(s == parent->hdr.nused);
+			--s;
+		}
+		else
+		{
+			FreePageBtree *check;
+
+			check = relptr_access(base, parent->u.internal_key[s].child);
+			if (check != child)
+			{
+				Assert(s > 0);
+				--s;
+			}
+		}
+
+#ifdef USE_ASSERT_CHECKING
+		/* Debugging double-check. */
+		if (assert_enabled)
+		{
+			FreePageBtree *check;
+
+			check = relptr_access(base, parent->u.internal_key[s].child);
+			Assert(s < parent->hdr.nused);
+			Assert(child == check);
+		}
+#endif
+
+		/* Update the parent key. */
+		parent->u.internal_key[s].first_page = first_page;
+
+		/*
+		 * If this is the first key in the parent, go up another level;
+		 * else done.
+		 */
+		if (s > 0)
+			break;
+		child = parent;
+	}
+}
+
+/*
+ * Attempt to reclaim space from the free-page btree.  The return value is
+ * the largest range of contiguous pages created by the cleanup operation.
+ */
+static Size
+FreePageBtreeCleanup(FreePageManager *fpm)
+{
+	char *base = fpm_segment_base(fpm);
+	Size	max_contiguous_pages = 0;
+
+	/* Attempt to shrink the depth of the btree. */
+	while (!relptr_is_null(fpm->btree_root))
+	{
+		FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+		/* If the root contains only one key, reduce depth by one. */
+		if (root->hdr.nused == 1)
+		{
+			/* Shrink depth of tree by one. */
+			Assert(fpm->btree_depth > 0);
+			--fpm->btree_depth;
+			if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+			{
+				/* If root is a leaf, convert only entry to singleton range. */
+				relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+				fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+				fpm->singleton_npages = root->u.leaf_key[0].npages;
+			}
+			else
+			{
+				FreePageBtree *newroot;
+
+				/* If root is an internal page, make only child the root. */
+				Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+				relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
+				newroot = relptr_access(base, fpm->btree_root);
+				relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
+			}
+			FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
+		}
+		else if (root->hdr.nused == 2 &&
+				 root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			Size	end_of_first;
+			Size	start_of_second;
+
+			end_of_first = root->u.leaf_key[0].first_page +
+				root->u.leaf_key[0].npages;
+			start_of_second = root->u.leaf_key[1].first_page;
+
+			if (end_of_first + 1 == start_of_second)
+			{
+				Size	root_page = fpm_pointer_to_page(base, root);
+
+				if (end_of_first == root_page)
+				{
+					FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
+					FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
+					fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+					fpm->singleton_npages = root->u.leaf_key[0].npages +
+						root->u.leaf_key[1].npages + 1;
+					fpm->btree_depth = 0;
+					relptr_store(base, fpm->btree_root,
+								 (FreePageBtree *) NULL);
+					FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+										   fpm->singleton_npages);
+					Assert(max_contiguous_pages == 0);
+					max_contiguous_pages = fpm->singleton_npages;
+				}
+			}
+
+			/* Whether it worked or not, it's time to stop. */
+			break;
+		}
+		else
+		{
+			/* Nothing more to do.  Stop. */
+			break;
+		}
+	}
+
+	/*
+	 * Attempt to free recycled btree pages.  We skip this if releasing
+	 * the recycled page would require a btree page split, because the page
+	 * we're trying to recycle would be consumed by the split, which would
+	 * be counterproductive.
+	 *
+	 * We also currently only ever attempt to recycle the first page on the
+	 * list; that could be made more aggressive, but it's not clear that the
+	 * complexity would be worthwhile.
+	 */
+	while (fpm->btree_recycle_count > 0)
+	{
+		FreePageBtree *btp;
+		Size	first_page;
+		Size	contiguous_pages;
+
+		btp = FreePageBtreeGetRecycled(fpm);
+		first_page = fpm_pointer_to_page(base, btp);
+		contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
+		if (contiguous_pages == 0)
+		{
+			FreePageBtreeRecycle(fpm, first_page);
+			break;
+		}
+		else
+		{
+			if (contiguous_pages > max_contiguous_pages)
+				max_contiguous_pages = contiguous_pages;
+		}
+	}
+
+	return max_contiguous_pages;
+}
+
+/*
+ * Consider consolidating the given page with its left or right sibling,
+ * if it's fairly empty.
+ */
+static void
+FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageBtree *np;
+	Size	max;
+
+	/*
+	 * We only try to consolidate pages that are less than a third full.
+	 * We could be more aggressive about this, but that might risk performing
+	 * consolidation only to end up splitting again shortly thereafter.  Since
+	 * the btree should be very small compared to the space under management,
+	 * our goal isn't so much to ensure that it always occupies the absolutely
+	 * smallest possible number of pages as to reclaim pages before things get
+	 * too egregiously out of hand.
+	 */
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		max = FPM_ITEMS_PER_LEAF_PAGE;
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		max = FPM_ITEMS_PER_INTERNAL_PAGE;
+	}
+	if (btp->hdr.nused >= max / 3)
+		return;
+
+	/*
+	 * If we can fit our right sibling's keys onto this page, consolidate.
+	 */
+	np = FreePageBtreeFindRightSibling(base, btp);
+	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+	{
+		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
+				   sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
+			btp->hdr.nused += np->hdr.nused;
+		}
+		else
+		{
+			memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
+				   sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
+			btp->hdr.nused += np->hdr.nused;
+			FreePageBtreeUpdateParentPointers(base, btp);
+		}
+		FreePageBtreeRemovePage(fpm, np);
+		return;
+	}
+
+	/*
+	 * If we can fit our keys onto our left sibling's page, consolidate.
+	 * In this case, we move our keys onto the other page rather than visca
+	 * versa, to avoid having to adjust ancestor keys.
+	 */
+	np = FreePageBtreeFindLeftSibling(base, btp);
+	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+	{
+		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
+				   sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
+			np->hdr.nused += btp->hdr.nused;
+		}
+		else
+		{
+			memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
+				   sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
+			np->hdr.nused += btp->hdr.nused;
+			FreePageBtreeUpdateParentPointers(base, np);
+		}
+		np->hdr.nused += btp->hdr.nused;
+		FreePageBtreeRemovePage(fpm, btp);
+		return;
+	}
+}
+
+/*
+ * Find the passed page's left sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately precedes ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
+{
+	FreePageBtree *p = btp;
+	int		levels = 0;
+
+	/* Move up until we can move left. */
+	for (;;)
+	{
+		Size	first_page;
+		Size	index;
+
+		first_page = FreePageBtreeFirstKey(p);
+		p = relptr_access(base, p->hdr.parent);
+
+		if (p == NULL)
+			return NULL;		/* we were passed the rightmost page */
+
+		index = FreePageBtreeSearchInternal(p, first_page);
+		if (index > 0)
+		{
+			Assert(p->u.internal_key[index].first_page == first_page);
+			p = relptr_access(base, p->u.internal_key[index - 1].child);
+			break;
+		}
+		Assert(index == 0);
+		++levels;
+	}
+
+	/* Descend left. */
+	while (levels > 0)
+	{
+		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
+		--levels;
+	}
+	Assert(p->hdr.magic == btp->hdr.magic);
+
+	return p;
+}
+
+/*
+ * Find the passed page's right sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately follows ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
+{
+	FreePageBtree *p = btp;
+	int		levels = 0;
+
+	/* Move up until we can move right. */
+	for (;;)
+	{
+		Size	first_page;
+		Size	index;
+
+		first_page = FreePageBtreeFirstKey(p);
+		p = relptr_access(base, p->hdr.parent);
+
+		if (p == NULL)
+			return NULL;		/* we were passed the rightmost page */
+
+		index = FreePageBtreeSearchInternal(p, first_page);
+		if (index < p->hdr.nused - 1)
+		{
+			Assert(p->u.internal_key[index].first_page == first_page);
+			p = relptr_access(base, p->u.internal_key[index + 1].child);
+			break;
+		}
+		Assert(index == p->hdr.nused - 1);
+		++levels;
+	}
+
+	/* Descend left. */
+	while (levels > 0)
+	{
+		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		p = relptr_access(base, p->u.internal_key[0].child);
+		--levels;
+	}
+	Assert(p->hdr.magic == btp->hdr.magic);
+
+	return p;
+}
+
+/*
+ * Get the first key on a btree page.
+ */
+static Size
+FreePageBtreeFirstKey(FreePageBtree *btp)
+{
+	Assert(btp->hdr.nused > 0);
+
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		return btp->u.leaf_key[0].first_page;
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		return btp->u.internal_key[0].first_page;
+	}
+}
+
+/*
+ * Get a page from the btree recycle list for use as a btree page.
+ */
+static FreePageBtree *
+FreePageBtreeGetRecycled(FreePageManager *fpm)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
+	FreePageSpanLeader *newhead;
+
+	Assert(victim != NULL);
+	newhead = relptr_access(base, victim->next);
+	if (newhead != NULL)
+		relptr_copy(newhead->prev, victim->prev);
+	relptr_store(base, fpm->btree_recycle, newhead);
+	Assert(fpm_pointer_is_page_aligned(base, victim));
+	fpm->btree_recycle_count--;
+	return (FreePageBtree *) victim;
+}
+
+/*
+ * Insert an item into an internal page.
+ */
+static void
+FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
+							Size first_page, FreePageBtree *child)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+	Assert(index <= btp->hdr.nused);
+	memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
+			sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
+	btp->u.internal_key[index].first_page = first_page;
+	relptr_store(base, btp->u.internal_key[index].child, child);
+	++btp->hdr.nused;
+}
+
+/*
+ * Insert an item into a leaf page.
+ */
+static void
+FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
+						Size npages)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+	Assert(index <= btp->hdr.nused);
+	memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
+			sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+	btp->u.leaf_key[index].first_page = first_page;
+	btp->u.leaf_key[index].npages = npages;
+	++btp->hdr.nused;
+}
+
+/*
+ * Put a page on the btree recycle list.
+ */
+static void
+FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
+	FreePageSpanLeader *span;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+	span->npages = 1;
+	relptr_store(base, span->next, head);
+	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+	if (head != NULL)
+		relptr_store(base, head->prev, span);
+	relptr_store(base, fpm->btree_recycle, span);
+	fpm->btree_recycle_count++;
+}
+
+/*
+ * Remove an item from the btree at the given position on the given page.
+ */
+static void
+FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(index < btp->hdr.nused);
+
+	/* When last item is removed, extirpate entire page from btree. */
+	if (btp->hdr.nused == 1)
+	{
+		FreePageBtreeRemovePage(fpm, btp);
+		return;
+	}
+
+	/* Physically remove the key from the page. */
+	--btp->hdr.nused;
+	if (index < btp->hdr.nused)
+		memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
+				sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+
+	/* If we just removed the first key, adjust ancestor keys. */
+	if (index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, btp);
+
+	/* Consider whether to consolidate this page with a sibling. */
+	FreePageBtreeConsolidate(fpm, btp);
+}
+
+/*
+ * Remove a page from the btree.  Caller is responsible for having relocated
+ * any keys from this page that are still wanted.  The page is placed on the
+ * recycled list.
+ */
+static void
+FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageBtree *parent;
+	Size	index;
+	Size	first_page;
+
+	for (;;)
+	{
+		/* Find parent page. */
+		parent = relptr_access(base, btp->hdr.parent);
+		if (parent == NULL)
+		{
+			/* We are removing the root page. */
+			relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+			fpm->btree_depth = 0;
+			Assert(fpm->singleton_first_page == 0);
+			Assert(fpm->singleton_npages == 0);
+			return;
+		}
+
+		/*
+		 * If the parent contains only one item, we need to remove it as
+		 * well.
+		 */
+		if (parent->hdr.nused > 1)
+			break;
+		FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+		btp = parent;
+	}
+
+	/* Find and remove the downlink. */
+	first_page = FreePageBtreeFirstKey(btp);
+	if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+	{
+		index = FreePageBtreeSearchLeaf(parent, first_page);
+		Assert(index < parent->hdr.nused);
+		if (index < parent->hdr.nused - 1)
+			memmove(&parent->u.leaf_key[index],
+					&parent->u.leaf_key[index + 1],
+					sizeof(FreePageBtreeLeafKey)
+						* (parent->hdr.nused - index - 1));
+	}
+	else
+	{
+		index = FreePageBtreeSearchInternal(parent, first_page);
+		Assert(index < parent->hdr.nused);
+		if (index < parent->hdr.nused - 1)
+			memmove(&parent->u.internal_key[index],
+					&parent->u.internal_key[index + 1],
+					sizeof(FreePageBtreeInternalKey)
+					* (parent->hdr.nused - index - 1));
+	}
+	parent->hdr.nused--;
+	Assert(parent->hdr.nused > 0);
+
+	/* Recycle the page. */
+	FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+
+	/* Adjust ancestor keys if needed. */
+	if (index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, parent);
+
+	/* Consider whether to consolidate the parent with a sibling. */
+	FreePageBtreeConsolidate(fpm, parent);
+}
+
+/*
+ * Search the btree for an entry for the given first page and initialize
+ * *result with the results of the search.  result->page and result->index
+ * indicate either the position of an exact match or the position at which
+ * the new key should be inserted.  result->found is true for an exact match,
+ * otherwise false.  result->split_pages will contain the number of additional
+ * btree pages that will be needed when performing a split to insert a key.
+ * Except as described above, the contents of fields in the result object are
+ * undefined on return.
+ */
+static void
+FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+					FreePageBtreeSearchResult *result)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageBtree *btp = relptr_access(base, fpm->btree_root);
+	Size	index;
+
+	result->split_pages = 1;
+
+	/* If the btree is empty, there's nothing to find. */
+	if (btp == NULL)
+	{
+		result->page = NULL;
+		result->found = false;
+		return;
+	}
+
+	/* Descend until we hit a leaf. */
+	while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		FreePageBtree *child;
+
+		index = FreePageBtreeSearchInternal(btp, first_page);
+
+		/*
+		 * If the index is 0, we're not going to find it, but we keep
+		 * descending anyway so that we can find the insertion point.
+		 */
+		if (index > 0)
+			--index;
+
+		/* Track required split depth for leaf insert. */
+		if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
+		{
+			Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+			result->split_pages++;
+		}
+		else
+			result->split_pages = 0;
+
+		/* Descend to appropriate child page. */
+		Assert(index < btp->hdr.nused);
+		child = relptr_access(base, btp->u.internal_key[index].child);
+		Assert(relptr_access(base, child->hdr.parent) == btp);
+		btp = child;
+	}
+
+	/* Track required split depth for leaf insert. */
+	if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
+	{
+		Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+		result->split_pages++;
+	}
+	else
+		result->split_pages = 0;
+
+	/* Search leaf page. */
+	index = FreePageBtreeSearchLeaf(btp, first_page);
+
+	/* Assemble results. */
+	result->page = btp;
+	result->index = index;
+	result->found = index < btp->hdr.nused &&
+		first_page == btp->u.leaf_key[index].first_page;
+}
+
+/*
+ * Search an internal page for the first key greater than or equal to a given
+ * page number.  Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
+{
+	Size	low = 0;
+	Size	high = btp->hdr.nused;
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
+
+	while (low < high)
+	{
+		Size	mid = (low + high) / 2;
+		Size	val = btp->u.internal_key[mid].first_page;
+
+		if (first_page == val)
+			return mid;
+		else if (first_page < val)
+			high = mid;
+		else
+			low = mid + 1;
+	}
+
+	return low;
+}
+
+/*
+ * Search a leaf page for the first key greater than or equal to a given
+ * page number.  Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
+{
+	Size	low = 0;
+	Size	high = btp->hdr.nused;
+
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
+
+	while (low < high)
+	{
+		Size	mid = (low + high) / 2;
+		Size	val = btp->u.leaf_key[mid].first_page;
+
+		if (first_page == val)
+			return mid;
+		else if (first_page < val)
+			high = mid;
+		else
+			low = mid + 1;
+	}
+
+	return low;
+}
+
+/*
+ * Allocate a new btree page and move half the keys from the provided page
+ * to the new page.  Caller is responsible for making sure that there's a
+ * page available from fpm->btree_recycle.  Returns a pointer to the new page,
+ * to which caller must add a downlink.
+ */
+static FreePageBtree *
+FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
+{
+	FreePageBtree *newsibling;
+
+	newsibling = FreePageBtreeGetRecycled(fpm);
+	newsibling->hdr.magic = btp->hdr.magic;
+	newsibling->hdr.nused = btp->hdr.nused / 2;
+	relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
+	btp->hdr.nused -= newsibling->hdr.nused;
+
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		memcpy(&newsibling->u.leaf_key,
+			   &btp->u.leaf_key[btp->hdr.nused],
+			   sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		memcpy(&newsibling->u.internal_key,
+			   &btp->u.internal_key[btp->hdr.nused],
+			   sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
+		FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
+	}
+
+	return newsibling;
+}
+
+/*
+ * When internal pages are split or merged, the parent pointers of their
+ * children must be updated.
+ */
+static void
+FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
+{
+	Size	i;
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	for (i = 0; i < btp->hdr.nused; ++i)
+	{
+		FreePageBtree *child;
+
+		child = relptr_access(base, btp->u.internal_key[i].child);
+		relptr_store(base, child->hdr.parent, btp);
+	}
+}	
+
+/*
+ * Debugging dump of btree data.
+ */
+static void
+FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+						 FreePageBtree *parent, int level, StringInfo buf)
+{
+	char   *base = fpm_segment_base(fpm);
+	Size	pageno = fpm_pointer_to_page(base, btp);
+	Size	index;
+	FreePageBtree *check_parent;
+
+	check_stack_depth();
+	check_parent = relptr_access(base, btp->hdr.parent);
+	appendStringInfo(buf, "  %zu@%d %c", pageno, level,
+					 btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
+	if (parent != check_parent)
+		appendStringInfo(buf, " [actual parent %zu, expected %zu]",
+						 fpm_pointer_to_page(base, check_parent),
+						 fpm_pointer_to_page(base, parent));
+	appendStringInfoChar(buf, ':');
+	for (index = 0; index < btp->hdr.nused; ++index)
+	{
+		if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+			appendStringInfo(buf, " %zu->%zu",
+				 btp->u.internal_key[index].first_page,
+				 btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE);
+		else
+			appendStringInfo(buf, " %zu(%zu)",
+				 btp->u.leaf_key[index].first_page,
+				 btp->u.leaf_key[index].npages);
+	}
+	appendStringInfo(buf, "\n");
+
+	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		for (index = 0; index < btp->hdr.nused; ++index)
+		{
+			FreePageBtree *child;
+
+			child = relptr_access(base, btp->u.internal_key[index].child);
+			FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
+		}
+	}
+}
+
+/*
+ * Debugging dump of free-span data.
+ */
+static void
+FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
+						 Size expected_pages, StringInfo buf)
+{
+	char   *base = fpm_segment_base(fpm);
+
+	while (span != NULL)
+	{
+		if (span->npages != expected_pages)
+			appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
+							 span->npages);
+		else
+			appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
+		span = relptr_access(base, span->next);
+	}
+
+	appendStringInfo(buf, "\n");
+}
+
+/*
+ * Like FreePageManagerGet, this function allocates a run of pages of the
+ * given length from the free page manager, but without taking and releasing
+ * the lock.  The caller is responsible for making sure the lock is already
+ * held.
+ */
+static bool
+FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *victim = NULL;
+	FreePageSpanLeader *prev;
+	FreePageSpanLeader *next;
+	FreePageBtreeSearchResult result;
+	Size	victim_page = 0;		/* placate compiler */
+	Size	f;
+
+	/*
+	 * Search for a free span.
+	 *
+	 * Right now, we use a simple best-fit policy here, but it's possible for
+	 * this to result in memory fragmentation if we're repeatedly asked to
+	 * allocate chunks just a little smaller than what we have available.
+	 * Hopefully, this is unlikely, because we expect most requests to be
+	 * single pages or superblock-sized chunks -- but no policy can be optimal
+	 * under all circumstances unless it has knowledge of future allocation
+	 * patterns.
+	 */
+	for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
+	{
+		/* Skip empty freelists. */
+		if (relptr_is_null(fpm->freelist[f]))
+			continue;
+
+		/*
+		 * All of the freelists except the last one contain only items of a
+		 * single size, so we just take the first one.  But the final free
+		 * list contains everything too big for any of the other lists, so
+		 * we need to search the list.
+		 */
+		if (f < FPM_NUM_FREELISTS - 1)
+			victim = relptr_access(base, fpm->freelist[f]);
+		else
+		{
+			FreePageSpanLeader *candidate;
+
+			candidate = relptr_access(base, fpm->freelist[f]);
+			do
+			{
+				if (candidate->npages >= npages && (victim == NULL ||
+					victim->npages > candidate->npages))
+				{
+					victim = candidate;
+					if (victim->npages == npages)
+						break;
+				}
+				candidate = relptr_access(base, candidate->next);
+			} while (candidate != NULL);
+		}
+		break;
+	}
+
+	/* If we didn't find an allocatable span, return failure. */
+	if (victim == NULL)
+		return false;
+
+	/* Remove span from free list. */
+	Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
+	prev = relptr_access(base, victim->prev);
+	next = relptr_access(base, victim->next);
+	if (prev != NULL)
+		relptr_copy(prev->next, victim->next);
+	else
+		relptr_copy(fpm->freelist[f], victim->next);
+	if (next != NULL)
+		relptr_copy(next->prev, victim->prev);
+	victim_page = fpm_pointer_to_page(base, victim);
+
+	/*
+	 * If we haven't initialized the btree yet, the victim must be the single
+	 * span stored within the FreePageManager itself.  Otherwise, we need
+	 * to update the btree.
+	 */
+	if (relptr_is_null(fpm->btree_root))
+	{
+		Assert(victim_page == fpm->singleton_first_page);
+		Assert(victim->npages = fpm->singleton_npages);
+		Assert(victim->npages >= npages);
+		fpm->singleton_first_page += npages;
+		fpm->singleton_npages -= npages;
+		if (fpm->singleton_npages > 0)
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+	}
+	else
+	{
+		/*
+		 * If the span we found is exactly the right size, remove it from the
+		 * btree completely.  Otherwise, adjust the btree entry to reflect the
+		 * still-unallocated portion of the span, and put that portion on the
+		 * appropriate free list.
+		 */
+		FreePageBtreeSearch(fpm, victim_page, &result);
+		Assert(result.found);
+		if (victim->npages == npages)
+			FreePageBtreeRemove(fpm, result.page, result.index);
+		else
+		{
+			FreePageBtreeLeafKey *key;
+
+			/* Adjust btree to reflect remaining pages. */
+			Assert(victim->npages > npages);
+			key = &result.page->u.leaf_key[result.index];
+			Assert(key->npages == victim->npages);
+			key->first_page += npages;
+			key->npages -= npages;
+			if (result.index == 0)
+				FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+			/* Put the unallocated pages back on the appropriate free list. */
+			FreePagePushSpanLeader(fpm, victim_page + npages,
+								   victim->npages - npages);
+		}
+	}
+
+	/* Return results to caller. */
+	*first_page = fpm_pointer_to_page(base, victim);
+	return true;
+}
+
+/*
+ * Put a range of pages into the btree and freelists, consolidating it with
+ * existing free spans just before and/or after it.  If 'soft' is true,
+ * only perform the insertion if it can be done without allocating new btree
+ * pages; if false, do it always.  Returns 0 if the soft flag caused the
+ * insertion to be skipped, or otherwise the size of the contiguous span
+ * created by the insertion.  This may be larger than npages if we're able
+ * to consolidate with an adjacent range.
+ */
+static Size
+FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
+						   bool soft)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageBtreeSearchResult result;
+	FreePageBtreeLeafKey *prevkey = NULL;
+	FreePageBtreeLeafKey *nextkey = NULL;
+	FreePageBtree *np;
+	Size	nindex;
+	Assert(npages > 0);
+
+	/* We can store a single free span without initializing the btree. */
+	if (fpm->btree_depth == 0)
+	{
+		if (fpm->singleton_npages == 0)
+		{
+			/* Don't have a span yet; store this one. */
+			fpm->singleton_first_page = first_page;
+			fpm->singleton_npages = npages;
+			FreePagePushSpanLeader(fpm, first_page, npages);
+			return fpm->singleton_npages;
+		}
+		else if (fpm->singleton_first_page + fpm->singleton_npages ==
+					first_page)
+		{
+			/* New span immediately follows sole existing span. */
+			fpm->singleton_npages += npages;
+			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+			return fpm->singleton_npages;
+		}
+		else if (first_page + npages == fpm->singleton_first_page)
+		{
+			/* New span immediately precedes sole existing span. */
+			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+			fpm->singleton_first_page = first_page;
+			fpm->singleton_npages += npages;
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+			return fpm->singleton_npages;
+		}
+		else
+		{
+			/* Not contiguous; we need to initialize the btree. */
+			Size	root_page;
+			FreePageBtree *root;
+
+			if (!relptr_is_null(fpm->btree_recycle))
+				root = FreePageBtreeGetRecycled(fpm);
+			else if (FreePageManagerGetInternal(fpm, 1, &root_page))
+				root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
+			else
+			{
+				/* We'd better be able to get a page from the existing range. */
+				elog(FATAL, "free page manager btree is corrupt");
+			}
+
+			/* Create the btree and move the preexisting range into it. */
+			root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
+			root->hdr.nused = 1;
+			relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
+			root->u.leaf_key[0].first_page = fpm->singleton_first_page;
+			root->u.leaf_key[0].npages = fpm->singleton_npages;
+			relptr_store(base, fpm->btree_root, root);
+			fpm->singleton_first_page = 0;
+			fpm->singleton_npages = 0;
+			fpm->btree_depth = 1;
+
+			/*
+			 * Corner case: it may be that the btree root took the very last
+			 * free page.  In that case, the sole btree entry covers a zero
+			 * page run, which is invalid.  Overwrite it with the entry we're
+			 * trying to insert and get out.
+			 */
+			if (root->u.leaf_key[0].npages == 0)
+			{
+				root->u.leaf_key[0].first_page = first_page;
+				root->u.leaf_key[0].npages = npages;
+				return npages;
+			}
+
+			/* Fall through to insert the new key. */
+		}
+	}
+
+	/* Search the btree. */
+	FreePageBtreeSearch(fpm, first_page, &result);
+	Assert(!result.found);
+	if (result.index > 0)
+		prevkey = &result.page->u.leaf_key[result.index - 1];
+	if (result.index < result.page->hdr.nused)
+	{
+		np = result.page;
+		nindex = result.index;
+		nextkey = &result.page->u.leaf_key[result.index];
+	}
+	else
+	{
+		np = FreePageBtreeFindRightSibling(base, result.page);
+		nindex = 0;
+		if (np != NULL)
+			nextkey = &np->u.leaf_key[0];
+	}
+
+	/* Consolidate with the previous entry if possible. */
+	if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
+	{
+		bool	remove_next = false;
+		Size	result;
+
+		Assert(prevkey->first_page + prevkey->npages == first_page);
+		prevkey->npages = (first_page - prevkey->first_page) + npages;
+
+		/* Check whether we can *also* consolidate with the following entry. */
+		if (nextkey != NULL &&
+			prevkey->first_page + prevkey->npages >= nextkey->first_page)
+		{
+			Assert(prevkey->first_page + prevkey->npages ==
+					nextkey->first_page);
+			prevkey->npages = (nextkey->first_page - prevkey->first_page)
+				+ nextkey->npages;
+			FreePagePopSpanLeader(fpm, nextkey->first_page);
+			remove_next = true;
+		}
+
+		/* Put the span on the correct freelist and save size. */
+		FreePagePopSpanLeader(fpm, prevkey->first_page);
+		FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
+		result = prevkey->npages;
+
+		/*
+		 * If we consolidated with both the preceding and following entries,
+		 * we must remove the following entry.  We do this last, because
+		 * removing an element from the btree may invalidate pointers we hold
+		 * into the current data structure.
+		 *
+		 * NB: The btree is technically in an invalid state a this point
+		 * because we've already updated prevkey to cover the same key space
+		 * as nextkey.  FreePageBtreeRemove() shouldn't notice that, though.
+		 */
+		if (remove_next)
+			FreePageBtreeRemove(fpm, np, nindex);
+
+		return result;
+	}
+
+	/* Consolidate with the next entry if possible. */
+	if (nextkey != NULL && first_page + npages >= nextkey->first_page)
+	{
+		Size	newpages;
+
+		/* Compute new size for span. */
+		Assert(first_page + npages == nextkey->first_page);
+		newpages = (nextkey->first_page - first_page) + nextkey->npages;
+
+		/* Put span on correct free list. */
+		FreePagePopSpanLeader(fpm, nextkey->first_page);
+		FreePagePushSpanLeader(fpm, first_page, newpages);
+
+		/* Update key in place. */
+		nextkey->first_page = first_page;
+		nextkey->npages = newpages;
+
+		/* If reducing first key on page, ancestors might need adjustment. */
+		if (nindex == 0)
+			FreePageBtreeAdjustAncestorKeys(fpm, np);
+
+		return nextkey->npages;
+	}
+
+	/* Split leaf page and as many of its ancestors as necessary. */
+	if (result.split_pages > 0)
+	{
+		/*
+		 * NB: We could consider various coping strategies here to avoid a
+		 * split; most obviously, if np != result.page, we could target that
+		 * page instead.   More complicated shuffling strategies could be
+		 * possible as well; basically, unless every single leaf page is 100%
+		 * full, we can jam this key in there if we try hard enough.  It's
+		 * unlikely that trying that hard is worthwhile, but it's possible
+		 * we might need to make more than no effort.  For now, we just do
+		 * the easy thing, which is nothing.
+		 */
+
+		/* If this is a soft insert, it's time to give up. */
+		if (soft)
+			return 0;
+
+		/* Check whether we need to allocate more btree pages to split. */
+		if (result.split_pages > fpm->btree_recycle_count)
+		{
+			Size	pages_needed;
+			Size	recycle_page;
+			Size	i;
+
+			/*
+			 * Allocate the required number of pages and split each one in
+			 * turn.  This should never fail, because if we've got enough spans
+			 * of free pages kicking around that we need additional storage
+			 * space just to remember them all, then we should certainly have
+			 * enough to expand the btree, which should only ever use a tiny
+			 * number of pages compared to the number under management.  If
+			 * it does, something's badly screwed up.
+			 */
+			pages_needed = result.split_pages - fpm->btree_recycle_count;
+			for (i = 0; i < pages_needed; ++i)
+			{
+				if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
+					elog(FATAL, "free page manager btree is corrupt");
+				FreePageBtreeRecycle(fpm, recycle_page);
+			}
+
+			/*
+			 * The act of allocating pages to recycle may have invalidated
+			 * the results of our previous btree reserch, so repeat it.
+			 * (We could recheck whether any of our split-avoidance strategies
+			 * that were not viable before now are, but it hardly seems
+			 * worthwhile, so we don't bother. Consolidation can't be possible
+			 * now if it wasn't previously.)
+			 */
+			FreePageBtreeSearch(fpm, first_page, &result);
+
+			/*
+			 * The act of allocating pages for use in constructing our btree
+			 * should never cause any page to become more full, so the new
+			 * split depth should be no greater than the old one, and perhaps
+			 * less if we fortutiously allocated a chunk that freed up a
+			 * slot on the page we need to update.
+			 */
+			Assert(result.split_pages <= fpm->btree_recycle_count);
+		}
+
+		/* If we still need to perform a split, do it. */
+		if (result.split_pages > 0)
+		{
+			FreePageBtree	*split_target = result.page;
+			FreePageBtree   *child = NULL;
+			Size	key = first_page;
+
+			for (;;)
+			{
+				FreePageBtree *newsibling;
+				FreePageBtree *parent;
+
+				/* Identify parent page, which must receive downlink. */
+				parent = relptr_access(base, split_target->hdr.parent);
+
+				/* Split the page - downlink not added yet. */
+				newsibling = FreePageBtreeSplitPage(fpm, split_target);
+
+				/*
+				 * At this point in the loop, we're always carrying a pending
+				 * insertion.  On the first pass, it's the actual key we're
+				 * trying to insert; on subsequent passes, it's the downlink
+				 * that needs to be added as a result of the split performed
+				 * during the previous loop iteration.  Since we've just split
+				 * the page, there's definitely room on one of the two
+				 * resulting pages.
+				 */
+				if (child == NULL)
+				{
+					Size	index;
+					FreePageBtree *insert_into;
+
+					insert_into = key < newsibling->u.leaf_key[0].first_page ?
+						split_target : newsibling;
+					index = FreePageBtreeSearchLeaf(insert_into, key);
+					FreePageBtreeInsertLeaf(insert_into, index, key, npages);
+					if (index == 0 && insert_into == split_target)
+						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+				}
+				else
+				{
+					Size	index;
+					FreePageBtree *insert_into;
+
+					insert_into =
+						key < newsibling->u.internal_key[0].first_page ?
+						split_target : newsibling;
+					index = FreePageBtreeSearchInternal(insert_into, key);
+					FreePageBtreeInsertInternal(base, insert_into, index,
+												key, child);
+					relptr_store(base, child->hdr.parent, insert_into);
+					if (index == 0 && insert_into == split_target)
+						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+				}
+
+				/* If the page we just split has no parent, split the root. */
+				if (parent == NULL)
+				{
+					FreePageBtree *newroot;
+
+					newroot = FreePageBtreeGetRecycled(fpm);
+					newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
+					newroot->hdr.nused = 2;
+					relptr_store(base, newroot->hdr.parent,
+								 (FreePageBtree *) NULL);
+					newroot->u.internal_key[0].first_page =
+						FreePageBtreeFirstKey(split_target);
+					relptr_store(base, newroot->u.internal_key[0].child,
+						split_target);
+					relptr_store(base, split_target->hdr.parent, newroot);
+					newroot->u.internal_key[1].first_page =
+						FreePageBtreeFirstKey(newsibling);
+					relptr_store(base, newroot->u.internal_key[1].child,
+						newsibling);
+					relptr_store(base, newsibling->hdr.parent, newroot);
+					relptr_store(base, fpm->btree_root, newroot);
+					fpm->btree_depth++;
+
+					break;
+				}
+
+				/* If the parent page isn't full, insert the downlink. */
+				key = newsibling->u.internal_key[0].first_page;
+				if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
+				{
+					Size	index;
+
+					index = FreePageBtreeSearchInternal(parent, key);
+					FreePageBtreeInsertInternal(base, parent, index,
+												key, newsibling);
+					relptr_store(base, newsibling->hdr.parent, parent);
+					if (index == 0)
+						FreePageBtreeAdjustAncestorKeys(fpm, parent);
+					break;
+				}
+
+				/* The parent also needs to be split, so loop around. */
+				child = newsibling;
+				split_target = parent;
+			}
+
+			/*
+			 * The loop above did the insert, so just need to update the
+			 * free list, and we're done.
+			 */
+			FreePagePushSpanLeader(fpm, first_page, npages);
+
+			return npages;
+		}
+	}
+
+	/* Physically add the key to the page. */
+	Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
+	FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
+
+	/* If new first key on page, ancestors might need adjustment. */
+	if (result.index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+	/* Put it on the free list. */
+	FreePagePushSpanLeader(fpm, first_page, npages);
+
+	return npages;
+}
+
+/*
+ * Remove a FreePageSpanLeader from the linked-list that contains it, either
+ * because we're changing the size of the span, or because we're allocating it.
+ */
+static void
+FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
+{
+	char *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *span;
+	FreePageSpanLeader *next;
+	FreePageSpanLeader *prev;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+
+	next = relptr_access(base, span->next);
+	prev = relptr_access(base, span->prev);
+	if (next != NULL)
+		relptr_copy(next->prev, span->prev);
+	if (prev != NULL)
+		relptr_copy(prev->next, span->next);
+	else
+	{
+		Size	f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
+
+		Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE);
+		relptr_copy(fpm->freelist[f], span->next);
+	}
+}
+
+/*
+ * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
+ */
+static void
+FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
+{
+	char   *base = fpm_segment_base(fpm);
+	Size	f = Min(npages, FPM_NUM_FREELISTS) - 1;
+	FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
+	FreePageSpanLeader *span;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
+	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+	span->npages = npages;
+	relptr_store(base, span->next, head);
+	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+	if (head != NULL)
+		relptr_store(base, head->prev, span);
+	relptr_store(base, fpm->freelist[f], span);
+}
diff --git a/src/backend/utils/mmgr/sb_alloc.c b/src/backend/utils/mmgr/sb_alloc.c
new file mode 100644
index 0000000..ace9e56
--- /dev/null
+++ b/src/backend/utils/mmgr/sb_alloc.c
@@ -0,0 +1,861 @@
+/*-------------------------------------------------------------------------
+ *
+ * sb_alloc.c
+ *	  Superblock-based memory allocator.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/mmgr/sb_alloc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "utils/sb_region.h"
+
+/*
+ * Metadata for an ordinary superblock, a large memory allocation, or a "span
+ * of spans".
+ *
+ * For ordinary superblocks and large memory allocations, span objects are
+ * stored out-of-line; that is, the span object is not stored within the
+ * span itself.  Ordinary superblocks are all of size SB_SUPERBLOCK_SIZE,
+ * and size_class indicates the size of object they contain.  Large memory
+ * spans contain just enough pages to store the object, and size_class
+ * is SB_SCLASS_SPAN_LARGE; ninitialized, nused, and firstfree are all unused,
+ * as the whole span consists of a single object.
+ * 
+ * For a "span of spans", the span object is stored "inline".  The allocation
+ * is always exactly one page, and the sb_span object is located at the
+ * beginning of that page.  This makes it easy to free a span: just find the
+ * start of the containing page, and there's the sb_span to which it needs to
+ * be returned.  The size class will be SB_SPAN_OF_SPANS, and the remaining
+ * fields are used just as they would be in an ordinary superblock.  We can't
+ * allocate spans out of ordinary superblocks because creating an ordinary
+ * superblock requires us to be able to allocate a span *first*.  Doing it
+ * this way avoids that circularity.
+ */
+struct sb_span
+{
+	relptr(sb_heap) parent;		/* Containing heap. */
+	relptr(sb_span) prevspan;	/* Previous span. */
+	relptr(sb_span) nextspan;	/* Next span. */
+	relptr(char)	start;		/* Starting address. */
+	Size		npages;			/* Length of span in pages. */
+	uint16		size_class;		/* Size class. */
+	uint16		ninitialized;	/* Maximum number of objects ever allocated. */
+	uint16		nallocatable;	/* Number of objects currently allocatable. */
+	uint16		firstfree;		/* First object on free list. */
+	uint16		nmax;			/* Maximum number of objects ever possible. */
+	uint16		fclass;			/* Current fullness class. */
+};
+
+#define SB_SPAN_NOTHING_FREE		((uint16) -1)
+#define SB_SUPERBLOCK_SIZE			(SB_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
+
+/*
+ * Small allocations are handled by dividing a relatively large chunk of
+ * memory called a superblock into many small objects of equal size.  The
+ * chunk sizes are defined by the following array.  Larger size classes are
+ * spaced more widely than smaller size classes.  We fudge the spacing for
+ * size classes >1k to avoid space wastage: based on the knowledge that we
+ * plan to allocate 64k superblocks, we bump the maximum object size up
+ * to the largest multiple of 8 bytes that still lets us fit the same
+ * number of objects into one superblock.
+ *
+ * NB: Because of this fudging, if the size of a superblock is ever changed,
+ * these size classes should be reworked to be optimal for the new size.
+ *
+ * NB: The optimal spacing for size classes, as well as the size of the
+ * superblocks themselves, is not a question that has one right answer.
+ * Some allocators (such as tcmalloc) use more closely-spaced size classes
+ * than we do here, while others (like aset.c) use more widely-spaced classes.
+ * Spacing the classes more closely avoids wasting memory within individual
+ * chunks, but also means a larger number of potentially-unfilled superblocks.
+ * This system is really only suitable for allocating relatively large amounts
+ * of memory, where the unfilled superblocks will be a small percentage of
+ * the total allocations.
+ */
+static const uint16 sb_size_classes[] = {
+	sizeof(sb_span), 0,				/* special size classes */
+	8, 16, 24, 32, 40, 48, 56, 64,	/* 8 classes separated by 8 bytes */
+	80, 96, 112, 128,				/* 4 classes separated by 16 bytes */
+	160, 192, 224, 256,				/* 4 classes separated by 32 bytes */
+	320, 384, 448, 512,				/* 4 classes separated by 64 bytes */
+	640, 768, 896, 1024,			/* 4 classes separated by 128 bytes */
+	1280, 1560, 1816, 2048,			/* 4 classes separated by ~256 bytes */
+	2616, 3120, 3640, 4096,			/* 4 classes separated by ~512 bytes */
+	5456, 6552, 7280, 8192			/* 4 classes separated by ~1024 bytes */
+};
+
+/*
+ * The following lookup table is used to map the size of small objects
+ * (less than 1kB) onto the corresponding size class.  To use this table,
+ * round the size of the object up to the next multiple of 8 bytes, and then
+ * index into this array.
+ */
+static char sb_size_class_map[] = {
+	2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+	20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
+};
+#define SB_SIZE_CLASS_MAP_QUANTUM	8
+
+/* Special size classes. */
+#define SB_SCLASS_SPAN_OF_SPANS			0
+#define SB_SCLASS_SPAN_LARGE			1
+#define SB_NUM_SIZE_CLASSES				lengthof(sb_size_classes)
+
+/* Helper functions. */
+static char *sb_alloc_guts(char *base, sb_region *region,
+			  sb_allocator *a, int size_class);
+static bool sb_ensure_active_superblock(char *base, sb_region *region,
+							sb_allocator *a, sb_heap *heap,
+							int size_class);
+static void sb_init_span(char *base, sb_span *span, sb_heap *heap,
+			 char *ptr, Size npages, uint16 size_class);
+static void sb_out_of_memory_error(sb_allocator *a);
+static bool sb_transfer_first_span(char *base, sb_heap *heap,
+					   int fromclass, int toclass);
+static void sb_unlink_span(char *base, sb_heap *heap, sb_span *span);
+
+/*
+ * Create a backend-private allocator.
+ */
+sb_allocator *
+sb_create_private_allocator(void)
+{
+	Size	allocator_size;
+	int		heapno;
+	int		fclass;
+	sb_allocator *a;
+	char   *base = NULL;
+
+	allocator_size = offsetof(sb_allocator, heaps);
+	allocator_size += sizeof(sb_heap) * SB_NUM_SIZE_CLASSES;
+	a = malloc(allocator_size);
+	if (a == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	a->private = true;
+	for (heapno = 0; heapno < SB_NUM_SIZE_CLASSES; ++heapno)
+	{
+		sb_heap *heap = &a->heaps[heapno];
+
+		relptr_store(base, heap->lock, (LWLock *) NULL);
+		for (fclass = 0; fclass < SB_FULLNESS_CLASSES; ++fclass)
+			relptr_store(base, heap->spans[fclass], (sb_span *) NULL);
+	}
+
+	return a;
+}
+
+/*
+ * Allocate memory.
+ */
+void *
+sb_alloc(sb_allocator *a, Size size, int flags)
+{
+	sb_region *region = NULL;
+	char *base = NULL;
+	uint16	size_class;
+	char   *result;
+
+	Assert(size > 0);
+
+	/*
+	 * For shared memory allocation, pointers are relative to the start of the
+	 * region, so finding out that information is essential.  For
+	 * backend-private memory allocation, allocators aren't uniquely tied to
+	 * a region; we'll only need to grab a region if we can't allocate out of
+	 * an existing superblock.
+	 */
+	if (!a->private)
+	{
+		region = sb_lookup_region(a);
+		if (region == NULL)
+			elog(ERROR, "sb_region not found");
+		base = region->region_start;
+	}
+
+	/* If it's too big for a superblock, just grab a raw run of pages. */
+	if (size > sb_size_classes[lengthof(sb_size_classes) - 1])
+	{
+		Size	npages = fpm_size_to_pages(size);
+		Size	first_page;
+		sb_span *span;
+		sb_heap *heap = &a->heaps[SB_SCLASS_SPAN_LARGE];
+		LWLock *lock = relptr_access(base, heap->lock);
+		void *ptr;
+
+		/* Obtain a span object. */
+		span = (sb_span *) sb_alloc_guts(base, region, a,
+										 SB_SCLASS_SPAN_OF_SPANS);
+		if (span == NULL)
+		{
+			if ((flags & SB_ALLOC_SOFT_FAIL) == 0)
+				sb_out_of_memory_error(a);
+			return NULL;
+		}
+
+		/* Find a region from which to allocate. */
+		if (region == NULL)
+			region = sb_private_region_for_allocator(npages);
+
+		/* Here's where we try to perform the actual allocation. */
+		if (region == NULL ||
+			!FreePageManagerGet(region->fpm, npages, &first_page))
+		{
+			/* XXX. Free the span. */
+			if ((flags & SB_ALLOC_SOFT_FAIL) == 0)
+				sb_out_of_memory_error(a);
+			return NULL;
+		}
+		ptr = fpm_page_to_pointer(fpm_segment_base(region->fpm), first_page);
+
+		/* Initialize span and pagemap. */
+		if (lock != NULL)
+			LWLockAcquire(lock, LW_EXCLUSIVE);
+		sb_init_span(base, span, heap, ptr, npages, SB_SCLASS_SPAN_LARGE);
+		if (lock != NULL)
+			LWLockRelease(lock);
+		sb_map_set(region->pagemap, first_page, span);
+
+		return ptr;
+	}
+
+	/* Map allocation to a size class. */
+	if (size < lengthof(sb_size_class_map) * SB_SIZE_CLASS_MAP_QUANTUM)
+	{
+		int	mapidx;
+
+		mapidx = ((size + SB_SIZE_CLASS_MAP_QUANTUM - 1) /
+					SB_SIZE_CLASS_MAP_QUANTUM) - 1;
+		size_class = sb_size_class_map[mapidx];
+	}
+	else
+	{
+		uint16	min = sb_size_class_map[lengthof(sb_size_class_map) - 1];
+		uint16	max = lengthof(sb_size_classes) - 1;
+
+		while (min < max)
+		{
+			uint16	mid = (min + max) / 2;
+			uint16	class_size = sb_size_classes[mid];
+
+			if (class_size < size)
+				min = mid + 1;
+			else
+				max = mid;
+		}
+
+		size_class = min;
+	}
+	Assert(size <= sb_size_classes[size_class]);
+	Assert(size_class == 0 || size > sb_size_classes[size_class - 1]);
+
+	/* Attempt the actual allocation. */
+	result = sb_alloc_guts(base, region, a, size_class);
+	if (result == NULL && (flags & SB_ALLOC_SOFT_FAIL) == 0)
+		sb_out_of_memory_error(a);
+	return result;		
+}
+
+/*
+ * Free memory allocated via sb_alloc.
+ */
+void
+sb_free(void *ptr)
+{
+	sb_region *region;
+	char   *fpm_base;
+	char   *base = NULL;
+	sb_span *span;
+	LWLock *lock = NULL;
+	char   *superblock;
+	Size	pageno;
+	Size	obsize;
+	uint16	size_class;
+
+	/* Locate the containing superblock. */
+	region = sb_lookup_region(ptr);
+	fpm_base = fpm_segment_base(region->fpm);
+	pageno = fpm_pointer_to_page(fpm_base, ptr);
+	span = sb_map_get(region->pagemap, pageno);
+
+	/*
+	 * If this is a shared-memory region, we might need locking.  If so,
+	 * lock the heap.
+	 */
+	if (region->seg != NULL)
+	{
+		sb_heap *heap = relptr_access(fpm_base, span->parent);
+		base = fpm_base;
+		lock = relptr_access(fpm_base, heap->lock);
+		if (lock != NULL)
+			LWLockAcquire(lock, LW_EXCLUSIVE);
+	}
+
+	/* Compute the object size. */
+	size_class = span->size_class;
+	obsize = sb_size_classes[size_class];
+
+	/* If it's a large object, free the entire span. */
+	if (size_class == SB_SCLASS_SPAN_LARGE)
+	{
+		sb_heap *heap = relptr_access(base, span->parent);
+		Size	first_page;
+
+		sb_unlink_span(base, heap, span);
+		first_page = fpm_pointer_to_page(fpm_base,
+										 relptr_access(base, span->start));
+		FreePageManagerPut(region->fpm, first_page, span->npages);
+		sb_free(span);
+
+		/* We're done, but must release any lock first. */
+		if (lock != NULL)
+			LWLockRelease(lock);
+	}
+
+	/* Put the object on the superblock's freelist. */
+	superblock = relptr_access(base, span->start);
+	Assert(((char *) ptr) >= superblock);
+	Assert(((char *) ptr) < superblock + SB_SUPERBLOCK_SIZE);
+	Assert((((char *) ptr) - superblock) % obsize == 0);
+	* (Size *) ptr = span->firstfree;
+	span->firstfree = (((char *) ptr) - superblock) / obsize;
+	span->nallocatable++;
+
+	if (span->nallocatable == 1 && span->fclass == SB_FULLNESS_CLASSES - 1)
+	{
+		sb_heap *heap = relptr_access(base, span->parent);
+		sb_span *new_nextspan;
+
+		/*
+		 * The superblock is completely full and is located in the
+		 * highest-numbered fullness class, which is never scanned for free
+		 * chunks.  We must move it to the next-lower fullness class.
+		 */
+
+		sb_unlink_span(base, heap, span);
+		span->fclass = SB_FULLNESS_CLASSES - 2;
+		relptr_copy(span->nextspan, heap->spans[SB_FULLNESS_CLASSES - 2]);
+		relptr_store(base, span->prevspan, (sb_span *) NULL);
+		new_nextspan = relptr_access(base,
+									 heap->spans[SB_FULLNESS_CLASSES - 2]);
+		if (new_nextspan != NULL)
+			relptr_store(base, new_nextspan->prevspan, span);
+		relptr_store(base, heap->spans[SB_FULLNESS_CLASSES - 2], span);
+	}
+	else if (span->nallocatable == span->nmax && (span->fclass != 1 ||
+		!relptr_is_null(span->prevspan)))
+	{
+		sb_heap *heap = relptr_access(base, span->parent);
+		Size	first_page;
+
+		/*
+		 * This entire superblock is free, and it's not the active superblock
+		 * for this size class.  Return the memory to the free page manager.
+		 * We don't do this for the active superblock to prevent hysteresis:
+		 * if we repeatedly allocate and free the only chunk in the active
+		 * superblock, it will be very inefficient if we deallocate and
+		 * reallocate the superblock every time.
+		 */
+		sb_unlink_span(base, heap, span);
+		first_page = fpm_pointer_to_page(fpm_base,
+										 relptr_access(base, span->start));
+		FreePageManagerPut(region->fpm, first_page, span->npages);
+
+		/*
+		 * Span-of-spans superblocks store the span which describes them
+		 * within the superblock itself, so freeing the storage implicitly
+		 * frees the descriptor also.  If this is a superblock of any other
+		 * type, we need to separately free the span object also.
+		 */
+		if (size_class != SB_SCLASS_SPAN_OF_SPANS)
+			sb_free(span);
+	}
+
+	/* If we locked the heap, release the lock. */
+	if (lock != NULL)
+		LWLockRelease(lock);
+}
+
+/*
+ * Return the size of the chunk that will be used to satisfy a given
+ * allocation.
+ */
+Size
+sb_alloc_space(Size size)
+{
+	uint16	size_class;
+
+	/* Large objects allocate full pages. */
+	if (size > sb_size_classes[lengthof(sb_size_classes) - 1])
+		return FPM_PAGE_SIZE * fpm_size_to_pages(size);
+
+	/* Map request size to a size class. */
+	if (size < lengthof(sb_size_class_map) * SB_SIZE_CLASS_MAP_QUANTUM)
+	{
+		int	mapidx;
+
+		mapidx = ((size + SB_SIZE_CLASS_MAP_QUANTUM - 1) /
+					SB_SIZE_CLASS_MAP_QUANTUM) - 1;
+		size_class = sb_size_class_map[mapidx];
+	}
+	else
+	{
+		uint16	min = sb_size_class_map[lengthof(sb_size_class_map) - 1];
+		uint16	max = lengthof(sb_size_classes) - 1;
+		while (min < max)
+		{
+			uint16	mid = (min + max) / 2;
+			uint16	class_size = sb_size_classes[mid];
+
+			if (class_size < size)
+				min = mid + 1;
+			else
+				max = mid;
+		}
+		size_class = min;
+	}
+
+	return sb_size_classes[size_class];
+}
+
+/*
+ * Return the size of the chunk used to satisfy a given allocation.
+ *
+ * This is roughly an analogue of GetMemoryChunkSpace, but it's hard to make
+ * a precisely fair comparison.  Unlike MemoryContextAlloc/AllocSetAlloc,
+ * there's no bookkeeping overhead associated with any single allocation;
+ * the only thing we can really reflect here is the fact that allocations
+ * will be rounded up to the next larger size class (or, for large allocations,
+ * to a full FPM page).  The storage overhead of the sb_span, sb_map,
+ * sb_region, and FreePageManager structures is typically spread across
+ * enough small allocations to make reflecting those costs here difficult.
+ *
+ * On the other hand, we also hope that the overhead in question is small
+ * enough not to matter.  The system malloc is not without bookkeeping
+ * overhead of its own.
+ */
+Size
+sb_chunk_space(void *ptr)
+{
+	sb_region *region;
+	char   *fpm_base;
+	sb_span *span;
+	Size	pageno;
+	uint16	size_class;
+
+	/* Locate the containing superblock. */
+	region = sb_lookup_region(ptr);
+	fpm_base = fpm_segment_base(region->fpm);
+	pageno = fpm_pointer_to_page(fpm_base, ptr);
+	span = sb_map_get(region->pagemap, pageno);
+
+	/* Work out the size of the allocation. */	
+	size_class = span->size_class;
+	if (span->size_class == SB_SCLASS_SPAN_LARGE)
+		return FPM_PAGE_SIZE * span->npages;
+	else
+		return sb_size_classes[size_class];
+}
+
+/*
+ * Free all memory used by an allocator.
+ *
+ * NB: It's not safe to do this while the allocator is in use!
+ */
+void
+sb_reset_allocator(sb_allocator *a)
+{
+	char *base = NULL;
+	int heapno;
+
+	/*
+	 * For shared memory allocation, pointers are relative to the start of the
+	 * region.
+	 */
+	if (!a->private)
+	{
+		sb_region *region = sb_lookup_region(a);
+		if (region == NULL)
+			elog(ERROR, "sb_region not found");
+		base = region->region_start;
+	}
+
+	/*
+	 * Iterate through heaps back to front.  We do it this way so that
+	 * spans-of-spans are freed last.
+	 */
+	for (heapno = SB_NUM_SIZE_CLASSES - 1; heapno >= 0; --heapno)
+	{
+		sb_heap *heap = &a->heaps[heapno];
+		int		fclass;
+
+		for (fclass = 0; fclass < SB_FULLNESS_CLASSES; ++fclass)
+		{
+			sb_region *region;
+			char *superblock;
+			sb_span *span;
+
+			span = relptr_access(base, heap->spans[fclass]);
+			while (span != NULL)
+			{
+				Size	offset;
+				sb_span *nextspan;
+
+				superblock = relptr_access(base, span->start);
+				nextspan = relptr_access(base, span->nextspan);
+				region = sb_lookup_region(superblock);
+				Assert(region != NULL);
+				offset = superblock - fpm_segment_base(region->fpm);
+				Assert(offset % FPM_PAGE_SIZE == 0);
+				FreePageManagerPut(region->fpm, offset / FPM_PAGE_SIZE,
+								   span->npages);
+				span = nextspan;
+			}
+		}
+	}
+}
+
+/*
+ * Allocate an object of the requested size class from the given allocator.
+ * If necessary, steal or create another superblock.
+ */
+static char *
+sb_alloc_guts(char *base, sb_region *region, sb_allocator *a, int size_class)
+{
+	sb_heap *heap = &a->heaps[size_class];
+	LWLock *lock = relptr_access(base, heap->lock);
+	sb_span *active_sb;
+	char   *superblock;
+	char   *result;
+	Size	obsize;
+
+	/* If locking is in use, acquire the lock. */
+	if (lock != NULL)
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	/*
+	 * If there's no active superblock, we must successfully obtain one or
+	 * fail the request.
+	 */
+	if (relptr_is_null(heap->spans[1])
+		&& !sb_ensure_active_superblock(base, region, a, heap, size_class))
+	{
+		if (lock != NULL)
+			LWLockRelease(lock);
+		return NULL;
+	}
+	Assert(!relptr_is_null(heap->spans[1]));
+
+	/*
+	 * There should be a superblock in fullness class 1 at this point, and
+	 * it should never be completely full.  Thus we can either pop the
+	 * free list or, failing that, initialize a new object.
+	 */
+	active_sb = relptr_access(base, heap->spans[1]);
+	Assert(active_sb != NULL && active_sb->nallocatable > 0);
+	superblock = relptr_access(base, active_sb->start);
+	Assert(size_class < SB_NUM_SIZE_CLASSES);
+	obsize = sb_size_classes[size_class];
+	if (active_sb->firstfree != SB_SPAN_NOTHING_FREE)
+	{
+		result = superblock + active_sb->firstfree * obsize;
+		active_sb->firstfree = * (Size *) result;
+	}
+	else
+	{
+		result = superblock + active_sb->ninitialized * obsize;
+		++active_sb->ninitialized;
+	}
+	--active_sb->nallocatable;
+
+	/* If it's now full, move it to the highest-numbered fullness class. */
+	if (active_sb->nallocatable == 0)
+		sb_transfer_first_span(base, heap, 1, SB_FULLNESS_CLASSES - 1);
+
+	/* We're all done.  Release the lock. */
+	if (lock != NULL)
+		LWLockRelease(lock);
+
+	return result;
+}
+
+/*
+ * Ensure an active (i.e. fullness class 1) superblock, unless all existing
+ * superblocks are completely full and no more can be allocated.
+ *
+ * Fullness classes K of 0..N is loosely intended to represent superblocks
+ * whose utilization percentage is at least K/N, but we only enforce this
+ * rigorously for the highest-numbered fullness class, which always contains
+ * exactly those blocks that are completely full.  It's otherwise acceptable
+ * for a superblock to be in a higher-numbered fullness class than the one
+ * to which it logically belongs.  In addition, the active superblock, which
+ * is always the first block in fullness class 1, is permitted to have a
+ * higher allocation percentage than would normally be allowable for that
+ * fullness class; we don't move it until it's completely full, and then
+ * it goes to the highest-numbered fullness class.
+ *
+ * It might seem odd that the active superblock is the head of fullness class
+ * 1 rather than fullness class 0, but experience with other allocators has
+ * shown that it's usually better to allocate from a superblock that's
+ * moderately full rather than one that's nearly empty.  Insofar as is
+ * reasonably possible, we want to avoid performing new allocations in a
+ * superblock that would otherwise become empty soon.
+ */
+static bool
+sb_ensure_active_superblock(char *base, sb_region *region, sb_allocator *a,
+							sb_heap *heap, int size_class)
+{
+	Size	obsize = sb_size_classes[size_class];
+	Size	nmax;
+	int		fclass;
+	sb_span *span = NULL;
+	Size	npages = 1;
+	Size	first_page;
+	Size	i;
+	void   *ptr;
+
+	/*
+	 * Compute the number of objects that will fit in a superblock of this
+	 * size class.  Span-of-spans superblocks are just a single page, and the
+	 * first object isn't available for use because it describes the
+	 * span-of-spans itself.
+	 */
+	if (size_class == SB_SCLASS_SPAN_OF_SPANS)
+		nmax = FPM_PAGE_SIZE / obsize - 1;
+	else
+ 		nmax = SB_SUPERBLOCK_SIZE / obsize;
+
+	/*
+	 * If fullness class 1 is empty, try to find something to put in it by
+	 * scanning higher-numbered fullness classes (excluding the last one,
+	 * whose blocks are certain to all be completely full).
+	 */
+	for (fclass = 2; fclass < SB_FULLNESS_CLASSES - 1; ++fclass)
+	{
+		sb_span *span;
+
+		span = relptr_access(base, heap->spans[fclass]);
+		while (span != NULL)
+		{
+			int		tfclass;
+			sb_span *nextspan;
+			sb_span *prevspan;
+
+			/* Figure out what fullness class should contain this. */
+			tfclass = (nmax - span->nallocatable)
+				* (SB_FULLNESS_CLASSES - 1) / nmax;
+
+			/* Look up next span. */
+			nextspan = relptr_access(base, span->nextspan);
+
+			/*
+			 * If utilization has dropped enough that this now belongs in
+			 * some other fullness class, move it there.
+			 */
+			if (tfclass < fclass)
+			{
+				prevspan = relptr_access(base, span->prevspan);
+
+				relptr_copy(span->nextspan, heap->spans[tfclass]);
+				relptr_store(base, span->prevspan, (sb_span *) NULL);
+				if (nextspan != NULL)
+					relptr_copy(nextspan->prevspan, span->prevspan);
+				if (prevspan != NULL)
+					relptr_copy(prevspan->nextspan, span->nextspan);
+				else
+					relptr_copy(heap->spans[fclass], span->nextspan);
+				span->fclass = tfclass;
+			}
+
+			/* Advance to next span on list. */
+			span = nextspan;
+		}
+
+		/* Stop now if we found a suitable superblock. */
+		if (!relptr_is_null(heap->spans[1]))
+			return true;
+	}
+
+	/*
+	 * If there are no superblocks that properly belong in fullness class 1,
+	 * pick one from some other fullness class and move it there anyway, so
+	 * that we have an allocation target.  Our last choice is to transfer a
+	 * superblock that's almost empty (and might become completely empty soon
+	 * if left alone), but even that is better than failing, which is what we
+	 * must do if there are no superblocks at all with freespace.
+	 */
+	Assert(relptr_is_null(heap->spans[1]));
+	for (fclass = 2; fclass < SB_FULLNESS_CLASSES - 1; ++fclass)
+		if (sb_transfer_first_span(base, heap, fclass, 1))
+			return true;
+	if (relptr_is_null(heap->spans[1]) &&
+		sb_transfer_first_span(base, heap, 0, 1))
+			return true;
+
+	/*
+	 * Get an sb_span object to describe the new superblock... unless
+	 * this allocation is for an sb_span object, in which case that's
+	 * surely not going to work.  We handle that case by storing the
+	 * sb_span describing an sb_span superblock inline.
+	 */
+	if (size_class != SB_SCLASS_SPAN_OF_SPANS)
+	{
+		sb_region *span_region = a->private ? NULL : region;
+
+		span = (sb_span *) sb_alloc_guts(base, span_region, a,
+										 SB_SCLASS_SPAN_OF_SPANS);
+		if (span == NULL)
+			return false;
+		npages = SB_PAGES_PER_SUPERBLOCK;
+	}
+
+	/* Find a region from which to allocate the superblock. */
+	if (region == NULL)
+	{
+		Assert(a->private);
+		region = sb_private_region_for_allocator(npages);
+	}
+
+	/* Try to allocate the actual superblock. */
+	if (region == NULL ||
+		!FreePageManagerGet(region->fpm, npages, &first_page))
+	{
+		/* XXX. Free the span, if any. */
+		return false;
+	}
+	ptr = fpm_page_to_pointer(fpm_segment_base(region->fpm), first_page);
+
+	/*
+	 * If this is a span-of-spans, carve the descriptor right out of
+	 * the allocated space.
+	 */
+	if (size_class == SB_SCLASS_SPAN_OF_SPANS)
+		span = (sb_span *) ptr;
+
+	/* Initialize span and pagemap. */
+	sb_init_span(base, span, heap, ptr, npages, size_class);
+	for (i = 0; i < npages; ++i)
+		sb_map_set(region->pagemap, first_page + i, span);
+
+	return true;
+}
+
+/*
+ * Add a new span to fullness class 1 of the indicated heap.
+ */
+static void
+sb_init_span(char *base, sb_span *span, sb_heap *heap, char *ptr,
+			 Size npages, uint16 size_class)
+{
+	sb_span *head = relptr_access(base, heap->spans[1]);
+	Size	obsize = sb_size_classes[size_class];
+
+	if (head != NULL)
+		relptr_store(base, head->prevspan, span);
+	relptr_store(base, span->parent, heap);
+	relptr_store(base, span->nextspan, head);
+	relptr_store(base, span->prevspan, (sb_span *) NULL);
+	relptr_store(base, heap->spans[1], span);
+	relptr_store(base, span->start, ptr);
+	span->npages = npages;
+	span->size_class = size_class;
+	span->ninitialized = 0;
+	if (size_class == SB_SCLASS_SPAN_OF_SPANS)
+	{
+		/*
+		 * A span-of-spans contains its own descriptor, so mark one object
+		 * as initialized and reduce the count of allocatable objects by one.
+		 * Doing this here has the side effect of also reducing nmax by one,
+		 * which is important to make sure we free this object at the correct
+		 * time.
+		 */
+		span->ninitialized = 1;
+ 		span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
+	}
+	else if (size_class != SB_SCLASS_SPAN_LARGE)
+ 		span->nallocatable = SB_SUPERBLOCK_SIZE / obsize;
+	span->firstfree = SB_SPAN_NOTHING_FREE;
+	span->nmax = span->nallocatable;
+	span->fclass = 1;
+}
+
+/*
+ * Report an out-of-memory condition.
+ */
+static void
+sb_out_of_memory_error(sb_allocator *a)
+{
+	if (a->private)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+}
+
+/*
+ * Transfer the first span in one fullness class to the head of another
+ * fullness class.
+ */
+static bool
+sb_transfer_first_span(char *base, sb_heap *heap, int fromclass, int toclass)
+{
+	sb_span *span;
+	sb_span *nextspan;
+
+	/* Can't do it if source list is empty. */
+	span = relptr_access(base, heap->spans[fromclass]);
+	if (span == NULL)
+		return false;
+
+	/* Remove span from source list. */
+	nextspan = relptr_access(base, span->nextspan);
+	relptr_store(base, heap->spans[fromclass], nextspan);
+	if (nextspan != NULL)
+		relptr_store(base, nextspan->prevspan, (sb_span *) NULL);
+
+	/* Add span to target list. */
+	relptr_copy(span->nextspan, heap->spans[toclass]);
+	relptr_store(base, heap->spans[toclass], span);
+	nextspan = relptr_access(base, span->nextspan);
+	if (nextspan != NULL)
+		relptr_store(base, nextspan->prevspan, span);
+	span->fclass = toclass;
+
+	return true;
+}
+
+/*
+ * Remove span from current list.
+ */
+static void
+sb_unlink_span(char *base, sb_heap *heap, sb_span *span)
+{
+	sb_span *nextspan = relptr_access(base, span->nextspan);
+	sb_span *prevspan = relptr_access(base, span->prevspan);
+
+	relptr_store(base, span->prevspan, (sb_span *) NULL);
+	if (nextspan != NULL)
+		relptr_copy(nextspan->prevspan, span->prevspan);
+	if (prevspan != NULL)
+		relptr_copy(prevspan->nextspan, span->nextspan);
+	else
+		relptr_copy(heap->spans[span->fclass], span->nextspan);
+}
diff --git a/src/backend/utils/mmgr/sb_map.c b/src/backend/utils/mmgr/sb_map.c
new file mode 100644
index 0000000..7c629df
--- /dev/null
+++ b/src/backend/utils/mmgr/sb_map.c
@@ -0,0 +1,137 @@
+/*-------------------------------------------------------------------------
+ *
+ * sb_map.c
+ *	  Superblock allocator page-mapping infrastructure.
+ *
+ * The superblock allocator does not store metadata with each chunk, and
+ * therefore needs a way to find the metadata given only the pointer
+ * address.  The first step is to translate the pointer address to a
+ * an offset relative to some base address, from which a page number
+ * can be calculated.  Then, this module is reponsible for mapping the
+ * page number to an offset with the chunk where the associated span
+ * object is stored.  We do this in the simplest possible way: one big
+ * array.
+ *
+ * Span metadata is stored within the same chunk of memory as the span
+ * itself.  Therefore, we can assume that the offset is less than 4GB
+ * whenever we're managing less than 4GB of pages, and use 4 byte
+ * offsets.  When we're managing more than 4GB of pages, we use 8 byte
+ * offsets.  (This could probably be optimized; for example, we could use
+ * 6 byte offsets for allocation sizes up to 256TB; also, if we assumed
+ * that the span object must itself be 2, 4, or 8 byte aligned, we could
+ * extend the cutoff point for offsets of any given length by a similar
+ * multiple.  It's not clear that the extra math would be worthwhile.)
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/mmgr/sb_map.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "storage/shmem.h"
+#include "utils/freepage.h"
+#include "utils/sb_map.h"
+
+const uint64 maxpages_4b = UINT64CONST(0x100000000) / FPM_PAGE_SIZE;
+
+struct sb_map
+{
+	relptr(sb_map) self;
+	Size	offset;
+	Size	npages;
+	bool	use64;
+};
+
+/* Map layout for segments less than 4GB. */
+typedef struct sb_map32
+{
+	sb_map	hdr;
+	uint32	map[FLEXIBLE_ARRAY_MEMBER];
+} sb_map32;
+
+/* Map layout for segments less than 8GB. */
+typedef struct sb_map64
+{
+	sb_map	hdr;
+	uint64	map[FLEXIBLE_ARRAY_MEMBER];
+} sb_map64;
+
+#define sb_map_base(m) \
+	(((char *) m) - m->self.relptr_off)
+
+/*
+ * Compute the amount of space required for an sb_map covering a given
+ * number of pages.  Note that for shared memory (i.e. when base != NULL),
+ * we assume that the pointers will always point to addresses within that
+ * same segment, but for backend-private memory that might not be the case.
+ */
+Size
+sb_map_size(char *base, Size npages)
+{
+	Size	map_bytes;
+
+	if (sizeof(Size) <= 4 || (base != NULL && npages < maxpages_4b))
+		map_bytes = add_size(offsetof(sb_map32, map),
+							 mul_size(npages, sizeof(uint32)));
+	else
+		map_bytes = add_size(offsetof(sb_map64, map),
+							 mul_size(npages, sizeof(uint64)));
+
+	return map_bytes;
+}
+
+/*
+ * Initialize an sb_map.  Storage is provided by the caller.  Note that we
+ * don't zero the array; the caller shouldn't try to get a value that hasn't
+ * been set.
+ */
+void
+sb_map_initialize(sb_map *m, char *base, Size offset, Size npages)
+{
+	relptr_store(base, m->self, m);
+	m->offset = offset;
+	m->npages = npages;
+	if (sizeof(Size) <= 4 || (base != NULL && npages < maxpages_4b))
+		m->use64 = false;
+	else
+		m->use64 = true;
+}
+
+/*
+ * Store a value into an sb_map.
+ */
+void
+sb_map_set(sb_map *m, Size pageno, void *ptr)
+{
+	char   *base = sb_map_base(m);
+	Assert(pageno >= m->offset);
+	pageno -= m->offset;
+	Assert(pageno < m->npages);
+
+	if (m->use64)
+		((sb_map64 *) m)->map[pageno] = (uint64) (((char *) ptr) - base);
+	else
+		((sb_map32 *) m)->map[pageno] = (uint32) (((char *) ptr) - base);
+}
+
+/*
+ * Get a value from an sb_map.  Getting a value not previously stored will
+ * produce an undefined result, so don't do that.
+ */
+void *
+sb_map_get(sb_map *m, Size pageno)
+{
+	char   *base = sb_map_base(m);
+	Assert(pageno >= m->offset);
+	pageno -= m->offset;
+	Assert(pageno < m->npages);
+
+	if (m->use64)
+		return base + ((sb_map64 *) m)->map[pageno];
+	else
+		return base + ((sb_map32 *) m)->map[pageno];
+}
diff --git a/src/backend/utils/mmgr/sb_region.c b/src/backend/utils/mmgr/sb_region.c
new file mode 100644
index 0000000..1c51563
--- /dev/null
+++ b/src/backend/utils/mmgr/sb_region.c
@@ -0,0 +1,744 @@
+/*-------------------------------------------------------------------------
+ *
+ * sb_region.c
+ *	  Superblock allocator memory region manager.
+ *
+ * The superblock allocator operates on ranges of pages managed by a
+ * FreePageManager and reverse-mapped by an sb_map.  When it's asked to
+ * free an object, it just gets a pointer address; our job is to figure
+ * out which page range contains that object and locate the
+ * FreePageManager, sb_map, and other metadata that the superblock
+ * allocator will need to do its thing.  Moreover, when allocating an
+ * object, the caller is only required to provide the superblock allocator
+ * with a pointer to the sb_allocator object, which could be in either
+ * shared or backend-private memory; our job again is to know which it
+ * is and provide pointers to the appropriate supporting data structures.
+ * To do all this, we have to keep track of where all dynamic shared memory
+ * segments configured for memory allocation are located; and we also have
+ * to keep track of where all chunks of memory obtained from the operating
+ * system for backend-private allocations are located.
+ *
+ * On a 32-bit system, the number of chunks can never get very big, so
+ * we just store them all in a single array and use binary search for
+ * lookups.  On a 64-bit system, this might get dicey, so we maintain
+ * one such array for every 4GB of address space; chunks that span a 4GB
+ * boundary require multiple entries.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/mmgr/sb_region.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/sb_region.h"
+
+/*
+ * On 64-bit systems, we use a two-level radix tree to find the data for
+ * the relevant 4GB range.  The radix tree is deliberately unbalanced, with
+ * more entries at the first level than at the second level.  We expect this
+ * to save memory, because the first level has a cache, and the full array
+ * is only instantiated if the cache overflows.  Since each L2 entry
+ * covers 2^44 bytes of address space (16TB), we expect overflows of the
+ * four-entry cache to happen essentially never.
+ */
+#define SB_LOOKUP_ROOT_BITS			20
+#define SB_LOOKUP_ROOT_ENTRIES		(1 << SB_LOOKUP_ROOT_BITS)
+#define SB_LOOKUP_ROOT_CACHE_SIZE	4
+#define SB_LOOKUP_L2_BITS			12
+#define SB_LOOKUP_L2_ENTRIES		(1 << SB_LOOKUP_L2_BITS)
+
+/* Lookup data for a 4GB range of address space. */
+typedef struct
+{
+	int		nused;
+	int		nallocated;
+	sb_region **region;
+} sb_lookup_leaf;
+
+/* Lookup data for a 16TB range of address space, direct mapped. */
+typedef struct
+{
+	sb_lookup_leaf *leaf[SB_LOOKUP_L2_ENTRIES];
+} sb_lookup_l2;
+
+/* Lookup data for an entire 64-bit address space. */
+typedef struct
+{
+	uint32	cache_key[SB_LOOKUP_ROOT_CACHE_SIZE];
+	sb_lookup_l2 *cache_value[SB_LOOKUP_ROOT_CACHE_SIZE];
+	sb_lookup_l2 **l2;
+} sb_lookup_root;
+
+/* Toplevel address lookup structure. */
+#if SIZEOF_SIZE_T > 4
+static sb_lookup_root lookup_root;
+#else
+static sb_lookup_leaf lookup_root_leaf;
+#endif
+
+/*
+ * Backend-private chunks binned by maximum contiguous freespace.  Lists are
+ * doubly-linked using fl_node.  List 0 contains regions with no internal
+ * no free pages at all.  List I, for I>0, contains regions where the number
+ * of contiguous free pages is no larger than 2^(I-1), except for the last
+ * list which contains everything with too many pages for any other list.
+ * A region may be on a higher-numbered list than where it actually belongs,
+ * but it cannot be any lower.  Thus it's safe to assume that searching
+ * lower-numbered lists is always pointless, but higher-numbered lists may
+ * contain regions that can't actually satisfy a requested allocation.
+ */
+#define NUM_PRIVATE_FREELISTS	16
+static dlist_head private_freelist[NUM_PRIVATE_FREELISTS];
+
+/*
+ * Constants to set the size of backend-private regions.  Superblocks are
+ * 16 pages each (64kB), and we want a number of superblocks to fit inside
+ * each region, so these need to be pretty good-sized.  The actual
+ * allocations will be a bit larger than the values indicated here, because
+ * we add a bit of space for bookkeeping.  These values are in units of
+ * FPM_PAGE_SIZE.
+ */
+#define SB_REGION_INITSIZE		(16 * SB_PAGES_PER_SUPERBLOCK)
+#define SB_REGION_MAXSIZE		((64 * 1024 * 1024) / FPM_PAGE_SIZE)
+
+static Size sb_private_pages_allocated = 0;
+static Size sb_private_bytes_allocated = 0;
+static Size sb_peak_private_bytes_allocated = 0;
+
+/* Static functions. */
+static bool sb_adjust_lookup(sb_region *region, bool insert);
+static bool sb_adjust_lookup_leaf(sb_lookup_leaf *leaf, sb_region *region,
+					  bool insert);
+static void sb_dump_regions_leaf(sb_region *last_region, sb_lookup_leaf *leaf);
+#if SIZEOF_SIZE_T > 4
+static sb_lookup_leaf *sb_find_leaf(Size highbits, bool insert);
+#endif
+static void *system_calloc(Size count, Size s);
+static void system_free(void *p, Size s);
+static void *system_malloc(Size s);
+
+/*
+ * Dump debugging information for sb_region objects.
+ */
+void
+sb_dump_regions(void)
+{
+#if SIZEOF_SIZE_T > 4
+	sb_region *last_region = NULL;
+
+	if (lookup_root.l2 != NULL)
+	{
+		int i;
+		int j;
+
+		for (i = 0; i < SB_LOOKUP_ROOT_ENTRIES; ++i)
+		{
+			sb_lookup_l2 *l2 = lookup_root.l2[i];
+
+			if (l2 == NULL)
+				continue;
+			for (j = 0; j < SB_LOOKUP_L2_ENTRIES; ++j)
+			{
+				sb_lookup_leaf *leaf = l2->leaf[j];
+
+				if (leaf != NULL)
+				{
+					sb_dump_regions_leaf(last_region, leaf);
+					last_region = leaf->region[leaf->nused - 1];
+				}
+			}
+		}
+	}
+	else
+	{
+		bool	first = true;
+		Size	highbits = 0;
+
+		for (;;)
+		{
+			int		i;
+			int		j;
+			int		n = -1;
+			sb_lookup_l2 *l2;
+
+			/* Find next L2 entry to visit. */
+			for (i = 0; i < SB_LOOKUP_ROOT_CACHE_SIZE; ++i)
+			{
+				if (lookup_root.cache_value[i] != NULL &&
+					(first || lookup_root.cache_key[i] > highbits))
+					n = i;
+			}
+			if (n == -1)
+				break;
+			first = false;
+			highbits = lookup_root.cache_key[n];
+
+			/* Dump this L2 entry. */
+			l2 = lookup_root.cache_value[n];
+			for (j = 0; j < SB_LOOKUP_L2_ENTRIES; ++j)
+			{
+				sb_lookup_leaf *leaf = l2->leaf[j];
+
+				if (leaf != NULL)
+				{
+					sb_dump_regions_leaf(last_region, leaf);
+					last_region = leaf->region[leaf->nused - 1];
+				}
+			}
+		}
+	}
+#else
+	sb_dump_regions_leaf(NULL, lookup_root_leaf);
+#endif
+
+	fprintf(stderr, "== overall statistics ==\n");
+	fprintf(stderr, "private bytes now: %zu, peak %zu\n",
+		sb_private_bytes_allocated,
+		Max(sb_private_bytes_allocated, sb_peak_private_bytes_allocated));
+}
+
+/*
+ * Find the region to which a pointer belongs.
+ */
+sb_region *
+sb_lookup_region(void *ptr)
+{
+	Size p = (Size) ptr;
+	sb_lookup_leaf *leaf = NULL;
+	int		high, low;
+
+	/*
+	 * If this is a 64-bit system, locate the lookup table that pertains
+	 * to the upper 32 bits of ptr.  On a 32-bit system, there's only one
+	 * lookup table.
+	 */
+#if SIZEOF_SIZE_T > 4
+	{
+		Size	highbits = p >> 32;
+		static Size last_highbits = 0;
+		static sb_lookup_leaf *last_leaf = NULL;
+
+		/* Quick test to see if we're in same range as before. */
+		if (last_highbits == highbits && last_leaf != NULL)
+			leaf = last_leaf;
+		else
+		{
+			leaf = sb_find_leaf(highbits, false);
+
+			/* No lookup table for this 4GB range?  OK, no matching region. */
+			if (leaf == NULL)
+				return NULL;
+
+			/* Remember results of this lookup for next time. */
+			last_highbits = highbits;
+			last_leaf = leaf;
+		}
+	}
+#else
+	leaf = &lookup_root_leaf;
+#endif
+
+	/* Now we use binary search on the sb_lookup_leaf. */
+	high = leaf->nused;
+	low = 0;
+	while (low < high)
+	{
+		int mid;
+		sb_region *region;
+
+		mid = (high + low) / 2;
+		region = leaf->region[mid];
+		if (region->region_start > (char *) ptr)
+			high = mid;
+		else if (region->region_start + region->region_size < (char *) ptr)
+			low = mid + 1;
+		else
+			return region;
+	}
+
+	return NULL;
+}
+
+/*
+ * When a backend-private sb_allocator needs more memory, it calls this
+ * function.  We search the existing backend-private regions for one capable
+ * of satisfying the request; if none found, we must create a new region.
+ */
+sb_region *
+sb_private_region_for_allocator(Size npages)
+{
+	int freelist = Min(fls(npages), NUM_PRIVATE_FREELISTS);
+	Size	new_region_net_pages;
+	Size	metadata_bytes;
+	char   *region_start;
+	Size	region_size;
+	sb_region *region;
+
+	Assert(npages > 0);
+
+	while (freelist < NUM_PRIVATE_FREELISTS)
+	{
+		dlist_mutable_iter	iter;
+		Size		threshold = 1 << (freelist - 1);
+
+		dlist_foreach_modify(iter, &private_freelist[freelist])
+		{
+			sb_region  *region;
+			Size	largest;
+
+			region = dlist_container(sb_region, fl_node, iter.cur);
+
+			/*
+			 * Quickly skip regions which appear to have enough space to
+			 * belong on this freelist but which don't have enough space to
+			 * satisfy the request, to avoid probing every region on the list
+			 * for its exact free space on every trip through.
+			 */
+			if (region->contiguous_pages >= threshold &&
+				region->contiguous_pages < npages)
+				continue;
+
+			/*
+			 * We're going to either use this region or move it to a
+			 * lower-numbered freelist or both, so determine the precise size
+			 * of the largest remaining run of pages.
+			 */
+			largest = FreePageManagerInquireLargest(region->fpm);
+			region->contiguous_pages = largest;
+
+			/*
+			 * The region we're examining not only doesn't have enough
+			 * contiguous freespace to satisfy this allocation, but it
+			 * doesn't even belong in this bucket.  Move it to the right place.
+			 */
+			if (largest < threshold)
+			{
+				int	new_freelist = Min(fls(largest), NUM_PRIVATE_FREELISTS);
+
+				dlist_delete(iter.cur);
+				dlist_push_head(&private_freelist[new_freelist],
+								&region->fl_node);
+			}
+
+			/*
+			 * If the region is big enough, use it.  For larger allocations
+			 * this might be suboptimal, because we might carve space out of a
+			 * chunk that's bigger than we really need rather than locating
+			 * the best fit across all chunks.  It shouldn't be too far off,
+			 * though, because chunks with way more contiguous space available
+			 * will be on a higher-numbered freelist.
+			 *
+			 * NB: For really large backend-private allocations, it's probably
+			 * better to malloc() directly than go through this machinery.
+			 */
+			if (largest >= npages)
+				return region;
+		}
+
+		/* Try next freelist. */
+		++freelist;
+	}
+
+	/*
+	 * There is no existing backend-private region with enough freespace
+	 * to satisfy the request, so we'll need to create a new one.  First
+	 * step is to figure out how many pages we should try to obtain.
+	 */
+	for (new_region_net_pages = SB_REGION_INITSIZE;
+		 new_region_net_pages < sb_private_pages_allocated &&
+		 new_region_net_pages < SB_REGION_MAXSIZE; new_region_net_pages *= 2)
+		;
+	if (new_region_net_pages < npages)
+		new_region_net_pages = npages;
+
+	/* Try to allocate space from the operating system. */
+	for (;;)
+	{
+		/*
+		 * Compute space required for metadata and determine raw allocation
+		 * size.
+		 */
+		metadata_bytes = MAXALIGN(sizeof(sb_region));
+		metadata_bytes += MAXALIGN(sizeof(FreePageManager));
+		metadata_bytes += MAXALIGN(sb_map_size(NULL, new_region_net_pages));
+		if (metadata_bytes % FPM_PAGE_SIZE != 0)
+			metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+		region_size = new_region_net_pages * FPM_PAGE_SIZE + metadata_bytes;
+
+		/* Try to allocate memory. */
+		region_start = system_malloc(region_size);
+		if (region_start != NULL)
+			break;
+
+		/* Too big; if possible, loop and try a smaller allocation. */
+		if (new_region_net_pages == npages)
+			return NULL;
+		new_region_net_pages = Max(new_region_net_pages / 2, npages);
+	}
+
+	/*
+	 * Initialize region object.
+	 *
+	 * NB: We temporarily set region->contiguous_pages to a value one more
+	 * than the actual number.  This is because calling FreePageManagerPut
+	 * will provoke a callback to sb_report_contiguous_freespace, which we
+	 * want to exit quickly and, in particular, without deallocating the
+	 * region.
+	 */
+	region = (sb_region *) region_start;
+	region->region_start = region_start;
+	region->region_size = region_size;
+	region->usable_pages = new_region_net_pages;
+	sb_private_pages_allocated += region->usable_pages;
+	region->seg = NULL;
+	region->allocator = NULL;
+	region->fpm = (FreePageManager *)
+		(region_start + MAXALIGN(sizeof(sb_region)));
+	region->pagemap = (sb_map *)
+		(((char *) region->fpm) + MAXALIGN(sizeof(FreePageManager)));
+	region->contiguous_pages = new_region_net_pages + 1;
+
+	/* Initialize supporting data structures. */
+	FreePageManagerInitialize(region->fpm, region->region_start, NULL, false);
+	FreePageManagerPut(region->fpm, metadata_bytes / FPM_PAGE_SIZE,
+					   new_region_net_pages);
+	sb_map_initialize(region->pagemap, NULL, metadata_bytes / FPM_PAGE_SIZE,
+					  new_region_net_pages);
+	region->contiguous_pages = new_region_net_pages; /* Now fix the value. */
+	freelist = Min(fls(new_region_net_pages), NUM_PRIVATE_FREELISTS);
+	dlist_push_head(&private_freelist[freelist], &region->fl_node);
+	sb_adjust_lookup(region, true);
+
+	/* Time to rock and roll. */
+	return region;
+}
+
+/*
+ * When a free page manager detects that the maximum contiguous freespace in
+ * a backend-private region has increased, it calls this function.  Our job
+ * is to free the region completely if there are no remaining allocatons,
+ * and otherwise to 
+ */
+void
+sb_report_contiguous_freespace(sb_region *region, Size npages)
+{
+	int		old_freelist;
+	int		new_freelist;
+
+	/* This should only be called for private regions. */
+	Assert(region->seg == NULL);
+	Assert(region->allocator == NULL);
+
+	/*
+	 * If there have been allocations from the region since the last report,
+	 * it's possible that the number of pages reported is less than what we
+	 * already know about.  In that case, exit quickly; else update our
+	 * cached value.
+	 */
+	if (npages < region->contiguous_pages)
+		return;
+
+	/*
+	 * If the entire region is free, deallocate it.  The sb_region,
+	 * FreePageManager, and sb_map for the region is stored within it, so
+	 * they all go away when we free the managed space.
+	 */
+	if (npages == region->usable_pages)
+	{
+		char   *region_start = region->region_start;
+		Size	region_size = region->region_size;
+
+		/* Pull the region out of the lookup table. */
+		sb_adjust_lookup(region, false);
+
+		/* Remove the region object from the private freelist. */
+		dlist_delete(&region->fl_node);
+
+		/* Decrement count of private pages allocated. */
+		Assert(sb_private_pages_allocated >= region->usable_pages);
+		sb_private_pages_allocated -= region->usable_pages;
+
+		/* Return the managed space to the operating system. */
+		system_free(region_start, region_size);
+		return;
+	}
+
+	/* If necessary, move the region to a higher-numbered freelist. */
+	old_freelist = Min(fls(region->contiguous_pages), NUM_PRIVATE_FREELISTS);
+	new_freelist = Min(fls(npages), NUM_PRIVATE_FREELISTS);
+	if (new_freelist > old_freelist)
+	{
+		dlist_delete(&region->fl_node);
+		dlist_push_head(&private_freelist[new_freelist], &region->fl_node);
+	}
+
+	/* Record the reported value for future calls to this function. */
+	region->contiguous_pages = npages;
+}
+
+/*
+ * Insert a region into, or delete a region from, the address-based lookup
+ * tables.  Returns true on success and false if we fail due to memory
+ * exhaustion; delete always succeeds.
+ */
+static bool
+sb_adjust_lookup(sb_region *region, bool insert)
+{
+	bool	ok = true;
+
+	/*
+	 * If this is a 64-bit system, we need to loop over all of the relevant
+	 * tables and update each one.  On a 32-bit system, there's only one table
+	 * and we simply update that.
+	 */
+#if SIZEOF_SIZE_T > 4
+	Size	tabstart;
+	Size	tabstop;
+	Size	i;
+
+	tabstart = ((Size) region->region_start) >> 32;
+	tabstop = ((Size) region->region_start + region->region_size - 1) >> 32;
+
+	for (i = tabstart; i <= tabstop; ++i)
+	{
+		sb_lookup_leaf *leaf = sb_find_leaf(i, insert);
+
+		/*
+		 * Finding the leaf might fail if we're inserting and can't allocate
+		 * memory for a new lookup table.  Even if we get the leaf, inserting
+		 * the new region pointer into it might also fail for lack of memory.
+		 */
+		Assert(insert || leaf != NULL);
+		if (leaf == NULL)
+			ok = false;
+		else
+			ok = sb_adjust_lookup_leaf(leaf, region, insert);
+
+		if (!ok)
+		{
+			/* We ran out of memory; back out changes already made. */
+			ok = false;
+			tabstop = i - 1;
+			for (i = tabstart; i <= tabstop; ++i)
+				sb_adjust_lookup_leaf(sb_find_leaf(i, false), region, false);
+			break;
+		}
+	}
+#else
+	ok = sb_adjust_lookup_leaf(&lookup_root_leaf, region, insert);
+#endif
+
+	return ok;
+}
+
+/*
+ * Insert a region into, or remove a region from, a particular sb_lookup_leaf.
+ * Returns true on success and false if we fail due to memory exhaustion;
+ * delete always succeeds.
+ */
+static bool
+sb_adjust_lookup_leaf(sb_lookup_leaf *leaf, sb_region *region, bool insert)
+{
+	int		high, low;
+
+	/* If we're inserting, we might need to allocate more space. */
+	if (insert && leaf->nused >= leaf->nallocated)
+	{
+		Size	newsize;
+		sb_region **newtab;
+
+		newsize = leaf->nallocated == 0 ? 16 : leaf->nallocated * 2;
+		newtab = system_malloc(sizeof(sb_region *) * newsize);
+		if (newtab == NULL)
+			return false;
+		if (leaf->nused > 0)
+			memcpy(newtab, leaf->region, sizeof(sb_region *) * leaf->nused);
+		if (leaf->region != NULL)
+			system_free(leaf->region, sizeof(sb_region *) * leaf->nallocated);
+		leaf->nallocated = newsize;
+		leaf->region = newtab;
+	}
+
+	/* Use binary search on the sb_lookup_leaf. */
+	high = leaf->nused;
+	low = 0;
+	while (low < high)
+	{
+		int mid;
+		sb_region *candidate;
+
+		mid = (high + low) / 2;
+		candidate = leaf->region[mid];
+		if (candidate->region_start > region->region_start)
+			high = mid;
+		else if (candidate->region_start < region->region_start)
+			low = mid + 1;
+		else
+			low = high = mid;
+	}
+
+	/* Really do it. */
+	if (insert)
+	{
+		Assert(low == leaf->nused || 
+				leaf->region[low]->region_start > region->region_start);
+		if (low < leaf->nused)
+			memmove(&leaf->region[low + 1], &leaf->region[low],
+					sizeof(sb_region *) * (leaf->nused - low));
+		leaf->region[low] = region;
+		++leaf->nused;
+	}
+	else
+	{
+		Assert(leaf->region[low] == region);
+		if (low < leaf->nused - 1)
+			memmove(&leaf->region[low], &leaf->region[low + 1],
+					sizeof(sb_region *) * (leaf->nused - low - 1));
+		--leaf->nused;		
+	}
+
+	return true;
+}
+
+/*
+ * Dump debugging information for the regions covered by a single
+ * sb_lookup_leaf.  Skip the first one if it's the same as last_region.
+ */
+static void
+sb_dump_regions_leaf(sb_region *last_region, sb_lookup_leaf *leaf)
+{
+	int i;
+
+	for (i = 0; i < leaf->nused; ++i)
+	{
+		sb_region *region = leaf->region[i];
+
+		if (i == 0 && region == last_region)
+			continue;
+		fprintf(stderr, "== region at %p [%zu bytes, %zu usable pages] ==\n",
+				region->region_start, region->region_size,
+				region->usable_pages);
+		fprintf(stderr, "%s\n\n", FreePageManagerDump(region->fpm));
+	}
+}
+
+#if SIZEOF_SIZE_T > 4
+static sb_lookup_leaf *
+sb_find_leaf(Size highbits, bool insert)
+{
+	Size	rootbits;
+	sb_lookup_l2 *l2 = NULL;
+	sb_lookup_leaf **leafptr;
+	int	i;
+	int unused = -1;
+
+	rootbits = (highbits >> SB_LOOKUP_L2_BITS) & (SB_LOOKUP_ROOT_ENTRIES - 1);
+
+	/* Check for L2 entry in toplevel cache. */
+	for (i = 0; i < SB_LOOKUP_ROOT_CACHE_SIZE; ++i)
+	{
+		if (lookup_root.cache_value[i] == NULL)
+			unused = i;
+		else if (lookup_root.cache_key[i] == rootbits)
+			l2 = lookup_root.cache_value[i];
+	}
+
+	/* If no hit, check the full L2 loookup table, if it's been initialized. */
+	if (l2 == NULL && lookup_root.l2 != NULL)
+	{
+		rootbits &= SB_LOOKUP_ROOT_ENTRIES - 1;
+		l2 = lookup_root.l2[rootbits];
+
+	 	/* Pull entry into cache. */
+		if (l2 != NULL)
+		{
+			/*
+			 * No need to be smart about replacement policy; we expect to
+			 * arrive here virtually never.
+			 */
+			i = highbits % SB_LOOKUP_ROOT_CACHE_SIZE;
+			lookup_root.cache_key[i] = highbits;
+			lookup_root.cache_value[i] = l2;
+		}
+	}
+
+	/* If no L2 entry found, create one if inserting else give up. */
+	if (l2 == NULL)
+	{
+		if (!insert)
+			return NULL;
+		l2 = system_calloc(1, sizeof(sb_lookup_l2));
+		if (l2 == NULL)
+			return NULL;
+		if (unused != -1)
+		{
+			lookup_root.cache_key[unused] = rootbits;
+			lookup_root.cache_value[unused] = l2;
+		}
+		else if (lookup_root.l2 != NULL)
+			lookup_root.l2[rootbits] = l2;
+		else
+		{
+			lookup_root.l2 = system_calloc(SB_LOOKUP_ROOT_ENTRIES,
+									sizeof(sb_lookup_l2 *));
+			if (lookup_root.l2 == NULL)
+			{
+				system_free(l2, sizeof(sb_lookup_l2));
+				return NULL;
+			}
+			for (i = 0; i < SB_LOOKUP_ROOT_CACHE_SIZE; ++i)
+				lookup_root.l2[lookup_root.cache_key[i]] =
+					lookup_root.cache_value[i];
+		}
+	}
+
+	/* Find slot for entry, and try to initialize it if needed. */
+	leafptr = &l2->leaf[highbits & (SB_LOOKUP_L2_ENTRIES - 1)];
+	if (insert && *leafptr == NULL)
+		*leafptr = system_calloc(1, sizeof(sb_lookup_leaf));
+
+	return *leafptr;
+}
+#endif
+
+/*
+ * calloc() wrapper, to track bytes allocated.
+ */
+static void *
+system_calloc(Size count, Size s)
+{
+	void *p = calloc(count, s);
+
+	if (p != NULL)
+		sb_private_bytes_allocated += count * s;
+	return p;
+}
+
+/*
+ * free() wrapper, to track bytes allocated.
+ */
+static void
+system_free(void *p, Size s)
+{
+	free(p);
+	if (sb_private_bytes_allocated > sb_peak_private_bytes_allocated)
+		sb_peak_private_bytes_allocated = sb_private_bytes_allocated;
+	sb_private_bytes_allocated -= s;
+}
+
+/*
+ * malloc() wrapper, to track bytes allocated.
+ */
+static void *
+system_malloc(Size s)
+{
+	void *p = malloc(s);
+
+	if (p != NULL)
+		sb_private_bytes_allocated += s;
+	return p;
+}
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 9e209ae..1983d0f 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -14,6 +14,7 @@
 #include "storage/sinval.h"
 #include "utils/hsearch.h"
 #include "utils/relcache.h"
+#include "utils/sb_alloc.h"
 #include "utils/snapshot.h"
 #include "utils/timestamp.h"
 
@@ -326,31 +327,10 @@ struct ReorderBuffer
 	void	   *private_data;
 
 	/*
-	 * Private memory context.
+	 * Private memory context and allocator.
 	 */
 	MemoryContext context;
-
-	/*
-	 * Data structure slab cache.
-	 *
-	 * We allocate/deallocate some structures very frequently, to avoid bigger
-	 * overhead we cache some unused ones here.
-	 *
-	 * The maximum number of cached entries is controlled by const variables
-	 * on top of reorderbuffer.c
-	 */
-
-	/* cached ReorderBufferTXNs */
-	dlist_head	cached_transactions;
-	Size		nr_cached_transactions;
-
-	/* cached ReorderBufferChanges */
-	dlist_head	cached_changes;
-	Size		nr_cached_changes;
-
-	/* cached ReorderBufferTupleBufs */
-	slist_head	cached_tuplebufs;
-	Size		nr_cached_tuplebufs;
+	sb_allocator *allocator;
 
 	XLogRecPtr	current_restart_decoding_lsn;
 
diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h
new file mode 100644
index 0000000..dd905d7
--- /dev/null
+++ b/src/include/utils/freepage.h
@@ -0,0 +1,101 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.h
+ *	  Management of page-organized free memory.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/freepage.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef FREEPAGE_H 
+#define FREEPAGE_H
+
+#include "storage/lwlock.h"
+#include "utils/relptr.h"
+
+/* Forward declarations. */
+typedef struct FreePageSpanLeader FreePageSpanLeader;
+typedef struct FreePageBtree FreePageBtree;
+typedef struct FreePageManager FreePageManager;
+
+/*
+ * PostgreSQL normally uses 8kB pages for most things, but many common
+ * architecture/operating system pairings use a 4kB page size for memory
+ * allocation, so we do that here also.  We assume that a large allocation
+ * is likely to begin on a page boundary; if not, we'll discard bytes from
+ * the beginning and end of the object and use only the middle portion that
+ * is properly aligned.  This works, but is not ideal, so it's best to keep
+ * this conservatively small.  There don't seem to be any common architectures
+ * where the page size is less than 4kB, so this should be good enough; also,
+ * making it smaller would increase the space consumed by the address space
+ * map, which also uses this page size.
+ */
+#define FPM_PAGE_SIZE			4096
+
+/*
+ * Each freelist except for the last contains only spans of one particular
+ * size.  Everything larger goes on the last one.  In some sense this seems
+ * like a waste since most allocations are in a few common sizes, but it
+ * means that small allocations can simply pop the head of the relevant list
+ * without needing to worry about whether the object we find there is of
+ * precisely the correct size (because we know it must be).
+ */
+#define FPM_NUM_FREELISTS		129
+
+/* Everything we need in order to manage free pages (see freepage.c) */
+struct FreePageManager
+{
+	relptr(FreePageManager)  self;
+	relptr(LWLock)  lock;
+	bool			lock_address_is_fixed;
+	relptr(FreePageBtree)   btree_root;
+	relptr(FreePageSpanLeader)	btree_recycle;
+	unsigned		btree_depth;
+	unsigned		btree_recycle_count;
+	Size			singleton_first_page;
+	Size			singleton_npages;
+	Size			largest_reported_chunk;
+	relptr(FreePageSpanLeader)  freelist[FPM_NUM_FREELISTS];
+};
+
+/* Macros to convert between page numbers (expressed as Size) and pointers. */
+#define fpm_page_to_pointer(base, page)	\
+	(AssertVariableIsOfTypeMacro(page, Size), \
+	 (base) + FPM_PAGE_SIZE * (page))
+#define fpm_pointer_to_page(base, ptr)		\
+	(((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE)
+
+/* Macro to convert an allocation size to a number of pages. */
+#define fpm_size_to_pages(sz) \
+	(TYPEALIGN((sz), FPM_PAGE_SIZE))
+
+/* Macros to check alignment of absolute and relative pointers. */
+#define fpm_pointer_is_page_aligned(base, ptr)		\
+	(((Size) (((char *) (ptr)) - (base))) % FPM_PAGE_SIZE == 0)
+#define fpm_relptr_is_page_aligned(base, relptr)		\
+	((relptr).relptr_off % FPM_PAGE_SIZE == 0)
+
+/* Macro to find base address of the segment containing a FreePageManager. */
+#define fpm_segment_base(fpm)	\
+	(((char *) fpm) - fpm->self.relptr_off)
+
+/* Macro to find the lwlock for the FreePageManager. */
+#define fpm_lock(fpm) \
+	(relptr_access((fpm)->lock_address_is_fixed ? NULL : \
+		fpm_segment_base(fpm), (fpm)->lock))
+
+/* Functions to manipulate the free page map. */
+extern void FreePageManagerInitialize(FreePageManager *fpm, char *base,
+						  LWLock *lock, bool lock_address_is_fixed);
+extern bool FreePageManagerGet(FreePageManager *fpm, Size npages,
+						Size *first_page);
+extern void FreePageManagerPut(FreePageManager *fpm, Size first_page,
+						Size npages);
+extern Size FreePageManagerInquireLargest(FreePageManager *fpm);
+extern char *FreePageManagerDump(FreePageManager *fpm);
+
+#endif   /* FREEPAGE_H */
diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h
new file mode 100644
index 0000000..46281cf
--- /dev/null
+++ b/src/include/utils/relptr.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * relptr.h
+ *	  This file contains basic declarations for relative pointers.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/relptr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef RELPTR_H
+#define RELPTR_H
+
+/*
+ * Relative pointers are intended to be used when storing an address that may
+ * be relative either to the base of the processes address space or some
+ * dynamic shared memory segment mapped therein.
+ *
+ * The idea here is that you declare a relative pointer as relptr(type)
+ * and then use relptr_access to dereference it and relptr_store to change
+ * it.  The use of a union here is a hack, because what's stored in the
+ * relptr is always a Size, never an actual pointer.  But including a pointer
+ * in the union allows us to use stupid macro tricks to provide some measure
+ * of type-safety.
+ */
+#define relptr(type)     union { type *relptr_type; Size relptr_off; }
+#define relptr_access(base, rp) \
+	(AssertVariableIsOfTypeMacro(base, char *), \
+	 (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \
+		(base + (rp).relptr_off)))
+#define relptr_is_null(rp) \
+	((rp).relptr_off == 0)
+#define relptr_store(base, rp, val) \
+	(AssertVariableIsOfTypeMacro(base, char *), \
+	 AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \
+	 (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base)))
+#define relptr_copy(rp1, rp2) \
+	((rp1).relptr_off = (rp2).relptr_off)
+
+#endif   /* RELPTR_H */
diff --git a/src/include/utils/sb_alloc.h b/src/include/utils/sb_alloc.h
new file mode 100644
index 0000000..07b6a57
--- /dev/null
+++ b/src/include/utils/sb_alloc.h
@@ -0,0 +1,79 @@
+/*-------------------------------------------------------------------------
+ *
+ * sb_alloc.h
+ *	  Superblock-based memory allocator.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/sb_alloc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SB_ALLOC_H
+#define SB_ALLOC_H
+
+#include "storage/lwlock.h"
+#include "utils/relptr.h"
+
+typedef struct sb_span sb_span;
+
+/*
+ * Superblocks are binned by how full they are.  Generally, each fullness
+ * class corresponds to one quartile, but the superblock being used for
+ * allocations is always at the head of the list for fullness class 1,
+ * regardless of how full it really is.
+ *
+ * For large objects, we just stick all of the allocations in fullness class
+ * 0. Since we can just return the space directly to the free page manager,
+ * we don't really need them on a list at all, except that if someone wants
+ * to bulk release everything allocated using this sb_allocator, we have no
+ * other way of finding them.
+ */
+#define SB_FULLNESS_CLASSES		4
+
+/*
+ * An sb_heap represents a set of allocations of a given size class.
+ * There can be multiple heaps for the same size class for contention
+ * avoidance.
+ */
+typedef struct sb_heap
+{
+	relptr(LWLock)	lock;
+	relptr(sb_span) spans[SB_FULLNESS_CLASSES];
+} sb_heap;
+
+/*
+ * An sb_allocator is basically just a group of heaps, one per size class.
+ * If locking is required, then we've also got an array of LWLocks, one per
+ * heap.
+ */
+typedef struct sb_allocator
+{
+	bool	private;
+	relptr(LWLock) locks;
+	sb_heap	heaps[FLEXIBLE_ARRAY_MEMBER];
+} sb_allocator;
+
+/* Pages per superblock (in units of FPM_PAGE_SIZE). */
+#define SB_PAGES_PER_SUPERBLOCK		16
+
+/* Allocation options. */
+#define SB_ALLOC_HUGE				0x0001		/* allow >=1GB */
+#define SB_ALLOC_SOFT_FAIL			0x0002		/* return NULL if no mem */
+
+/* Functions to manipulate allocators. */
+extern sb_allocator *sb_create_private_allocator(void);
+extern void sb_reset_allocator(sb_allocator *a);
+extern void sb_destroy_private_allocator(sb_allocator *a);
+
+/* Functions to allocate and free memory. */
+extern void *sb_alloc(sb_allocator *a, Size, int flags);
+extern void sb_free(void *ptr);
+
+/* Reporting functions. */
+extern Size sb_alloc_space(Size size);
+extern Size sb_chunk_space(void *ptr);
+
+#endif		/* SB_ALLOC_H */
diff --git a/src/include/utils/sb_map.h b/src/include/utils/sb_map.h
new file mode 100644
index 0000000..519bf52
--- /dev/null
+++ b/src/include/utils/sb_map.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ *
+ * sb_map.h
+ *	  Superblock allocator page-mapping infrastructure.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/sb_map.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SB_MAP_H
+#define SB_MAP_H
+
+typedef struct sb_map sb_map;
+
+extern Size sb_map_size(char *base, Size npages);
+extern void sb_map_initialize(sb_map *, char *base, Size offset, Size npages);
+extern void sb_map_set(sb_map *, Size pageno, void *ptr);
+extern void *sb_map_get(sb_map *, Size pageno);
+
+#endif /* SB_MAP_H */
diff --git a/src/include/utils/sb_region.h b/src/include/utils/sb_region.h
new file mode 100644
index 0000000..5bb01f3
--- /dev/null
+++ b/src/include/utils/sb_region.h
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * sb_region.h
+ *	  Superblock allocator memory region manager.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/sb_region.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SB_REGION_H
+#define SB_REGION_H
+
+#include "lib/ilist.h"
+#include "storage/dsm.h"
+#include "storage/shm_toc.h"
+#include "utils/freepage.h"
+#include "utils/sb_alloc.h"
+#include "utils/sb_map.h"
+
+/*
+ * An sb_region is a backend-private object used to track allocatable regions
+ * of memory, either backend-private or shared.
+ */
+typedef struct sb_region
+{
+	char *region_start;			/* Address of region. */
+	Size region_size;			/* Number of bytes in region. */
+	Size usable_pages;			/* Number of usable pages in region. */
+	dsm_segment *seg;			/* If not backend-private, DSM handle. */
+	sb_allocator *allocator;	/* If not backend-private, shared allocator. */
+	FreePageManager *fpm;		/* Free page manager for region (if any). */
+	sb_map *pagemap;			/* Page map for region (if any). */
+	Size contiguous_pages;		/* Last reported contiguous free pages. */
+	dlist_node fl_node;			/* Freelist links. */
+} sb_region;
+
+/*
+ * An sb_shared_region is a shared-memory object containing the information
+ * necessary to set up an sb_region object for an individual backend.
+ */
+typedef struct sb_shared_region
+{
+	relptr(FreePageManager) fpm;
+	relptr(sb_map) pagemap;
+	relptr(sb_allocator) allocator;
+	int	lwlock_tranche_id;
+	char lwlock_tranche_name[FLEXIBLE_ARRAY_MEMBER];
+} sb_shared_region;
+
+/* Public API. */
+extern sb_shared_region *sb_create_shared_region(dsm_segment *seg,
+						shm_toc *toc, Size size,
+						int lwlock_tranche_id,
+						char *lwlock_tranche_name);
+extern sb_allocator *sb_attach_shared_region(dsm_segment *,
+						sb_shared_region *);
+extern void sb_dump_regions(void);
+
+/* For internal use by cooperating modules. */
+extern sb_region *sb_lookup_region(void *);
+extern sb_region *sb_private_region_for_allocator(Size npages);
+extern void sb_report_contiguous_freespace(sb_region *, Size npages);
+
+#endif		/* SB_REGION_H */
#24Petr Jelinek
petr@2ndquadrant.com
In reply to: Tomas Vondra (#23)
Re: PATCH: two slab-like memory allocators

On 23/10/16 16:26, Tomas Vondra wrote:

On 10/22/2016 08:30 PM, Tomas Vondra wrote:

On 10/20/2016 04:43 PM, Robert Haas wrote:

...

The sb_alloc allocator I proposed a couple of years ago would work
well for this case, I think.

Maybe, but it does not follow the Memory Context design at all, if I
understand it correctly. I was willing to give it a spin anyway and see
how it compares to the two other allocators, but this is a significant
paradigm shift and certainly much larger step than what I proposed.

I'm not even sure it's possible to implement a MemoryContext based on
the same ideas as sb_alloc(), because one of the important points of
sb_alloc design seems to be throwing away the chunk header. While that
may be possible, it would certainly affect the whole tree (not just the
reorderbuffer bit), and it'd require way more work.

Moreover, the two allocators I proposed significantly benefit from the
"same lifespan" assumption. I don't think sb_alloc can do that.

I've given the sb_alloc patch another try - essentially hacking it into
reorderbuffer, ignoring the issues mentioned yesterday. And yes, it's
faster than the allocators discussed in this thread. Based on a few very
quick tests on my laptop, the difference is usually ~5-10%.

That might seem like a significant improvement, but it's negligible
compared to the "master -> slab/gen" improvement, which improves
performance by orders of magnitude (at least for the tested cases).

Moreover, the slab/gen allocators proposed here seem like a better fit
for reorderbuffer, e.g. because they release memory. I haven't looked at
sb_alloc too closely, but I think it behaves more like AllocSet in this
regard (i.e. keeping the memory indefinitely).

For reorderbuffer, from what I've seen in practice, I'd prefer proper
freeing to 5% performance gain as I seen walsenders taking GBs of memory
dues to reoderbuffer allocations that are never properly freed.

About your actual patch. I do like both the Slab and the Gen allocators
and think that we should proceed with them for the moment. You
definitely need to rename the Gen one (don't ask me to what though) as
it sounds like "generic" and do some finishing touches but I think it's
the way to go. I don't see any point in GenSlab anymore.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Petr Jelinek (#24)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 10/23/2016 05:26 PM, Petr Jelinek wrote:

On 23/10/16 16:26, Tomas Vondra wrote:

On 10/22/2016 08:30 PM, Tomas Vondra wrote:
...
Moreover, the slab/gen allocators proposed here seem like a better
fit for reorderbuffer, e.g. because they release memory. I haven't
looked at sb_alloc too closely, but I think it behaves more like
AllocSet in this regard (i.e. keeping the memory indefinitely).

For reorderbuffer, from what I've seen in practice, I'd prefer
proper freeing to 5% performance gain as I seen walsenders taking GBs
of memory dues to reoderbuffer allocations that are never properly
freed.

Right.

About your actual patch. I do like both the Slab and the Gen allocators
and think that we should proceed with them for the moment. You
definitely need to rename the Gen one (don't ask me to what though) as
it sounds like "generic" and do some finishing touches but I think it's
the way to go. I don't see any point in GenSlab anymore.

Attached is a v5 of the patch that does this i.e. throws away the
GenSlab allocator and modifies reorderbuffer in two steps.

First (0001) it adds Slab allocator for TXN/Change allocations, and
keeps the local slab cache for TupleBuf allocations (with a separate
AllocSet context).

Then (in 0002) it adds the Gen allocator for TupleBuf, removing the last
bits of the local slab cache.

I do think this version is is as simple as it gets - there's not much
more we could simplify / remove.

The main issue that bugs me is the name of the Gen allocator, but I
don't have a good naming ideas :( The basic characteristics of Gen is
that it does not reuse space released by pfree(), relying on the fact
that the whole block will become free. That should be reflected in the
name somehow, I guess.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-allocators-v5.tgzapplication/x-compressed-tar; name=slab-allocators-v5.tgzDownload
�;�X�[�w�F��W�+j<;i0~��L������^�N����RI��1����{UR	����/�I�H��u�>~�Q��nw�i��nD��EI+v3o����������;��m�/|�tv�w��l���ps{�wv��{���W�a�g�fn"���������s�DS�������y�vvw������p���w��}�=WQ(�2�}�n��:�"�CqM�T��~���2��>������E�w�37�0n&��7^�;{��}�����6���g�_���|<�;� :[�	4R�i��l6�"M��������Jd(x��p+���5�{Q��d8�d�����x�����L��t:N���{9R���)�����
O?�;��5��B/��r+�|	�����&+�-qR���4=�#����V�����}��������������
o��c�;���-T^�+��F�{�����zUx��S������</�j�W��h6�*���Z��KgTU��G����r�o�v��Q�`W��&��\T�(�'�/���=��
��/�`[�<��<3�W��(��� �p�������i�U����F�FY�Pz���i,=����E� P�Xd��Uzw�c|�E>�b� Z�(�"�O[x�Um�a��&�7|in`z8zvF6�	"(��T�6��^���b��a�zdp@b�+��WPh��C��5uA�%��t�Q��$C
eJ:��t�=��n�u�]��-�9�9��o��)���T�X�� �������A8�F���7(F��#N�������90�|6Ni��}�y|28;�8�ty78��9�a���rnIA���Vb�)��G���V�~1Ow?_
���nf��
	�/�Wp�vTKl_+��������2_O���{w/�:����2q�	p����	�������a�5V�Y�4zY8'��f�y�D�dtGl��Ywo,�%��e>\�7�������
�_��h��3�?]^��7�1a����G+G������XP�C�A���5g�D�����~�(�Y��b���3�iX����h|+��Y�l��A�Pe�osC���@V0���^5n��^�%Ef�>C����\&2�%�6-�������=kYJ�eFX�J��A`������M0�g`���DrZH<�W�H���Zy'�u��U�k6QH6�'�#6�DE]�xO��s�c�����(L���a�[db��HM�?��p�CW�5���(�;�����x�~c=}��]��Tf5��������wv�x��N�S���[2��f�������l�S�m��1 ���A�f�`��2�X�����������F��Q����g��8�[� �i�����������W�W�O�kz��g��g
c�t3�X(�Q,����Q"%�m��#����U�������>8�x��X�����p��j���9���/.^�<�K!���R��b��`H���J_\�0)��/�gA���U�9&V[@c���q�P�t����YgoKPS65��r����.&��nE����N�H@��Z01:x�H��{��R��g�E���-�M��&Vm!�
�,������4UC(rcP�N���0�1RY�����@B�*\AY��F�7_
��5��=����G2�����J�`~��+��Jz�U��/��G������'Q�l�un�l�/�*��?B�*z�:����P���>]
�/{��o.D�}���|�w|�����|������������3����e����v�oF�{;�h<���E��$]O������VN�`���L>k���U��s��j��:�a�l��h��7'��GA��"1��`��
Z[Q����+(������.�h:�����U��]��2�}?(���h�
���i��v���$�m��a+�����Q?mk�r������|e�����B�_�'��Q���Z�2��V>���p���Y4%j
~2���r\ ��q(}1z(3�":��v���R����^,CC>F���Q�H�x���W����l����\��dV���(����$��9�����]��wz|�����}^��;����s��n������m���pH�B����xp�R"�����u�M\ �HH�2$�t�)j���Y�p����1��(���(v��lH�Q\�s�I��@�>��Mf�=�_�G��!��+�����`�
`7(��Cl�!� q���J�Q!�v�w:� ���gVC�B�I���
�	D��5A
d��1>�������%�&�H��qS�A��j��lB��Q��l:��72�pEK���H=J���9d"�1�)h��N`XeS7P�x�(��D�hM�}7������
�,Q
�F��,�+0�\+�%$�t�\0 �0#a����$
��3�nL�1�
���
�n��!r.���s��Fx2�l��C��@G+�cW/�v�5MNR+HD�1���w� �5���h"Vh�b|���4a(���/F"� G�<�(&�Y���8������#��{,�t��<��[��w�����M�9�'\H7�&h0��h���Y��&r� ���'Y��t�[2B!$�2��$�%�IM)���L������ ���4�X��E\��FPK�8��[
'e��B�0^>
_���y��%WQ������u�#��pm-],�O�UL��� A�$%��g�.@K�?t���KH\SE���,z����r�l��X+��,�0��A�b�8�R���5[x$�,���OR�T�P���B���n
��
������v\�1d�H��l����k ��P
���#�cG�&cF���y4+���2	E��NU8�kA���H���S� Kq�rLa�������vpq{~~�������cz#{���IR6��Z���x����'P������oR�Z�W��H�N�h:
'�z��P�����Lk��xpzs}w�������������
I2t��{�6�)�\�)6�"v����������78���ks��!BM��.��;Bkt�
0�w3��k1��E:Q(uxb�&�I��(L�Z�|�A{����Wp�oFi�(�@�#+��q��[(p[�ZpYg+.���0
�nS�`��}�
�Ao�����b@�0�GMeZ5�&iA#u������:6��H�	�=H��]���m
�M�T�-�������@�`r��WH���E�#7�X�>Xg�u�"a=$�!X�%2A�f�NL��`�&eg�����x��S�4��O�$)��*O��?�����
���L2P(P�cZ�v>���0O�(�����H�(u{���OsM0��Tk���:��S�����<:�v_g���?�{�4^�i��#����(����Q6kR�S�/�o�OO��}0�#���%�@�h�>g�p���^�^@��=��i9�i��51�P��|BQ���b&1��~�#�����8�P2�Sf#!~M:��]��
��#J�
������)���tb|�*��Y��B!��%���">3�V��@v2������H�R6+LG�u�Cns/�L�V��rJM��X�l!,�~��s�+�a��A�.j���2��Yh�H��e��z��[c�8����U*u�[R��-T�I�������A�dNS@P�G&N��U�:L^���(��<����;�<+��_B!�����r��\3��k-TP�Ef�����-|������s7 mM6����4^l�|�_&��0�S
ju���n��T���faMw1:����l��4�����g&R6f&bo��;PK����s)��n��HOW|�tr�;��������(���������i�bC�u���2��
8G��@����.d�����y^�P����R���_K2�<�-�>�=�Y2�)�]l���`�A�H&Q*+H�0	�<b�s�
n~d�$��J��J�	����t�	����}����X^7��Q�����>�R{+������#������$���J�o�G���+pT#�8���b�&�Tb	G�=%sz�!�G��a$�������5���b`yC�u��E��<-2T/kq�T�b�h���2��w����������wBM��Oh�\62�T]L��Q��{���f�X8���T��7*V�����2���b��Fu���m1G���I�l��c����g	�
y������E���0ohEs�i4��@@p�j;����aM��{�c�RE7m��F�B�:\���u�����.4�{�q�<�uk���6-��]���g��JE&�:�e�D@����e�@Q�9��k����sM]��*����^93��j��M�y+eo
.�,��/�v���RvC�A����f��<K!�2�*%1?A��4��D7R���b�^�.	�@�����OY5W��wN�6L+|�H��`�����������X�z�c7�0J�03b[�����'��a����L"~.��8�V�c����3R�FVA2� KV\��y��������q�&���
�����-��YJ��n����Ip����.G�e�����������0��1�&�b���$�#��-��Y�Q�2����?�����^�#��m7���G#�TpYzEnM�'x�<�����J�|���K\�Y
<�N�!~jEVX�����Dl�iL]4���R�o�a�L�_&���q&�+����N^f�~�/O��1U:{=��� ��L35���I(R?���)�o@������L��u��v�����-���*/0����W�Y5�V�+[�~��
Z�Ok<1���]����T<�(
�Q��������u��2�8�V��T�/�����F����,�)�G/a�23�x����������m�A%J��EF���A����$�8��������;���|�&i.���G��/�$�UI��\���$����\z��x/��q���[�t�IgD/��������w}qSF,�>&�����2 p�������f��	7��C���������p���+D��|�EQ+�j�5M���|G�\����e������R�`�p^��m��"Vui���bY>��2"��,�<J��^m�a&����1%K��1���:�;��9�OA�4���M<=�=%b��e������
Rl��2�R�������g;����CM��`�c�'X/n�C������5���������������}���l���tC\����[�7
�y�/�"����y�U��=�>�����q��n��	�.�����X��,�'tl��@AM7����XH`��^���>A��<���6<��%7��������@��lBe(�$��J��\r���.��L����,@�A���Gu
��
�������iCo�uV���
�E�ot�����~��-2i�yMF'|���{F?xzO���cn��������q����q�*Px�ya~Gk/�_R��V�A��1��A�^��$I�o�6���S�����L�\���J�n�&<����Y�) �����L�X������|�������a���H��
6 �K���
Wa������<��:F��j��O��_���
�2V{�:l�;��U�M�QY����5@]�W�
\����v�����:=|��3a�����B�z,��59 |����7���O~���"���vRXxN��;��E&�����^��bn6�����=����d�����R,l-���QC4Q2��m6��=�����K��&tiP�c��(�����_1��l�����v(�(B�(�t�~��6}���.�����r����i�;V�����s^��A}?�)&8���%>������������>�3v�pE������f����8��dt�dI.�dq�HK����?��j!e;==����,�����K�`�z��[��mi��4m�0��
v�9p�e������3����aY'�}��e���I�+������c������~��ff�U���4�V���*���S���.d����U�C��]z��nTF�W��D�}���AC���.��+�8���j&�]T?�o�M*_��e=�����;Y�6V�:~����1S���������b97�&k�����}�\�%�Px����T#�G�&�@Z��R����t]~B��f��3Mu�#�Kj��%@M��9�~��f �9;TE�57��m��=���m�j����^�2���m��
���lz�y z��O��'�v<u�/��n���=2O���%1��!�)�<�����*L,�7
���<C?q��O��*7���M�&s� �`w
�m/�qI�h�����n�3���mjg
��&s������M���������Z��3x��:�G�lL�~�
�x�9�S���Z�����(u}o��d�0����.��,� jyh��h��r�s������L>�N;m�!�a�y��������:�	�*�����[��4s�'KA*�N�)n�D^��XX���v'�e#T���#��-���,[�l\�&�W������Y��K�h���W��k)Ln���o�$��	�0��-9��I�e�jK���sk�N~C^;����C/
�N-��G��++��P�T��G�Kp��CW����ZU�n������i��\������dp���ps�������� n|�4�����d�����&p7 
�����[#��P����Vo��qR�
��G��N`m�����)����No[�'B���~�G#^�E�W�Z.L?���������s+97�����
6<��L~��Gn,q�wA����Q�B����$/�����<��x&�w:�AQ��Sg;�[�"��v�9	��b6��4�vlJKh;��{�5��@�O�GdU�������<��+��[T{\'��U�`��g�&j�v�F������[�8��s���yt5�KU�X�Z�������l����\�U�n�K�J�s��(�]!����n�����&7L�P%l����^h�4��c�X���y��$�C��GGj��A{�5;���7����Wo_6������y�����	�,��
�1"5Y�u���Xe<	��&g�NT���{��iX���U��KF7��]���sW5J���pe�T��Ge�E�*B3�z�?�l%$N/|�'y��z���bb���.��hh���[��X��7n�L�6����C��
������t
������)`���R��	]#��^��^S����F�A�=�D�dP��e���0�+R����k���^�`ev�AoV�pT��_��4����Rc�0���:�e4R�`/�������Cw]���9���V��e�=�SG�C�C!�MB|�7��2���/��X5�9g	4��Q|�t+�;�TG4�q^����1|�=BT�%�r/��yw`�6��4�\�7�6tO���57R����k��������������6�|���]�V��Wm(��o9[����A��b����4 ��c!�
����u���X����Z���L�jH�fF�K31q�tv�����9�p�������������m���H�{R6�-���uYn��vgV}�:�7������{�g�0���I���td����������b5��5
g�?t�����n�w��R�Qc��7�k9�=����XmY��d.�c�{�8.{���h~n1�S��X�p���3���B'o���6E��Y*KAK
��V�������d%���C�ks G���A��u�����(�f���,��1p�d1)w�ff��i�q��5�uR\���S���R|G�F9����y��#S�����8\������g����!k$V�F_����.�6sc5#SA��ilv�+{�6�PO�*�����,J���t4Hh��n��jG�|�������KA��u9��4`�|�Y�J���b]���.����E�/�M�P�����F�U��f�rA`1�����q�J���,yKb>Og� ~�.�Wc�z��`�s����{�HE�_J�d��p���)������}`0�K\�����^�c,&fJH���o�\h�C����J��8���o>��h��w�mD���R9jD�|^N�P!���!.����
�05�����C>���J��'Ho`>a����&X�*vo����@�5��v��F����"V�#K�u��j�R��nH-�V���Is����p��������YuP�t�<��f�HY�����1�f���'�`V����J~�3i,����;�c����%n+�u����?��>(������� q{���0�fp<��t��h�H�G��:�GJL�YT��Z�A�y`.Y�6:� ��5xo����f����h]����2�4*�>��F�n��{=��xjM��$}��Ix"��`>Y=�D�-�8/U
} g�(�P�P�h�|����Z�p^W83��K��J'���F��$f����i4��0��#]<+�Hg�S�;j(����R�E[m
 �d��@C���x����?3�I"��O��x�*l�+������e�ZYg)l		�HQ�^I�j�F�(D�����:�t���LIc�� �����U%�z�`9�*y�����k�%�k;�t!���9��U���;��E���#�iC�=�=<c=�i�kV]���&�M��A�[��Y��)eS
-�CH�E��~��C�;����x�j���7�a���a$Z
nI����,0
��I*�|������G,����cpQ��^4jX�Z������,2�L���W��
����m�$5��u#��|����%)7�1���D�Va����jZB,���\��2O���
w��G�?C|�b�	���j�5��%C��Z@���t^�H���x_����� '�j����-5sl������U�.h��Lt���3��\n���LJ��z����q��
�T0��O��@o6����a7���<���/��R�y���sl�������G�+]�[�A����@�c�
�0����z���
���j���zk�hl��qr�MQ2�����t���&�)a���1A�Z����F����'����5Y|��	yO(H&���������6`��K�p�&�ja4�"G����B�B<G��S��~��W��A�V���	f��h<�L�IxL�k��b�Vz[�m!�dS37C=�����&��@�
����H���k�-Kkb7a�i�N������R�=����`���(�<v��uI�L,�F������@���'�r0�N�cm�����`p����3@�l����'�d�	o�`��'�����t��^��(�/��WSH7��47(*l�B���P���0E�.����tJ���_�{����5��"�Q�0N,�F��8C���`��z��m�C!��j������{��� a�6J`��e��b����g�u��1���T@�l
���h!zq!�\P�i�����SF `J�xc�
z��N���^(���^N���47�%��<4��d�[��m�J�a�a<Mh����U��97��n�@�qi����������8�6�]�`�
D(����(�4����>@�������3$�4����p��V<��~�on��	�WP�Q`b�~�����C������Eq��B�~p�
-�2������P)�Y�����!#K�.i����P��x:�u|�p�|u��^�6�ZY]���F}���������O]%D��t�
�# ��������A��)&ZAK,��� ��x�C���?�Q<65�~��m/��c�(�x����
�A����uP.V�D	"
���5`l�1�N�v�"\X��rA�������a#>-['��4E������b>N�@K����Yah�t��zl��'�10e�O�N u�d2H/9}�@xF�JP:�R�s�g/<��rUI�s�1�$��a���,�1�Oh.�Nd�V�:Yy��}�?�Cv�.!����?*�
��������I\�g�u��:�N��9�p�na�!����cL���4d����.���b�j��{������{�u�bl��6�^��WFr�%^Xx^�����u��C
�{�?���<������>B�����Y�{��$m_��d5�	G�8&F��i�qn�Y4��D�~�YsE����� �y�X��,�t�#^]dX�l����q�2p����>Q��@�V]�*�xzV��C�j:b������a�{���p4~�f��S�3��������0*��O�Rc�v8�OlE���I���3�J��Z'4���lpu&P�������5(��Uk]i��ZJ?a�D���@�n��������S���*p�1:"����2��F������n��}�$�;,��yX���_��q���-#�*��^!3%�)���w���?��8ud�c�C4�����a?���?(��l����,�)� {(�W
�w���������]�������+�������5�8�O����ngp�p--���CE"4�t��{����7��WX����W����?t���`p����e�k����E���*��h�x�E�z�qi���}q�X��zm|���=�bC�c8�g'?�Q��0���_�*�4-�=���Y=sZ�s�L�7��`
��<���Y=����(�v��^\=�um���4f�������1�./%���R���L#Z�_X�,�RnA���UFYSU���� �pvn����1��
^��=?*<��Lk���7�9�w)/����s*����m�l�k�l����.2J=P-�� �����3��ko��)���D��3HqS�W����nh��I���An=������������%6�#Bh�4����I����/�����=��e�p�p��a�t���8�����_�V�>8;�c>�����N��v�����k`����,�D��H���>l�k�l�g�D�gM�8����d��Qo�&��}y��Ew�g��'w�78�&E��1$�� �G�1g=yV���H@���e�4�n7,��������wxCi!#�5������@�[m}�����������
t�}�T_YO_��|r������O�<������.^��'O�������C����m��;a���N~��iyQW��K�rv��z������������z�^�sO�$w���;���������n��������������v�������o�j*����^�Y���R��8�/W�����f�v�����p��`{;���w����A��?���=����~��������T_��:4�P��-F�{:e�z4�o�]��v8�jB�x]3$�C�q�������3����������Z�����,��z�������U������9b�����%��\�B����;�?��bX+�*r��}f�������ZYSwT�������!V|*����=��J���NE��j\sK���u����������������sf�������=��3�hi��`�bI�����b8p���.�z�������s]���E�p���w��E��.m:F�.����9lo����4	�$���I�������p���>.�j�w��WG�9A���ZCff���|�+�r� ��}�)���)"�7@�C�(�{n��M25Zi����
��5�����3h�q�']���
Gs���:���i4ct	c�����M���<������n*�T~�[�M?3`����U��1���W�.cl���_5�R1�������S���{��{���H\Q��S���$�\�b9O8%������d�����K��}���n�c0F�n�{4����mQ��<��D!��F�<r���rw_��7z���7�0�!�|���������2�/O��zq�+��`����O+z�
z�o��J9�&iB�E�;���{���B=�gN�>��\Q�P���"|{:z:�=�{;H����,�o3���9�H R��&X��)F�{�;������:rC�H�d8����v����W��3��<�by:7��|�O]��)��"�k�x��������|:C��s= ]�	���\8^V�a�R����;�����-v������xn������l�����sM�Pw��T����G�����y��EJ<w��P0k���c��Rb�C��Z����Y������I>���j�,�u�tW�L�1'��~Pc����������>�Y/f�UnWM�zkt��Ms�R�$���d
�}��N`dq3����&�	9�aTq��K�E ���.�l�F�{�,}m�YP�g��]��U��'�S�����*����F���MX��(R�Y:C��9<m<�z���D`����44���'�0��n!����7OF����z���?��':lZg[��D�J]�y�Ve+l14���������\��iV[I�8�������c,m��5;����h��evP��Q����5��D��n[9���&�Z
E�5|�����h����e��4���w�+x�Yk������ZmZ+OBk�9h�6�O�q��O����i�5+���#%�g]_���7A��u�T,u}]������.q�0��
�.�1�������b�\��f�����D5��WM������y�R��f�<�S��2�����+�����U����p��)9�U�6V~�����7}8�*�e
�������o�/����hkwTb��k���(+�J�>���l`\F	fs*��F���"��X��i����Vb�/
Y���0��R�p�H_}��o�t��������7�S����t���R]qvD��v�	�@��;�Zi�HY9�����i\a���$�|��=<����"\��Q�as�������a�z��^�6h=��W0���
XA_`��eV�@�-�S�lt�]��O_"p�.�y:	�[6�/��y��=�^�RK�'�d��DKL��^���T?��[yj���n����2����������	�s�������6t������g�?����z���]��E1���sd�9D�'M�8�
?#q�C���bc(��C�X[��v��c@�`p����wl�1�U�d7)�=�&'�9`�Y��������P���c[H��.������r����{/�&�a�b���A"�Q�;R��Ywb9�F����Yz=U�&�b��p��o|g4���	�7S��$�F�����v��HT�R����j��mU�M���k���@������brt���b����N�zE���bT�b3��H��F�Y�����{�fJ�Ao��h��5A������p?*��)�1�4���rsh��r��Y&��h�/�hH/"������'���8(;E3�$sJ�jG�]��E����l��z��`tl;0�G	������2�&�]x���"�O��/w����APi��1�\J:gA3[�*��4��:��,Z���B��Db�87�^�GN�s�VO��`}�d�������%���6��L���q�C�ampp;%��F ���I���%p��j�s���g�j������_� �|z��l��H��W������NX�$�.��|%����v�^���	S����M���9�����
_mGW�b����wh��A�X��Z�o�72���I\�M���Ql���oG�U���H�����@�cg��T�H;h������.��I��{����#�#���&�04�$0�!��8�7���9q~��s�����&7g&�SR�'�!��r��I��`�o!�&��A���Yo}�-��pX�`-�^`�����+3I����Y��JG@�����g2hb�f��8Gx#}C`_�mY���=M-�w�"D�c�m
^~%�hW�-����f�0ML�� �����Y����2��B�Zlj�	���p���~�0e]sw��wRy58'��$N��K8�]�>��^�� ���`���l��*o��V�9�a~��7��f{�V��
Bu��1KU��E���dka?�rI��jpn<�
�#0d��e�1o)�a�6c�'�yv�����Sp���0��&�T�hQV��Z�-U�-��|�7	�"#�V�0s���k��!���/o�8���N|�s���@K�0d>��9�!w%;@�`�X�Y:�T��wq�<o��n�����z���vQ�Uf�hy�q2D��l�n���l��C7^z�5�/��A�������1wl"��T<Y������tI1��*P�H�\jk�)�-��0�E6�G^��#�H���dtp	c��c>K$�p��e9��k�'�;�c@����OBf�@����m`Z'�<���c�-�@��Yy2�4,?�lXt^,9�|��������{��Od�P��b�v����bJNz�-�P�>��D��2�dV��u���W��=*��SQ����ID[,\�0�X�6����s�?������]\�U���w�Rxq��:0��ni��g�!2\���~�����8u4t��y����m���a������%���o���[eS���1�R��6S&�$����&�'@���=�R��)h�m�O����O_>{�&�g}>��qNA��HS��+<�pZ�9��PE\|
ixE
n��e��E?`���_�'�P^51���W����m23r_�������E�9��$���7f�i�`8�s�uXg��U�v�>���f��xT|����.���`<~����tA�������?�5t�-.Kq�K�8�k����UF�e��<�?}c��Cn�����Z�� td��J��!*���C\�����=_X�����?C+���d(��bz�r lP|hXKH��*����#��	�6GGm�\!�W�����A:��I��}k�b9�;�d�J�<&�+U�.�t��A4H����#�����6�������:'7����I��:����D.�LK����be���C���f�EdCd-P�#��|b\v.w��$�������M��@�!���%1iq���T�<� fQ��6,$��J��aJ���4�����H��4��	���m��lL��RhH?��=�et���� �����" ��9�eu�PH�0�3�+�2
���*��(��-����3"t�����0:��Z�������f����]�5�?;!�N��DAalc5=`��%
����P���
�N��e*3�
Q�������
��5��U��A m����Ec����1l�p��t���,�����o�9u�r��v������(Uh�F|��H#z��M��lv��`2?�_S^:j	��k�%bg���7��ns7c����h?���QD���moBd$7��?`b]�G.K?����/.�m����v1�q2�<�A��5�89�@,�E|~B��u�%e#�-�����Ir������2���
�@r��vv����{V5�?"�>p�������QB������{*���s������b�K}T4S����:8�kt����`���a����R;w�`����DnT���#]#�[��>�L|SJ�Q��%� ��eH�
��,�{�$�2�:�����`���������f����&o����������-
��p�f�N���-�o��5yh����A���{�V���!o����m
|���, ������6�3�Q6n�F������
��&����x����Q����(���W�f�%�����\��h!���g���4#�����P�sw�2?�����|�<���,�0SrA-���C�"�$��a�hD*}�����5#���\���7���Wo_6 P�����<T���	f" ��"v&���$`����"����Pi%[r���M���	��!}�W��^��l�g�k_&��bon�����P��g3�w���aiHrn�K����C�Z+��P_�z��4fPn�iYV.*P��D�
F�c"s�1H�s6��rM3�^�
����*e�������U�L��'�3e�����Xv�h��`�N�lTt���6�����.~�Eb��	�(���M5#�\2C�U^VR[sk��f���N�y�����t�%gi�O��XH�x��s-H�Xj�Y|r���$z���x�.��*���T�5�z!�G��\��]"*�%���lw��fP����Oc�Op�2��3���L��QSh9��$�}�����9"�s|������b��JFo������#�"C��.��3��e�
��[�l�J0IH��]bZ������3�*����5p�B=3�LT+�����p�2-�m��4g�U���yao/��w��)�����Y|�G�'G�Re�=�������OK;�'���;��V@-������B2����O��������*o���X��h�5"�[���+���iH"��X~6��
�X)���l�#3H3��!�#H�p����������B>�i'!4�IP9���r��2m�<�9!r����[�M|��W Ec�2?���z!�i��{|\{��\zk�����R�!1���FnA3��1`�<�t1���#y,�����u~����NmQ�0�o�������O�}7f���Y��
$s�r���ZpiL�K#��$�{�}�;��G4J�!�I�bU�����.���,�]������K�ei��&����;$���K��1Z|���S������K�ie���%?���3Nw��)�g�}�������L*'8�m�r>�K��dDf�.��L�]����?j7�%�_jeazG<�pwmD�IvI�n._���4���}s
x|�L�R����o�2���m�;��: ^2�������<P�~�E���3����������T���&�]r���3��;�A"p��{�|q���������]|4!��*����$�� ����I�k�����S���Z9�y�`�ik�tW��VO��D��9����G�K��^a<��������Z�_��9�_�Gh������h�G������hSS�������~��sb�0�
��b0-�q�����a�����wW�����!�@��|N�
�h�d����E����Q	���n71<��_u�����A� �!��RS?��!�7G���9{�]Y�O�����7�<DKu�o)58/+!H}^��
�^��,�����@��X���6�\�0v���  �Xe!T�KU����l��S�"�Bn�����n�����	�?D�=����iQ��$�mv��y
��R�4���:w����xgCdIZEk�^�5L�9+H�c����hmI����X�?���jR��6����^	8MH�6����b7�,�����1l�9Up�������dn��+�>��?��VH{���eD��2��b�4�vf��U�%�W�IwF9����2\����9F��25�����V��� �L�Q�a1���y1����;R,_(����\��3�[��34�	�Q��U�L�?���%s�B|�����,�l��h���8��#t����,��
���/�73��8�O���'DW1�sp{9n�;E�=mi�2�<�����d��]�u4����q���ZG�l�}.���2���o���D*7�Z��{��r8eo`��a��n�YPUz2��m�HK^�G`�cK$.������� ����;?.f�T�Ah�j��:b��

�f ���b�'��Y��.B��0��.���(���~�s�1���<�
����C�����mvT�T���
��6t����eq2[�o������k��R�h���S��9�
	����=4�3��<��e�|�`��"�����0-P�E��?s��p'*Ix��i�����%{�LF����a{���N}�T:��xQ��[r�,Q�Y��B/����&e�o�v�\S�;��bs����)�A�|���C�B��K�x�,�.��k�;&�_6�>����?3�fu�J�AznR8��^���v����J0��B-�!��4w��E��V�������+!'���`����V�iR��<A�f��g
$�>������6r�s3����<R~�[
CXj*�����
�\���?&2�����1�`���m�,�m/�����\9�k��}�G;�b*���T���������U]S��D���-',���vw���pi���������{�9a��
V��`!=�{�G��Ia���E�+M~�ge��o�{��>WiM"��C��U���I�
����J�����`������@�9�����4���3��@�K!�0�=��N�c� ���(K@���1���rf����23j�>3jIzW�P/�K��V�q�^����M���Q��@"/�%.��u���v���������;�
�������:C�S?3����[��\�i�.��wH��?�P���l�_?_?_?_?_?_?_?_?_?_?_?_?_?����`���
#26Jim Nasby
Jim.Nasby@BlueTreble.com
In reply to: Tomas Vondra (#8)
Re: PATCH: two slab-like memory allocators

On 10/1/16 7:34 PM, Tomas Vondra wrote:

+ /* otherwise add it to the proper freelist bin */
Looks like something went missing... :)

Ummm? The patch contains this:

+    /* otherwise add it to the proper freelist bin */
+    if (set->freelist[block->nfree])
+        set->freelist[block->nfree]->prev = block;
+
+    block->next = set->freelist[block->nfree];
+    set->freelist[block->nfree] = block;

Which does exactly the thing it should do. Or what is missing?

What's confusing is the "otherwise" right at the beginning of the function:

+static void
+add_to_freelist(Slab set, SlabBlock block)
+{
+	/* otherwise add it to the proper freelist bin */
+	if (set->freelist[block->nfree])
+		set->freelist[block->nfree]->prev = block;
+
+	block->next = set->freelist[block->nfree];
+	set->freelist[block->nfree] = block;
+}

Otherwise what? What's the other option?

(Haven't looked at the newer patch, so maybe this isn't an issue anymore.)
--
Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
Experts in Analytics, Data Architecture and PostgreSQL
Data in Trouble? Get it in Treble! http://BlueTreble.com
855-TREBLE2 (855-873-2532) mobile: 512-569-9461

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#27Jim Nasby
Jim.Nasby@BlueTreble.com
In reply to: Tomas Vondra (#25)
Re: PATCH: two slab-like memory allocators

On 10/25/16 4:48 PM, Tomas Vondra wrote:

The main issue that bugs me is the name of the Gen allocator, but I
don't have a good naming ideas :( The basic characteristics of Gen is
that it does not reuse space released by pfree(), relying on the fact
that the whole block will become free. That should be reflected in the
name somehow, I guess.

OneTime? OneUse? OneShot? AllocOnce?

OneHitWonder? ;P
--
Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
Experts in Analytics, Data Architecture and PostgreSQL
Data in Trouble? Get it in Treble! http://BlueTreble.com
855-TREBLE2 (855-873-2532) mobile: 512-569-9461

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#28Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#25)
Re: PATCH: two slab-like memory allocators

Hi,

Subject: [PATCH 1/2] slab allocator

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 6ad7e7d..520f295 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c

I'd rather have that in a separate followup commit...

+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations that are not possible in AllocSet. Firstly, we can get rid
+ *	of the doubling and carve the blocks into chunks of exactly the right size
+ *	(plus alignment), not wasting memory.

References to AllocSet aren't necessarily a good idea, they'll quite
possibly get out of date. The argument can be quite easily be made
without referring to a concrete reference to behaviour elsewhere.

+ *
+ *	At the context level, we use 'freelist' array to track blocks grouped by
+ *	number of free chunks. For example freelist[0] is a list of completely full
+ *	blocks, freelist[1] is a block with a single free chunk, etc.

Hm. Those arrays are going to be quite large for small allocations w/
big blocks (an imo sensible combination). Maybe it'd make more sense to
model it as a linked list of blocks? Full blocks are at one end, empty
ones at the other?

+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data
area.

I assume the "round up" stuff is copy-paste?

+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.

I think we need a preliminary patch moving a lot of this into something
like mcxt_internal.h. Copying this comment, and a lot of the utility
functions, into every memory context implementation is a bad pattern.

+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock;	/* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* Info about storage allocated in this context: */
+	SlabBlock	freelist[1];	/* free lists (block-level) */

I assume this is a variable-length array? If so, that a) should be
documented b) use FLEXIBLE_ARRAY_MEMBER as length - not doing so
actually will cause compiler warnings and potential misoptimizations.

+/*
+ * SlabBlockData
+ *		Structure of a single block in SLAB allocator.
+ *
+ * slab: context owning this block

What do we need this for?

+ * prev, next: used for doubly-linked list of blocks in global freelist

I'd prefer using an embedded list here (cf. ilist.h).

+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ * However it's possible to fields in front of the StandardChunkHeader fields,
+ * which is used to add pointer to the block.
+ */

Wouldn't that be easier to enforce - particularly around alignment
requirements - by embedding a StandardChunkHeader here? That'd also
avoid redundancies.

+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif

Do we really have to copy that stuff from aset.c? Obviously no-one uses
that, since it doesn't even compile cleanly if HAVE_ALLOCINFO is
defined:
/home/andres/src/postgresql/src/backend/utils/mmgr/aset.c:302:20: warning: format ‘%d’ expects argument of type ‘int’, but argument 5 has type ‘Size {aka long unsigned int}’ [-Wformat=]
fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \

+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static void
+wipe_mem(void *ptr, size_t size)
+#ifdef MEMORY_CONTEXT_CHECKING
+static void
+set_sentinel(void *base, Size offset)
+
+static bool
+sentinel_ok(const void *base, Size offset)
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+static void
+randomize_mem(char *ptr, size_t size)

Let's move these into an mcxt_internal.h or mcxt_impl.h or such, as
static inlines.

+MemoryContext
+SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize)
+{
+	int		chunksPerBlock;
+	Size	fullChunkSize;
+	Slab	set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+	/* make sure the block can store at least one chunk (plus a bitmap) */
+	if (blockSize - sizeof(SlabChunkData) < fullChunkSize + 1)
+		elog(ERROR, "block size %ld for slab is too small for chunks %ld",
+					blockSize, chunkSize);

I assume the 1 is the bitmap size?

+	/* so how many chunks can we fit into a block, including header and bitmap? */
+	chunksPerBlock
+		=  (8 * (blockSize - sizeof(SlabBlockData)) - 7) / (8 * fullChunkSize + 1);

I'm slightly drunk due to bad airline wine, but right now that seems a
bit odd and/or documentation worthy. I understand the (xxx + 7) / 8
pattern elsewhere, but right now I can't follow the - 7.

+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.

That seems like a meaningless comment in the context of a slab allocator
with a fixed size.

+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 *
+	 * (set->minFreeChunks == 0) means there are no blocks with free chunks,
+	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+	 */
+	if (set->minFreeChunks == 0)
+	{
+		block = (SlabBlock)malloc(set->blockSize);

Space after cast - maybe run pgindent over the file before submission?
Doing that manually helps to avoid ugly damange by the per-release run
later. I'm pretty sure there'll be a significant number of changes.

+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int byte = block->firstFreeChunk / 8;
+			int bit  = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (! (block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}

I haven't profiled (or even compiled) this patchset yet, but FWIW, in
the tuple deforming code, I could measure a noticeable speedup by
accessing bitmap-bytes in the native word-size, rather than char. I'm
*NOT* saying you should do that, but if this ever shows up as a
bottlneck, it might be worthwhile to optimize.

+	/*
+	 * And finally update minFreeChunks, i.e. the index to the block with the
+	 * lowest number of free chunks. We only need to do that when the block
+	 * got full (otherwise we know the current block is the right one).
+	 * We'll simply walk the freelist until we find a non-empty entry.
+	 */
+	if (set->minFreeChunks == 0)
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+			if (set->freelist[idx])
+			{
+				set->minFreeChunks = idx;
+				break;
+			}

Yuck. This definitely needs braces.

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#29Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#28)
Re: PATCH: two slab-like memory allocators

On 11/12/2016 08:12 PM, Andres Freund wrote:

Hi,

Subject: [PATCH 1/2] slab allocator

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 6ad7e7d..520f295 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c

I'd rather have that in a separate followup commit...

+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations that are not possible in AllocSet. Firstly, we can get rid
+ *	of the doubling and carve the blocks into chunks of exactly the right size
+ *	(plus alignment), not wasting memory.

References to AllocSet aren't necessarily a good idea, they'll quite
possibly get out of date. The argument can be quite easily be made
without referring to a concrete reference to behaviour elsewhere.

Yeah, that's probably true.

+ *
+ *	At the context level, we use 'freelist' array to track blocks grouped by
+ *	number of free chunks. For example freelist[0] is a list of completely full
+ *	blocks, freelist[1] is a block with a single free chunk, etc.

Hm. Those arrays are going to be quite large for small allocations w/
big blocks (an imo sensible combination). Maybe it'd make more sense to
model it as a linked list of blocks? Full blocks are at one end, empty
ones at the other?

So there'd be one huge list of blocks, sorted by the number of empty
chunks? Hmm, that might work I guess.

I don't think the combination of large blocks with small allocations is
particularly sensible, though - what exactly would be the benefit of
such combination? I would even consider enforcing some upper limit on
the number of chunks per block - say, 256, for example.

+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data
area.

I assume the "round up" stuff is copy-paste?

Yeah, sorry about that.

+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	...
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.

I think we need a preliminary patch moving a lot of this into something
like mcxt_internal.h. Copying this comment, and a lot of the utility
functions, into every memory context implementation is a bad pattern.

Yes, I planned to do that for the next version of patch. Laziness.

+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock;	/* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* Info about storage allocated in this context: */
+	SlabBlock	freelist[1];	/* free lists (block-level) */

I assume this is a variable-length array? If so, that a) should be
documented b) use FLEXIBLE_ARRAY_MEMBER as length - not doing so
actually will cause compiler warnings and potential misoptimizations.

Will fix, thanks.

+/*
+ * SlabBlockData
+ *		Structure of a single block in SLAB allocator.
+ *
+ * slab: context owning this block

What do we need this for?

You're right the pointer to the owning context is unnecessary - there's
nothing like "standard block header" and we already have the pointer in
the standard chunk header. But maybe keeping the pointer at least with
MEMORY_CONTEXT_CHECKING would be a good idea?

+ * prev, next: used for doubly-linked list of blocks in global freelist

I'd prefer using an embedded list here (cf. ilist.h).

Makes sense.

+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ * However it's possible to fields in front of the StandardChunkHeader fields,
+ * which is used to add pointer to the block.
+ */

Wouldn't that be easier to enforce - particularly around alignment
requirements - by embedding a StandardChunkHeader here? That'd also
avoid redundancies.

Also makes sense.

+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif

Do we really have to copy that stuff from aset.c? Obviously no-one uses
that, since it doesn't even compile cleanly if HAVE_ALLOCINFO is
defined:
/home/andres/src/postgresql/src/backend/utils/mmgr/aset.c:302:20: warning: format ‘%d’ expects argument of type ‘int’, but argument 5 has type ‘Size {aka long unsigned int}’ [-Wformat=]
fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \

I don't really care. Sure, we should fix the warning, but not supporting
HAVE_ALLOCINFO in the new allocator(s) seems wrong - we should either
support it everywhere, or we should rip it out. That's not the purpose
of this patch, though.

+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static void
+wipe_mem(void *ptr, size_t size)
+#ifdef MEMORY_CONTEXT_CHECKING
+static void
+set_sentinel(void *base, Size offset)
+
+static bool
+sentinel_ok(const void *base, Size offset)
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+static void
+randomize_mem(char *ptr, size_t size)

Let's move these into an mcxt_internal.h or mcxt_impl.h or such, as
static inlines.

Yes, next to the valgrind stuff.

+MemoryContext
+SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize)
+{
+	int		chunksPerBlock;
+	Size	fullChunkSize;
+	Slab	set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+	/* make sure the block can store at least one chunk (plus a bitmap) */
+	if (blockSize - sizeof(SlabChunkData) < fullChunkSize + 1)
+		elog(ERROR, "block size %ld for slab is too small for chunks %ld",
+					blockSize, chunkSize);

I assume the 1 is the bitmap size?

Yes, the smallest bitmap is 1 byte.

+	/* so how many chunks can we fit into a block, including header and bitmap? */
+	chunksPerBlock
+		=  (8 * (blockSize - sizeof(SlabBlockData)) - 7) / (8 * fullChunkSize + 1);

I'm slightly drunk due to bad airline wine, but right now that seems a
bit odd and/or documentation worthy. I understand the (xxx + 7) / 8
pattern elsewhere, but right now I can't follow the - 7.

We need all the bits (header, chunks and bitmap) to fit onto the block,
so this needs to hold:

blockSize >= sizeof(SlabBlockData) +
chunksPerBlock * fullChunkSize +
(chunksPerBlock + 7) / 8

solve for 'chunksPerBlock' and you'll get the above formula. Moving the
7 to the other side of the inequality is the reason for the minus.

But documenting this is probably a good idea.

+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.

That seems like a meaningless comment in the context of a slab allocator
with a fixed size.

Why? It might be worth moving this to SlabContextCreate though.

+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 *
+	 * (set->minFreeChunks == 0) means there are no blocks with free chunks,
+	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+	 */
+	if (set->minFreeChunks == 0)
+	{
+		block = (SlabBlock)malloc(set->blockSize);

Space after cast - maybe run pgindent over the file before submission?
Doing that manually helps to avoid ugly damange by the per-release run
later. I'm pretty sure there'll be a significant number of changes.

Will do.

+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int byte = block->firstFreeChunk / 8;
+			int bit  = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (! (block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}

I haven't profiled (or even compiled) this patchset yet, but FWIW, in
the tuple deforming code, I could measure a noticeable speedup by
accessing bitmap-bytes in the native word-size, rather than char. I'm
*NOT* saying you should do that, but if this ever shows up as a
bottlneck, it might be worthwhile to optimize.

OK, will try, although I don't expect this branch to be very hot.

+	/*
+	 * And finally update minFreeChunks, i.e. the index to the block with the
+	 * lowest number of free chunks. We only need to do that when the block
+	 * got full (otherwise we know the current block is the right one).
+	 * We'll simply walk the freelist until we find a non-empty entry.
+	 */
+	if (set->minFreeChunks == 0)
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+			if (set->freelist[idx])
+			{
+				set->minFreeChunks = idx;
+				break;
+			}

Yuck. This definitely needs braces.

OK ;-)

thanks

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#30Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#29)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

Attached is v6 of the patch series, fixing most of the points:

* common bits (valgrind/randomization/wipe) moved to memdebug.h/c

Instead of introducing a new header file, I've added the prototypes to
memdebug.h (which was already used for the valgrind stuff anyway), and
the implementations to a new memdebug.c file. Not sure what you meant by
"static inlines" though.

So the patch series now has three parts - 0001 with memdebug stuff, 0002
with slab and 0003 with gen (still a poor name).

* removing AllocSet references from both new memory contexts

* using FLEXIBLE_ARRAY_ELEMENT in SlabContext

* using dlist instead of the custom linked list

I've however kept SlabContext->freelist as an array, because there may
be many blocks with the same number of free chunks, in which case moving
the block in the list would be expensive. This way it's simply
dlist_delete + dlist_push.

* use StandardChunkHeader instead of the common fields

* removing pointer to context from block header for both contexts

* fix format in FreeInfo/AllocInfo (including for AllocSet)

* improved a bunch of comments (bitmap size, chunksPerBlock formula)

* did a pgindent run on the patch

* implement the missing methods in Gen (Stats/Check)

* fix a few minor bugs in both contexts

I haven't done anything with hiding pointers behind typedefs, because I
don't know what's so wrong about that.

I also haven't done anything with the bitmap access in SlabAlloc - I
haven't found any reasonable case when it would be measurable, and I
don't expect this to be even measurable in practice.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-allocators-v6.tgzapplication/x-compressed-tar; name=slab-allocators-v6.tgzDownload
#31Petr Jelinek
petr@2ndquadrant.com
In reply to: Tomas Vondra (#30)
Re: PATCH: two slab-like memory allocators

On 15/11/16 01:44, Tomas Vondra wrote:

Attached is v6 of the patch series, fixing most of the points:

* common bits (valgrind/randomization/wipe) moved to memdebug.h/c

Instead of introducing a new header file, I've added the prototypes to
memdebug.h (which was already used for the valgrind stuff anyway), and
the implementations to a new memdebug.c file. Not sure what you meant by
"static inlines" though.

I think Andres wanted to put the implementation to the static inline
functions directly in the header (see parts of pg_list or how atomics
work), but I guess you way works too.

I've however kept SlabContext->freelist as an array, because there may
be many blocks with the same number of free chunks, in which case moving
the block in the list would be expensive. This way it's simply
dlist_delete + dlist_push.

+1

I get mxact isolation test failures in test_decoding with this version
of patch:
  step s0w: INSERT INTO do_write DEFAULT VALUES;
+ WARNING:  problem in slab TXN: number of free chunks 33 in block
0x22beba0 does not match bitmap 34
  step s0start: SELECT data FROM
pg_logical_slot_get_changes('isolation_slot', NULL, NULL,
'include-xids', 'false');
  data
and
  step s0alter: ALTER TABLE do_write ADD column ts timestamptz;
  step s0w: INSERT INTO do_write DEFAULT VALUES;
+ WARNING:  problem in slab TXN: number of free chunks 33 in block
0x227c3f0 does not match bitmap 34
  step s0start: SELECT data FROM
pg_logical_slot_get_changes('isolation_slot', NULL, NULL,
'include-xids', 'false');
  data

Also, let's just rename the Gen to Generation.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#32Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Petr Jelinek (#31)
Re: PATCH: two slab-like memory allocators

On 11/27/2016 07:25 PM, Petr Jelinek wrote:

On 15/11/16 01:44, Tomas Vondra wrote:

Attached is v6 of the patch series, fixing most of the points:

* common bits (valgrind/randomization/wipe) moved to memdebug.h/c

Instead of introducing a new header file, I've added the prototypes to
memdebug.h (which was already used for the valgrind stuff anyway), and
the implementations to a new memdebug.c file. Not sure what you meant by
"static inlines" though.

I think Andres wanted to put the implementation to the static inline
functions directly in the header (see parts of pg_list or how atomics
work), but I guess you way works too.

I see. Well turning that into static inlines just like in pg_list is
possible. I guess the main reason is performance - for pg_list that
probably makes sense, but the memory randomization/valgrind stuff is
only ever used for debugging, which already does a lot of expensive
stuff anyway.

I've however kept SlabContext->freelist as an array, because there may
be many blocks with the same number of free chunks, in which case moving
the block in the list would be expensive. This way it's simply
dlist_delete + dlist_push.

+1

I get mxact isolation test failures in test_decoding with this version
of patch:
step s0w: INSERT INTO do_write DEFAULT VALUES;
+ WARNING:  problem in slab TXN: number of free chunks 33 in block
0x22beba0 does not match bitmap 34
step s0start: SELECT data FROM
pg_logical_slot_get_changes('isolation_slot', NULL, NULL,
'include-xids', 'false');
data
and
step s0alter: ALTER TABLE do_write ADD column ts timestamptz;
step s0w: INSERT INTO do_write DEFAULT VALUES;
+ WARNING:  problem in slab TXN: number of free chunks 33 in block
0x227c3f0 does not match bitmap 34
step s0start: SELECT data FROM
pg_logical_slot_get_changes('isolation_slot', NULL, NULL,
'include-xids', 'false');
data

D'oh! I believe this is a simple thinko in SlabCheck, which iterates
over chunks like this:

for (j = 0; j <= slab->chunksPerBlock; j++)
...

which is of course off-by-one error (and the 33 vs. 34 error message is
consistent with this theory).

Also, let's just rename the Gen to Generation.

OK.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#33Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#29)
Re: PATCH: two slab-like memory allocators

Hi,

+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

It hides what is passed by reference, and what by value, and it makes it
a guessing game whether you need -> or . since you don't know whether
it's a pointer or the actual object. All to save a * in parameter and
variable declaration?...

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#34Petr Jelinek
petr@2ndquadrant.com
In reply to: Andres Freund (#33)
Re: PATCH: two slab-like memory allocators

On 27/11/16 21:47, Andres Freund wrote:

Hi,

+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

It hides what is passed by reference, and what by value, and it makes it
a guessing game whether you need -> or . since you don't know whether
it's a pointer or the actual object. All to save a * in parameter and
variable declaration?...

FWIW I don't like that pattern either although it's used in many parts
of our code-base.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#35Andres Freund
andres@anarazel.de
In reply to: Petr Jelinek (#34)
Re: PATCH: two slab-like memory allocators

On 2016-11-27 22:21:49 +0100, Petr Jelinek wrote:

On 27/11/16 21:47, Andres Freund wrote:

Hi,

+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

It hides what is passed by reference, and what by value, and it makes it
a guessing game whether you need -> or . since you don't know whether
it's a pointer or the actual object. All to save a * in parameter and
variable declaration?...

FWIW I don't like that pattern either although it's used in many parts
of our code-base.

But relatively few new ones, most of it is pretty old.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#36Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#35)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

Dne 11/27/2016 v 11:02 PM Andres Freund napsal(a):

On 2016-11-27 22:21:49 +0100, Petr Jelinek wrote:

On 27/11/16 21:47, Andres Freund wrote:

Hi,

+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

It hides what is passed by reference, and what by value, and it makes it
a guessing game whether you need -> or . since you don't know whether
it's a pointer or the actual object. All to save a * in parameter and
variable declaration?...

FWIW I don't like that pattern either although it's used in many
parts of our code-base.

But relatively few new ones, most of it is pretty old.

I do agree it's not particularly pretty pattern, but in this case it's
fairly isolated in the mmgr sources, and I quite value the consistency
in this part of the code (i.e. that aset.c, slab.c and generation.c all
use the same approach). So I haven't changed this.

The attached v7 fixes the off-by-one error in slab.c, causing failures
in test_decoding isolation tests, and renames Gen to Generation, as
proposed by Petr.

regards
Tomas

Attachments:

slab-allocators-v7.tgzapplication/x-compressed-tar; name=slab-allocators-v7.tgzDownload
#37Haribabu Kommi
kommi.haribabu@gmail.com
In reply to: Tomas Vondra (#36)
Re: PATCH: two slab-like memory allocators

On Thu, Dec 1, 2016 at 1:26 PM, Tomas Vondra <tomas.vondra@2ndquadrant.com>
wrote:

Dne 11/27/2016 v 11:02 PM Andres Freund napsal(a):

On 2016-11-27 22:21:49 +0100, Petr Jelinek wrote:

On 27/11/16 21:47, Andres Freund wrote:

Hi,

+typedef struct SlabBlockData *SlabBlock; /* forward

reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

It hides what is passed by reference, and what by value, and it makes it
a guessing game whether you need -> or . since you don't know whether
it's a pointer or the actual object. All to save a * in parameter and
variable declaration?...

FWIW I don't like that pattern either although it's used in many
parts of our code-base.

But relatively few new ones, most of it is pretty old.

I do agree it's not particularly pretty pattern, but in this case it's
fairly isolated in the mmgr sources, and I quite value the consistency in
this part of the code (i.e. that aset.c, slab.c and generation.c all use
the same approach). So I haven't changed this.

The attached v7 fixes the off-by-one error in slab.c, causing failures in
test_decoding isolation tests, and renames Gen to Generation, as proposed
by Petr.

Moved to next CF with same status (needs review).

Regards,
Hari Babu
Fujitsu Australia

#38Petr Jelinek
petr.jelinek@2ndquadrant.com
In reply to: Tomas Vondra (#36)
Re: PATCH: two slab-like memory allocators

On 01/12/16 03:26, Tomas Vondra wrote:

Dne 11/27/2016 v 11:02 PM Andres Freund napsal(a):

On 2016-11-27 22:21:49 +0100, Petr Jelinek wrote:

On 27/11/16 21:47, Andres Freund wrote:

Hi,

+typedef struct SlabBlockData *SlabBlock;        /* forward
reference */
+typedef struct SlabChunkData *SlabChunk;

Can we please not continue hiding pointers behind typedefs? It's a
bad
pattern, and that it's fairly widely used isn't a good excuse to
introduce further usages of it.

Why is it a bad pattern?

It hides what is passed by reference, and what by value, and it
makes it
a guessing game whether you need -> or . since you don't know whether
it's a pointer or the actual object. All to save a * in parameter and
variable declaration?...

FWIW I don't like that pattern either although it's used in many
parts of our code-base.

But relatively few new ones, most of it is pretty old.

I do agree it's not particularly pretty pattern, but in this case it's
fairly isolated in the mmgr sources, and I quite value the consistency
in this part of the code (i.e. that aset.c, slab.c and generation.c all
use the same approach). So I haven't changed this.

The attached v7 fixes the off-by-one error in slab.c, causing failures
in test_decoding isolation tests, and renames Gen to Generation, as
proposed by Petr.

I'd be happy with this patch now (as in committer ready) except that it
does have some merge conflicts after the recent commits, so rebase is
needed.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#39Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Petr Jelinek (#38)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 12/12/2016 05:05 AM, Petr Jelinek wrote:

I'd be happy with this patch now (as in committer ready) except that it
does have some merge conflicts after the recent commits, so rebase is
needed.

Attached is a rebased version of the patch, resolving the Makefile merge
conflicts.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-allocators-v7.tgzapplication/x-compressed-tar; name=slab-allocators-v7.tgzDownload
#40Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#39)
3 attachment(s)
Re: PATCH: two slab-like memory allocators

On 12/12/2016 11:39 PM, Tomas Vondra wrote:

On 12/12/2016 05:05 AM, Petr Jelinek wrote:

I'd be happy with this patch now (as in committer ready) except that it
does have some merge conflicts after the recent commits, so rebase is
needed.

Attached is a rebased version of the patch, resolving the Makefile merge
conflicts.

Meh, managed to rebase a wrong branch, missing fix to the off-by-one
error (fixed v6). Attached is v8, hopefully the correct one.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0003-generational-context-v8.patchbinary/octet-stream; name=0003-generational-context-v8.patchDownload
From 45d781c581efc54070f26f64972bdb10ddeb7954 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Thu, 1 Dec 2016 01:42:32 +0100
Subject: [PATCH 3/3] generational context

---
 src/backend/replication/logical/reorderbuffer.c |  78 +--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/gen.c                    | 758 ++++++++++++++++++++++++
 src/backend/utils/mmgr/generation.c             | 758 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   4 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  14 -
 src/include/utils/memutils.h                    |   5 +
 8 files changed, 1536 insertions(+), 84 deletions(-)
 create mode 100644 src/backend/utils/mmgr/gen.c
 create mode 100644 src/backend/utils/mmgr/generation.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index caadc07..c62e786 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -149,15 +149,6 @@ typedef struct ReorderBufferDiskChange
  */
 static const Size max_changes_in_memory = 4096;
 
-/*
- * We use a very simple form of a slab allocator for frequently allocated
- * objects, simply keeping a fixed number in a linked list when unused,
- * instead pfree()ing them. Without that in many workloads aset.c becomes a
- * major bottleneck, especially when spilling to disk while decoding batch
- * workloads.
- */
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-
 /* ---------------------------------------
  * primary reorderbuffer support routines
  * ---------------------------------------
@@ -248,11 +239,9 @@ ReorderBufferAllocate(void)
 											SLAB_DEFAULT_BLOCK_SIZE,
 											sizeof(ReorderBufferTXN));
 
-	buffer->tup_context = AllocSetContextCreate(new_ctx,
-									"TupleBuf",
-									ALLOCSET_DEFAULT_MINSIZE,
-									ALLOCSET_DEFAULT_INITSIZE,
-									ALLOCSET_DEFAULT_MAXSIZE);
+	buffer->tup_context = GenerationContextCreate(new_ctx,
+										   "Tuples",
+										   SLAB_LARGE_BLOCK_SIZE);
 
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
@@ -264,15 +253,12 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_tuplebufs = 0;
-
 	buffer->outbuf = NULL;
 	buffer->outbufsize = 0;
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
 }
@@ -425,42 +411,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
-
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->tup_context,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
@@ -474,21 +430,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index cd0e803..7263399 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
+OBJS = aset.o generation.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/gen.c b/src/backend/utils/mmgr/gen.c
new file mode 100644
index 0000000..669a512
--- /dev/null
+++ b/src/backend/utils/mmgr/gen.c
@@ -0,0 +1,758 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the allocated chunks
+ *	have similar lifespan, i.e. that chunks allocated close from each other
+ *	(by time) will also be freed in close proximity, and mostly in the same
+ *	order. This is typical for various queue-like use cases, i.e. when tuples
+ *	are constructed, processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. The space release by freed chunks is not
+ *	reused, and once all chunks are freed (i.e. when nallocated == nfreed),
+ *	the whole block is thrown away. When the allocated chunks have similar
+ *	lifespan, this works very well and is extremely cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlockData))
+#define Generation_CHUNKHDRSZ	MAXALIGN(sizeof(GenerationChunkData))
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlockData *GenerationBlock;	/* forward reference */
+typedef struct GenerationChunkData *GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	block;			/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+typedef GenerationContext *Generation;
+
+/*
+ * GenerationBlockData
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlockData;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+}	GenerationChunkData;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+					((GenerationChunk)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+					((GenerationPointer)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	Generation			set;
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Generation) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationBlock	block;
+	GenerationChunk	chunk;
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->header.context = (MemoryContext) set;
+		chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->header.requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->header.context = (MemoryContext) set;
+	chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->header.size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	Size		oldsize = chunk->header.size;
+	GenerationPointer	newPointer;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->header.requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->header.requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+								   sizeof(chunk->header.requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->header.requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->header.size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	Generation			set = (Generation) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				set->header.name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	Generation			Generation = (Generation) context;
+	char	   *name = Generation->header.name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &Generation->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk	chunk = (GenerationChunk)ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->header.size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->header.context != (MemoryContext) Generation)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->header.requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->header.size != MAXALIGN(chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->header.requested_size < chunk->header.size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
new file mode 100644
index 0000000..67c7b8b
--- /dev/null
+++ b/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,758 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the allocated chunks
+ *	have similar lifespan, i.e. that chunks allocated close from each other
+ *	(by time) will also be freed in close proximity, and mostly in the same
+ *	order. This is typical for various queue-like use cases, i.e. when tuples
+ *	are constructed, processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. The space release by freed chunks is not
+ *	reused, and once all chunks are freed (i.e. when nallocated == nfreed),
+ *	the whole block is thrown away. When the allocated chunks have similar
+ *	lifespan, this works very well and is extremely cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlockData))
+#define Generation_CHUNKHDRSZ	MAXALIGN(sizeof(GenerationChunkData))
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlockData *GenerationBlock;	/* forward reference */
+typedef struct GenerationChunkData *GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	block;			/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+typedef GenerationContext *Generation;
+
+/*
+ * GenerationBlockData
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlockData;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+}	GenerationChunkData;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+					((GenerationChunk)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+					((GenerationPointer)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	Generation			set;
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Generation) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationBlock	block;
+	GenerationChunk	chunk;
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->header.context = (MemoryContext) set;
+		chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->header.requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->header.context = (MemoryContext) set;
+	chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->header.size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	Size		oldsize = chunk->header.size;
+	GenerationPointer	newPointer;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->header.requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->header.requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+								   sizeof(chunk->header.requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->header.requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->header.size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	Generation			set = (Generation) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				set->header.name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	Generation	gen = (Generation) context;
+	char	   *name = gen->header.name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &gen->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk	chunk = (GenerationChunk)ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->header.size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->header.context != (MemoryContext) gen)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->header.requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->header.size != MAXALIGN(chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->header.requested_size < chunk->header.size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 92a7478..f3f9939 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenerationContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index d910236..aba3215 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -276,6 +276,7 @@ typedef enum NodeTag
 	T_MemoryContext = 600,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenerationContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index b8f2f0e..cfa0572 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -337,20 +337,6 @@ struct ReorderBuffer
 	MemoryContext txn_context;
 	MemoryContext tup_context;
 
-	/*
-	 * Data structure slab cache.
-	 *
-	 * We allocate/deallocate some structures very frequently, to avoid bigger
-	 * overhead we cache some unused ones here.
-	 *
-	 * The maximum number of cached entries is controlled by const variables
-	 * on top of reorderbuffer.c
-	 */
-
-	/* cached ReorderBufferTupleBufs */
-	slist_head	cached_tuplebufs;
-	Size		nr_cached_tuplebufs;
-
 	XLogRecPtr	current_restart_decoding_lsn;
 
 	/* buffer for disk<->memory conversions */
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 7308845..e7d51ca 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -141,6 +141,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 				  Size blockSize,
 				  Size chunkSize);
 
+/* gen.c */
+extern MemoryContext GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
-- 
2.5.5

0001-move-common-bits-to-memdebug-v8.patchbinary/octet-stream; name=0001-move-common-bits-to-memdebug-v8.patchDownload
From c32b50b6b1b03ad20ffec4764944aa6db2a8f654 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Wed, 30 Nov 2016 15:26:59 +0100
Subject: [PATCH 1/3] move common bits to memdebug

---
 src/backend/utils/mmgr/Makefile   |   2 +-
 src/backend/utils/mmgr/aset.c     | 115 +--------------------------------
 src/backend/utils/mmgr/memdebug.c | 131 ++++++++++++++++++++++++++++++++++++++
 src/include/utils/memdebug.h      |  22 +++++++
 4 files changed, 156 insertions(+), 114 deletions(-)
 create mode 100644 src/backend/utils/mmgr/memdebug.c

diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index 1842bae..fc5f793 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o portalmem.o
+OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index f44e467..412674b 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -41,46 +41,6 @@
  *	chunks as chunks.  Anything "large" is passed off to malloc().  Change
  *	the number of freelists to change the small/large boundary.
  *
- *
- *	About CLOBBER_FREED_MEMORY:
- *
- *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
- *	This is useful for catching places that reference already-freed memory.
- *
- *	About MEMORY_CONTEXT_CHECKING:
- *
- *	Since we usually round request sizes up to the next power of 2, there
- *	is often some unused space immediately after a requested data area.
- *	Thus, if someone makes the common error of writing past what they've
- *	requested, the problem is likely to go unnoticed ... until the day when
- *	there *isn't* any wasted space, perhaps because of different memory
- *	alignment on a new platform, or some other effect.  To catch this sort
- *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
- *	the requested space whenever the request is less than the actual chunk
- *	size, and verifies that the byte is undamaged when the chunk is freed.
- *
- *
- *	About USE_VALGRIND and Valgrind client requests:
- *
- *	Valgrind provides "client request" macros that exchange information with
- *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
- *	currently-used macros.
- *
- *	When running under Valgrind, we want a NOACCESS memory region both before
- *	and after the allocation.  The chunk header is tempting as the preceding
- *	region, but mcxt.c expects to able to examine the standard chunk header
- *	fields.  Therefore, we use, when available, the requested_size field and
- *	any subsequent padding.  requested_size is made NOACCESS before returning
- *	a chunk pointer to a caller.  However, to reduce client request traffic,
- *	it is kept DEFINED in chunks on the free list.
- *
- *	The rounded-up capacity of the chunk usually acts as a post-allocation
- *	NOACCESS region.  If the request consumes precisely the entire chunk,
- *	there is no such region; another chunk header may immediately follow.  In
- *	that case, Valgrind will not detect access beyond the end of the chunk.
- *
- *	See also the cooperating Valgrind client requests in mcxt.c.
- *
  *-------------------------------------------------------------------------
  */
 
@@ -296,10 +256,10 @@ static const unsigned char LogTable256[256] =
  */
 #ifdef HAVE_ALLOCINFO
 #define AllocFreeInfo(_cxt, _chunk) \
-			fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+			fprintf(stderr, "AllocFree: %s: %p, %lu\n", \
 				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #define AllocAllocInfo(_cxt, _chunk) \
-			fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+			fprintf(stderr, "AllocAlloc: %s: %p, %lu\n", \
 				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #else
 #define AllocFreeInfo(_cxt, _chunk)
@@ -345,77 +305,6 @@ AllocSetFreeIndex(Size size)
 	return idx;
 }
 
-#ifdef CLOBBER_FREED_MEMORY
-
-/* Wipe freed memory for debugging purposes */
-static void
-wipe_mem(void *ptr, size_t size)
-{
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
-	memset(ptr, 0x7F, size);
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
-}
-#endif
-
-#ifdef MEMORY_CONTEXT_CHECKING
-static void
-set_sentinel(void *base, Size offset)
-{
-	char	   *ptr = (char *) base + offset;
-
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
-	*ptr = 0x7E;
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
-}
-
-static bool
-sentinel_ok(const void *base, Size offset)
-{
-	const char *ptr = (const char *) base + offset;
-	bool		ret;
-
-	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
-	ret = *ptr == 0x7E;
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
-
-	return ret;
-}
-#endif
-
-#ifdef RANDOMIZE_ALLOCATED_MEMORY
-
-/*
- * Fill a just-allocated piece of memory with "random" data.  It's not really
- * very random, just a repeating sequence with a length that's prime.  What
- * we mainly want out of it is to have a good probability that two palloc's
- * of the same number of bytes start out containing different data.
- *
- * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
- * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
- * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
- * fairly arbitrary.  UNDEFINED is more convenient for AllocSetRealloc(), and
- * other callers have no preference.
- */
-static void
-randomize_mem(char *ptr, size_t size)
-{
-	static int	save_ctr = 1;
-	size_t		remaining = size;
-	int			ctr;
-
-	ctr = save_ctr;
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
-	while (remaining-- > 0)
-	{
-		*ptr++ = ctr;
-		if (++ctr > 251)
-			ctr = 1;
-	}
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
-	save_ctr = ctr;
-}
-#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
-
 
 /*
  * Public routines
diff --git a/src/backend/utils/mmgr/memdebug.c b/src/backend/utils/mmgr/memdebug.c
new file mode 100644
index 0000000..ff1db78
--- /dev/null
+++ b/src/backend/utils/mmgr/memdebug.c
@@ -0,0 +1,131 @@
+/*-------------------------------------------------------------------------
+ *
+ * memdebug.c
+ *	  Declarations used in memory context implementations, not part of the
+ *	  public API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/memdebug.c
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+void
+wipe_mem(void *ptr, size_t size)
+{
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	memset(ptr, 0x7F, size);
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
+}
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+void
+set_sentinel(void *base, Size offset)
+{
+	char	   *ptr = (char *) base + offset;
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
+	*ptr = 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+}
+
+bool
+sentinel_ok(const void *base, Size offset)
+{
+	const char *ptr = (const char *) base + offset;
+	bool		ret;
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
+	ret = *ptr == 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+
+	return ret;
+}
+#endif
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
diff --git a/src/include/utils/memdebug.h b/src/include/utils/memdebug.h
index 96b5baf..6e40314 100644
--- a/src/include/utils/memdebug.h
+++ b/src/include/utils/memdebug.h
@@ -31,4 +31,26 @@
 #define VALGRIND_MEMPOOL_CHANGE(context, optr, nptr, size)	do {} while (0)
 #endif
 
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+void		wipe_mem(void *ptr, size_t size);
+
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+void		set_sentinel(void *base, Size offset);
+bool		sentinel_ok(const void *base, Size offset);
+
+#endif
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+void		randomize_mem(char *ptr, size_t size);
+
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
 #endif   /* MEMDEBUG_H */
-- 
2.5.5

0002-slab-allocator-v8.patchbinary/octet-stream; name=0002-slab-allocator-v8.patchDownload
From 43aaabf70b979b172fd659ef4d0ef129fd78d72d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Wed, 30 Nov 2016 15:36:23 +0100
Subject: [PATCH 2/3] slab allocator

---
 src/backend/replication/logical/reorderbuffer.c |  82 +--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/slab.c                   | 803 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   2 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  15 +-
 src/include/utils/memutils.h                    |   9 +
 7 files changed, 845 insertions(+), 69 deletions(-)
 create mode 100644 src/backend/utils/mmgr/slab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index fa84bd8..caadc07 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -156,10 +156,7 @@ static const Size max_changes_in_memory = 4096;
  * major bottleneck, especially when spilling to disk while decoding batch
  * workloads.
  */
-static const Size max_cached_changes = 4096 * 2;
 static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
-
 
 /* ---------------------------------------
  * primary reorderbuffer support routines
@@ -241,6 +238,22 @@ ReorderBufferAllocate(void)
 
 	buffer->context = new_ctx;
 
+	buffer->change_context = SlabContextCreate(new_ctx,
+											   "Change",
+											   SLAB_DEFAULT_BLOCK_SIZE,
+											   sizeof(ReorderBufferChange));
+
+	buffer->txn_context = SlabContextCreate(new_ctx,
+											"TXN",
+											SLAB_DEFAULT_BLOCK_SIZE,
+											sizeof(ReorderBufferTXN));
+
+	buffer->tup_context = AllocSetContextCreate(new_ctx,
+									"TupleBuf",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -251,8 +264,6 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_transactions = 0;
-	buffer->nr_cached_changes = 0;
 	buffer->nr_cached_tuplebufs = 0;
 
 	buffer->outbuf = NULL;
@@ -261,8 +272,6 @@ ReorderBufferAllocate(void)
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	dlist_init(&buffer->cached_transactions);
-	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
@@ -291,19 +300,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)
+		MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -344,18 +342,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	pfree(txn);
 }
 
 /*
@@ -366,19 +353,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 {
 	ReorderBufferChange *change;
 
-	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)
+		MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -434,21 +410,9 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	pfree(change);
 }
 
-
 /*
  * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at
  * least a tuple of size tuple_len (excluding header overhead).
@@ -491,7 +455,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 	else
 	{
 		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
+			MemoryContextAlloc(rb->tup_context,
 							   sizeof(ReorderBufferTupleBuf) +
 							   MAXIMUM_ALIGNOF + alloc_len);
 		tuple->alloc_tuple_size = alloc_len;
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index fc5f793..cd0e803 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o
+OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000..bf04891
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,803 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom MemoryContext implementation designed for cases of
+ * equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations. Firstly, we can get rid of the doubling and carve the blocks
+ *	into chunks of exactly the right size (plus alignment), not wasting memory.
+ *
+ *	The information about free chunks is maintained both at the block level and
+ *	global (context) level. This is possible as the chunk size (and thus also
+ *	the number of chunks per block) is fixed.
+ *
+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with a global
+ *	freelist of chunks, storing chunks from all blocks).
+ *
+ *	At the context level, we use 'freelist' to track blocks ordered by number
+ *	of free chunks, starting with blocks having a single allocated chunk, and
+ *	with completely full blocks on the tail.
+ *
+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, we the allocator reuses space from the most full blocks first,
+ *	in the hope that some of the less full blocks will get completely empty
+ *	(and returned back to the OS).
+ *
+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.
+ *
+ *	We cache indexes of the first empty chunk on each block (firstFreeChunk),
+ *	and freelist index for blocks with least free chunks (minFreeChunks), so
+ *	that we don't have to search the freelist and block on every SlabAlloc()
+ *	call, which is quite expensive.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define SLAB_BLOCKHDRSZ MAXALIGN(sizeof(SlabBlockData))
+#define SLAB_CHUNKHDRSZ MAXALIGN(sizeof(SlabChunkData))
+
+/* Portion of SLAB_CHUNKHDRSZ examined outside slab.c. */
+#define SLAB_CHUNK_PUBLIC	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+
+/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define SLAB_CHUNK_USED \
+	(offsetof(SlabChunkData, requested_size) + sizeof(Size))
+#else
+#define SLAB_CHUNK_USED \
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;
+
+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock; /* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* blocks with free space, grouped by number of free chunks: */
+	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
+}	SlabContext;
+
+typedef SlabContext *Slab;
+
+/*
+ * SlabBlockData
+ *		Structure of a single block in SLAB allocator.
+ *
+ * node: doubly-linked list of blocks in global freelist
+ * nfree: number of free chunks in this block
+ * firstFreeChunk: pointer to the first free chunk
+ * bitmapptr: pointer to the free bitmap (tracking free chunks)
+ */
+typedef struct SlabBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nfree;			/* number of free chunks */
+	int			firstFreeChunk; /* index of the first free chunk in the block */
+	char	   *bitmapptr;		/* pointer to free bitmap */
+}	SlabBlockData;
+
+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ */
+typedef struct SlabChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+}	SlabChunkData;
+
+
+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)
+
+#define SlabPointerGetChunk(ptr)	\
+					((SlabChunk)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+#define SlabChunkGetPointer(chk)	\
+					((SlabPointer)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabInit(MemoryContext context);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context, int level, bool print,
+		  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods SlabMethods = {
+	SlabAlloc,
+	SlabFree,
+	SlabRealloc,
+	SlabInit,
+	SlabReset,
+	SlabDelete,
+	SlabGetChunkSpace,
+	SlabIsEmpty,
+	SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,SlabCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ *
+ * The chunkSize may not exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ *
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize)
+{
+	int			chunksPerBlock;
+	Size		fullChunkSize;
+	Slab		set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+
+	/*
+	 * Make sure the block can store at least one chunk (plus 1 byte as the
+	 * smallest possible bitmap)
+	 */
+	if (blockSize - sizeof(SlabBlockData) < fullChunkSize + 1)
+		elog(ERROR, "block size %ld for slab is too small for chunks %ld",
+			 blockSize, chunkSize);
+
+	/*
+	 * Compute number of chunks per block, including header and bitmap. The
+	 * block also includes block header and bitmap, so we need this inequality
+	 * to hold:
+	 *
+	 * blockSize >= sizeof(SlabBlockData) + chunksPerBlock * fullChunkSize +
+	 * (chunksPerBlock + 7) / 8
+	 *
+	 * By solving for chunksPerBlock, we get the following formula.
+	 */
+	chunksPerBlock
+		= (8 * (blockSize - sizeof(SlabBlockData)) - 7) / (8 * fullChunkSize + 1);
+
+	/* if we can't fit at least one chunk into the block, we're hosed */
+	Assert(chunksPerBlock > 0);
+
+	/* make sure the chunks (and bitmap) actually fit on the block	*/
+	Assert(fullChunkSize * chunksPerBlock + ((chunksPerBlock + 7) / 8) + sizeof(SlabBlockData) <= blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Slab) MemoryContextCreate(T_SlabContext,
+									 (offsetof(SlabContext, freelist) + sizeof(dlist_head) * (chunksPerBlock + 1)),
+									 &SlabMethods,
+									 parent,
+									 name);
+
+	set->blockSize = blockSize;
+	set->chunkSize = chunkSize;
+	set->fullChunkSize = fullChunkSize;
+	set->chunksPerBlock = chunksPerBlock;
+	set->nblocks = 0;
+	set->minFreeChunks = 0;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * SlabInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+SlabInit(MemoryContext context)
+{
+	int			i;
+	Slab		set = (Slab) context;
+
+	/* initialize the freelist slots */
+	for (i = 0; i < (set->chunksPerBlock + 1); i++)
+		dlist_init(&set->freelist[i]);
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+	int			i;
+	Slab		set = (Slab) context;
+
+	AssertArg(SlabIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		dlist_mutable_iter miter;
+
+		dlist_foreach_modify(miter, &set->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+			/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, set->blockSize);
+#endif
+			free(block);
+			set->nblocks--;
+		}
+	}
+
+	set->minFreeChunks = 0;
+
+	Assert(set->nblocks == 0);
+}
+
+/*
+ * SlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call SlabReset().
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+	/* just reset the context */
+	SlabReset(context);
+}
+
+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab		set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert((set->minFreeChunks >= 0) && (set->minFreeChunks < set->chunksPerBlock));
+
+	/* make sure we only allow correct request size */
+	if (size != set->chunkSize)
+		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+			 size, set->chunkSize);
+
+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 *
+	 * (set->minFreeChunks == 0) means there are no blocks with free chunks,
+	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+	 */
+	if (set->minFreeChunks == 0)
+	{
+		block = (SlabBlock) malloc(set->blockSize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nfree = set->chunksPerBlock;
+		block->firstFreeChunk = 0;
+
+		/* the free bitmap is placed at the end */
+		block->bitmapptr
+			= ((char *) block) + set->blockSize - ((set->chunksPerBlock + 7) / 8);
+
+		/* we need to reset the free bitmap */
+		memset(block->bitmapptr, 0, ((set->chunksPerBlock + 7) / 8));
+
+		/*
+		 * And add it to the last freelist with all chunks empty.
+		 *
+		 * XXX We know there are no blocks in the freelist, otherwise we
+		 * wouldn't need a new block.
+		 */
+		Assert(dlist_is_empty(&set->freelist[set->chunksPerBlock]));
+
+		dlist_push_head(&set->freelist[set->chunksPerBlock], &block->node);
+
+		set->minFreeChunks = set->chunksPerBlock;
+		set->nblocks += 1;
+	}
+
+	/* grab the block from the freelist (even the new block is there) */
+	block = dlist_head_element(SlabBlockData, node,
+							   &set->freelist[set->minFreeChunks]);
+
+	/* make sure we actually got a valid block, with matching nfree */
+	Assert(block != NULL);
+	Assert(set->minFreeChunks == block->nfree);
+	Assert(block->nfree > 0);
+
+	Assert((char *) block < block->bitmapptr);
+	Assert((char *) block + set->blockSize > block->bitmapptr);
+
+	/* we know index of the first free chunk in the block */
+	idx = block->firstFreeChunk;
+
+	/* make sure the chunk index is valid, and that it's marked as empty */
+	Assert((idx >= 0) && (idx < set->chunksPerBlock));
+	Assert(!((block->bitmapptr[idx / 8] & (0x01 << (idx % 8)))));
+
+	/* mark the chunk as used (set 1 to the bit) */
+	block->bitmapptr[idx / 8] |= (0x01 << (idx % 8));
+
+	/* compute the chunk location block start (after the block header) */
+	chunk = (SlabChunk) ((char *) block + sizeof(SlabBlockData)
+						 + (idx * set->fullChunkSize));
+
+	/*
+	 * Update the block nfree count, and also the minFreeChunks as we've
+	 * decreased nfree for a block with the minimum number of free chunks
+	 * (because that's how we chose the block).
+	 */
+	block->nfree--;
+	set->minFreeChunks = block->nfree;
+
+	/*
+	 * We need to update index of the next free chunk on the block. If we used
+	 * the last free chunk on this block, set it to chunksPerBlock (which is
+	 * not a valid chunk index). Otherwise look for the next chunk - we know
+	 * that it has to be above the current firstFreeChunk value, thanks to how
+	 * we maintain firstFreeChunk here and in SlabFree().
+	 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int			byte = block->firstFreeChunk / 8;
+			int			bit = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (!(block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}
+
+	/* move the whole block to the right place in the freelist */
+	dlist_delete(&block->node);
+	dlist_push_head(&set->freelist[block->nfree], &block->node);
+
+	/*
+	 * And finally update minFreeChunks, i.e. the index to the block with the
+	 * lowest number of free chunks. We only need to do that when the block
+	 * got full (otherwise we know the current block is the right one). We'll
+	 * simply walk the freelist until we find a non-empty entry.
+	 */
+	if (set->minFreeChunks == 0)
+	{
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+		{
+			if (dlist_is_empty(&set->freelist[idx]))
+				continue;
+
+			/* found a non-empty freelist */
+			set->minFreeChunks = idx;
+			break;
+		}
+	}
+
+	if (set->minFreeChunks == set->chunksPerBlock)
+		set->minFreeChunks = 0;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+
+	chunk->block = (void *) block;
+
+	chunk->header.context = (MemoryContext) set;
+	chunk->header.size = MAXALIGN(size);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(SlabChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+	SlabAllocInfo(set, chunk);
+	return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+	int			idx;
+	Slab		set = (Slab) context;
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+	SlabBlock	block = chunk->block;
+
+	SlabFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/* compute index of the chunk with respect to block start */
+	idx = ((char *) chunk - ((char *) block + sizeof(SlabBlockData))) / set->fullChunkSize;
+
+	Assert((block->bitmapptr[idx / 8] & (0x01 << (idx % 8))));
+
+	/* mark the chunk as unused (zero the bit), and update block nfree count */
+	block->bitmapptr[idx / 8] ^= (0x01 << (idx % 8));
+	block->nfree++;
+	block->firstFreeChunk = Min(block->firstFreeChunk, idx);
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= set->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->header.size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	/* remove the block from a freelist */
+	dlist_delete(&block->node);
+
+	/*
+	 * See if we need to update the minFreeChunks field for the set - we only
+	 * need to do that if there the block had that number of free chunks
+	 * before we freed one. In that case, we check if there still are blocks
+	 * in the original freelist and we either keep the current value (if there
+	 * still are blocks) or increment it by one (the new block is still the
+	 * one with minimum free chunks).
+	 *
+	 * The one exception is when the block will get completely free - in that
+	 * case we will free it, se we can't use it for minFreeChunks. It however
+	 * means there are no more blocks with free chunks.
+	 */
+	if (set->minFreeChunks == (block->nfree - 1))
+	{
+		/* Have we removed the last chunk from the freelist? */
+		if (dlist_is_empty(&set->freelist[set->minFreeChunks]))
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == set->chunksPerBlock)
+				set->minFreeChunks = 0;
+			else
+				set->minFreeChunks++;
+		}
+	}
+
+	/* If the block is now completely empty, free it. */
+	if (block->nfree == set->chunksPerBlock)
+	{
+		free(block);
+		set->nblocks--;
+	}
+	else
+		dlist_push_head(&set->freelist[block->nfree], &block->node);
+
+	Assert(set->nblocks >= 0);
+}
+
+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc.
+ *
+ * We try to be gentle and allow calls with exactly the same size as in that
+ * case we can simply return the same chunk. When the size differs, we fail
+ * with assert failure or return NULL.
+ *
+ * We might be even support cases with (size < chunkSize). That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ *
+ * XXX Perhaps we should not be gentle at all and simply fails in all cases,
+ * to eliminate the (mostly pointless) uncertainty.
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab		set = (Slab) context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	elog(ERROR, "slab allocator does not support realloc()");
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+
+	return chunk->header.size + SLAB_CHUNKHDRSZ;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+	Slab		set = (Slab) context;
+
+	return (set->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of an Slab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+SlabStats(MemoryContext context, int level, bool print,
+		  MemoryContextCounters *totals)
+{
+	Slab		set = (Slab) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+	int			i;
+
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		dlist_iter	iter;
+
+		dlist_foreach(iter, &set->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, iter.cur);
+
+			nblocks++;
+			totalspace += set->blockSize;
+			freespace += set->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+		}
+	}
+
+	if (print)
+	{
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Slab: %s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n",
+				set->header.name, totalspace, nblocks, freespace, freechunks,
+				totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+	int			i;
+	Slab		slab = (Slab) context;
+	char	   *name = slab->header.name;
+
+	/* walk all the freelists */
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		int			j,
+					nfree;
+		dlist_iter	iter;
+
+		/* walk all blocks on this freelist */
+		dlist_foreach(iter, &slab->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, iter.cur);
+
+			/*
+			 * Make sure the number of free chunks (in the block header) matches
+			 * position in the freelist.
+			 */
+			if (block->nfree != i)
+				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
+					 name, block->nfree, block, i);
+
+			/*
+			 * Now walk through the chunks, count the free ones and also perform
+			 * some additional checks for the used ones.
+			 */
+
+			nfree = 0;
+			for (j = 0; j < slab->chunksPerBlock; j++)
+			{
+				/* non-zero bit in the bitmap means chunk the chunk is used */
+				if ((block->bitmapptr[j / 8] & (0x01 << (j % 8))) != 0)
+				{
+					SlabChunk	chunk = (SlabChunk) ((char *) block + sizeof(SlabBlockData)
+													 + (j * slab->fullChunkSize));
+
+					VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+										   sizeof(chunk->header.requested_size));
+
+					/* we're in a no-freelist branch */
+					VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+										   sizeof(chunk->header.requested_size));
+
+					/* chunks have both block and slab pointers, so check both */
+					if (chunk->block != block)
+						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+							 name, block, chunk);
+
+					if (chunk->header.context != (MemoryContext) slab)
+						elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* now make sure the chunk size is correct */
+					if (chunk->header.size != MAXALIGN(slab->chunkSize))
+						elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* now make sure the chunk size is correct */
+					if (chunk->header.requested_size != slab->chunkSize)
+						elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* there might be sentinel (thanks to alignment) */
+					if (chunk->header.requested_size < chunk->header.size &&
+						!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->header.requested_size))
+						elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+							 name, block, chunk);
+				}
+				else
+					/* free chunk */
+					nfree += 1;
+			}
+
+			/*
+			 * Make sure we got the expected number of free chunks (as tracked in
+			 * the block header).
+			 */
+			if (nfree != block->nfree)
+				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
+					 name, block->nfree, block, nfree);
+		}
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index ba069cc..92a7478 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,6 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext)))
+	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index c514d3f..d910236 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -275,6 +275,7 @@ typedef enum NodeTag
 	 */
 	T_MemoryContext = 600,
 	T_AllocSetContext,
+	T_SlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 9e209ae..b8f2f0e 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -331,6 +331,13 @@ struct ReorderBuffer
 	MemoryContext context;
 
 	/*
+	 * Memory contexts for each type of object (TXNs, changes and tuple buffers)
+	 */
+	MemoryContext change_context;
+	MemoryContext txn_context;
+	MemoryContext tup_context;
+
+	/*
 	 * Data structure slab cache.
 	 *
 	 * We allocate/deallocate some structures very frequently, to avoid bigger
@@ -340,14 +347,6 @@ struct ReorderBuffer
 	 * on top of reorderbuffer.c
 	 */
 
-	/* cached ReorderBufferTXNs */
-	dlist_head	cached_transactions;
-	Size		nr_cached_transactions;
-
-	/* cached ReorderBufferChanges */
-	dlist_head	cached_changes;
-	Size		nr_cached_changes;
-
 	/* cached ReorderBufferTupleBufs */
 	slist_head	cached_tuplebufs;
 	Size		nr_cached_tuplebufs;
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index e6334a2..7308845 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -135,6 +135,12 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
 					  Size initBlockSize,
 					  Size maxBlockSize);
 
+/* slab.c */
+extern MemoryContext SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
@@ -171,4 +177,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
  */
 #define ALLOCSET_SEPARATE_THRESHOLD  8192
 
+#define SLAB_DEFAULT_BLOCK_SIZE		(8 * 1024)
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
+
 #endif   /* MEMUTILS_H */
-- 
2.5.5

#41Petr Jelinek
petr.jelinek@2ndquadrant.com
In reply to: Tomas Vondra (#40)
Re: PATCH: two slab-like memory allocators

On 13/12/16 01:45, Tomas Vondra wrote:

On 12/12/2016 11:39 PM, Tomas Vondra wrote:

On 12/12/2016 05:05 AM, Petr Jelinek wrote:

I'd be happy with this patch now (as in committer ready) except that it
does have some merge conflicts after the recent commits, so rebase is
needed.

Attached is a rebased version of the patch, resolving the Makefile merge
conflicts.

Meh, managed to rebase a wrong branch, missing fix to the off-by-one
error (fixed v6). Attached is v8, hopefully the correct one.

Okay, this version looks good to me, marked as RfC.

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#42Michael Paquier
michael.paquier@gmail.com
In reply to: Petr Jelinek (#41)
Re: PATCH: two slab-like memory allocators

On Tue, Dec 13, 2016 at 10:32 AM, Petr Jelinek
<petr.jelinek@2ndquadrant.com> wrote:

Okay, this version looks good to me, marked as RfC.

The patches still apply, moved to CF 2017-03 with same status: RfC.
--
Michael

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#43Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#40)
Re: PATCH: two slab-like memory allocators

Hi,

On 2016-12-13 01:45:13 +0100, Tomas Vondra wrote:

src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/aset.c | 115 +--------------------------------
src/backend/utils/mmgr/memdebug.c | 131 ++++++++++++++++++++++++++++++++++++++
src/include/utils/memdebug.h | 22 +++++++
4 files changed, 156 insertions(+), 114 deletions(-)
create mode 100644 src/backend/utils/mmgr/memdebug.c

I'm a bit loathe to move these to a .c file - won't this likely make
these debugging tools even slower? Seems better to put some of them
them in a header as static inlines (not randomize, but the rest).

From 43aaabf70b979b172fd659ef4d0ef129fd78d72d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Wed, 30 Nov 2016 15:36:23 +0100
Subject: [PATCH 2/3] slab allocator

---
src/backend/replication/logical/reorderbuffer.c | 82 +--
src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/slab.c | 803 ++++++++++++++++++++++++
src/include/nodes/memnodes.h | 2 +-
src/include/nodes/nodes.h | 1 +
src/include/replication/reorderbuffer.h | 15 +-
src/include/utils/memutils.h | 9 +

I'd like to see the reorderbuffer changes split into a separate commit
from the slab allocator introduction.

+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom MemoryContext implementation designed for cases of
+ * equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group

Bump, before a committer forgets it.

+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations. Firstly, we can get rid of the doubling and carve the blocks
+ *	into chunks of exactly the right size (plus alignment), not wasting memory.

Getting rid of it relative to what? I'd try to phrase it so these
comments stand on their own.

+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with a global
+ *	freelist of chunks, storing chunks from all blocks).

Why is checking a potentially somewhat long-ish bitmap better than a
simple counter, or a "linked list" of "next free chunk-number" or such
(where free chunks simply contain the id of the subsequent chunk)?
Using a list instead of a bitmap would also make it possible to get
'lifo' behaviour, which is good for cache efficiency. A simple
chunk-number based singly linked list would only imply a minimum
allocation size of 4 - that seems perfectly reasonable?

+ *	At the context level, we use 'freelist' to track blocks ordered by number
+ *	of free chunks, starting with blocks having a single allocated chunk, and
+ *	with completely full blocks on the tail.

Why that way round? Filling chunks up as much as possible is good for
cache and TLB efficiency, and allows for earlier de-allocation of
partially used blocks? Oh, I see you do that in the next comment,
but it still leaves me wondering.

Also, is this actually a list? It's more an array of lists, right?
I.e. it should be named freelists?

Thirdly, isn't that approach going to result in a quite long freelists
array, when you have small items and a decent blocksize? That seems like
a fairly reasonable thing to do?

+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, we the allocator reuses space from the most full blocks first,
+ *	in the hope that some of the less full blocks will get completely empty
+ *	(and returned back to the OS).

Might be worth mentioning tlb/cache efficiency too.

+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.

Hm, so that'd be eliminated if we maintained a linked list of chunks (by
chunk number) and a free_chunk_cnt or such.

+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"

Move ilist up, above memdebug, so the list is alphabetically ordered.

+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+typedef SlabContext *Slab;

I personally wont commit this whith pointer hiding typedefs. If
somebody else does, I can live with it, but for me it's bad enough taste
that I wont.

+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock; /* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* blocks with free space, grouped by number of free chunks: */
+	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
+}	SlabContext;
+

Why aren't these ints something unsigned?

+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)

It's not your fault, but this "iff" is obviously a lot stronger than the
actual test ;). I seriously doubt this macro is worth anything...

+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */

Why don't we? Seems quite worthwhile? Thinking about this, won't this
result in a drastic increase of system malloc/mmap/brk traffic when
there's lot of short transactions in reorderbuffer?

+static void
+SlabReset(MemoryContext context)
+{
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		dlist_mutable_iter miter;
+
+		dlist_foreach_modify(miter, &set->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+			/* Normal case, release the block */

What does "normal case" refer to here? Given that there's no alternative
case...

+	/*
+	 * We need to update index of the next free chunk on the block. If we used
+	 * the last free chunk on this block, set it to chunksPerBlock (which is
+	 * not a valid chunk index). Otherwise look for the next chunk - we know
+	 * that it has to be above the current firstFreeChunk value, thanks to how
+	 * we maintain firstFreeChunk here and in SlabFree().
+	 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int			byte = block->firstFreeChunk / 8;
+			int			bit = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (!(block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}

This and previous code just re-affirms my opinion that a bitmap is not
the best structure here.

+	/* move the whole block to the right place in the freelist */
+	dlist_delete(&block->node);
+	dlist_push_head(&set->freelist[block->nfree], &block->node);

Hm. What if we, instead of the array of doubly linked lists, just kept
a single linked list of blocks, and keep that list sorted by number of
free chunks? Given that freeing / allocation never changes the number
of allocated chunks by more than 1, we'll never have to move an entry
far in that list to keep it sorted.

+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc.
+ *
+ * We try to be gentle and allow calls with exactly the same size as in that
+ * case we can simply return the same chunk. When the size differs, we fail
+ * with assert failure or return NULL.
+ *
+ * We might be even support cases with (size < chunkSize). That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ *
+ * XXX Perhaps we should not be gentle at all and simply fails in all cases,
+ * to eliminate the (mostly pointless) uncertainty.

I think I'm in favor of that. This seems more likely to hide a bug than
actually helpful.

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#44Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#43)
Re: PATCH: two slab-like memory allocators

On 02/09/2017 10:37 PM, Andres Freund wrote:

Hi,

On 2016-12-13 01:45:13 +0100, Tomas Vondra wrote:

src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/aset.c | 115 +--------------------------------
src/backend/utils/mmgr/memdebug.c | 131 ++++++++++++++++++++++++++++++++++++++
src/include/utils/memdebug.h | 22 +++++++
4 files changed, 156 insertions(+), 114 deletions(-)
create mode 100644 src/backend/utils/mmgr/memdebug.c

I'm a bit loathe to move these to a .c file - won't this likely make
these debugging tools even slower? Seems better to put some of them
them in a header as static inlines (not randomize, but the rest).

Do you have any numbers to support that? AFAICS compilers got really
good in inlining stuff on their own.

From 43aaabf70b979b172fd659ef4d0ef129fd78d72d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Wed, 30 Nov 2016 15:36:23 +0100
Subject: [PATCH 2/3] slab allocator

---
src/backend/replication/logical/reorderbuffer.c | 82 +--
src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/slab.c | 803 ++++++++++++++++++++++++
src/include/nodes/memnodes.h | 2 +-
src/include/nodes/nodes.h | 1 +
src/include/replication/reorderbuffer.h | 15 +-
src/include/utils/memutils.h | 9 +

I'd like to see the reorderbuffer changes split into a separate commit
from the slab allocator introduction.

I rather dislike patches that only add a bunch of code, without actually
using it anywhere. But if needed, this is trivial to do at commit time -
just don't commit the reorderbuffer bits.

+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom MemoryContext implementation designed for cases of
+ * equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group

Bump, before a committer forgets it.

OK.

+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations. Firstly, we can get rid of the doubling and carve the blocks
+ *	into chunks of exactly the right size (plus alignment), not wasting memory.

Getting rid of it relative to what? I'd try to phrase it so these
comments stand on their own.

OK, rill reword.

+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with a global
+ *	freelist of chunks, storing chunks from all blocks).

Why is checking a potentially somewhat long-ish bitmap better than a
simple counter, or a "linked list" of "next free chunk-number" or such
(where free chunks simply contain the id of the subsequent chunk)?
Using a list instead of a bitmap would also make it possible to get
'lifo' behaviour, which is good for cache efficiency. A simple
chunk-number based singly linked list would only imply a minimum
allocation size of 4 - that seems perfectly reasonable?

A block-level counter would be enough to decide if all chunks on the
block are free, but it's not sufficient to identify which chunks are
free / available for reuse.

The bitmap only has a single bit per chunk, so I find "potentially
long-ish" is a bit misleading. Any linked list implementation will
require much more per-chunk overhead - as the chunks are fixed-legth,
it's possible to use chunk index (instead of 64-bit pointers), to save
some space. But with large blocks / small chunks that's still at least 2
or 4 bytes per index, and you'll need two (to implement doubly-linked
list, to make add/remove efficient).

For example assume 8kB block and 64B chunks, i.e. 128 chunks. With
bitmap that's 16B to track all free space on the block. Doubly linked
list would require 1B per chunk index, 2 indexes per chunk. That's 128*2
= 256B.

I have a hard time believing this the cache efficiency of linked lists
(which may or may not be real in this case) out-weights this, but if you
want to try, be my guest.

+ *	At the context level, we use 'freelist' to track blocks ordered by number
+ *	of free chunks, starting with blocks having a single allocated chunk, and
+ *	with completely full blocks on the tail.

Why that way round? Filling chunks up as much as possible is good for
cache and TLB efficiency, and allows for earlier de-allocation of
partially used blocks? Oh, I see you do that in the next comment,
but it still leaves me wondering.

Also, is this actually a list? It's more an array of lists, right?
I.e. it should be named freelists?

Possibly. Naming things is hard.

Thirdly, isn't that approach going to result in a quite long freelists
array, when you have small items and a decent blocksize? That seems like
a fairly reasonable thing to do?

I'm confused. Why wouldn't that be reasonable. Or rather, what would be
a more reasonable way?

+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, we the allocator reuses space from the most full blocks first,
+ *	in the hope that some of the less full blocks will get completely empty
+ *	(and returned back to the OS).

Might be worth mentioning tlb/cache efficiency too.

I haven't really considered tlb/cache very much. The main goal of this
design was to free blocks (instead of keeping many partially-used blocks
around). If you have comments on this, feel free to add them.

+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.

Hm, so that'd be eliminated if we maintained a linked list of chunks (by
chunk number) and a free_chunk_cnt or such.

As I explained above, I don't think linked list is a good solution. IIRC
correctly I've initially done that, and ended using the bitmap. If you
have idea how to do that, feel free to implement that and then we can do
some measurements and compare the patches.

+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"

Move ilist up, above memdebug, so the list is alphabetically ordered.

OK

+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+typedef SlabContext *Slab;

I personally wont commit this whith pointer hiding typedefs. If
somebody else does, I can live with it, but for me it's bad enough taste
that I wont.

Meh.

+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock; /* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* blocks with free space, grouped by number of free chunks: */
+	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
+}	SlabContext;
+

Why aren't these ints something unsigned?

Yeah, some of those could be unsigned. Will check.

+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)

It's not your fault, but this "iff" is obviously a lot stronger than the
actual test ;). I seriously doubt this macro is worth anything...

Yeah.

+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */

Why don't we? Seems quite worthwhile? Thinking about this, won't this
result in a drastic increase of system malloc/mmap/brk traffic when
there's lot of short transactions in reorderbuffer?

I haven't seen any significant impact of that during the tests I've done
(even with many tiny transactions), but it adding keeper blocks should
be trivial.

+static void
+SlabReset(MemoryContext context)
+{
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		dlist_mutable_iter miter;
+
+		dlist_foreach_modify(miter, &set->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+			/* Normal case, release the block */

What does "normal case" refer to here? Given that there's no alternative
case...

Mem, bogus comment.

+	/*
+	 * We need to update index of the next free chunk on the block. If we used
+	 * the last free chunk on this block, set it to chunksPerBlock (which is
+	 * not a valid chunk index). Otherwise look for the next chunk - we know
+	 * that it has to be above the current firstFreeChunk value, thanks to how
+	 * we maintain firstFreeChunk here and in SlabFree().
+	 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int			byte = block->firstFreeChunk / 8;
+			int			bit = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (!(block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}

This and previous code just re-affirms my opinion that a bitmap is not
the best structure here.

It'd be great if you could explain why, instead of just making such
claims ...

+	/* move the whole block to the right place in the freelist */
+	dlist_delete(&block->node);
+	dlist_push_head(&set->freelist[block->nfree], &block->node);

Hm. What if we, instead of the array of doubly linked lists, just kept
a single linked list of blocks, and keep that list sorted by number of
free chunks? Given that freeing / allocation never changes the number
of allocated chunks by more than 1, we'll never have to move an entry
far in that list to keep it sorted.

Only assuming that there'll be only few blocks with the same number of
free chunks. If that's not the case, you'll have to walk many blocks to
move the block to the right place in the list. The array of lists
handles such cases way more efficiently, and I think we should keep it.

+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc.
+ *
+ * We try to be gentle and allow calls with exactly the same size as in that
+ * case we can simply return the same chunk. When the size differs, we fail
+ * with assert failure or return NULL.
+ *
+ * We might be even support cases with (size < chunkSize). That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ *
+ * XXX Perhaps we should not be gentle at all and simply fails in all cases,
+ * to eliminate the (mostly pointless) uncertainty.

I think I'm in favor of that. This seems more likely to hide a bug than
actually helpful.

OK.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#45Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#44)
Re: PATCH: two slab-like memory allocators

On 2017-02-11 02:13:59 +0100, Tomas Vondra wrote:

On 02/09/2017 10:37 PM, Andres Freund wrote:

Hi,

On 2016-12-13 01:45:13 +0100, Tomas Vondra wrote:

src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/aset.c | 115 +--------------------------------
src/backend/utils/mmgr/memdebug.c | 131 ++++++++++++++++++++++++++++++++++++++
src/include/utils/memdebug.h | 22 +++++++
4 files changed, 156 insertions(+), 114 deletions(-)
create mode 100644 src/backend/utils/mmgr/memdebug.c

I'm a bit loathe to move these to a .c file - won't this likely make
these debugging tools even slower? Seems better to put some of them
them in a header as static inlines (not randomize, but the rest).

Do you have any numbers to support that? AFAICS compilers got really good in
inlining stuff on their own.

Unless you use LTO, they can't inline across translation units. And
using LTO is slow enough for linking that it's not that much fun to use,
as it makes compile-edit-compile cycles essentially take as long as a
full rebuild.

From 43aaabf70b979b172fd659ef4d0ef129fd78d72d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Wed, 30 Nov 2016 15:36:23 +0100
Subject: [PATCH 2/3] slab allocator

---
src/backend/replication/logical/reorderbuffer.c | 82 +--
src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/slab.c | 803 ++++++++++++++++++++++++
src/include/nodes/memnodes.h | 2 +-
src/include/nodes/nodes.h | 1 +
src/include/replication/reorderbuffer.h | 15 +-
src/include/utils/memutils.h | 9 +

I'd like to see the reorderbuffer changes split into a separate commit
from the slab allocator introduction.

I rather dislike patches that only add a bunch of code, without actually
using it anywhere.

But if needed, this is trivial to do at commit time - just don't
commit the reorderbuffer bits.

Meh.

+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with a global
+ *	freelist of chunks, storing chunks from all blocks).

Why is checking a potentially somewhat long-ish bitmap better than a
simple counter, or a "linked list" of "next free chunk-number" or such
(where free chunks simply contain the id of the subsequent chunk)?
Using a list instead of a bitmap would also make it possible to get
'lifo' behaviour, which is good for cache efficiency. A simple
chunk-number based singly linked list would only imply a minimum
allocation size of 4 - that seems perfectly reasonable?

A block-level counter would be enough to decide if all chunks on the block
are free, but it's not sufficient to identify which chunks are free /
available for reuse.

The bitmap only has a single bit per chunk, so I find "potentially long-ish"
is a bit misleading. Any linked list implementation will require much more
per-chunk overhead - as the chunks are fixed-legth, it's possible to use
chunk index (instead of 64-bit pointers), to save some space. But with large
blocks / small chunks that's still at least 2 or 4 bytes per index, and
you'll need two (to implement doubly-linked list, to make add/remove
efficient).

For example assume 8kB block and 64B chunks, i.e. 128 chunks. With bitmap
that's 16B to track all free space on the block. Doubly linked list would
require 1B per chunk index, 2 indexes per chunk. That's 128*2 = 256B.

I have a hard time believing this the cache efficiency of linked lists
(which may or may not be real in this case) out-weights this, but if you
want to try, be my guest.

I'm not following - why would there be overhead in anything for
allocations bigger than 4 (or maybe 8) bytes? You can store the list
(via chunk ids, not pointers) inside the chunks itself, where data
otherwise would be. And I don't see why you'd need a doubly linked
list, as the only operations that are needed are to push to the front of
the list, and to pop from the front of the list - and both operations
are simple to do with a singly linked list?

Thirdly, isn't that approach going to result in a quite long freelists
array, when you have small items and a decent blocksize? That seems like
a fairly reasonable thing to do?

I'm confused. Why wouldn't that be reasonable. Or rather, what would be a
more reasonable way?

If I understood correctly, you have one an array of doubly linked lists.
A block is stored in the list at the index #block's-free-elements. Is that
right?

If so, if you have e.g. 8 byte allocations and 64kb sized blocks, you
end up with an array of 1024 doubly linked lists, which'll take up 64kb
on its own. And there a certainly scenarios where even bigger block
sizes could make sense. That's both memory overhead, and runtime
overhead, because at reset-time we'll have to check the whole array
(which'll presumably largely be empty lists). Resetting is a pretty
common path...

+	/*
+	 * We need to update index of the next free chunk on the block. If we used
+	 * the last free chunk on this block, set it to chunksPerBlock (which is
+	 * not a valid chunk index). Otherwise look for the next chunk - we know
+	 * that it has to be above the current firstFreeChunk value, thanks to how
+	 * we maintain firstFreeChunk here and in SlabFree().
+	 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int			byte = block->firstFreeChunk / 8;
+			int			bit = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (!(block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}

This and previous code just re-affirms my opinion that a bitmap is not
the best structure here.

It'd be great if you could explain why, instead of just making such claims
...

Because it's complicated. This is a fair bit of code and branches to
run in a pretty hot path.

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#46Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#44)
Re: PATCH: two slab-like memory allocators

On 2017-02-11 02:13:59 +0100, Tomas Vondra wrote:

+	/* move the whole block to the right place in the freelist */
+	dlist_delete(&block->node);
+	dlist_push_head(&set->freelist[block->nfree], &block->node);

Hm. What if we, instead of the array of doubly linked lists, just kept
a single linked list of blocks, and keep that list sorted by number of
free chunks? Given that freeing / allocation never changes the number
of allocated chunks by more than 1, we'll never have to move an entry
far in that list to keep it sorted.

Only assuming that there'll be only few blocks with the same number of free
chunks. If that's not the case, you'll have to walk many blocks to move the
block to the right place in the list. The array of lists handles such cases
way more efficiently, and I think we should keep it.

The proper datastructure would probably be a heap. Right now
binaryheap.h is fixed-size - probably not too hard to change.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#47Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#45)
Re: PATCH: two slab-like memory allocators

On 02/11/2017 02:33 AM, Andres Freund wrote:

On 2017-02-11 02:13:59 +0100, Tomas Vondra wrote:

On 02/09/2017 10:37 PM, Andres Freund wrote:

Hi,

On 2016-12-13 01:45:13 +0100, Tomas Vondra wrote:

src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/aset.c | 115 +--------------------------------
src/backend/utils/mmgr/memdebug.c | 131 ++++++++++++++++++++++++++++++++++++++
src/include/utils/memdebug.h | 22 +++++++
4 files changed, 156 insertions(+), 114 deletions(-)
create mode 100644 src/backend/utils/mmgr/memdebug.c

I'm a bit loathe to move these to a .c file - won't this likely make
these debugging tools even slower? Seems better to put some of them
them in a header as static inlines (not randomize, but the rest).

Do you have any numbers to support that? AFAICS compilers got really good in
inlining stuff on their own.

Unless you use LTO, they can't inline across translation units. And
using LTO is slow enough for linking that it's not that much fun to use,
as it makes compile-edit-compile cycles essentially take as long as a
full rebuild.

From 43aaabf70b979b172fd659ef4d0ef129fd78d72d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Wed, 30 Nov 2016 15:36:23 +0100
Subject: [PATCH 2/3] slab allocator

---
src/backend/replication/logical/reorderbuffer.c | 82 +--
src/backend/utils/mmgr/Makefile | 2 +-
src/backend/utils/mmgr/slab.c | 803 ++++++++++++++++++++++++
src/include/nodes/memnodes.h | 2 +-
src/include/nodes/nodes.h | 1 +
src/include/replication/reorderbuffer.h | 15 +-
src/include/utils/memutils.h | 9 +

I'd like to see the reorderbuffer changes split into a separate commit
from the slab allocator introduction.

I rather dislike patches that only add a bunch of code, without actually
using it anywhere.

But if needed, this is trivial to do at commit time - just don't
commit the reorderbuffer bits.

Meh.

+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with a global
+ *	freelist of chunks, storing chunks from all blocks).

Why is checking a potentially somewhat long-ish bitmap better than a
simple counter, or a "linked list" of "next free chunk-number" or such
(where free chunks simply contain the id of the subsequent chunk)?
Using a list instead of a bitmap would also make it possible to get
'lifo' behaviour, which is good for cache efficiency. A simple
chunk-number based singly linked list would only imply a minimum
allocation size of 4 - that seems perfectly reasonable?

A block-level counter would be enough to decide if all chunks on the block
are free, but it's not sufficient to identify which chunks are free /
available for reuse.

The bitmap only has a single bit per chunk, so I find "potentially long-ish"
is a bit misleading. Any linked list implementation will require much more
per-chunk overhead - as the chunks are fixed-legth, it's possible to use
chunk index (instead of 64-bit pointers), to save some space. But with large
blocks / small chunks that's still at least 2 or 4 bytes per index, and
you'll need two (to implement doubly-linked list, to make add/remove
efficient).

For example assume 8kB block and 64B chunks, i.e. 128 chunks. With bitmap
that's 16B to track all free space on the block. Doubly linked list would
require 1B per chunk index, 2 indexes per chunk. That's 128*2 = 256B.

I have a hard time believing this the cache efficiency of linked lists
(which may or may not be real in this case) out-weights this, but if you
want to try, be my guest.

I'm not following - why would there be overhead in anything for
allocations bigger than 4 (or maybe 8) bytes? You can store the list
(via chunk ids, not pointers) inside the chunks itself, where data
otherwise would be. And I don't see why you'd need a doubly linked
list, as the only operations that are needed are to push to the front of
the list, and to pop from the front of the list - and both operations
are simple to do with a singly linked list?

Oh! I have not considered storing the chunk indexes (for linked lists)
in the chunk itself, which obviously eliminates the overhead concerns,
and you're right a singly-linked list should be enough.

There will be some minimum-chunk-size requirement, depending on the
block size/chunk size. I wonder whether it makes sense to try to be
smart and make it dynamic, so that we only require 1B or 2B for cases
when only that many chunks fit into a block, or just say that it's 4B
and be done with it.

I mean 2^32 chunks ought to be enough for anyone, right?

I'm still not buying the cache efficiency argument, though. One of the
reasons is that the implementation prefers blocks with fewer free chunks
when handling palloc(), so pfree() is making the block less likely to be
chosen by the next palloc().

Thirdly, isn't that approach going to result in a quite long freelists
array, when you have small items and a decent blocksize? That seems like
a fairly reasonable thing to do?

I'm confused. Why wouldn't that be reasonable. Or rather, what would be a
more reasonable way?

If I understood correctly, you have one an array of doubly linked lists.
A block is stored in the list at the index #block's-free-elements. Is that
right?

If so, if you have e.g. 8 byte allocations and 64kb sized blocks, you
end up with an array of 1024 doubly linked lists, which'll take up 64kb
on its own. And there a certainly scenarios where even bigger block
sizes could make sense. That's both memory overhead, and runtime
overhead, because at reset-time we'll have to check the whole array
(which'll presumably largely be empty lists). Resetting is a pretty
common path...

True, but it's not entirely clear if resetting is common for the paths
where we use those new allocators.

Also, if we accept that it might be a problem, what other solution you
propose? I don't think just merging everything into a single list is a
good idea, for the reasons I explained before (it might make the resets
somewhat less expensive, but it'll make pfree() more expensive).

What might work is replacing the array with a list, though. So we'd have
a list of lists, which would eliminate the array overhead.

+	/*
+	 * We need to update index of the next free chunk on the block. If we used
+	 * the last free chunk on this block, set it to chunksPerBlock (which is
+	 * not a valid chunk index). Otherwise look for the next chunk - we know
+	 * that it has to be above the current firstFreeChunk value, thanks to how
+	 * we maintain firstFreeChunk here and in SlabFree().
+	 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int			byte = block->firstFreeChunk / 8;
+			int			bit = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (!(block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}

This and previous code just re-affirms my opinion that a bitmap is not
the best structure here.

It'd be great if you could explain why, instead of just making such claims
...

Because it's complicated. This is a fair bit of code and branches to
run in a pretty hot path.

Hmm. I admit updating the index of the first free chunk is a bit
cumbersome, and the linked list would make it unnecessary.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#48Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#47)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-11 14:40:18 +0100, Tomas Vondra wrote:

On 02/11/2017 02:33 AM, Andres Freund wrote:

I have a hard time believing this the cache efficiency of linked lists
(which may or may not be real in this case) out-weights this, but if you
want to try, be my guest.

I'm not following - why would there be overhead in anything for
allocations bigger than 4 (or maybe 8) bytes? You can store the list
(via chunk ids, not pointers) inside the chunks itself, where data
otherwise would be. And I don't see why you'd need a doubly linked
list, as the only operations that are needed are to push to the front of
the list, and to pop from the front of the list - and both operations
are simple to do with a singly linked list?

Oh! I have not considered storing the chunk indexes (for linked lists) in
the chunk itself, which obviously eliminates the overhead concerns, and
you're right a singly-linked list should be enough.

There will be some minimum-chunk-size requirement, depending on the block
size/chunk size. I wonder whether it makes sense to try to be smart and make
it dynamic, so that we only require 1B or 2B for cases when only that many
chunks fit into a block, or just say that it's 4B and be done with it.

I doubt it's worth it - it seems likely that the added branch is more
noticeable overall than the possible savings of 3 bytes. Also, won't the
space be lost due to alignment *anyway*?
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));

In that case I'd just Assert(MAXIMUM_ALIGNOF >= sizeof(slist_head)) and
use a plain slist - no point in being more careful than that.

I mean 2^32 chunks ought to be enough for anyone, right?

Yea, that seems enough; but given the alignment thing pointed out above,
I think we can just use plain pointers - and that definitely should be
enough :P

I'm still not buying the cache efficiency argument, though. One of the
reasons is that the implementation prefers blocks with fewer free chunks
when handling palloc(), so pfree() is making the block less likely to be
chosen by the next palloc().

That'll possibly de-optimize L1, but for L2 usage the higher density
seems like it'll be a win. All this memory is only accessed by a single
backend, so packing as densely as possible is good.

If so, if you have e.g. 8 byte allocations and 64kb sized blocks, you
end up with an array of 1024 doubly linked lists, which'll take up 64kb
on its own. And there a certainly scenarios where even bigger block
sizes could make sense. That's both memory overhead, and runtime
overhead, because at reset-time we'll have to check the whole array
(which'll presumably largely be empty lists). Resetting is a pretty
common path...

True, but it's not entirely clear if resetting is common for the paths where
we use those new allocators.

That's fair enough. There's still the memory overhead, but I guess we
can also live with that.

Also, if we accept that it might be a problem, what other solution you
propose? I don't think just merging everything into a single list is a good
idea, for the reasons I explained before (it might make the resets somewhat
less expensive, but it'll make pfree() more expensive).

Now that I think about it, a binary heap, as suggested elsewhere, isn't
entirely trivial to use for this - it's more or less trivial to "fix"
the heap after changing an element's value, but it's harder to find that
element first.

But a two-level list approach seems like it could work quite well -
basically a simplified skip-list. A top-level list contains that has
the property that all the elements have a distinct #free, and then
hanging of those sub-lists for all the other blocks with the same number
of chunks.

I think that'd be a better implementation, but I can understand if you
don't immediately want to go there.

What might work is replacing the array with a list, though. So we'd have a
list of lists, which would eliminate the array overhead.

That seems likely to be significantly worse, because a) iteration is
more expensive b) accessing the relevant list to move between two
different "freecount" lists would be O(n).

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#49Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#48)
Re: PATCH: two slab-like memory allocators

On 02/14/2017 03:22 AM, Andres Freund wrote:

Hi,

On 2017-02-11 14:40:18 +0100, Tomas Vondra wrote:

On 02/11/2017 02:33 AM, Andres Freund wrote:

I have a hard time believing this the cache efficiency of linked lists
(which may or may not be real in this case) out-weights this, but if you
want to try, be my guest.

I'm not following - why would there be overhead in anything for
allocations bigger than 4 (or maybe 8) bytes? You can store the list
(via chunk ids, not pointers) inside the chunks itself, where data
otherwise would be. And I don't see why you'd need a doubly linked
list, as the only operations that are needed are to push to the front of
the list, and to pop from the front of the list - and both operations
are simple to do with a singly linked list?

Oh! I have not considered storing the chunk indexes (for linked lists) in
the chunk itself, which obviously eliminates the overhead concerns, and
you're right a singly-linked list should be enough.

There will be some minimum-chunk-size requirement, depending on the block
size/chunk size. I wonder whether it makes sense to try to be smart and make
it dynamic, so that we only require 1B or 2B for cases when only that many
chunks fit into a block, or just say that it's 4B and be done with it.

I doubt it's worth it - it seems likely that the added branch is more
noticeable overall than the possible savings of 3 bytes. Also, won't the
space be lost due to alignment *anyway*?
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));

In that case I'd just Assert(MAXIMUM_ALIGNOF >= sizeof(slist_head)) and
use a plain slist - no point in being more careful than that.

Hmm, I think you're right.

I mean 2^32 chunks ought to be enough for anyone, right?

Yea, that seems enough; but given the alignment thing pointed out above,
I think we can just use plain pointers - and that definitely should be
enough :P

People in year 2078: Why the hell did they only use 32 bits? Wasn't it
obvious we'll have tiny computers with 32EB of RAM? ;-)

I'm still not buying the cache efficiency argument, though. One of
the reasons is that the implementation prefers blocks with fewer
free chunks when handling palloc(), so pfree() is making the block
less likely to be chosen by the next palloc().

That'll possibly de-optimize L1, but for L2 usage the higher density
seems like it'll be a win. All this memory is only accessed by a
single backend, so packing as densely as possible is good.

If so, if you have e.g. 8 byte allocations and 64kb sized blocks,
you end up with an array of 1024 doubly linked lists, which'll
take up 64kb on its own. And there a certainly scenarios where
even bigger block sizes could make sense. That's both memory
overhead, and runtime overhead, because at reset-time we'll have
to check the whole array (which'll presumably largely be empty
lists). Resetting is a pretty common path...

True, but it's not entirely clear if resetting is common for the

paths where we use those new allocators.

That's fair enough. There's still the memory overhead, but I guess
we can also live with that.

Right. My ambition was not to develop another general-purpose memory
context that would work perfectly for everything, but something that
works (better than the current code) for places like reorderbuffer.

Also, if we accept that it might be a problem, what other solution you
propose? I don't think just merging everything into a single list is a good
idea, for the reasons I explained before (it might make the resets somewhat
less expensive, but it'll make pfree() more expensive).

Now that I think about it, a binary heap, as suggested elsewhere, isn't
entirely trivial to use for this - it's more or less trivial to "fix"
the heap after changing an element's value, but it's harder to find that
element first.

But a two-level list approach seems like it could work quite well -
basically a simplified skip-list. A top-level list contains that has
the property that all the elements have a distinct #free, and then
hanging of those sub-lists for all the other blocks with the same number
of chunks.

I think that'd be a better implementation, but I can understand if you
don't immediately want to go there.

I don't want to go there. I'm not all that interested in reorderbuffer,
to be honest, and this started more as "Hold my beer!" hack, after a
midnight discussion with Petr, than a seriously meant patch. I've
already spent like 100x time on it than I expected.

What might work is replacing the array with a list, though. So we'd have a
list of lists, which would eliminate the array overhead.

That seems likely to be significantly worse, because a) iteration is
more expensive b) accessing the relevant list to move between two
different "freecount" lists would be O(n).

Oh, right, I haven't realized we won't know the current head of the
list, so we'd have to search for it. OTOH, we could replace it with a
small hash table, which would reduce the lookup time because we'd have
to search only in a single bin.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#50Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#49)
3 attachment(s)
Re: PATCH: two slab-like memory allocators

Hi,

Attached is v9 of this patch series. This addresses most of the points
raised in the review, namely:

1) change most 'debug' stuff to 'static inline' in memdebug.h

2) fixed and reworded a bunch of comments

3) get rid of the block-level bitmap tracking free chunks

Instead of the bitmap, I've used a simple singly-linked list, using
int32 chunk indexes. Perhaps it could use the slist instead, but I'm not
quite sure MAXALIGN is guaranteed to be greater than pointer.

In any case, this seems to be working reasonably well - it saves a bit
of code (but also made some code slightly more complex). Also, it seems
to be a tad faster than v8 - after repeating the same benchmark as
before, I get these numbers:

master slab-v8 slab-v9
-----------------------------------------
10000 50 28 25
50000 17500 180 160
100000 150000 380 330
200000 ? 750 670

Although the results are quite noisy.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0001-move-common-bits-to-memdebug-v9.patchtext/x-diff; name=0001-move-common-bits-to-memdebug-v9.patchDownload
From 7c70a7bef4029dd7f10c7dc9ff0dd92a7bd2f966 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Mon, 20 Feb 2017 20:16:16 +0100
Subject: [PATCH 1/3] move common bits to memdebug

---
 src/backend/utils/mmgr/Makefile   |   2 +-
 src/backend/utils/mmgr/aset.c     | 115 +-------------------------------------
 src/backend/utils/mmgr/memdebug.c |  93 ++++++++++++++++++++++++++++++
 src/include/utils/memdebug.h      |  48 ++++++++++++++++
 4 files changed, 144 insertions(+), 114 deletions(-)
 create mode 100644 src/backend/utils/mmgr/memdebug.c

diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index 1842bae..fc5f793 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o portalmem.o
+OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 4dfc3ec..33b4d01 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -41,46 +41,6 @@
  *	chunks as chunks.  Anything "large" is passed off to malloc().  Change
  *	the number of freelists to change the small/large boundary.
  *
- *
- *	About CLOBBER_FREED_MEMORY:
- *
- *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
- *	This is useful for catching places that reference already-freed memory.
- *
- *	About MEMORY_CONTEXT_CHECKING:
- *
- *	Since we usually round request sizes up to the next power of 2, there
- *	is often some unused space immediately after a requested data area.
- *	Thus, if someone makes the common error of writing past what they've
- *	requested, the problem is likely to go unnoticed ... until the day when
- *	there *isn't* any wasted space, perhaps because of different memory
- *	alignment on a new platform, or some other effect.  To catch this sort
- *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
- *	the requested space whenever the request is less than the actual chunk
- *	size, and verifies that the byte is undamaged when the chunk is freed.
- *
- *
- *	About USE_VALGRIND and Valgrind client requests:
- *
- *	Valgrind provides "client request" macros that exchange information with
- *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
- *	currently-used macros.
- *
- *	When running under Valgrind, we want a NOACCESS memory region both before
- *	and after the allocation.  The chunk header is tempting as the preceding
- *	region, but mcxt.c expects to able to examine the standard chunk header
- *	fields.  Therefore, we use, when available, the requested_size field and
- *	any subsequent padding.  requested_size is made NOACCESS before returning
- *	a chunk pointer to a caller.  However, to reduce client request traffic,
- *	it is kept DEFINED in chunks on the free list.
- *
- *	The rounded-up capacity of the chunk usually acts as a post-allocation
- *	NOACCESS region.  If the request consumes precisely the entire chunk,
- *	there is no such region; another chunk header may immediately follow.  In
- *	that case, Valgrind will not detect access beyond the end of the chunk.
- *
- *	See also the cooperating Valgrind client requests in mcxt.c.
- *
  *-------------------------------------------------------------------------
  */
 
@@ -296,10 +256,10 @@ static const unsigned char LogTable256[256] =
  */
 #ifdef HAVE_ALLOCINFO
 #define AllocFreeInfo(_cxt, _chunk) \
-			fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+			fprintf(stderr, "AllocFree: %s: %p, %lu\n", \
 				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #define AllocAllocInfo(_cxt, _chunk) \
-			fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+			fprintf(stderr, "AllocAlloc: %s: %p, %lu\n", \
 				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #else
 #define AllocFreeInfo(_cxt, _chunk)
@@ -345,77 +305,6 @@ AllocSetFreeIndex(Size size)
 	return idx;
 }
 
-#ifdef CLOBBER_FREED_MEMORY
-
-/* Wipe freed memory for debugging purposes */
-static void
-wipe_mem(void *ptr, size_t size)
-{
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
-	memset(ptr, 0x7F, size);
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
-}
-#endif
-
-#ifdef MEMORY_CONTEXT_CHECKING
-static void
-set_sentinel(void *base, Size offset)
-{
-	char	   *ptr = (char *) base + offset;
-
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
-	*ptr = 0x7E;
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
-}
-
-static bool
-sentinel_ok(const void *base, Size offset)
-{
-	const char *ptr = (const char *) base + offset;
-	bool		ret;
-
-	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
-	ret = *ptr == 0x7E;
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
-
-	return ret;
-}
-#endif
-
-#ifdef RANDOMIZE_ALLOCATED_MEMORY
-
-/*
- * Fill a just-allocated piece of memory with "random" data.  It's not really
- * very random, just a repeating sequence with a length that's prime.  What
- * we mainly want out of it is to have a good probability that two palloc's
- * of the same number of bytes start out containing different data.
- *
- * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
- * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
- * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
- * fairly arbitrary.  UNDEFINED is more convenient for AllocSetRealloc(), and
- * other callers have no preference.
- */
-static void
-randomize_mem(char *ptr, size_t size)
-{
-	static int	save_ctr = 1;
-	size_t		remaining = size;
-	int			ctr;
-
-	ctr = save_ctr;
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
-	while (remaining-- > 0)
-	{
-		*ptr++ = ctr;
-		if (++ctr > 251)
-			ctr = 1;
-	}
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
-	save_ctr = ctr;
-}
-#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
-
 
 /*
  * Public routines
diff --git a/src/backend/utils/mmgr/memdebug.c b/src/backend/utils/mmgr/memdebug.c
new file mode 100644
index 0000000..5f603d2
--- /dev/null
+++ b/src/backend/utils/mmgr/memdebug.c
@@ -0,0 +1,93 @@
+/*-------------------------------------------------------------------------
+ *
+ * memdebug.c
+ *	  Declarations used in memory context implementations, not part of the
+ *	  public API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/memdebug.c
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
diff --git a/src/include/utils/memdebug.h b/src/include/utils/memdebug.h
index 90eb926..fc382c3 100644
--- a/src/include/utils/memdebug.h
+++ b/src/include/utils/memdebug.h
@@ -31,4 +31,52 @@
 #define VALGRIND_MEMPOOL_CHANGE(context, optr, nptr, size)	do {} while (0)
 #endif
 
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static inline void
+wipe_mem(void *ptr, size_t size)
+{
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	memset(ptr, 0x7F, size);
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
+}
+
+#endif	/* CLOBBER_FREED_MEMORY */
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+static inline void
+set_sentinel(void *base, Size offset)
+{
+	char	   *ptr = (char *) base + offset;
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
+	*ptr = 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+}
+
+static inline bool
+sentinel_ok(const void *base, Size offset)
+{
+	const char *ptr = (const char *) base + offset;
+	bool		ret;
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
+	ret = *ptr == 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+
+	return ret;
+}
+
+#endif	/* MEMORY_CONTEXT_CHECKING */
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+void		randomize_mem(char *ptr, size_t size);
+
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
 #endif   /* MEMDEBUG_H */
-- 
2.5.5

0002-slab-allocator-v9.patchtext/x-diff; name=0002-slab-allocator-v9.patchDownload
From ec3ffcd37b88a3b86d0a56eebc21c24a17e1b7c6 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Mon, 20 Feb 2017 20:16:57 +0100
Subject: [PATCH 2/3] slab allocator

---
 src/backend/replication/logical/reorderbuffer.c |  82 +--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/slab.c                   | 800 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   2 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  15 +-
 src/include/utils/memutils.h                    |   9 +
 7 files changed, 842 insertions(+), 69 deletions(-)
 create mode 100644 src/backend/utils/mmgr/slab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 7dc97fa..85e52ea 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -156,10 +156,7 @@ static const Size max_changes_in_memory = 4096;
  * major bottleneck, especially when spilling to disk while decoding batch
  * workloads.
  */
-static const Size max_cached_changes = 4096 * 2;
 static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
-
 
 /* ---------------------------------------
  * primary reorderbuffer support routines
@@ -241,6 +238,22 @@ ReorderBufferAllocate(void)
 
 	buffer->context = new_ctx;
 
+	buffer->change_context = SlabContextCreate(new_ctx,
+											   "Change",
+											   SLAB_DEFAULT_BLOCK_SIZE,
+											   sizeof(ReorderBufferChange));
+
+	buffer->txn_context = SlabContextCreate(new_ctx,
+											"TXN",
+											SLAB_DEFAULT_BLOCK_SIZE,
+											sizeof(ReorderBufferTXN));
+
+	buffer->tup_context = AllocSetContextCreate(new_ctx,
+									"TupleBuf",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -251,8 +264,6 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_transactions = 0;
-	buffer->nr_cached_changes = 0;
 	buffer->nr_cached_tuplebufs = 0;
 
 	buffer->outbuf = NULL;
@@ -261,8 +272,6 @@ ReorderBufferAllocate(void)
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	dlist_init(&buffer->cached_transactions);
-	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
@@ -291,19 +300,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)
+		MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -344,18 +342,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	pfree(txn);
 }
 
 /*
@@ -366,19 +353,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 {
 	ReorderBufferChange *change;
 
-	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)
+		MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -434,21 +410,9 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	pfree(change);
 }
 
-
 /*
  * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at
  * least a tuple of size tuple_len (excluding header overhead).
@@ -491,7 +455,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 	else
 	{
 		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
+			MemoryContextAlloc(rb->tup_context,
 							   sizeof(ReorderBufferTupleBuf) +
 							   MAXIMUM_ALIGNOF + alloc_len);
 		tuple->alloc_tuple_size = alloc_len;
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index fc5f793..cd0e803 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o
+OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000..f30c239
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,800 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom MemoryContext implementation designed for cases of
+ * equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations. The blocks are carved into chunks of exactly the right size
+ *	(plus alignment), not wasting any memory.
+ *
+ *	The information about free chunks is maintained both at the block level and
+ *	global (context) level. This is possible as the chunk size (and thus also
+ *	the number of chunks per block) is fixed.
+ *
+ *	On each block, free chunks are tracked in a simple linked list. Contents
+ *	of free chunks is replaced with an index of the next free chunk, forming
+ *	a very simple linked list. Each block also contains a counter of free
+ *	chunks. Combined with the local block-level freelist, it makes it trivial
+ *	to eventually free the whole block.
+ *
+ *	At the context level, we use 'freelist' to track blocks ordered by number
+ *	of free chunks, starting with blocks having a single allocated chunk, and
+ *	with completely full blocks on the tail.
+ *
+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, we the allocator reuses space from the most full blocks first,
+ *	in the hope that some of the less full blocks will get completely empty
+ *	(and returned back to the OS).
+ *
+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.
+ *
+ *	We cache indexes of the first empty chunk on each block (firstFreeChunk),
+ *	and freelist index for blocks with least free chunks (minFreeChunks), so
+ *	that we don't have to search the freelist and block on every SlabAlloc()
+ *	call, which is quite expensive.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define SLAB_BLOCKHDRSZ MAXALIGN(sizeof(SlabBlockData))
+#define SLAB_CHUNKHDRSZ MAXALIGN(sizeof(SlabChunkData))
+
+/* Portion of SLAB_CHUNKHDRSZ examined outside slab.c. */
+#define SLAB_CHUNK_PUBLIC	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+
+/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define SLAB_CHUNK_USED \
+	(offsetof(SlabChunkData, requested_size) + sizeof(Size))
+#else
+#define SLAB_CHUNK_USED \
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;
+
+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock; /* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* blocks with free space, grouped by number of free chunks: */
+	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
+}	SlabContext;
+
+typedef SlabContext *Slab;
+
+/*
+ * SlabBlockData
+ *		Structure of a single block in SLAB allocator.
+ *
+ * node: doubly-linked list of blocks in global freelist
+ * nfree: number of free chunks in this block
+ * firstFreeChunk: index of the first free chunk
+ */
+typedef struct SlabBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nfree;			/* number of free chunks */
+	int			firstFreeChunk; /* index of the first free chunk in the block */
+}	SlabBlockData;
+
+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ */
+typedef struct SlabChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+}	SlabChunkData;
+
+
+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)
+
+#define SlabPointerGetChunk(ptr)	\
+					((SlabChunk)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+#define SlabChunkGetPointer(chk)	\
+					((SlabPointer)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+#define SlabBlockGetChunk(set, block, idx)	\
+					((SlabChunk) ((char *) (block) + sizeof(SlabBlockData)	\
+					+ (idx * set->fullChunkSize)))
+#define SlabBlockStart(block)	\
+				((char *) block + sizeof(SlabBlockData))
+#define SlabChunkIndex(set, block, chunk)	\
+				(((char *) chunk - SlabBlockStart(block)) / set->fullChunkSize)
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabInit(MemoryContext context);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context, int level, bool print,
+		  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods SlabMethods = {
+	SlabAlloc,
+	SlabFree,
+	SlabRealloc,
+	SlabInit,
+	SlabReset,
+	SlabDelete,
+	SlabGetChunkSpace,
+	SlabIsEmpty,
+	SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,SlabCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ *
+ * The chunkSize may not exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ *
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize)
+{
+	int			chunksPerBlock;
+	Size		fullChunkSize;
+	Size		freelistSize;
+	Slab		set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+
+	/* Make sure the block can store at least one chunk. */
+	if (blockSize - sizeof(SlabBlockData) < fullChunkSize)
+		elog(ERROR, "block size %ld for slab is too small for %ld chunks",
+			 blockSize, chunkSize);
+
+	/* Compute maximum number of chunks per block */
+	chunksPerBlock = (blockSize - sizeof(SlabBlockData)) / fullChunkSize;
+
+	/* The freelist starts with 0, ends with chunksPerBlock. */
+	freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1);
+
+	/* if we can't fit at least one chunk into the block, we're hosed */
+	Assert(chunksPerBlock > 0);
+
+	/* make sure the chunks actually fit on the block	*/
+	Assert((fullChunkSize * chunksPerBlock) + sizeof(SlabBlockData) <= blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Slab) MemoryContextCreate(T_SlabContext,
+									 (offsetof(SlabContext, freelist) + freelistSize),
+									 &SlabMethods,
+									 parent,
+									 name);
+
+	set->blockSize = blockSize;
+	set->chunkSize = chunkSize;
+	set->fullChunkSize = fullChunkSize;
+	set->chunksPerBlock = chunksPerBlock;
+	set->nblocks = 0;
+	set->minFreeChunks = 0;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * SlabInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+SlabInit(MemoryContext context)
+{
+	int			i;
+	Slab		set = (Slab) context;
+
+	/* initialize the freelist slots */
+	for (i = 0; i < (set->chunksPerBlock + 1); i++)
+		dlist_init(&set->freelist[i]);
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+	int			i;
+	Slab		set = (Slab) context;
+
+	AssertArg(SlabIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		dlist_mutable_iter miter;
+
+		dlist_foreach_modify(miter, &set->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, set->blockSize);
+#endif
+			free(block);
+			set->nblocks--;
+		}
+	}
+
+	set->minFreeChunks = 0;
+
+	Assert(set->nblocks == 0);
+}
+
+/*
+ * SlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call SlabReset().
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+	/* just reset the context */
+	SlabReset(context);
+}
+
+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab		set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert((set->minFreeChunks >= 0) && (set->minFreeChunks < set->chunksPerBlock));
+
+	/* make sure we only allow correct request size */
+	if (size != set->chunkSize)
+		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+			 size, set->chunkSize);
+
+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 *
+	 * (set->minFreeChunks == 0) means there are no blocks with free chunks,
+	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+	 */
+	if (set->minFreeChunks == 0)
+	{
+		block = (SlabBlock) malloc(set->blockSize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nfree = set->chunksPerBlock;
+		block->firstFreeChunk = 0;
+
+		/*
+		 * Put all the chunks on a freelist. Walk the chunks and point each
+		 * one to the next one.
+		 */
+		for (idx = 0; idx < set->chunksPerBlock; idx++)
+		{
+			chunk = SlabBlockGetChunk(set, block, idx);
+			*(int32 *)SlabChunkGetPointer(chunk) = (idx + 1);
+		}
+
+		/*
+		 * And add it to the last freelist with all chunks empty.
+		 *
+		 * XXX We know there are no blocks in the freelist, otherwise we
+		 * wouldn't need a new block.
+		 */
+		Assert(dlist_is_empty(&set->freelist[set->chunksPerBlock]));
+
+		dlist_push_head(&set->freelist[set->chunksPerBlock], &block->node);
+
+		set->minFreeChunks = set->chunksPerBlock;
+		set->nblocks += 1;
+	}
+
+	/* grab the block from the freelist (even the new block is there) */
+	block = dlist_head_element(SlabBlockData, node,
+							   &set->freelist[set->minFreeChunks]);
+
+	/* make sure we actually got a valid block, with matching nfree */
+	Assert(block != NULL);
+	Assert(set->minFreeChunks == block->nfree);
+	Assert(block->nfree > 0);
+
+	/* we know index of the first free chunk in the block */
+	idx = block->firstFreeChunk;
+
+	/* make sure the chunk index is valid, and that it's marked as empty */
+	Assert((idx >= 0) && (idx < set->chunksPerBlock));
+
+	/* compute the chunk location block start (after the block header) */
+	chunk = SlabBlockGetChunk(set, block, idx);
+
+	/*
+	 * Update the block nfree count, and also the minFreeChunks as we've
+	 * decreased nfree for a block with the minimum number of free chunks
+	 * (because that's how we chose the block).
+	 */
+	block->nfree--;
+	set->minFreeChunks = block->nfree;
+
+	/*
+	 * Remove the chunk from the freelist head. The index of the next free
+	 * chunk is stored in the chunk itself.
+	 */
+	block->firstFreeChunk = *(int32 *)SlabChunkGetPointer(chunk);
+
+	Assert(block->firstFreeChunk >= 0);
+	Assert(block->firstFreeChunk <= set->chunksPerBlock);
+
+	Assert(((block->nfree != 0) && (block->firstFreeChunk < set->chunksPerBlock)) ||
+		   ((block->nfree == 0) && (block->firstFreeChunk == set->chunksPerBlock)));
+
+	/* move the whole block to the right place in the freelist */
+	dlist_delete(&block->node);
+	dlist_push_head(&set->freelist[block->nfree], &block->node);
+
+	/*
+	 * And finally update minFreeChunks, i.e. the index to the block with the
+	 * lowest number of free chunks. We only need to do that when the block
+	 * got full (otherwise we know the current block is the right one). We'll
+	 * simply walk the freelist until we find a non-empty entry.
+	 */
+	if (set->minFreeChunks == 0)
+	{
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+		{
+			if (dlist_is_empty(&set->freelist[idx]))
+				continue;
+
+			/* found a non-empty freelist */
+			set->minFreeChunks = idx;
+			break;
+		}
+	}
+
+	if (set->minFreeChunks == set->chunksPerBlock)
+		set->minFreeChunks = 0;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+
+	chunk->block = (void *) block;
+
+	chunk->header.context = (MemoryContext) set;
+	chunk->header.size = MAXALIGN(size);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(SlabChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+	SlabAllocInfo(set, chunk);
+	return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+	int			idx;
+	Slab		set = (Slab) context;
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+	SlabBlock	block = chunk->block;
+
+	SlabFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/* compute index of the chunk with respect to block start */
+	idx = SlabChunkIndex(set, block, chunk);
+
+	/* mark the chunk as unused (zero the bit), and update block nfree count */
+	*(int32 *)pointer = block->firstFreeChunk;
+	block->firstFreeChunk = idx;
+	block->nfree++;
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= set->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	/* XXX don't wipe the int32 index, used for block-level freelist */
+	wipe_mem((char *)pointer + sizeof(int32), chunk->header.size - sizeof(int32));
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	/* remove the block from a freelist */
+	dlist_delete(&block->node);
+
+	/*
+	 * See if we need to update the minFreeChunks field for the set - we only
+	 * need to do that if there the block had that number of free chunks
+	 * before we freed one. In that case, we check if there still are blocks
+	 * in the original freelist and we either keep the current value (if there
+	 * still are blocks) or increment it by one (the new block is still the
+	 * one with minimum free chunks).
+	 *
+	 * The one exception is when the block will get completely free - in that
+	 * case we will free it, se we can't use it for minFreeChunks. It however
+	 * means there are no more blocks with free chunks.
+	 */
+	if (set->minFreeChunks == (block->nfree - 1))
+	{
+		/* Have we removed the last chunk from the freelist? */
+		if (dlist_is_empty(&set->freelist[set->minFreeChunks]))
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == set->chunksPerBlock)
+				set->minFreeChunks = 0;
+			else
+				set->minFreeChunks++;
+		}
+	}
+
+	/* If the block is now completely empty, free it. */
+	if (block->nfree == set->chunksPerBlock)
+	{
+		free(block);
+		set->nblocks--;
+	}
+	else
+		dlist_push_head(&set->freelist[block->nfree], &block->node);
+
+	Assert(set->nblocks >= 0);
+}
+
+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc.
+ *
+ * We try to be gentle and allow calls with exactly the same size as in that
+ * case we can simply return the same chunk. When the size differs, we fail
+ * with assert failure or return NULL.
+ *
+ * We might be even support cases with (size < chunkSize). That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ *
+ * XXX Perhaps we should not be gentle at all and simply fails in all cases,
+ * to eliminate the (mostly pointless) uncertainty.
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab		set = (Slab) context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	elog(ERROR, "slab allocator does not support realloc()");
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+
+	return chunk->header.size + SLAB_CHUNKHDRSZ;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+	Slab		set = (Slab) context;
+
+	return (set->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of an Slab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+SlabStats(MemoryContext context, int level, bool print,
+		  MemoryContextCounters *totals)
+{
+	Slab		set = (Slab) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+	int			i;
+
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		dlist_iter	iter;
+
+		dlist_foreach(iter, &set->freelist[i])
+		{
+			SlabBlock	block = dlist_container(SlabBlockData, node, iter.cur);
+
+			nblocks++;
+			totalspace += set->blockSize;
+			freespace += set->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+		}
+	}
+
+	if (print)
+	{
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Slab: %s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n",
+				set->header.name, totalspace, nblocks, freespace, freechunks,
+				totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+	int			i;
+	Slab		slab = (Slab) context;
+	char	   *name = slab->header.name;
+	char	   *freechunks;
+
+	Assert(slab->chunksPerBlock > 0);
+
+	/* bitmap of free chunks on a block */
+	freechunks = palloc(slab->chunksPerBlock * sizeof(bool));
+
+	/* walk all the freelists */
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		int			j,
+					nfree;
+		dlist_iter	iter;
+
+		/* walk all blocks on this freelist */
+		dlist_foreach(iter, &slab->freelist[i])
+		{
+			int			idx;
+			SlabBlock	block = dlist_container(SlabBlockData, node, iter.cur);
+
+			/*
+			 * Make sure the number of free chunks (in the block header) matches
+			 * position in the freelist.
+			 */
+			if (block->nfree != i)
+				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
+					 name, block->nfree, block, i);
+
+			/* reset the bitmap of free chunks for this block */
+			memset(freechunks, 0, (slab->chunksPerBlock * sizeof(bool)));
+			idx = block->firstFreeChunk;
+
+			/*
+			 * Now walk through the chunks, count the free ones and also perform
+			 * some additional checks for the used ones. As the chunk freelist
+			 * is stored within the chunks themselves, we have to walk through
+			 * the chunks and construct our own bitmap.
+			 */
+
+			nfree = 0;
+			while (idx < slab->chunksPerBlock)
+			{
+				SlabChunk	chunk;
+
+				/* count the chunk as free, add it to the bitmap */
+				nfree++;
+				freechunks[idx] = true;
+
+				/* read index of the next free chunk */
+				chunk = SlabBlockGetChunk(slab, block, idx);
+				idx = *(int32 *)SlabChunkGetPointer(chunk);
+			}
+
+			for (j = 0; j < slab->chunksPerBlock; j++)
+			{
+				/* non-zero bit in the bitmap means chunk the chunk is used */
+				if (! freechunks[j])
+				{
+					SlabChunk	chunk = SlabBlockGetChunk(slab, block, j);
+
+					VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+										   sizeof(chunk->header.requested_size));
+
+					/* we're in a no-freelist branch */
+					VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+										   sizeof(chunk->header.requested_size));
+
+					/* chunks have both block and slab pointers, so check both */
+					if (chunk->block != block)
+						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+							 name, block, chunk);
+
+					if (chunk->header.context != (MemoryContext) slab)
+						elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* now make sure the chunk size is correct */
+					if (chunk->header.size != MAXALIGN(slab->chunkSize))
+						elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* now make sure the chunk size is correct */
+					if (chunk->header.requested_size != slab->chunkSize)
+						elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* there might be sentinel (thanks to alignment) */
+					if (chunk->header.requested_size < chunk->header.size &&
+						!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->header.requested_size))
+						elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+							 name, block, chunk);
+				}
+			}
+
+			/*
+			 * Make sure we got the expected number of free chunks (as tracked in
+			 * the block header).
+			 */
+			if (nfree != block->nfree)
+				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
+					 name, block->nfree, block, nfree);
+		}
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index e487d17..fe6bc90 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,6 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext)))
+	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 95dd8ba..28aca92 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -278,6 +278,7 @@ typedef enum NodeTag
 	 */
 	T_MemoryContext,
 	T_AllocSetContext,
+	T_SlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 25b0fc8..c931e83 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -331,6 +331,13 @@ struct ReorderBuffer
 	MemoryContext context;
 
 	/*
+	 * Memory contexts for each type of object (TXNs, changes and tuple buffers)
+	 */
+	MemoryContext change_context;
+	MemoryContext txn_context;
+	MemoryContext tup_context;
+
+	/*
 	 * Data structure slab cache.
 	 *
 	 * We allocate/deallocate some structures very frequently, to avoid bigger
@@ -340,14 +347,6 @@ struct ReorderBuffer
 	 * on top of reorderbuffer.c
 	 */
 
-	/* cached ReorderBufferTXNs */
-	dlist_head	cached_transactions;
-	Size		nr_cached_transactions;
-
-	/* cached ReorderBufferChanges */
-	dlist_head	cached_changes;
-	Size		nr_cached_changes;
-
 	/* cached ReorderBufferTupleBufs */
 	slist_head	cached_tuplebufs;
 	Size		nr_cached_tuplebufs;
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 1d1035e..5223a4d 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -135,6 +135,12 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
 					  Size initBlockSize,
 					  Size maxBlockSize);
 
+/* slab.c */
+extern MemoryContext SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
@@ -171,4 +177,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
  */
 #define ALLOCSET_SEPARATE_THRESHOLD  8192
 
+#define SLAB_DEFAULT_BLOCK_SIZE		(8 * 1024)
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
+
 #endif   /* MEMUTILS_H */
-- 
2.5.5

0003-generational-context-v9.patchtext/x-diff; name=0003-generational-context-v9.patchDownload
From f52f71265d91a96154b74ae00e6f99df2f6f3e48 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Mon, 20 Feb 2017 20:17:37 +0100
Subject: [PATCH 3/3] generational context

---
 src/backend/replication/logical/reorderbuffer.c |  78 +--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/gen.c                    | 758 ++++++++++++++++++++++++
 src/backend/utils/mmgr/generation.c             | 758 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   4 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  14 -
 src/include/utils/memutils.h                    |   5 +
 8 files changed, 1536 insertions(+), 84 deletions(-)
 create mode 100644 src/backend/utils/mmgr/gen.c
 create mode 100644 src/backend/utils/mmgr/generation.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 85e52ea..0bdc214 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -149,15 +149,6 @@ typedef struct ReorderBufferDiskChange
  */
 static const Size max_changes_in_memory = 4096;
 
-/*
- * We use a very simple form of a slab allocator for frequently allocated
- * objects, simply keeping a fixed number in a linked list when unused,
- * instead pfree()ing them. Without that in many workloads aset.c becomes a
- * major bottleneck, especially when spilling to disk while decoding batch
- * workloads.
- */
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-
 /* ---------------------------------------
  * primary reorderbuffer support routines
  * ---------------------------------------
@@ -248,11 +239,9 @@ ReorderBufferAllocate(void)
 											SLAB_DEFAULT_BLOCK_SIZE,
 											sizeof(ReorderBufferTXN));
 
-	buffer->tup_context = AllocSetContextCreate(new_ctx,
-									"TupleBuf",
-									ALLOCSET_DEFAULT_MINSIZE,
-									ALLOCSET_DEFAULT_INITSIZE,
-									ALLOCSET_DEFAULT_MAXSIZE);
+	buffer->tup_context = GenerationContextCreate(new_ctx,
+										   "Tuples",
+										   SLAB_LARGE_BLOCK_SIZE);
 
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
@@ -264,15 +253,12 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_tuplebufs = 0;
-
 	buffer->outbuf = NULL;
 	buffer->outbufsize = 0;
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
 }
@@ -425,42 +411,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
-
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->tup_context,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
@@ -474,21 +430,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index cd0e803..7263399 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
+OBJS = aset.o generation.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/gen.c b/src/backend/utils/mmgr/gen.c
new file mode 100644
index 0000000..669a512
--- /dev/null
+++ b/src/backend/utils/mmgr/gen.c
@@ -0,0 +1,758 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the allocated chunks
+ *	have similar lifespan, i.e. that chunks allocated close from each other
+ *	(by time) will also be freed in close proximity, and mostly in the same
+ *	order. This is typical for various queue-like use cases, i.e. when tuples
+ *	are constructed, processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. The space release by freed chunks is not
+ *	reused, and once all chunks are freed (i.e. when nallocated == nfreed),
+ *	the whole block is thrown away. When the allocated chunks have similar
+ *	lifespan, this works very well and is extremely cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlockData))
+#define Generation_CHUNKHDRSZ	MAXALIGN(sizeof(GenerationChunkData))
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlockData *GenerationBlock;	/* forward reference */
+typedef struct GenerationChunkData *GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	block;			/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+typedef GenerationContext *Generation;
+
+/*
+ * GenerationBlockData
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlockData;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+}	GenerationChunkData;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+					((GenerationChunk)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+					((GenerationPointer)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	Generation			set;
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Generation) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationBlock	block;
+	GenerationChunk	chunk;
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->header.context = (MemoryContext) set;
+		chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->header.requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->header.context = (MemoryContext) set;
+	chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->header.size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	Size		oldsize = chunk->header.size;
+	GenerationPointer	newPointer;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->header.requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->header.requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+								   sizeof(chunk->header.requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->header.requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->header.size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	Generation			set = (Generation) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				set->header.name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	Generation			Generation = (Generation) context;
+	char	   *name = Generation->header.name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &Generation->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk	chunk = (GenerationChunk)ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->header.size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->header.context != (MemoryContext) Generation)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->header.requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->header.size != MAXALIGN(chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->header.requested_size < chunk->header.size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
new file mode 100644
index 0000000..7a25eb2
--- /dev/null
+++ b/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,758 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the allocated chunks
+ *	have similar lifespan, i.e. that chunks allocated close from each other
+ *	(by time) will also be freed in close proximity, and mostly in the same
+ *	order. This is typical for various queue-like use cases, i.e. when tuples
+ *	are constructed, processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. The space release by freed chunks is not
+ *	reused, and once all chunks are freed (i.e. when nallocated == nfreed),
+ *	the whole block is thrown away. When the allocated chunks have similar
+ *	lifespan, this works very well and is extremely cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlockData))
+#define Generation_CHUNKHDRSZ	MAXALIGN(sizeof(GenerationChunkData))
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlockData *GenerationBlock;	/* forward reference */
+typedef struct GenerationChunkData *GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	block;			/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+typedef GenerationContext *Generation;
+
+/*
+ * GenerationBlockData
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlockData;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+}	GenerationChunkData;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+					((GenerationChunk)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+					((GenerationPointer)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	Generation			set;
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Generation) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationBlock	block;
+	GenerationChunk	chunk;
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->header.context = (MemoryContext) set;
+		chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->header.requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->header.context = (MemoryContext) set;
+	chunk->header.size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->header.size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	Size		oldsize = chunk->header.size;
+	GenerationPointer	newPointer;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->header.requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->header.requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+								   sizeof(chunk->header.requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->header.requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->header.size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	Generation			set = (Generation) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				set->header.name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	Generation	gen = (Generation) context;
+	char	   *name = gen->header.name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &gen->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk	chunk = (GenerationChunk)ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->header.size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->header.context != (MemoryContext) gen)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->header.requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->header.size != MAXALIGN(chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->header.requested_size < chunk->header.size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->header.requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index fe6bc90..815a52a 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenerationContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 28aca92..2ef935a 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -279,6 +279,7 @@ typedef enum NodeTag
 	T_MemoryContext,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenerationContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index c931e83..83fd885 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -337,20 +337,6 @@ struct ReorderBuffer
 	MemoryContext txn_context;
 	MemoryContext tup_context;
 
-	/*
-	 * Data structure slab cache.
-	 *
-	 * We allocate/deallocate some structures very frequently, to avoid bigger
-	 * overhead we cache some unused ones here.
-	 *
-	 * The maximum number of cached entries is controlled by const variables
-	 * on top of reorderbuffer.c
-	 */
-
-	/* cached ReorderBufferTupleBufs */
-	slist_head	cached_tuplebufs;
-	Size		nr_cached_tuplebufs;
-
 	XLogRecPtr	current_restart_decoding_lsn;
 
 	/* buffer for disk<->memory conversions */
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 5223a4d..57fb47e 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -141,6 +141,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 				  Size blockSize,
 				  Size chunkSize);
 
+/* gen.c */
+extern MemoryContext GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
-- 
2.5.5

#51Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#50)
3 attachment(s)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-21 01:43:46 +0100, Tomas Vondra wrote:

Attached is v9 of this patch series. This addresses most of the points
raised in the review, namely:

Cool, thanks.

3) get rid of the block-level bitmap tracking free chunks

Instead of the bitmap, I've used a simple singly-linked list, using int32
chunk indexes. Perhaps it could use the slist instead, but I'm not quite
sure MAXALIGN is guaranteed to be greater than pointer.

I'm pretty sure it's guaranteed to be >= sizeof(void*). But this seems
ok, so ...

Attached are changes I made on the path to committing the patch. These
look large enough that I wanted to give you a chance to comment:

- The AllocFreeInfo changes aren't correct for 32bit systems with 64bit,
  longs (like linux, unlike windows) - using %zu is better (z is code
  for size_t sized)
- Split off the reorderbuffer changes
- Removed SlabBlock/SlabChunk / renamed SlabBlockData
-
  +#ifdef MEMORY_CONTEXT_CHECKING
  +#define SLAB_CHUNK_USED \
  +	(offsetof(SlabChunkData, requested_size) + sizeof(Size))
  +#else
  doesn't work anymore, because requested_size isn't a top-level
  field. I first redefined it as (without surrounding ifdef)
  #define SLAB_CHUNK_USED \
  	(offsetof(SlabChunkData, header) + sizeof(StandardChunkHeader))
  but I'm not really sure there's a whole lot of point in the define
  rather than just using sizeof() on the whole thing - there's no trailing
  padding?
- SLAB_CHUNK_PUBLIC and SLAB_BLOCKHDRSZ are unused?
- renamed 'set' variables (and comments) to slab.
- used castNode(SlabContext, context) instead of manual casts
- I rephrased
  + *
  + *	We cache indexes of the first empty chunk on each block (firstFreeChunk),
  + *	and freelist index for blocks with least free chunks (minFreeChunks), so
  + *	that we don't have to search the freelist and block on every SlabAlloc()
  + *	call, which is quite expensive.
    so it's not referencing firstFreeChunk anymore, since that seems to
    make less sense now that firstFreeChunk is essentially just the head
    of the list of free chunks.
- added typedefs.list entries and pgindented slab.c
- "mark the chunk as unused (zero the bit)" referenced non-existant
   bitmap afaics.
- valgrind was triggering on
	block->firstFreeChunk = *(int32 *) SlabChunkGetPointer(chunk);
  because that was previously marked as NOACCESS (via
  wipe_mem). Explicitly marking as DEFINED solves that.
- removed
  * XXX Perhaps we should not be gentle at all and simply fails in all cases,
  * to eliminate the (mostly pointless) uncertainty.
- you'd included MemoryContext tup_context; in 0002, but it's not really
  useful yet (and the comments above it in reorderbuffer.h don't apply)
- SlabFreeInfo/SlabAllocInfo didn't actually compile w/ HAVE_ALLOCINFO
  defined (pre StandardChunkHeader def)
- some minor stuff.

I'm kinda inclined to drop SlabFreeInfo/SlabAllocInfo.

I've not yet looked a lot at the next type of context - I want to get
this much committed first...

I plan to give this another pass sometime this weekend and then push
soon.

- Andres

Attachments:

0001-Make-useful-infrastructure-from-aset.c-generally-ava.patchtext/x-patch; charset=us-asciiDownload
From cec3f8372137d2392ff7ac0ab1b2db11fc96e8b3 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 23 Feb 2017 22:35:44 -0800
Subject: [PATCH 1/3] Make useful infrastructure from aset.c generally
 available.

An upcoming patch introduces a new type of memory context. To avoid
duplicating debugging infrastructure with aset.c move useful pieces to
memdebug.[ch].

While touching aset.c, fix printf format AllocFree* debug macros.

Author: Tomas Vondra
Reviewed-By: Andres Freund
Discussion: https://postgr.es/m/b3b2245c-b37a-e1e5-ebc4-857c914bc747@2ndquadrant.com
---
 src/backend/utils/mmgr/Makefile   |   2 +-
 src/backend/utils/mmgr/aset.c     | 115 +-------------------------------------
 src/backend/utils/mmgr/memdebug.c |  93 ++++++++++++++++++++++++++++++
 src/include/utils/memdebug.h      |  48 ++++++++++++++++
 4 files changed, 144 insertions(+), 114 deletions(-)
 create mode 100644 src/backend/utils/mmgr/memdebug.c

diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index 1842bae386..fc5f793b7f 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o portalmem.o
+OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 4dfc3ec260..8056c00ae4 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -41,46 +41,6 @@
  *	chunks as chunks.  Anything "large" is passed off to malloc().  Change
  *	the number of freelists to change the small/large boundary.
  *
- *
- *	About CLOBBER_FREED_MEMORY:
- *
- *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
- *	This is useful for catching places that reference already-freed memory.
- *
- *	About MEMORY_CONTEXT_CHECKING:
- *
- *	Since we usually round request sizes up to the next power of 2, there
- *	is often some unused space immediately after a requested data area.
- *	Thus, if someone makes the common error of writing past what they've
- *	requested, the problem is likely to go unnoticed ... until the day when
- *	there *isn't* any wasted space, perhaps because of different memory
- *	alignment on a new platform, or some other effect.  To catch this sort
- *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
- *	the requested space whenever the request is less than the actual chunk
- *	size, and verifies that the byte is undamaged when the chunk is freed.
- *
- *
- *	About USE_VALGRIND and Valgrind client requests:
- *
- *	Valgrind provides "client request" macros that exchange information with
- *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
- *	currently-used macros.
- *
- *	When running under Valgrind, we want a NOACCESS memory region both before
- *	and after the allocation.  The chunk header is tempting as the preceding
- *	region, but mcxt.c expects to able to examine the standard chunk header
- *	fields.  Therefore, we use, when available, the requested_size field and
- *	any subsequent padding.  requested_size is made NOACCESS before returning
- *	a chunk pointer to a caller.  However, to reduce client request traffic,
- *	it is kept DEFINED in chunks on the free list.
- *
- *	The rounded-up capacity of the chunk usually acts as a post-allocation
- *	NOACCESS region.  If the request consumes precisely the entire chunk,
- *	there is no such region; another chunk header may immediately follow.  In
- *	that case, Valgrind will not detect access beyond the end of the chunk.
- *
- *	See also the cooperating Valgrind client requests in mcxt.c.
- *
  *-------------------------------------------------------------------------
  */
 
@@ -296,10 +256,10 @@ static const unsigned char LogTable256[256] =
  */
 #ifdef HAVE_ALLOCINFO
 #define AllocFreeInfo(_cxt, _chunk) \
-			fprintf(stderr, "AllocFree: %s: %p, %d\n", \
+			fprintf(stderr, "AllocFree: %s: %p, %zu\n", \
 				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #define AllocAllocInfo(_cxt, _chunk) \
-			fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \
+			fprintf(stderr, "AllocAlloc: %s: %p, %zu\n", \
 				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #else
 #define AllocFreeInfo(_cxt, _chunk)
@@ -345,77 +305,6 @@ AllocSetFreeIndex(Size size)
 	return idx;
 }
 
-#ifdef CLOBBER_FREED_MEMORY
-
-/* Wipe freed memory for debugging purposes */
-static void
-wipe_mem(void *ptr, size_t size)
-{
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
-	memset(ptr, 0x7F, size);
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
-}
-#endif
-
-#ifdef MEMORY_CONTEXT_CHECKING
-static void
-set_sentinel(void *base, Size offset)
-{
-	char	   *ptr = (char *) base + offset;
-
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
-	*ptr = 0x7E;
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
-}
-
-static bool
-sentinel_ok(const void *base, Size offset)
-{
-	const char *ptr = (const char *) base + offset;
-	bool		ret;
-
-	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
-	ret = *ptr == 0x7E;
-	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
-
-	return ret;
-}
-#endif
-
-#ifdef RANDOMIZE_ALLOCATED_MEMORY
-
-/*
- * Fill a just-allocated piece of memory with "random" data.  It's not really
- * very random, just a repeating sequence with a length that's prime.  What
- * we mainly want out of it is to have a good probability that two palloc's
- * of the same number of bytes start out containing different data.
- *
- * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
- * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
- * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
- * fairly arbitrary.  UNDEFINED is more convenient for AllocSetRealloc(), and
- * other callers have no preference.
- */
-static void
-randomize_mem(char *ptr, size_t size)
-{
-	static int	save_ctr = 1;
-	size_t		remaining = size;
-	int			ctr;
-
-	ctr = save_ctr;
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
-	while (remaining-- > 0)
-	{
-		*ptr++ = ctr;
-		if (++ctr > 251)
-			ctr = 1;
-	}
-	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
-	save_ctr = ctr;
-}
-#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
-
 
 /*
  * Public routines
diff --git a/src/backend/utils/mmgr/memdebug.c b/src/backend/utils/mmgr/memdebug.c
new file mode 100644
index 0000000000..5f603d29a7
--- /dev/null
+++ b/src/backend/utils/mmgr/memdebug.c
@@ -0,0 +1,93 @@
+/*-------------------------------------------------------------------------
+ *
+ * memdebug.c
+ *	  Declarations used in memory context implementations, not part of the
+ *	  public API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/memdebug.c
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
diff --git a/src/include/utils/memdebug.h b/src/include/utils/memdebug.h
index 90eb926ddf..206e6ce002 100644
--- a/src/include/utils/memdebug.h
+++ b/src/include/utils/memdebug.h
@@ -31,4 +31,52 @@
 #define VALGRIND_MEMPOOL_CHANGE(context, optr, nptr, size)	do {} while (0)
 #endif
 
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static inline void
+wipe_mem(void *ptr, size_t size)
+{
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	memset(ptr, 0x7F, size);
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
+}
+
+#endif   /* CLOBBER_FREED_MEMORY */
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+static inline void
+set_sentinel(void *base, Size offset)
+{
+	char	   *ptr = (char *) base + offset;
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
+	*ptr = 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+}
+
+static inline bool
+sentinel_ok(const void *base, Size offset)
+{
+	const char *ptr = (const char *) base + offset;
+	bool		ret;
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
+	ret = *ptr == 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+
+	return ret;
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+void		randomize_mem(char *ptr, size_t size);
+
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
 #endif   /* MEMDEBUG_H */
-- 
2.11.0.22.g8d7a455.dirty

0002-Add-Slab-MemoryContext-implementation-for-efficient-.patchtext/x-patch; charset=us-asciiDownload
From e7b4ed53e67974663f5ea6863bb83bfcb76ada80 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 24 Feb 2017 13:51:44 -0800
Subject: [PATCH 2/3] Add "Slab" MemoryContext implementation for efficient
 equal-sized allocations.

Author: Tomas Vondra, editorialized by Andres Freund
Reviewed-By: Andres Freund, Petr Jelinek, Robert Haas, Jim Nasby,
Discussion: https://postgr.es/m/d15dff83-0b37-28ed-0809-95a5cc7292ad@2ndquadrant.com
---
 src/backend/utils/mmgr/Makefile  |   2 +-
 src/backend/utils/mmgr/slab.c    | 790 +++++++++++++++++++++++++++++++++++++++
 src/include/nodes/memnodes.h     |   2 +-
 src/include/nodes/nodes.h        |   1 +
 src/include/utils/memutils.h     |   9 +
 src/tools/pgindent/typedefs.list |   3 +
 6 files changed, 805 insertions(+), 2 deletions(-)
 create mode 100644 src/backend/utils/mmgr/slab.c

diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index fc5f793b7f..cd0e803253 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o
+OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000000..c9e99c12b2
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,790 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a MemoryContext implementation designed for cases where large
+ * numbers of equally-sized objects are allocated (and freed).
+ *
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ * NOTE:
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations over more general purpose allocators. The blocks are carved
+ *	into chunks of exactly the right size (plus alignment), not wasting any
+ *	memory.
+ *
+ *	The information about free chunks is maintained both at the block level and
+ *	global (context) level. This is possible as the chunk size (and thus also
+ *	the number of chunks per block) is fixed.
+ *
+ *	On each block, free chunks are tracked in a simple linked list. Contents
+ *	of free chunks is replaced with an index of the next free chunk, forming
+ *	a very simple linked list. Each block also contains a counter of free
+ *	chunks. Combined with the local block-level freelist, it makes it trivial
+ *	to eventually free the whole block.
+ *
+ *	At the context level, we use 'freelist' to track blocks ordered by number
+ *	of free chunks, starting with blocks having a single allocated chunk, and
+ *	with completely full blocks on the tail.
+ *
+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, the allocator reuses space from the fullest blocks first, in
+ *	the hope that some of the less full blocks will get completely empty (and
+ *	returned back to the OS).
+ *
+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.
+ *
+ *	We cache the freelist index for the blocks with the fewest free chunks
+ *	(minFreeChunks), so that we don't have to search the freelist on every
+ *	SlabAlloc() call, which is quite expensive.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define SLAB_CHUNKHDRSZ MAXALIGN(sizeof(SlabChunk))
+
+/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
+#define SLAB_CHUNK_USED \
+	(offsetof(SlabChunk, header) + sizeof(StandardChunkHeader))
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock; /* number of chunks per block */
+	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	/* blocks with free space, grouped by number of free chunks: */
+	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
+} SlabContext;
+
+/*
+ * SlabBlock
+ *		Structure of a single block in SLAB allocator.
+ *
+ * node: doubly-linked list of blocks in global freelist
+ * nfree: number of free chunks in this block
+ * firstFreeChunk: index of the first free chunk
+ */
+typedef struct SlabBlock
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nfree;			/* number of free chunks */
+	int			firstFreeChunk; /* index of the first free chunk in the block */
+} SlabBlock;
+
+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ */
+typedef struct SlabChunk
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* include StandardChunkHeader because mcxt.c expects that */
+	StandardChunkHeader header;
+
+} SlabChunk;
+
+
+#define SlabPointerGetChunk(ptr)	\
+	((SlabChunk *)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+#define SlabChunkGetPointer(chk)	\
+	((void *)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+#define SlabBlockGetChunk(slab, block, idx) \
+	((SlabChunk *) ((char *) (block) + sizeof(SlabBlock)	\
+					+ (idx * slab->fullChunkSize)))
+#define SlabBlockStart(block)	\
+	((char *) block + sizeof(SlabBlock))
+#define SlabChunkIndex(slab, block, chunk)	\
+	(((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize)
+
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabInit(MemoryContext context);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context, int level, bool print,
+		  MemoryContextCounters *totals);
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods SlabMethods = {
+	SlabAlloc,
+	SlabFree,
+	SlabRealloc,
+	SlabInit,
+	SlabReset,
+	SlabDelete,
+	SlabGetChunkSpace,
+	SlabIsEmpty,
+	SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,SlabCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %zu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->header.size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %zu\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->header.size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ *
+ * The chunkSize may not exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - MAXALIGN(sizeof(SlabBlock)) - SLAB_CHUNKHDRSZ
+ *
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize)
+{
+	int			chunksPerBlock;
+	Size		fullChunkSize;
+	Size		freelistSize;
+	SlabContext *slab;
+
+	/* otherwise the linked list inside freed chunk isn't guaranteed to fit */
+	StaticAssertStmt(MAXIMUM_ALIGNOF >= sizeof(int),
+					 "MAXALIGN too small to fit int32");
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunk) + MAXALIGN(chunkSize));
+
+	/* Make sure the block can store at least one chunk. */
+	if (blockSize - sizeof(SlabBlock) < fullChunkSize)
+		elog(ERROR, "block size %ld for slab is too small for %ld chunks",
+			 blockSize, chunkSize);
+
+	/* Compute maximum number of chunks per block */
+	chunksPerBlock = (blockSize - sizeof(SlabBlock)) / fullChunkSize;
+
+	/* The freelist starts with 0, ends with chunksPerBlock. */
+	freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1);
+
+	/* if we can't fit at least one chunk into the block, we're hosed */
+	Assert(chunksPerBlock > 0);
+
+	/* make sure the chunks actually fit on the block	*/
+	Assert((fullChunkSize * chunksPerBlock) + sizeof(SlabBlock) <= blockSize);
+
+	/* Do the type-independent part of context creation */
+	slab = (SlabContext *)
+		MemoryContextCreate(T_SlabContext,
+							(offsetof(SlabContext, freelist) +freelistSize),
+							&SlabMethods,
+							parent,
+							name);
+
+	slab->blockSize = blockSize;
+	slab->chunkSize = chunkSize;
+	slab->fullChunkSize = fullChunkSize;
+	slab->chunksPerBlock = chunksPerBlock;
+	slab->nblocks = 0;
+	slab->minFreeChunks = 0;
+
+	return (MemoryContext) slab;
+}
+
+/*
+ * SlabInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+SlabInit(MemoryContext context)
+{
+	int			i;
+	SlabContext *slab = castNode(SlabContext, context);
+
+	Assert(slab);
+
+	/* initialize the freelist slots */
+	for (i = 0; i < (slab->chunksPerBlock + 1); i++)
+		dlist_init(&slab->freelist[i]);
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+	int			i;
+	SlabContext *slab = castNode(SlabContext, context);
+
+	Assert(slab);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		dlist_mutable_iter miter;
+
+		dlist_foreach_modify(miter, &slab->freelist[i])
+		{
+			SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, slab->blockSize);
+#endif
+			free(block);
+			slab->nblocks--;
+		}
+	}
+
+	slab->minFreeChunks = 0;
+
+	Assert(slab->nblocks == 0);
+}
+
+/*
+ * SlabDelete
+ *		Frees all memory which is allocated in the given slab, in preparation
+ *		for deletion of the slab. We simply call SlabReset().
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+	/* just reset the context */
+	SlabReset(context);
+}
+
+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the slab.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	SlabContext *slab = castNode(SlabContext, context);
+	SlabBlock  *block;
+	SlabChunk  *chunk;
+	int			idx;
+
+	Assert(slab);
+
+	Assert((slab->minFreeChunks >= 0) &&
+		   (slab->minFreeChunks < slab->chunksPerBlock));
+
+	/* make sure we only allow correct request size */
+	if (size != slab->chunkSize)
+		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+			 size, slab->chunkSize);
+
+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 *
+	 * slab->minFreeChunks == 0 means there are no blocks with free chunks,
+	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+	 */
+	if (slab->minFreeChunks == 0)
+	{
+		block = (SlabBlock *) malloc(slab->blockSize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nfree = slab->chunksPerBlock;
+		block->firstFreeChunk = 0;
+
+		/*
+		 * Put all the chunks on a freelist. Walk the chunks and point each
+		 * one to the next one.
+		 */
+		for (idx = 0; idx < slab->chunksPerBlock; idx++)
+		{
+			chunk = SlabBlockGetChunk(slab, block, idx);
+			*(int32 *) SlabChunkGetPointer(chunk) = (idx + 1);
+		}
+
+		/*
+		 * And add it to the last freelist with all chunks empty.
+		 *
+		 * XXX We know there are no blocks in the freelist, otherwise we
+		 * wouldn't need a new block.
+		 */
+		Assert(dlist_is_empty(&slab->freelist[slab->chunksPerBlock]));
+
+		dlist_push_head(&slab->freelist[slab->chunksPerBlock], &block->node);
+
+		slab->minFreeChunks = slab->chunksPerBlock;
+		slab->nblocks += 1;
+	}
+
+	/* grab the block from the freelist (even the new block is there) */
+	block = dlist_head_element(SlabBlock, node,
+							   &slab->freelist[slab->minFreeChunks]);
+
+	/* make sure we actually got a valid block, with matching nfree */
+	Assert(block != NULL);
+	Assert(slab->minFreeChunks == block->nfree);
+	Assert(block->nfree > 0);
+
+	/* we know index of the first free chunk in the block */
+	idx = block->firstFreeChunk;
+
+	/* make sure the chunk index is valid, and that it's marked as empty */
+	Assert((idx >= 0) && (idx < slab->chunksPerBlock));
+
+	/* compute the chunk location block start (after the block header) */
+	chunk = SlabBlockGetChunk(slab, block, idx);
+
+	/*
+	 * Update the block nfree count, and also the minFreeChunks as we've
+	 * decreased nfree for a block with the minimum number of free chunks
+	 * (because that's how we chose the block).
+	 */
+	block->nfree--;
+	slab->minFreeChunks = block->nfree;
+
+	/*
+	 * Remove the chunk from the freelist head. The index of the next free
+	 * chunk is stored in the chunk itself.
+	 */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, SlabChunkGetPointer(chunk));
+	block->firstFreeChunk = *(int32 *) SlabChunkGetPointer(chunk);
+
+	Assert(block->firstFreeChunk >= 0);
+	Assert(block->firstFreeChunk <= slab->chunksPerBlock);
+
+	Assert((block->nfree != 0 &&
+			block->firstFreeChunk < slab->chunksPerBlock) ||
+		   (block->nfree == 0 &&
+			block->firstFreeChunk == slab->chunksPerBlock));
+
+	/* move the whole block to the right place in the freelist */
+	dlist_delete(&block->node);
+	dlist_push_head(&slab->freelist[block->nfree], &block->node);
+
+	/*
+	 * And finally update minFreeChunks, i.e. the index to the block with the
+	 * lowest number of free chunks. We only need to do that when the block
+	 * got full (otherwise we know the current block is the right one). We'll
+	 * simply walk the freelist until we find a non-empty entry.
+	 */
+	if (slab->minFreeChunks == 0)
+	{
+		for (idx = 1; idx <= slab->chunksPerBlock; idx++)
+		{
+			if (dlist_is_empty(&slab->freelist[idx]))
+				continue;
+
+			/* found a non-empty freelist */
+			slab->minFreeChunks = idx;
+			break;
+		}
+	}
+
+	if (slab->minFreeChunks == slab->chunksPerBlock)
+		slab->minFreeChunks = 0;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+
+	chunk->block = (void *) block;
+
+	chunk->header.context = (MemoryContext) slab;
+	chunk->header.size = MAXALIGN(size);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->header.requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+							   sizeof(chunk->header.requested_size));
+	/* slab mark to catch clobber of "unused" space */
+	if (size < chunk->header.size)
+		set_sentinel(SlabChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+	SlabAllocInfo(slab, chunk);
+	return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the slab.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+	int			idx;
+	SlabContext *slab = castNode(SlabContext, context);
+	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
+	SlabBlock  *block = chunk->block;
+
+	SlabFreeInfo(slab, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+							  sizeof(chunk->header.requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->header.requested_size < chunk->header.size)
+		if (!sentinel_ok(pointer, chunk->header.requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 slab->header.name, chunk);
+#endif
+
+	/* compute index of the chunk with respect to block start */
+	idx = SlabChunkIndex(slab, block, chunk);
+
+	/* add chunk to freelist, and update block nfree count */
+	*(int32 *) pointer = block->firstFreeChunk;
+	block->firstFreeChunk = idx;
+	block->nfree++;
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= slab->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	/* XXX don't wipe the int32 index, used for block-level freelist */
+	wipe_mem((char *) pointer + sizeof(int32),
+			 chunk->header.size - sizeof(int32));
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->header.requested_size = 0;
+#endif
+
+	/* remove the block from a freelist */
+	dlist_delete(&block->node);
+
+	/*
+	 * See if we need to update the minFreeChunks field for the slab - we only
+	 * need to do that if there the block had that number of free chunks
+	 * before we freed one. In that case, we check if there still are blocks
+	 * in the original freelist and we either keep the current value (if there
+	 * still are blocks) or increment it by one (the new block is still the
+	 * one with minimum free chunks).
+	 *
+	 * The one exception is when the block will get completely free - in that
+	 * case we will free it, se we can't use it for minFreeChunks. It however
+	 * means there are no more blocks with free chunks.
+	 */
+	if (slab->minFreeChunks == (block->nfree - 1))
+	{
+		/* Have we removed the last chunk from the freelist? */
+		if (dlist_is_empty(&slab->freelist[slab->minFreeChunks]))
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == slab->chunksPerBlock)
+				slab->minFreeChunks = 0;
+			else
+				slab->minFreeChunks++;
+		}
+	}
+
+	/* If the block is now completely empty, free it. */
+	if (block->nfree == slab->chunksPerBlock)
+	{
+		free(block);
+		slab->nblocks--;
+	}
+	else
+		dlist_push_head(&slab->freelist[block->nfree], &block->node);
+
+	Assert(slab->nblocks >= 0);
+}
+
+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc.
+ *
+ * We try to be gentle and allow calls with exactly the same size as in that
+ * case we can simply return the same chunk. When the size differs, we fail
+ * with assert failure or return NULL.
+ *
+ * We might be even support cases with (size < chunkSize). That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	SlabContext *slab = castNode(SlabContext, context);
+
+	Assert(slab);
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == slab->chunkSize)
+		return pointer;
+
+	elog(ERROR, "slab allocator does not support realloc()");
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
+
+	return chunk->header.size + SLAB_CHUNKHDRSZ;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+	SlabContext *slab = castNode(SlabContext, context);
+
+	Assert(slab);
+
+	return (slab->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of an Slab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+SlabStats(MemoryContext context, int level, bool print,
+		  MemoryContextCounters *totals)
+{
+	SlabContext *slab = castNode(SlabContext, context);
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+	int			i;
+
+	Assert(slab);
+
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		dlist_iter	iter;
+
+		dlist_foreach(iter, &slab->freelist[i])
+		{
+			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+
+			nblocks++;
+			totalspace += slab->blockSize;
+			freespace += slab->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+		}
+	}
+
+	if (print)
+	{
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+				"Slab: %s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n",
+				slab->header.name, totalspace, nblocks, freespace, freechunks,
+				totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+	int			i;
+	SlabContext *slab = castNode(SlabContext, context);
+	char	   *name = slab->header.name;
+	char	   *freechunks;
+
+	Assert(slab);
+	Assert(slab->chunksPerBlock > 0);
+
+	/* bitmap of free chunks on a block */
+	freechunks = palloc(slab->chunksPerBlock * sizeof(bool));
+
+	/* walk all the freelists */
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		int			j,
+					nfree;
+		dlist_iter	iter;
+
+		/* walk all blocks on this freelist */
+		dlist_foreach(iter, &slab->freelist[i])
+		{
+			int			idx;
+			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+
+			/*
+			 * Make sure the number of free chunks (in the block header)
+			 * matches position in the freelist.
+			 */
+			if (block->nfree != i)
+				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
+					 name, block->nfree, block, i);
+
+			/* reset the bitmap of free chunks for this block */
+			memset(freechunks, 0, (slab->chunksPerBlock * sizeof(bool)));
+			idx = block->firstFreeChunk;
+
+			/*
+			 * Now walk through the chunks, count the free ones and also
+			 * perform some additional checks for the used ones. As the chunk
+			 * freelist is stored within the chunks themselves, we have to
+			 * walk through the chunks and construct our own bitmap.
+			 */
+
+			nfree = 0;
+			while (idx < slab->chunksPerBlock)
+			{
+				SlabChunk  *chunk;
+
+				/* count the chunk as free, add it to the bitmap */
+				nfree++;
+				freechunks[idx] = true;
+
+				/* read index of the next free chunk */
+				chunk = SlabBlockGetChunk(slab, block, idx);
+				idx = *(int32 *) SlabChunkGetPointer(chunk);
+			}
+
+			for (j = 0; j < slab->chunksPerBlock; j++)
+			{
+				/* non-zero bit in the bitmap means chunk the chunk is used */
+				if (!freechunks[j])
+				{
+					SlabChunk  *chunk = SlabBlockGetChunk(slab, block, j);
+
+					VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+					/* we're in a no-freelist branch */
+					VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
+									   sizeof(chunk->header.requested_size));
+
+					/* chunks have both block and slab pointers, so check both */
+					if (chunk->block != block)
+						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+							 name, block, chunk);
+
+					if (chunk->header.context != (MemoryContext) slab)
+						elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* now make sure the chunk size is correct */
+					if (chunk->header.size != MAXALIGN(slab->chunkSize))
+						elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* now make sure the chunk size is correct */
+					if (chunk->header.requested_size != slab->chunkSize)
+						elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* there might be sentinel (thanks to alignment) */
+					if (chunk->header.requested_size < chunk->header.size &&
+						!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->header.requested_size))
+						elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+							 name, block, chunk);
+				}
+			}
+
+			/*
+			 * Make sure we got the expected number of free chunks (as tracked
+			 * in the block header).
+			 */
+			if (nfree != block->nfree)
+				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
+					 name, block->nfree, block, nfree);
+		}
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index e487d172fc..fe6bc903b3 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,6 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext)))
+	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 95dd8baadd..28aca928a8 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -278,6 +278,7 @@ typedef enum NodeTag
 	 */
 	T_MemoryContext,
 	T_AllocSetContext,
+	T_SlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 1d1035e374..5223a4da39 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -135,6 +135,12 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
 					  Size initBlockSize,
 					  Size maxBlockSize);
 
+/* slab.c */
+extern MemoryContext SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
@@ -171,4 +177,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
  */
 #define ALLOCSET_SEPARATE_THRESHOLD  8192
 
+#define SLAB_DEFAULT_BLOCK_SIZE		(8 * 1024)
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
+
 #endif   /* MEMUTILS_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9f876ae264..1fd7ec4256 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1941,6 +1941,9 @@ SimpleStringList
 SimpleStringListCell
 SingleBoundSortItem
 Size
+SlabBlock
+SlabContext
+SlabChunk
 SlabSlot
 SlotNumber
 SlruCtl
-- 
2.11.0.22.g8d7a455.dirty

0003-Use-the-new-Slab-context-for-some-allocations-in-reo.patchtext/x-patch; charset=us-asciiDownload
From 2e10d6e62ba4756042fef96e411efb33128e67e1 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 24 Feb 2017 14:02:59 -0800
Subject: [PATCH 3/3] Use the new Slab context for some allocations in
 reorderbuffer.h.

Author: Tomas Vondra
Discussion: https://postgr.es/m/d15dff83-0b37-28ed-0809-95a5cc7292ad@2ndquadrant.com
---
 src/backend/replication/logical/reorderbuffer.c | 74 ++++++-------------------
 src/include/replication/reorderbuffer.h         | 14 ++---
 2 files changed, 22 insertions(+), 66 deletions(-)

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 7dc97fa796..8aac670bd4 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -156,10 +156,7 @@ static const Size max_changes_in_memory = 4096;
  * major bottleneck, especially when spilling to disk while decoding batch
  * workloads.
  */
-static const Size max_cached_changes = 4096 * 2;
 static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
-
 
 /* ---------------------------------------
  * primary reorderbuffer support routines
@@ -241,6 +238,16 @@ ReorderBufferAllocate(void)
 
 	buffer->context = new_ctx;
 
+	buffer->change_context = SlabContextCreate(new_ctx,
+											   "Change",
+											   SLAB_DEFAULT_BLOCK_SIZE,
+											   sizeof(ReorderBufferChange));
+
+	buffer->txn_context = SlabContextCreate(new_ctx,
+											"TXN",
+											SLAB_DEFAULT_BLOCK_SIZE,
+											sizeof(ReorderBufferTXN));
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -251,8 +258,6 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_transactions = 0;
-	buffer->nr_cached_changes = 0;
 	buffer->nr_cached_tuplebufs = 0;
 
 	buffer->outbuf = NULL;
@@ -261,8 +266,6 @@ ReorderBufferAllocate(void)
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	dlist_init(&buffer->cached_transactions);
-	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
@@ -291,19 +294,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)
+		MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -344,18 +336,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	pfree(txn);
 }
 
 /*
@@ -366,19 +347,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 {
 	ReorderBufferChange *change;
 
-	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)
+		MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -434,21 +404,9 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	pfree(change);
 }
 
-
 /*
  * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at
  * least a tuple of size tuple_len (excluding header overhead).
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 25b0fc8c0a..17e47b385b 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -331,6 +331,12 @@ struct ReorderBuffer
 	MemoryContext context;
 
 	/*
+	 * Memory contexts for specific types objects
+	 */
+	MemoryContext change_context;
+	MemoryContext txn_context;
+
+	/*
 	 * Data structure slab cache.
 	 *
 	 * We allocate/deallocate some structures very frequently, to avoid bigger
@@ -340,14 +346,6 @@ struct ReorderBuffer
 	 * on top of reorderbuffer.c
 	 */
 
-	/* cached ReorderBufferTXNs */
-	dlist_head	cached_transactions;
-	Size		nr_cached_transactions;
-
-	/* cached ReorderBufferChanges */
-	dlist_head	cached_changes;
-	Size		nr_cached_changes;
-
 	/* cached ReorderBufferTupleBufs */
 	slist_head	cached_tuplebufs;
 	Size		nr_cached_tuplebufs;
-- 
2.11.0.22.g8d7a455.dirty

#52Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#51)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-24 14:10:38 -0800, Andres Freund wrote:

I've not yet looked a lot at the next type of context - I want to get
this much committed first...

I plan to give this another pass sometime this weekend and then push
soon.

Before committing I wanted to make sure that
http://archives.postgresql.org/message-id/32354.1487977458%40sss.pgh.pa.us
isn't a sufficient fix.

With the test of N=1000000 from this thread I measured both runtime and
memory usage (note that's peak virtual memory which includes 2GB of
shared_buffers and such), in assert enabled builds.

master: doesn't finish reasonably
master+doubly linked list fix: 9390.805 ms VmPeak: 10,969,424 kb
master+this thread: 6500.293 ms VmPeak: 2,969,528 kB

So the doubly-linked-list fix is great (and much more backpatchable),
but the patches in this thread are both better runtime *and* peak memory
usage wise. So that seems like a clear call.

I've not yet reviewed the generational allocator yet, but during these
measurements I get:
postgres[3970][1]=# select count(*) FROM pg_logical_slot_get_changes('ttt', NULL, NULL);
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d011ef10f0 exceeds 7234 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d01023eba0 exceeds 65532 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d00d7fb870 exceeds 65532 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d00cde17b0 exceeds 65531 allocated
LOCATION: GenerationCheck, generation.c:693

that seems to occur when there's currently in-progress transactions when
finishing decoding:

#0 GenerationCheck (context=0x5629129407c8)
at /home/andres/src/postgresql/src/backend/utils/mmgr/generation.c:692
#1 0x00005629105d92db in GenerationReset (context=0x5629129407c8)
at /home/andres/src/postgresql/src/backend/utils/mmgr/generation.c:255
#2 0x00005629105d93c6 in GenerationDelete (context=0x5629129407c8)
at /home/andres/src/postgresql/src/backend/utils/mmgr/generation.c:287
#3 0x00005629105e1e12 in MemoryContextDelete (context=0x5629129407c8)
at /home/andres/src/postgresql/src/backend/utils/mmgr/mcxt.c:225
#4 0x00005629105e1ee3 in MemoryContextDeleteChildren (context=0x562912940008)
at /home/andres/src/postgresql/src/backend/utils/mmgr/mcxt.c:245
#5 0x00005629105e1de0 in MemoryContextDelete (context=0x562912940008)
at /home/andres/src/postgresql/src/backend/utils/mmgr/mcxt.c:208
#6 0x00005629103d5451 in ReorderBufferFree (rb=0x562912906320)
at /home/andres/src/postgresql/src/backend/replication/logical/reorderbuffer.c:278
#7 0x00005629103cea4f in FreeDecodingContext (ctx=0x562912904310)
at /home/andres/src/postgresql/src/backend/replication/logical/logical.c:462
#8 0x00005629103d03f0 in pg_logical_slot_get_changes_guts (fcinfo=0x7fffc2042e50, confirm=0 '\000',

could it be that the test's condition is inverted?

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm wondering
if generation.c is a too generic filename.

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#53Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#52)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-27 03:17:32 -0800, Andres Freund wrote:

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm wondering
if generation.c is a too generic filename.

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#54Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#52)
Re: PATCH: two slab-like memory allocators

On 02/27/2017 12:17 PM, Andres Freund wrote:

Hi,

On 2017-02-24 14:10:38 -0800, Andres Freund wrote:

I've not yet looked a lot at the next type of context - I want to get
this much committed first...

I plan to give this another pass sometime this weekend and then push
soon.

Before committing I wanted to make sure that
http://archives.postgresql.org/message-id/32354.1487977458%40sss.pgh.pa.us
isn't a sufficient fix.

With the test of N=1000000 from this thread I measured both runtime and
memory usage (note that's peak virtual memory which includes 2GB of
shared_buffers and such), in assert enabled builds.

master: doesn't finish reasonably
master+doubly linked list fix: 9390.805 ms VmPeak: 10,969,424 kb
master+this thread: 6500.293 ms VmPeak: 2,969,528 kB

So the doubly-linked-list fix is great (and much more backpatchable),
but the patches in this thread are both better runtime *and* peak memory
usage wise. So that seems like a clear call.

Nice, thanks for doing the test.

I've not yet reviewed the generational allocator yet, but during these
measurements I get:
postgres[3970][1]=# select count(*) FROM pg_logical_slot_get_changes('ttt', NULL, NULL);
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d011ef10f0 exceeds 7234 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d01023eba0 exceeds 65532 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d00d7fb870 exceeds 65532 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d00cde17b0 exceeds 65531 allocated
LOCATION: GenerationCheck, generation.c:693

that seems to occur when there's currently in-progress transactions when
finishing decoding:

...

could it be that the test's condition is inverted?

Yeah, that seems like the culprit - the condition seems wrong. I wonder
why I haven't seen it during my tests, though ...

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm wondering
if generation.c is a too generic filename.

Naming things is hard.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#55Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#53)
Re: PATCH: two slab-like memory allocators

On 02/27/2017 01:02 PM, Andres Freund wrote:

Hi,

On 2017-02-27 03:17:32 -0800, Andres Freund wrote:

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm wondering
if generation.c is a too generic filename.

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

- Andres

Gah. I don't want to annoy person who just committed my patch, but can
you give more time when asking for feedback? I mean, sending a modified
patch on Friday midnight, and committing on Monday noon does not really
give much time to look at it.

The changes seem fine to me, thanks for spending time on this.

Thanks

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#56Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#55)
Re: PATCH: two slab-like memory allocators

On February 27, 2017 6:14:20 AM PST, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:

On 02/27/2017 01:02 PM, Andres Freund wrote:

Hi,

On 2017-02-27 03:17:32 -0800, Andres Freund wrote:

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm

wondering

if generation.c is a too generic filename.

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

- Andres

Gah. I don't want to annoy person who just committed my patch, but can
you give more time when asking for feedback? I mean, sending a modified

patch on Friday midnight, and committing on Monday noon does not really

give much time to look at it.

Hm. The changes IMO weren't controversial (or surprising -most of them I had announced previously); I announced that I would when posting the review that I'd push the patch later that weekend. If I hadn't been tired after doing the review/editing I'd have just pushed right then and there. It's hard to find time and attention, so not introducing a week of feedback time is quite worthwhile. I listed the changes I made primarily for posterities sake. Most if not all committers make editorializing changed around commit, so that's not just me.

If you specifically want I can try to give you more time to look at an edited patch, but that'll mean things move slower. I won't promise not to make minor changed just before commit either way, I always do another round of review just before push.

Andres
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#57Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#53)
Re: PATCH: two slab-like memory allocators

Andres Freund <andres@anarazel.de> writes:

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

Perhaps first you need to find out why so much of the buildfarm
is unhappy.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#58Andres Freund
andres@anarazel.de
In reply to: Tom Lane (#57)
Re: PATCH: two slab-like memory allocators

On 2017-02-27 10:32:25 -0500, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

Perhaps first you need to find out why so much of the buildfarm
is unhappy.

Will do, after a morning coffee.

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#59Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#58)
Re: PATCH: two slab-like memory allocators

On 2017-02-27 07:55:32 -0800, Andres Freund wrote:

On 2017-02-27 10:32:25 -0500, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

Perhaps first you need to find out why so much of the buildfarm
is unhappy.

Will do, after a morning coffee.

Hm. Not entirely clear on what's going on yet. I've run the tests on
hydra (community ppc 64 machine), which is pretty similar to termite
which failed [1]https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=termite&amp;dt=2017-02-27%2014%3A00%3A06 with:
TRAP: BadArgument("!(((context) != ((void *)0) && (((((const Node*)((context)))->type) == T_AllocSetContext) || ((((const Node*)((context)))->type) == T_SlabContext))))", File: "/home/pgbuildfarm/buildroot-termite/HEAD/pgsql.build/../pgsql/src/backend/utils/mmgr/mcxt.c", Line: 1010)

The best theory I have so far that I have is that slab.c's idea of
StandardChunkHeader's size doesn't match what mcxt.c think it is
(because slab.c simply embeds StandardChunkHeader, but mcxt uses
MAXALIGN(sizeof(StandardChunkHeader))). That's not good, but I don't
quite see how that'd cause the issue, since StandardChunkHeader's size
should always be properly sized.

Tomas, do you have access to termite (which appears to be run by Craig,
under company mail).

If not, I can push a "blind" fix, but I'd rather have more information.

Greetings,

Andres Freund

[1]: https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=termite&amp;dt=2017-02-27%2014%3A00%3A06

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#60Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#54)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-27 15:11:40 +0100, Tomas Vondra wrote:

I've not yet reviewed the generational allocator yet, but during these
measurements I get:
postgres[3970][1]=# select count(*) FROM pg_logical_slot_get_changes('ttt', NULL, NULL);
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d011ef10f0 exceeds 7234 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d01023eba0 exceeds 65532 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d00d7fb870 exceeds 65532 allocated
LOCATION: GenerationCheck, generation.c:693
WARNING: 01000: problem in Generation Tuples: number of free chunks 0 in block 0x55d00cde17b0 exceeds 65531 allocated
LOCATION: GenerationCheck, generation.c:693

that seems to occur when there's currently in-progress transactions when
finishing decoding:

...

could it be that the test's condition is inverted?

Yeah, that seems like the culprit - the condition seems wrong. I wonder why
I haven't seen it during my tests, though ...

I suspect it's because your tests only triggered a memory context reset
when it was empty... But I ran decoding while a concurrent write
transaction was ongoing...

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm wondering
if generation.c is a too generic filename.

Naming things is hard.

Indeed. I was thinking of genalloc, but that might be understood as
general, rather generational...

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#61Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#59)
Re: PATCH: two slab-like memory allocators

On 02/27/2017 05:48 PM, Andres Freund wrote:

On 2017-02-27 07:55:32 -0800, Andres Freund wrote:

On 2017-02-27 10:32:25 -0500, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

Perhaps first you need to find out why so much of the buildfarm
is unhappy.

Will do, after a morning coffee.

Hm. Not entirely clear on what's going on yet. I've run the tests on
hydra (community ppc 64 machine), which is pretty similar to termite
which failed [1] with:
TRAP: BadArgument("!(((context) != ((void *)0) && (((((const Node*)((context)))->type) == T_AllocSetContext) || ((((const Node*)((context)))->type) == T_SlabContext))))", File: "/home/pgbuildfarm/buildroot-termite/HEAD/pgsql.build/../pgsql/src/backend/utils/mmgr/mcxt.c", Line: 1010)

The best theory I have so far that I have is that slab.c's idea of
StandardChunkHeader's size doesn't match what mcxt.c think it is
(because slab.c simply embeds StandardChunkHeader, but mcxt uses
MAXALIGN(sizeof(StandardChunkHeader))). That's not good, but I don't
quite see how that'd cause the issue, since StandardChunkHeader's size
should always be properly sized.

Tomas, do you have access to termite (which appears to be run by Craig,
under company mail).

No, I don't, but I'll ping Craig. I might ping him, but it's ~4AM in
Australia, though, so it'll take time.

FWIW I think the ppc64 machines are failing because of unrelated issue
(changes to integer timestamps). We should probably look at 32bit
machines first.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#62Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#61)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-27 17:56:08 +0100, Tomas Vondra wrote:

No, I don't, but I'll ping Craig. I might ping him, but it's ~4AM in
Australia, though, so it'll take time.

Did the same... ;)

FWIW I think the ppc64 machines are failing because of unrelated issue
(changes to integer timestamps). We should probably look at 32bit machines
first.

Don't think so - termite is ppc64 afaics, and the failure doesn't look
integer timestamp related (assert failure is clearly about this, and set
of changed commits *only* include slab related commits).

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#63Petr Jelinek
petr.jelinek@2ndquadrant.com
In reply to: Andres Freund (#62)
Re: PATCH: two slab-like memory allocators

On 27/02/17 18:00, Andres Freund wrote:

FWIW I think the ppc64 machines are failing because of unrelated issue
(changes to integer timestamps). We should probably look at 32bit machines
first.

Don't think so - termite is ppc64 afaics, and the failure doesn't look
integer timestamp related (assert failure is clearly about this, and set
of changed commits *only* include slab related commits).

termite is ppc64 but with 4 byte pointer size according to configure so
it might be related to that perhaps?

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#64Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#62)
Re: PATCH: two slab-like memory allocators

Andres Freund <andres@anarazel.de> writes:

FWIW I think the ppc64 machines are failing because of unrelated issue
(changes to integer timestamps). We should probably look at 32bit machines
first.

Don't think so - termite is ppc64 afaics, and the failure doesn't look
integer timestamp related (assert failure is clearly about this, and set
of changed commits *only* include slab related commits).

There are a couple of animals that have --disable-integer-datetimes,
but those are failing at the configure stage, and were doing so
before today.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#65Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#59)
Re: PATCH: two slab-like memory allocators

Andres Freund <andres@anarazel.de> writes:

The best theory I have so far that I have is that slab.c's idea of
StandardChunkHeader's size doesn't match what mcxt.c think it is
(because slab.c simply embeds StandardChunkHeader, but mcxt uses
MAXALIGN(sizeof(StandardChunkHeader))). That's not good, but I don't
quite see how that'd cause the issue, since StandardChunkHeader's size
should always be properly sized.

Uh, wrong. On a 32-bit machine with debug enabled, StandardChunkHeader
will contain 3 4-byte fields. However, there are some such machines on
which MAXALIGN is 8. For example, looking at termite's configure
output:

checking size of void *... 4
checking size of size_t... 4
checking size of long... 4
checking alignment of short... 2
checking alignment of int... 4
checking alignment of long... 4
checking alignment of long long int... 8
checking alignment of double... 8

axolotl's output looks similar. I expect my old HPPA dinosaur
will show the failure as well.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#66Andres Freund
andres@anarazel.de
In reply to: Petr Jelinek (#63)
Re: PATCH: two slab-like memory allocators

On 2017-02-27 18:04:41 +0100, Petr Jelinek wrote:

On 27/02/17 18:00, Andres Freund wrote:

FWIW I think the ppc64 machines are failing because of unrelated issue
(changes to integer timestamps). We should probably look at 32bit machines
first.

Don't think so - termite is ppc64 afaics, and the failure doesn't look
integer timestamp related (assert failure is clearly about this, and set
of changed commits *only* include slab related commits).

termite is ppc64 but with 4 byte pointer size according to configure so
it might be related to that perhaps?

Uh, ok. I checked the --configure options, but not the actual configure
output (blame -ENOCOFEE and jetlag). The output makes it fairly likely
that my StandardChunkHeader theory is valid, so I'll work on a patch to
clean that up.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#67Andres Freund
andres@anarazel.de
In reply to: Tom Lane (#65)
Re: PATCH: two slab-like memory allocators

On 2017-02-27 12:27:48 -0500, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

The best theory I have so far that I have is that slab.c's idea of
StandardChunkHeader's size doesn't match what mcxt.c think it is
(because slab.c simply embeds StandardChunkHeader, but mcxt uses
MAXALIGN(sizeof(StandardChunkHeader))). That's not good, but I don't
quite see how that'd cause the issue, since StandardChunkHeader's size
should always be properly sized.

Uh, wrong. On a 32-bit machine with debug enabled, StandardChunkHeader
will contain 3 4-byte fields. However, there are some such machines on
which MAXALIGN is 8. For example, looking at termite's configure
output:

checking size of void *... 4
checking size of size_t... 4
checking size of long... 4
checking alignment of short... 2
checking alignment of int... 4
checking alignment of long... 4
checking alignment of long long int... 8
checking alignment of double... 8

axolotl's output looks similar. I expect my old HPPA dinosaur
will show the failure as well.

Yea, I hadn't yet realized when writing that that termite actually,
despite running on ppc64, compiles a 32bit postgres. Will thus
duplicate StandardChunkHeader's contents in to slab.c :( - I don't see
an easy way around that...

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#68Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#66)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 02/27/2017 06:40 PM, Andres Freund wrote:

On 2017-02-27 18:04:41 +0100, Petr Jelinek wrote:

On 27/02/17 18:00, Andres Freund wrote:

FWIW I think the ppc64 machines are failing because of unrelated issue
(changes to integer timestamps). We should probably look at 32bit machines
first.

Don't think so - termite is ppc64 afaics, and the failure doesn't look
integer timestamp related (assert failure is clearly about this, and set
of changed commits *only* include slab related commits).

termite is ppc64 but with 4 byte pointer size according to configure so
it might be related to that perhaps?

Uh, ok. I checked the --configure options, but not the actual configure
output (blame -ENOCOFEE and jetlag). The output makes it fairly likely
that my StandardChunkHeader theory is valid, so I'll work on a patch to
clean that up.

Thanks. I set up a rpi3 machine (amrv7l) that fails with the same issue,
so if you need to test the patch, let me know.

While building, I've also noticed a bunch of warnings about string
formatting, attached is a patch that that fixes those.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-warnings.patchtext/x-diff; name=slab-warnings.patchDownload
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index a5e140e..c673bc3 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -207,7 +207,7 @@ SlabContextCreate(MemoryContext parent,
 
 	/* Make sure the block can store at least one chunk. */
 	if (blockSize - sizeof(SlabBlock) < fullChunkSize)
-		elog(ERROR, "block size %ld for slab is too small for %ld chunks",
+		elog(ERROR, "block size %zu for slab is too small for %zu chunks",
 			 blockSize, chunkSize);
 
 	/* Compute maximum number of chunks per block */
@@ -333,7 +333,7 @@ SlabAlloc(MemoryContext context, Size size)
 
 	/* make sure we only allow correct request size */
 	if (size != slab->chunkSize)
-		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+		elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)",
 			 size, slab->chunkSize);
 
 	/*
#69Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#56)
Re: PATCH: two slab-like memory allocators

On 02/27/2017 04:07 PM, Andres Freund wrote:

On February 27, 2017 6:14:20 AM PST, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:

On 02/27/2017 01:02 PM, Andres Freund wrote:

Hi,

On 2017-02-27 03:17:32 -0800, Andres Freund wrote:

I'll work on getting slab committed first, and then review /
edit / commit generation.c later. One first note there is that
I'm

wondering

if generation.c is a too generic filename.

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

- Andres

Gah. I don't want to annoy person who just committed my patch, but
can you give more time when asking for feedback? I mean, sending a
modified

patch on Friday midnight, and committing on Monday noon does not
really

give much time to look at it.

Hm. The changes IMO weren't controversial (or surprising -most of
them I had announced previously); I announced that I would when
posting the review that I'd push the patch later that weekend. If I
hadn't been tired after doing the review/editing I'd have just pushed
right then and there. It's hard to find time and attention, so not
introducing a week of feedback time is quite worthwhile. I listed
the changes I made primarily for posterities sake. Most if not all
committers make editorializing changed around commit, so that's not
just me.

If you specifically want I can try to give you more time to look at
an edited patch, but that'll mean things move slower. I won't
promise not to make minor changed just before commit either way, I
always do another round of review just before push.

I also agree the changes are not particularly controversial, but then
why to ask for comments at all? I'm OK with a committer making final
tweaks and pushing it without asking, but if you ask for comments, let's
give people time to actually respond.

I agree introducing weeks of delays would be silly, but I'm not asking
for that. I'm perfectly fine with two days for feedback, as long as it's
not a weekend + half of Monday.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#70Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#67)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 02/27/2017 06:42 PM, Andres Freund wrote:

On 2017-02-27 12:27:48 -0500, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

The best theory I have so far that I have is that slab.c's idea of
StandardChunkHeader's size doesn't match what mcxt.c think it is
(because slab.c simply embeds StandardChunkHeader, but mcxt uses
MAXALIGN(sizeof(StandardChunkHeader))). That's not good, but I don't
quite see how that'd cause the issue, since StandardChunkHeader's size
should always be properly sized.

Uh, wrong. On a 32-bit machine with debug enabled, StandardChunkHeader
will contain 3 4-byte fields. However, there are some such machines on
which MAXALIGN is 8. For example, looking at termite's configure
output:

checking size of void *... 4
checking size of size_t... 4
checking size of long... 4
checking alignment of short... 2
checking alignment of int... 4
checking alignment of long... 4
checking alignment of long long int... 8
checking alignment of double... 8

axolotl's output looks similar. I expect my old HPPA dinosaur
will show the failure as well.

Yea, I hadn't yet realized when writing that that termite actually,
despite running on ppc64, compiles a 32bit postgres. Will thus
duplicate StandardChunkHeader's contents in to slab.c :( - I don't
see an easy way around that...

I've tried this - essentially copying the StandardChunkHeader's contents
into SlabChunk, but that does not seem to do the trick, sadly. Per
pahole, the structures then (at least on armv7l) look like this:

struct SlabChunk {
void * block; /* 0 4 */
MemoryContext context; /* 4 4 */
Size size; /* 8 4 */
Size requested_size; /* 12 4 */

/* size: 16, cachelines: 1, members: 4 */
/* last cacheline: 16 bytes */
};

struct StandardChunkHeader {
MemoryContext context; /* 0 4 */
Size size; /* 4 4 */
Size requested_size; /* 8 4 */

/* size: 12, cachelines: 1, members: 3 */
/* last cacheline: 12 bytes */
};

So SlabChunk happens to be perfectly aligned (MAXIMUM_ALIGNOF=8), and so
pfree() grabs the block pointer but thinks it's the context :-(

Not sure what to do about this - the only thing I can think about is
splitting SlabChunk into two separate structures, and align them
independently.

The attached patch does that - it probably needs a bit more work on the
comments to make it commit-ready, but it fixes the test_deconding tests
on the rpi3 board I'm using for testing.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

slab-fix.patchbinary/octet-stream; name=slab-fix.patchDownload
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index c673bc3..6fbdb54 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -57,11 +57,11 @@
 #include "lib/ilist.h"
 
 
-#define SLAB_CHUNKHDRSZ MAXALIGN(sizeof(SlabChunk))
+#define SLAB_CHUNKHDRSZ (MAXALIGN(sizeof(SlabChunk)) + STANDARDCHUNKHEADERSIZE)
 
 /* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
 #define SLAB_CHUNK_USED \
-	(offsetof(SlabChunk, header) + sizeof(StandardChunkHeader))
+	(offsetof(SlabChunk, header) + STANDARDCHUNKHEADERSIZE)
 
 /*
  * SlabContext is a specialized implementation of MemoryContext.
@@ -103,10 +103,6 @@ typedef struct SlabChunk
 {
 	/* block owning this chunk */
 	void	   *block;
-
-	/* include StandardChunkHeader because mcxt.c expects that */
-	StandardChunkHeader header;
-
 } SlabChunk;
 
 
@@ -121,6 +117,8 @@ typedef struct SlabChunk
 	((char *) block + sizeof(SlabBlock))
 #define SlabChunkIndex(slab, block, chunk)	\
 	(((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize)
+#define SlabChunkStandardHeader(chunk) \
+	( (StandardChunkHeader *) ((char *)chunk + MAXALIGN(sizeof(SlabChunk))))
 
 /*
  * These functions implement the MemoryContext API for Slab contexts.
@@ -164,10 +162,10 @@ static MemoryContextMethods SlabMethods = {
 #ifdef HAVE_ALLOCINFO
 #define SlabFreeInfo(_cxt, _chunk) \
 			fprintf(stderr, "SlabFree: %s: %p, %zu\n", \
-				(_cxt)->header.name, (_chunk), (_chunk)->header.size)
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #define SlabAllocInfo(_cxt, _chunk) \
 			fprintf(stderr, "SlabAlloc: %s: %p, %zu\n", \
-				(_cxt)->header.name, (_chunk), (_chunk)->header.size)
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
 #else
 #define SlabFreeInfo(_cxt, _chunk)
 #define SlabAllocInfo(_cxt, _chunk)
@@ -203,7 +201,8 @@ SlabContextCreate(MemoryContext parent,
 					 "MAXALIGN too small to fit int32");
 
 	/* chunk, including SLAB header (both addresses nicely aligned) */
-	fullChunkSize = MAXALIGN(sizeof(SlabChunk) + MAXALIGN(chunkSize));
+	fullChunkSize = MAXALIGN(sizeof(SlabChunk))
+		+ MAXALIGN(sizeof(StandardChunkHeader)) + MAXALIGN(chunkSize);
 
 	/* Make sure the block can store at least one chunk. */
 	if (blockSize - sizeof(SlabBlock) < fullChunkSize)
@@ -324,6 +323,7 @@ SlabAlloc(MemoryContext context, Size size)
 	SlabContext *slab = castNode(SlabContext, context);
 	SlabBlock  *block;
 	SlabChunk  *chunk;
+	StandardChunkHeader *header;
 	int			idx;
 
 	Assert(slab);
@@ -394,6 +394,7 @@ SlabAlloc(MemoryContext context, Size size)
 
 	/* compute the chunk location block start (after the block header) */
 	chunk = SlabBlockGetChunk(slab, block, idx);
+	header = SlabChunkStandardHeader(chunk);
 
 	/*
 	 * Update the block nfree count, and also the minFreeChunks as we've
@@ -449,15 +450,15 @@ SlabAlloc(MemoryContext context, Size size)
 
 	chunk->block = (void *) block;
 
-	chunk->header.context = (MemoryContext) slab;
-	chunk->header.size = MAXALIGN(size);
+	header->context = (MemoryContext) slab;
+	header->size = MAXALIGN(size);
 
 #ifdef MEMORY_CONTEXT_CHECKING
-	chunk->header.requested_size = size;
-	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
-							   sizeof(chunk->header.requested_size));
+	header->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&header->requested_size,
+							   sizeof(header->requested_size));
 	/* slab mark to catch clobber of "unused" space */
-	if (size < chunk->header.size)
+	if (size < header->size)
 		set_sentinel(SlabChunkGetPointer(chunk), size);
 #endif
 #ifdef RANDOMIZE_ALLOCATED_MEMORY
@@ -480,15 +481,16 @@ SlabFree(MemoryContext context, void *pointer)
 	SlabContext *slab = castNode(SlabContext, context);
 	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
 	SlabBlock  *block = chunk->block;
+	StandardChunkHeader *header = SlabChunkStandardHeader(chunk);
 
 	SlabFreeInfo(slab, chunk);
 
 #ifdef MEMORY_CONTEXT_CHECKING
-	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
-							  sizeof(chunk->header.requested_size));
+	VALGRIND_MAKE_MEM_DEFINED(&header->requested_size,
+							  sizeof(header->requested_size));
 	/* Test for someone scribbling on unused space in chunk */
-	if (chunk->header.requested_size < chunk->header.size)
-		if (!sentinel_ok(pointer, chunk->header.requested_size))
+	if (header->requested_size < header->size)
+		if (!sentinel_ok(pointer, header->requested_size))
 			elog(WARNING, "detected write past chunk end in %s %p",
 				 slab->header.name, chunk);
 #endif
@@ -507,12 +509,12 @@ SlabFree(MemoryContext context, void *pointer)
 #ifdef CLOBBER_FREED_MEMORY
 	/* XXX don't wipe the int32 index, used for block-level freelist */
 	wipe_mem((char *) pointer + sizeof(int32),
-			 chunk->header.size - sizeof(int32));
+			 header->size - sizeof(int32));
 #endif
 
 #ifdef MEMORY_CONTEXT_CHECKING
 	/* Reset requested_size to 0 in chunks that are on freelist */
-	chunk->header.requested_size = 0;
+	header->requested_size = 0;
 #endif
 
 	/* remove the block from a freelist */
@@ -591,8 +593,9 @@ static Size
 SlabGetChunkSpace(MemoryContext context, void *pointer)
 {
 	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
+	StandardChunkHeader *header = SlabChunkStandardHeader(chunk);
 
-	return chunk->header.size + SLAB_CHUNKHDRSZ;
+	return header->size + SLAB_CHUNKHDRSZ;
 }
 
 /*
@@ -741,36 +744,37 @@ SlabCheck(MemoryContext context)
 				if (!freechunks[j])
 				{
 					SlabChunk  *chunk = SlabBlockGetChunk(slab, block, j);
+					StandardChunkHeader *header = SlabChunkStandardHeader(chunk);
 
-					VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
-									   sizeof(chunk->header.requested_size));
+					VALGRIND_MAKE_MEM_DEFINED(&header->requested_size,
+									   sizeof(header->requested_size));
 
 					/* we're in a no-freelist branch */
-					VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
-									   sizeof(chunk->header.requested_size));
+					VALGRIND_MAKE_MEM_NOACCESS(&header->requested_size,
+									   sizeof(header->requested_size));
 
 					/* chunks have both block and slab pointers, so check both */
 					if (chunk->block != block)
 						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
 							 name, block, chunk);
 
-					if (chunk->header.context != (MemoryContext) slab)
+					if (header->context != (MemoryContext) slab)
 						elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
 							 name, block, chunk);
 
 					/* now make sure the chunk size is correct */
-					if (chunk->header.size != MAXALIGN(slab->chunkSize))
+					if (header->size != MAXALIGN(slab->chunkSize))
 						elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
 							 name, block, chunk);
 
 					/* now make sure the chunk size is correct */
-					if (chunk->header.requested_size != slab->chunkSize)
+					if (header->requested_size != slab->chunkSize)
 						elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
 							 name, block, chunk);
 
 					/* there might be sentinel (thanks to alignment) */
-					if (chunk->header.requested_size < chunk->header.size &&
-						!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->header.requested_size))
+					if (header->requested_size < header->size &&
+						!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + header->requested_size))
 						elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
 							 name, block, chunk);
 				}
#71Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#70)
Re: PATCH: two slab-like memory allocators

On 2017-02-28 01:44:42 +0100, Tomas Vondra wrote:

On 02/27/2017 06:42 PM, Andres Freund wrote:

Yea, I hadn't yet realized when writing that that termite actually,
despite running on ppc64, compiles a 32bit postgres. Will thus
duplicate StandardChunkHeader's contents in to slab.c :( - I don't
see an easy way around that...

I've tried this - essentially copying the StandardChunkHeader's contents
into SlabChunk, but that does not seem to do the trick, sadly. Per pahole,
the structures then (at least on armv7l) look like this:

struct SlabChunk {
void * block; /* 0 4 */
MemoryContext context; /* 4 4 */
Size size; /* 8 4 */
Size requested_size; /* 12 4 */

/* size: 16, cachelines: 1, members: 4 */
/* last cacheline: 16 bytes */
};

struct StandardChunkHeader {
MemoryContext context; /* 0 4 */
Size size; /* 4 4 */
Size requested_size; /* 8 4 */

/* size: 12, cachelines: 1, members: 3 */
/* last cacheline: 12 bytes */
};

So SlabChunk happens to be perfectly aligned (MAXIMUM_ALIGNOF=8), and so
pfree() grabs the block pointer but thinks it's the context :-(

Hm. The only way I can think of to do achieve the right thing here would
be something like:

typedef struct StandardChunkHeader
{
MemoryContext context; /* owning context */
Size size; /* size of data space allocated in chunk */
#ifdef MEMORY_CONTEXT_CHECKING
/* when debugging memory usage, also store actual requested size */
Size requested_size;
#endif
union
{
char *data;
/* ensure MAXALIGNed */
int64 alignof_int64;
double alignof_double;
} d;
} StandardChunkHeader;

typedef struct SlabChunk
{
void *block;
StandardChunkHeader header;
} SlabChunk;

That's not overly pretty, but also not absolutely disgusting. Unifying
the padding calculations between allocators would be a nice side-effect.
Note we at least previously had such union/double tricks in the tree, via
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=e1a11d93111ff3fba7a91f3f2ac0b0aca16909a8

It might be a good idea to have configure define maxaligned_type instead
of including both int64/double (although it'll IIRC practically always
be double that's maxaligned).

Independently of this, we really should redefine StandardChunkHeader to
be only the MemoryContext. There's no need to have size/requested_size
part of StandardChunkHeader, given there's
MemoryContextMethods->get_chunk_space().

Not sure what to do about this - the only thing I can think about is
splitting SlabChunk into two separate structures, and align them
independently.

The attached patch does that - it probably needs a bit more work on the
comments to make it commit-ready, but it fixes the test_deconding tests on
the rpi3 board I'm using for testing.

That'd work as well, although at the very least I'd want to add a
comment explaining the actual memory layout somewhere - this is a bit
too finnicky to expect to get right the next time round.

Any preferences?

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#72Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#71)
Re: PATCH: two slab-like memory allocators

Andres Freund <andres@anarazel.de> writes:

Independently of this, we really should redefine StandardChunkHeader to
be only the MemoryContext. There's no need to have size/requested_size
part of StandardChunkHeader, given there's
MemoryContextMethods->get_chunk_space().

Yeah, perhaps. The trick would be to get things laid out so that the
MemoryContext pointer is always immediately adjacent to the chunk data
(no padding between).

One could imagine redefining aset.c's chunk header along the lines of

typedef struct AllocSetChunkHeader
{
Size size; /* size of data space allocated in chunk */
#ifdef MEMORY_CONTEXT_CHECKING
Size requested_size; /* original request size */
#if 32-bit-but-maxalign-is-8
Size padding; /* needed to avoid padding below */
#endif
#endif
MemoryContext context; /* owning context */
/* there must not be any padding to reach a MAXALIGN boundary here! */
} AllocSetChunkHeader;

where we'd possibly need some help from configure to implement that inner
#if condition, but it seems doable enough.

If the slab allocator would be happier with just a MemoryContext pointer
as chunk header, I think we should push in this direction rather than
invent some short-term hack.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#73Andres Freund
andres@anarazel.de
In reply to: Tom Lane (#72)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-27 22:57:24 -0500, Tom Lane wrote:

If the slab allocator would be happier with just a MemoryContext pointer
as chunk header, I think we should push in this direction rather than
invent some short-term hack.

It would - it really doesn't need the size, because it's the same for
the whole context, and thereby is a waste of space. Still wondering if
we should band-aid this till that's done.

One could imagine redefining aset.c's chunk header along the lines of

typedef struct AllocSetChunkHeader
{
Size size; /* size of data space allocated in chunk */
#ifdef MEMORY_CONTEXT_CHECKING
Size requested_size; /* original request size */
#if 32-bit-but-maxalign-is-8
Size padding; /* needed to avoid padding below */
#endif
#endif
MemoryContext context; /* owning context */
/* there must not be any padding to reach a MAXALIGN boundary here! */
} AllocSetChunkHeader;

where we'd possibly need some help from configure to implement that inner
#if condition, but it seems doable enough.

Hm, that should be doable with something like
#if MAXIMUM_ALIGNOF > 4 && SIZEOF_VOID_P == 4

which'd probably be better documentation than a macro that hides this
(arguing internally whether SIZEOF_VOID_P or SIZEOF_SIZE_T) is better.

Working on a patch now, will post but not push tonight.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#74Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#73)
Re: PATCH: two slab-like memory allocators

Andres Freund <andres@anarazel.de> writes:

Hm, that should be doable with something like
#if MAXIMUM_ALIGNOF > 4 && SIZEOF_VOID_P == 4
which'd probably be better documentation than a macro that hides this
(arguing internally whether SIZEOF_VOID_P or SIZEOF_SIZE_T) is better.

Not sure either, but suggest we add a StaticAssert asserting there's no
padding; something along the lines of
offsetof(AllocSetChunkHeader, context) + sizeof(MemoryContext) == MAXALIGN(sizeof(AllocSetChunkHeader))

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#75Andres Freund
andres@anarazel.de
In reply to: Tom Lane (#74)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

Hi,

*preliminary* patch attached. This needs a good bit of polishing
(primarily comment work, verifying that valgrind works), but I'm too
tired now.

I'm not quite sure how to deal with mmgr/README - it's written as kind
of a historical document, and the "Mechanisms to Allow Multiple Types of
Contexts" is already quite out of date. I think it'd be good to rip out
all the historical references and just describe the current state, but
I'm not really enthusiastic about tackling that :/

On 2017-02-28 00:29:44 -0500, Tom Lane wrote:

Not sure either, but suggest we add a StaticAssert asserting there's no
padding; something along the lines of
offsetof(AllocSetChunkHeader, context) + sizeof(MemoryContext) == MAXALIGN(sizeof(AllocSetChunkHeader))

Included.

With Craig's help I've verified that the patch as attached works on a
platform that's currently failing. Thanks!

- Andres

Attachments:

0001-Remove-StandardChunkHeader-for-Slab-s-benefit.patchtext/x-patch; charset=us-asciiDownload
From a59c3200dd127feb0cb09a055250ff6401aee1aa Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 27 Feb 2017 23:32:22 -0800
Subject: [PATCH] Remove StandardChunkHeader for Slab's benefit.

---
 src/backend/utils/mmgr/aset.c    |  51 ++++++++---------
 src/backend/utils/mmgr/mcxt.c    | 121 +++------------------------------------
 src/backend/utils/mmgr/slab.c    |  81 ++++++++------------------
 src/include/utils/memutils.h     |  56 +++++++++++-------
 src/tools/pgindent/typedefs.list |   1 -
 5 files changed, 92 insertions(+), 218 deletions(-)

diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 8056c00ae4..31228ade19 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -97,20 +97,7 @@
  */
 
 #define ALLOC_BLOCKHDRSZ	MAXALIGN(sizeof(AllocBlockData))
-#define ALLOC_CHUNKHDRSZ	MAXALIGN(sizeof(AllocChunkData))
-
-/* Portion of ALLOC_CHUNKHDRSZ examined outside aset.c. */
-#define ALLOC_CHUNK_PUBLIC	\
-	(offsetof(AllocChunkData, size) + sizeof(Size))
-
-/* Portion of ALLOC_CHUNKHDRSZ excluding trailing padding. */
-#ifdef MEMORY_CONTEXT_CHECKING
-#define ALLOC_CHUNK_USED	\
-	(offsetof(AllocChunkData, requested_size) + sizeof(Size))
-#else
-#define ALLOC_CHUNK_USED	\
-	(offsetof(AllocChunkData, size) + sizeof(Size))
-#endif
+#define ALLOC_CHUNKHDRSZ	sizeof(struct AllocChunkData)
 
 typedef struct AllocBlockData *AllocBlock;		/* forward reference */
 typedef struct AllocChunkData *AllocChunk;
@@ -169,20 +156,27 @@ typedef struct AllocBlockData
 /*
  * AllocChunk
  *		The prefix of each piece of memory in an AllocBlock
- *
- * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
  */
 typedef struct AllocChunkData
 {
-	/* aset is the owning aset if allocated, or the freelist link if free */
-	void	   *aset;
 	/* size is always the size of the usable space in the chunk */
 	Size		size;
+
 #ifdef MEMORY_CONTEXT_CHECKING
 	/* when debugging memory usage, also store actual requested size */
 	/* this is zero in a free chunk */
 	Size		requested_size;
+
+#if MAXIMUM_ALIGNOF > 4 && SIZEOF_VOID_P == 4
+	Size		padding;
 #endif
+
+#endif /* MEMORY_CONTEXT_CHECKING */
+
+	/* aset is the owning aset if allocated, or the freelist link if free */
+	void	   *aset;
+
+	/* there must not be any padding to reach a MAXALIGN boundary here! */
 }	AllocChunkData;
 
 /*
@@ -334,6 +328,10 @@ AllocSetContextCreate(MemoryContext parent,
 {
 	AllocSet	set;
 
+	StaticAssertStmt(offsetof(AllocChunkData, aset) + sizeof(MemoryContext) ==
+					 MAXALIGN(sizeof(AllocChunkData)),
+					 "padding calculation in AllocChunkData is wrong");
+
 	/*
 	 * First, validate allocation parameters.  (If we're going to throw an
 	 * error, we should do so before the context is created, not after.)  We
@@ -616,13 +614,14 @@ AllocSetAlloc(MemoryContext context, Size size)
 		AllocAllocInfo(set, chunk);
 
 		/*
-		 * Chunk header public fields remain DEFINED.  The requested
-		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
-		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
-		 * if any, NOACCESS.
+		 * Chunk's context fields remain DEFINED.  The requested allocation
+		 * itself can be NOACCESS or UNDEFINED; our caller will soon make it
+		 * UNDEFINED.  Make extra space at the end of the chunk, if any,
+		 * NOACCESS.
 		 */
-		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + ALLOC_CHUNK_PUBLIC,
-						 chunk_size + ALLOC_CHUNKHDRSZ - ALLOC_CHUNK_PUBLIC);
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk,
+								   chunk_size + ALLOC_CHUNKHDRSZ);
+		VALGRIND_MAKE_MEM_DEFINED(chunk->aset, sizeof(MemoryContext));
 
 		return AllocChunkGetPointer(chunk);
 	}
@@ -709,7 +708,7 @@ AllocSetAlloc(MemoryContext context, Size size)
 				chunk = (AllocChunk) (block->freeptr);
 
 				/* Prepare to initialize the chunk header. */
-				VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNK_USED);
+				VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(AllocChunkData));
 
 				block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
 				availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
@@ -799,7 +798,7 @@ AllocSetAlloc(MemoryContext context, Size size)
 	chunk = (AllocChunk) (block->freeptr);
 
 	/* Prepare to initialize the chunk header. */
-	VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNK_USED);
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(AllocChunkData));
 
 	block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
 	Assert(block->freeptr <= block->endptr);
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 6ad0bb47b6..80b41b0e65 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -389,55 +389,10 @@ MemoryContextAllowInCriticalSection(MemoryContext context, bool allow)
 Size
 GetMemoryChunkSpace(void *pointer)
 {
-	StandardChunkHeader *header;
+	MemoryContext	context = GetMemoryChunkContext(pointer);
 
-	/*
-	 * Try to detect bogus pointers handed to us, poorly though we can.
-	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
-	 * allocated chunk.
-	 */
-	Assert(pointer != NULL);
-	Assert(pointer == (void *) MAXALIGN(pointer));
-
-	/*
-	 * OK, it's probably safe to look at the chunk header.
-	 */
-	header = (StandardChunkHeader *)
-		((char *) pointer - STANDARDCHUNKHEADERSIZE);
-
-	AssertArg(MemoryContextIsValid(header->context));
-
-	return (*header->context->methods->get_chunk_space) (header->context,
-														 pointer);
-}
-
-/*
- * GetMemoryChunkContext
- *		Given a currently-allocated chunk, determine the context
- *		it belongs to.
- */
-MemoryContext
-GetMemoryChunkContext(void *pointer)
-{
-	StandardChunkHeader *header;
-
-	/*
-	 * Try to detect bogus pointers handed to us, poorly though we can.
-	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
-	 * allocated chunk.
-	 */
-	Assert(pointer != NULL);
-	Assert(pointer == (void *) MAXALIGN(pointer));
-
-	/*
-	 * OK, it's probably safe to look at the chunk header.
-	 */
-	header = (StandardChunkHeader *)
-		((char *) pointer - STANDARDCHUNKHEADERSIZE);
-
-	AssertArg(MemoryContextIsValid(header->context));
-
-	return header->context;
+	return (context->methods->get_chunk_space) (context,
+												pointer);
 }
 
 /*
@@ -611,23 +566,9 @@ MemoryContextCheck(MemoryContext context)
 bool
 MemoryContextContains(MemoryContext context, void *pointer)
 {
-	StandardChunkHeader *header;
+	MemoryContext	ptr_context = GetMemoryChunkContext(pointer);
 
-	/*
-	 * Try to detect bogus pointers handed to us, poorly though we can.
-	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
-	 * allocated chunk.
-	 */
-	if (pointer == NULL || pointer != (void *) MAXALIGN(pointer))
-		return false;
-
-	/*
-	 * OK, it's probably safe to look at the chunk header.
-	 */
-	header = (StandardChunkHeader *)
-		((char *) pointer - STANDARDCHUNKHEADERSIZE);
-
-	return header->context == context;
+	return ptr_context == context;
 }
 
 /*--------------------
@@ -991,23 +932,7 @@ palloc_extended(Size size, int flags)
 void
 pfree(void *pointer)
 {
-	MemoryContext context;
-
-	/*
-	 * Try to detect bogus pointers handed to us, poorly though we can.
-	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
-	 * allocated chunk.
-	 */
-	Assert(pointer != NULL);
-	Assert(pointer == (void *) MAXALIGN(pointer));
-
-	/*
-	 * OK, it's probably safe to look at the chunk header.
-	 */
-	context = ((StandardChunkHeader *)
-			   ((char *) pointer - STANDARDCHUNKHEADERSIZE))->context;
-
-	AssertArg(MemoryContextIsValid(context));
+	MemoryContext	context = GetMemoryChunkContext(pointer);
 
 	(*context->methods->free_p) (context, pointer);
 	VALGRIND_MEMPOOL_FREE(context, pointer);
@@ -1020,27 +945,12 @@ pfree(void *pointer)
 void *
 repalloc(void *pointer, Size size)
 {
-	MemoryContext context;
+	MemoryContext	context = GetMemoryChunkContext(pointer);
 	void	   *ret;
 
 	if (!AllocSizeIsValid(size))
 		elog(ERROR, "invalid memory alloc request size %zu", size);
 
-	/*
-	 * Try to detect bogus pointers handed to us, poorly though we can.
-	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
-	 * allocated chunk.
-	 */
-	Assert(pointer != NULL);
-	Assert(pointer == (void *) MAXALIGN(pointer));
-
-	/*
-	 * OK, it's probably safe to look at the chunk header.
-	 */
-	context = ((StandardChunkHeader *)
-			   ((char *) pointer - STANDARDCHUNKHEADERSIZE))->context;
-
-	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
 	/* isReset must be false already */
@@ -1103,27 +1013,12 @@ MemoryContextAllocHuge(MemoryContext context, Size size)
 void *
 repalloc_huge(void *pointer, Size size)
 {
-	MemoryContext context;
+	MemoryContext	context = GetMemoryChunkContext(pointer);
 	void	   *ret;
 
 	if (!AllocHugeSizeIsValid(size))
 		elog(ERROR, "invalid memory alloc request size %zu", size);
 
-	/*
-	 * Try to detect bogus pointers handed to us, poorly though we can.
-	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
-	 * allocated chunk.
-	 */
-	Assert(pointer != NULL);
-	Assert(pointer == (void *) MAXALIGN(pointer));
-
-	/*
-	 * OK, it's probably safe to look at the chunk header.
-	 */
-	context = ((StandardChunkHeader *)
-			   ((char *) pointer - STANDARDCHUNKHEADERSIZE))->context;
-
-	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
 	/* isReset must be false already */
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index a5e140eef7..99956fc279 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -57,12 +57,6 @@
 #include "lib/ilist.h"
 
 
-#define SLAB_CHUNKHDRSZ MAXALIGN(sizeof(SlabChunk))
-
-/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
-#define SLAB_CHUNK_USED \
-	(offsetof(SlabChunk, header) + sizeof(StandardChunkHeader))
-
 /*
  * SlabContext is a specialized implementation of MemoryContext.
  */
@@ -103,17 +97,15 @@ typedef struct SlabChunk
 {
 	/* block owning this chunk */
 	void	   *block;
-
-	/* include StandardChunkHeader because mcxt.c expects that */
-	StandardChunkHeader header;
-
+	SlabContext	*slab;        /* owning context */
+	/* there must not be any padding to reach a MAXALIGN boundary here! */
 } SlabChunk;
 
 
 #define SlabPointerGetChunk(ptr)	\
-	((SlabChunk *)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+	((SlabChunk *)(((char *)(ptr)) - sizeof(SlabChunk)))
 #define SlabChunkGetPointer(chk)	\
-	((void *)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+	((void *)(((char *)(chk)) + sizeof(SlabChunk)))
 #define SlabBlockGetChunk(slab, block, idx) \
 	((SlabChunk *) ((char *) (block) + sizeof(SlabBlock)	\
 					+ (idx * slab->fullChunkSize)))
@@ -198,6 +190,10 @@ SlabContextCreate(MemoryContext parent,
 	Size		freelistSize;
 	SlabContext *slab;
 
+	StaticAssertStmt(offsetof(SlabChunk, slab) + sizeof(MemoryContext) ==
+					 MAXALIGN(sizeof(SlabChunk)),
+					 "padding calculation in SlabChunk is wrong");
+
 	/* otherwise the linked list inside freed chunk isn't guaranteed to fit */
 	StaticAssertStmt(MAXIMUM_ALIGNOF >= sizeof(int),
 					 "MAXALIGN too small to fit int32");
@@ -207,7 +203,7 @@ SlabContextCreate(MemoryContext parent,
 
 	/* Make sure the block can store at least one chunk. */
 	if (blockSize - sizeof(SlabBlock) < fullChunkSize)
-		elog(ERROR, "block size %ld for slab is too small for %ld chunks",
+		elog(ERROR, "block size %zu for slab is too small for %zu chunks",
 			 blockSize, chunkSize);
 
 	/* Compute maximum number of chunks per block */
@@ -333,7 +329,7 @@ SlabAlloc(MemoryContext context, Size size)
 
 	/* make sure we only allow correct request size */
 	if (size != slab->chunkSize)
-		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+		elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)",
 			 size, slab->chunkSize);
 
 	/*
@@ -445,19 +441,14 @@ SlabAlloc(MemoryContext context, Size size)
 		slab->minFreeChunks = 0;
 
 	/* Prepare to initialize the chunk header. */
-	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(SlabChunk));
 
 	chunk->block = (void *) block;
-
-	chunk->header.context = (MemoryContext) slab;
-	chunk->header.size = MAXALIGN(size);
+	chunk->slab = slab;
 
 #ifdef MEMORY_CONTEXT_CHECKING
-	chunk->header.requested_size = size;
-	VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
-							   sizeof(chunk->header.requested_size));
 	/* slab mark to catch clobber of "unused" space */
-	if (size < chunk->header.size)
+	if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
 		set_sentinel(SlabChunkGetPointer(chunk), size);
 #endif
 #ifdef RANDOMIZE_ALLOCATED_MEMORY
@@ -484,11 +475,9 @@ SlabFree(MemoryContext context, void *pointer)
 	SlabFreeInfo(slab, chunk);
 
 #ifdef MEMORY_CONTEXT_CHECKING
-	VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
-							  sizeof(chunk->header.requested_size));
 	/* Test for someone scribbling on unused space in chunk */
-	if (chunk->header.requested_size < chunk->header.size)
-		if (!sentinel_ok(pointer, chunk->header.requested_size))
+	if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
+		if (!sentinel_ok(pointer, slab->chunkSize))
 			elog(WARNING, "detected write past chunk end in %s %p",
 				 slab->header.name, chunk);
 #endif
@@ -507,12 +496,7 @@ SlabFree(MemoryContext context, void *pointer)
 #ifdef CLOBBER_FREED_MEMORY
 	/* XXX don't wipe the int32 index, used for block-level freelist */
 	wipe_mem((char *) pointer + sizeof(int32),
-			 chunk->header.size - sizeof(int32));
-#endif
-
-#ifdef MEMORY_CONTEXT_CHECKING
-	/* Reset requested_size to 0 in chunks that are on freelist */
-	chunk->header.requested_size = 0;
+			 slab->chunkSize - sizeof(int32));
 #endif
 
 	/* remove the block from a freelist */
@@ -590,9 +574,11 @@ SlabRealloc(MemoryContext context, void *pointer, Size size)
 static Size
 SlabGetChunkSpace(MemoryContext context, void *pointer)
 {
-	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
+	SlabContext *slab = castNode(SlabContext, context);
 
-	return chunk->header.size + SLAB_CHUNKHDRSZ;
+	Assert(slab);
+
+	return slab->fullChunkSize;
 }
 
 /*
@@ -742,37 +728,20 @@ SlabCheck(MemoryContext context)
 				{
 					SlabChunk  *chunk = SlabBlockGetChunk(slab, block, j);
 
-					VALGRIND_MAKE_MEM_DEFINED(&chunk->header.requested_size,
-									   sizeof(chunk->header.requested_size));
-
-					/* we're in a no-freelist branch */
-					VALGRIND_MAKE_MEM_NOACCESS(&chunk->header.requested_size,
-									   sizeof(chunk->header.requested_size));
-
 					/* chunks have both block and slab pointers, so check both */
 					if (chunk->block != block)
 						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
 							 name, block, chunk);
 
-					if (chunk->header.context != (MemoryContext) slab)
+					if (chunk->slab != slab)
 						elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
 							 name, block, chunk);
 
-					/* now make sure the chunk size is correct */
-					if (chunk->header.size != MAXALIGN(slab->chunkSize))
-						elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
-							 name, block, chunk);
-
-					/* now make sure the chunk size is correct */
-					if (chunk->header.requested_size != slab->chunkSize)
-						elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
-							 name, block, chunk);
-
 					/* there might be sentinel (thanks to alignment) */
-					if (chunk->header.requested_size < chunk->header.size &&
-						!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->header.requested_size))
-						elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
-							 name, block, chunk);
+					if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
+						if (!sentinel_ok(chunk, slab->chunkSize))
+							elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+								 name, block, chunk);
 				}
 			}
 
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 5223a4da39..930e469858 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -45,27 +45,6 @@
 
 #define AllocHugeSizeIsValid(size)	((Size) (size) <= MaxAllocHugeSize)
 
-/*
- * All chunks allocated by any memory context manager are required to be
- * preceded by a StandardChunkHeader at a spacing of STANDARDCHUNKHEADERSIZE.
- * A currently-allocated chunk must contain a backpointer to its owning
- * context as well as the allocated size of the chunk.  The backpointer is
- * used by pfree() and repalloc() to find the context to call.  The allocated
- * size is not absolutely essential, but it's expected to be needed by any
- * reasonable implementation.
- */
-typedef struct StandardChunkHeader
-{
-	MemoryContext context;		/* owning context */
-	Size		size;			/* size of data space allocated in chunk */
-#ifdef MEMORY_CONTEXT_CHECKING
-	/* when debugging memory usage, also store actual requested size */
-	Size		requested_size;
-#endif
-} StandardChunkHeader;
-
-#define STANDARDCHUNKHEADERSIZE  MAXALIGN(sizeof(StandardChunkHeader))
-
 
 /*
  * Standard top-level memory contexts.
@@ -100,7 +79,6 @@ extern void MemoryContextDeleteChildren(MemoryContext context);
 extern void MemoryContextSetParent(MemoryContext context,
 					   MemoryContext new_parent);
 extern Size GetMemoryChunkSpace(void *pointer);
-extern MemoryContext GetMemoryChunkContext(void *pointer);
 extern MemoryContext MemoryContextGetParent(MemoryContext context);
 extern bool MemoryContextIsEmpty(MemoryContext context);
 extern void MemoryContextStats(MemoryContext context);
@@ -114,6 +92,40 @@ extern void MemoryContextCheck(MemoryContext context);
 extern bool MemoryContextContains(MemoryContext context, void *pointer);
 
 /*
+ * GetMemoryChunkContext
+ *		Given a currently-allocated chunk, determine the context
+ *		it belongs to.
+ *
+ * All chunks allocated by any memory context manager are required to be
+ * preceded by the corresponding MemoryContext stored, without padding, in the
+ * preceding sizeof(void*) bytes.  A currently-allocated chunk must contain a
+ * backpointer to its owning context.  The backpointer is used by pfree() and
+ * repalloc() to find the context to call.
+ */
+static inline MemoryContext
+GetMemoryChunkContext(void *pointer)
+{
+	MemoryContext	context;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	Assert(pointer != NULL);
+	Assert(pointer == (void *) MAXALIGN(pointer));
+
+	/*
+	 * OK, it's probably safe to look at the context.
+	 */
+	context = *(MemoryContext*) (((char *) pointer) - sizeof(void *));
+
+	AssertArg(MemoryContextIsValid(context));
+
+	return context;
+}
+
+/*
  * This routine handles the context-type-independent part of memory
  * context creation.  It's intended to be called from context-type-
  * specific creation routines, and noplace else.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 1fd7ec4256..6717eccbb0 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2001,7 +2001,6 @@ SplitLR
 SplitVar
 SplitedPageLayout
 StackElem
-StandardChunkHeader
 StartBlobPtr
 StartBlobsPtr
 StartDataPtr
-- 
2.11.0.22.g8d7a455.dirty

#76Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#75)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-02-27 23:44:20 -0800, Andres Freund wrote:

*preliminary* patch attached. This needs a good bit of polishing
(primarily comment work, verifying that valgrind works), but I'm too
tired now.

I'm not quite sure how to deal with mmgr/README - it's written as kind
of a historical document, and the "Mechanisms to Allow Multiple Types of
Contexts" is already quite out of date. I think it'd be good to rip out
all the historical references and just describe the current state, but
I'm not really enthusiastic about tackling that :/

While still not enthusiastic, I took a stab at doing so. While still
not perfect, I do think this is an improvement.

Is anybody uncomfortable going away from the current historical account
style?

- Andres

Attachments:

0002-Overhaul-mmgr-s-README.patchtext/x-patch; charset=us-asciiDownload
From 5eef2abe5e593b6a9b072b58bbecadbe15689a01 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 28 Feb 2017 10:36:29 -0800
Subject: [PATCH 2/2] Overhaul mmgr/'s README.

The README was written as a "historical account", and that style
hasn't aged particularly well.  Rephrase it to describe the current
situation, instead of having various version specific comments.

This also updates the description of how allocated chunks are
associated with their corresponding context, the method of which has
changed with the previous commit.

Author: Andres Freund
Discussion: https://postgr.es/m/d15dff83-0b37-28ed-0809-95a5cc7292ad@2ndquadrant.com
---
 src/backend/utils/mmgr/README | 292 +++++++++++++++++++-----------------------
 1 file changed, 132 insertions(+), 160 deletions(-)

diff --git a/src/backend/utils/mmgr/README b/src/backend/utils/mmgr/README
index f97d7653de..b83b29c268 100644
--- a/src/backend/utils/mmgr/README
+++ b/src/backend/utils/mmgr/README
@@ -1,15 +1,7 @@
 src/backend/utils/mmgr/README
 
-Notes About Memory Allocation Redesign
-======================================
-
-Up through version 7.0, Postgres had serious problems with memory leakage
-during large queries that process a lot of pass-by-reference data.  There
-was no provision for recycling memory until end of query.  This needed to be
-fixed, even more so with the advent of TOAST which allows very large chunks
-of data to be passed around in the system.  This document describes the new
-memory management system implemented in 7.1.
-
+Memory Context System Design Overview
+=====================================
 
 Background
 ----------
@@ -38,10 +30,10 @@ to or get more memory from the same context the chunk was originally
 allocated in.
 
 At all times there is a "current" context denoted by the
-CurrentMemoryContext global variable.  The backend macro palloc()
-implicitly allocates space in that context.  The MemoryContextSwitchTo()
-operation selects a new current context (and returns the previous context,
-so that the caller can restore the previous context before exiting).
+CurrentMemoryContext global variable.  palloc() implicitly allocates space
+in that context.  The MemoryContextSwitchTo() operation selects a new current
+context (and returns the previous context, so that the caller can restore the
+previous context before exiting).
 
 The main advantage of memory contexts over plain use of malloc/free is
 that the entire contents of a memory context can be freed easily, without
@@ -60,8 +52,10 @@ The behavior of palloc and friends is similar to the standard C library's
 malloc and friends, but there are some deliberate differences too.  Here
 are some notes to clarify the behavior.
 
-* If out of memory, palloc and repalloc exit via elog(ERROR).  They never
-return NULL, and it is not necessary or useful to test for such a result.
+* If out of memory, palloc and repalloc exit via elog(ERROR).  They
+never return NULL, and it is not necessary or useful to test for such
+a result.  With palloc_extended() that behavior can be overridden
+using the MCXT_ALLOC_NO_OOM flag.
 
 * palloc(0) is explicitly a valid operation.  It does not return a NULL
 pointer, but a valid chunk of which no bytes may be used.  However, the
@@ -71,28 +65,18 @@ error.  Similarly, repalloc allows realloc'ing to zero size.
 * pfree and repalloc do not accept a NULL pointer.  This is intentional.
 
 
-pfree/repalloc No Longer Depend On CurrentMemoryContext
--------------------------------------------------------
+The Current Memory Context
+--------------------------
 
-Since Postgres 7.1, pfree() and repalloc() can be applied to any chunk
-whether it belongs to CurrentMemoryContext or not --- the chunk's owning
-context will be invoked to handle the operation, regardless.  This is a
-change from the old requirement that CurrentMemoryContext must be set
-to the same context the memory was allocated from before one can use
-pfree() or repalloc().
-
-There was some consideration of getting rid of CurrentMemoryContext entirely,
-instead requiring the target memory context for allocation to be specified
-explicitly.  But we decided that would be too much notational overhead ---
-we'd have to pass an appropriate memory context to called routines in
-many places.  For example, the copyObject routines would need to be passed
-a context, as would function execution routines that return a
-pass-by-reference datatype.  And what of routines that temporarily
-allocate space internally, but don't return it to their caller?  We
-certainly don't want to clutter every call in the system with "here is
-a context to use for any temporary memory allocation you might want to
-do".  So there'd still need to be a global variable specifying a suitable
-temporary-allocation context.  That might as well be CurrentMemoryContext.
+Because it would be too much notational overhead to always pass an
+appropriate memory context to called routines, there always exists the
+notion of the current memory context CurrentMemoryContext.  Without it,
+for example, the copyObject routines would need to be passed a context, as
+would function execution routines that return a pass-by-reference
+datatype.  Similarly for routines that temporarily allocate space
+internally, but don't return it to their caller?  We certainly don't
+want to clutter every call in the system with "here is a context to
+use for any temporary memory allocation you might want to do".
 
 The upshot of that reasoning, though, is that CurrentMemoryContext should
 generally point at a short-lifespan context if at all possible.  During
@@ -102,42 +86,83 @@ context having greater than transaction lifespan, since doing so risks
 permanent memory leaks.
 
 
-Additions to the Memory-Context Mechanism
------------------------------------------
+pfree/repalloc Do Not Depend On CurrentMemoryContext
+----------------------------------------------------
 
-Before 7.1 memory contexts were all independent, but it was too hard to
-keep track of them; with lots of contexts there needs to be explicit
-mechanism for that.
+pfree() and repalloc() can be applied to any chunk whether it belongs
+to CurrentMemoryContext or not --- the chunk's owning context will be
+invoked to handle the operation, regardless.
 
-We solved this by creating a tree of "parent" and "child" contexts.  When
-creating a memory context, the new context can be specified to be a child
-of some existing context.  A context can have many children, but only one
-parent.  In this way the contexts form a forest (not necessarily a single
-tree, since there could be more than one top-level context; although in
-current practice there is only one top context, TopMemoryContext).
 
-We then say that resetting or deleting any particular context resets or
-deletes all its direct and indirect children as well.  This feature allows
-us to manage a lot of contexts without fear that some will be leaked; we
-only need to keep track of one top-level context that we are going to
-delete at transaction end, and make sure that any shorter-lived contexts
-we create are descendants of that context.  Since the tree can have
-multiple levels, we can deal easily with nested lifetimes of storage,
-such as per-transaction, per-statement, per-scan, per-tuple.  Storage
-lifetimes that only partially overlap can be handled by allocating
-from different trees of the context forest (there are some examples
-in the next section).
+"Parent" and "Child" Contexts
+-----------------------------
 
-Actually, it turns out that resetting a given context should almost
-always imply deleting, not just resetting, any child contexts it has.
-So MemoryContextReset() means that, and if you really do want a tree of
-empty contexts you need to call MemoryContextResetOnly() plus
-MemoryContextResetChildren().
+If all contexts were independent, it'd be hard to keep track of them,
+especially in error cases.  That is solved this by creating a tree of
+"parent" and "child" contexts.  When creating a memory context, the
+new context can be specified to be a child of some existing context.
+A context can have many children, but only one parent.  In this way
+the contexts form a forest (not necessarily a single tree, since there
+could be more than one top-level context; although in current practice
+there is only one top context, TopMemoryContext).
+
+Deleting a context deletes all its direct and indirect children as
+well.  When resetting a context it's almost always more useful to
+delete child contexts, thus MemoryContextReset() means that, and if
+you really do want a tree of empty contexts you need to call
+MemoryContextResetOnly() plus MemoryContextResetChildren().
+
+These features allow us to manage a lot of contexts without fear that
+some will be leaked; we only need to keep track of one top-level
+context that we are going to delete at transaction end, and make sure
+that any shorter-lived contexts we create are descendants of that
+context.  Since the tree can have multiple levels, we can deal easily
+with nested lifetimes of storage, such as per-transaction,
+per-statement, per-scan, per-tuple.  Storage lifetimes that only
+partially overlap can be handled by allocating from different trees of
+the context forest (there are some examples in the next section).
 
 For convenience we also provide operations like "reset/delete all children
 of a given context, but don't reset or delete that context itself".
 
 
+Memory Context Reset/Delete Callbacks
+-------------------------------------
+
+A feature introduced in Postgres 9.5 allows memory contexts to be used
+for managing more resources than just plain palloc'd memory.  This is
+done by registering a "reset callback function" for a memory context.
+Such a function will be called, once, just before the context is next
+reset or deleted.  It can be used to give up resources that are in some
+sense associated with an object allocated within the context.  Possible
+use-cases include
+* closing open files associated with a tuplesort object;
+* releasing reference counts on long-lived cache objects that are held
+  by some object within the context being reset;
+* freeing malloc-managed memory associated with some palloc'd object.
+That last case would just represent bad programming practice for pure
+Postgres code; better to have made all the allocations using palloc,
+in the target context or some child context.  However, it could well
+come in handy for code that interfaces to non-Postgres libraries.
+
+Any number of reset callbacks can be established for a memory context;
+they are called in reverse order of registration.  Also, callbacks
+attached to child contexts are called before callbacks attached to
+parent contexts, if a tree of contexts is being reset or deleted.
+
+The API for this requires the caller to provide a MemoryContextCallback
+memory chunk to hold the state for a callback.  Typically this should be
+allocated in the same context it is logically attached to, so that it
+will be released automatically after use.  The reason for asking the
+caller to provide this memory is that in most usage scenarios, the caller
+will be creating some larger struct within the target context, and the
+MemoryContextCallback struct can be made "for free" without a separate
+palloc() call by including it in this larger struct.
+
+
+Memory Contexts in Practice
+===========================
+
 Globally Known Contexts
 -----------------------
 
@@ -325,83 +350,64 @@ copy step.
 Mechanisms to Allow Multiple Types of Contexts
 ----------------------------------------------
 
-We may want several different types of memory contexts with different
-allocation policies but similar external behavior.  To handle this,
-memory allocation functions will be accessed via function pointers,
-and we will require all context types to obey the conventions given here.
-(As of 2015, there's actually still just one context type; but interest in
-creating other types has never gone away entirely, so we retain this API.)
+To efficiently allow for different allocation patterns, and for
+experimentation, we allow for different types of memory contexts with
+different allocation policies but similar external behavior.  To
+handle this, memory allocation functions are accessed via function
+pointers, and we require all context types to obey the conventions
+given here.
 
-A memory context is represented by an object like
+A memory context is represented by struct MemoryContextData (see
+memnodes.h). This struct identifies the exact type of the context, and
+contains information common between the different types of
+MemoryContext like the parent and child contexts, and the name of the
+context.
 
-typedef struct MemoryContextData
-{
-    NodeTag        type;           /* identifies exact kind of context */
-    MemoryContextMethods methods;
-    MemoryContextData *parent;     /* NULL if no parent (toplevel context) */
-    MemoryContextData *firstchild; /* head of linked list of children */
-    MemoryContextData *nextchild;  /* next child of same parent */
-    char          *name;           /* context name (just for debugging) */
-} MemoryContextData, *MemoryContext;
-
-This is essentially an abstract superclass, and the "methods" pointer is
-its virtual function table.  Specific memory context types will use
+This is essentially an abstract superclass, and the behavior is
+determined by the "methods" pointer is its virtual function table
+(struct MemoryContextMethods).  Specific memory context types will use
 derived structs having these fields as their first fields.  All the
-contexts of a specific type will have methods pointers that point to the
-same static table of function pointers, which look like
+contexts of a specific type will have methods pointers that point to
+the same static table of function pointers.
 
-typedef struct MemoryContextMethodsData
-{
-    Pointer     (*alloc) (MemoryContext c, Size size);
-    void        (*free_p) (Pointer chunk);
-    Pointer     (*realloc) (Pointer chunk, Size newsize);
-    void        (*reset) (MemoryContext c);
-    void        (*delete) (MemoryContext c);
-} MemoryContextMethodsData, *MemoryContextMethods;
+While operations like allocating from and resetting a context take the
+relevant MemoryContext as a parameter, operations like free and
+realloc are trickier.  To make those work, we require all memory
+context types to produce allocated chunks that are immediately,
+without any padding, preceded by a pointer to the corresponding
+MemoryContext.
 
-Alloc, reset, and delete requests will take a MemoryContext pointer
-as parameter, so they'll have no trouble finding the method pointer
-to call.  Free and realloc are trickier.  To make those work, we
-require all memory context types to produce allocated chunks that
-are immediately preceded by a standard chunk header, which has the
-layout
+If a type of allocator needs additional information about its chunks,
+like e.g. the size of the allocation, that information can in turn
+precede the MemoryContext.  This means the only overhead implied by
+the memory context mechanism is a pointer to its context, so we're not
+constraining context-type designers very much.
 
-typedef struct StandardChunkHeader
-{
-    MemoryContext mycontext;         /* Link to owning context object */
-    Size          size;              /* Allocated size of chunk */
-};
+Given this, routines like pfree their corresponding context with an
+operation like (although that is usually encapsulated in
+GetMemoryChunkContext())
 
-It turns out that the pre-existing aset.c memory context type did this
-already, and probably any other kind of context would need to have the
-same data available to support realloc, so this is not really creating
-any additional overhead.  (Note that if a context type needs more per-
-allocated-chunk information than this, it can make an additional
-nonstandard header that precedes the standard header.  So we're not
-constraining context-type designers very much.)
+    MemoryContext context = *(MemoryContext*) (((char *) pointer) - sizeof(void *));
 
-Given this, the pfree routine looks something like
+and then invoke the corresponding method for the context
 
-    StandardChunkHeader * header =
-        (StandardChunkHeader *) ((char *) p - sizeof(StandardChunkHeader));
-
-    (*header->mycontext->methods->free_p) (p);
+    (*context->methods->free_p) (p);
 
 
 More Control Over aset.c Behavior
 ---------------------------------
 
-Previously, aset.c always allocated an 8K block upon the first allocation
-in a context, and doubled that size for each successive block request.
-That's good behavior for a context that might hold *lots* of data, and
-the overhead wasn't bad when we had only a few contexts in existence.
-With dozens if not hundreds of smaller contexts in the system, we need
-to be able to fine-tune things a little better.
+By default aset.c always allocates an 8K block upon the first
+allocation in a context, and doubles that size for each successive
+block request.  That's good behavior for a context that might hold
+*lots* of data.  But if there are dozens if not hundreds of smaller
+contexts in the system, we need to be able to fine-tune things a
+little better.
 
-The creator of a context is now able to specify an initial block size
-and a maximum block size.  Selecting smaller values can prevent wastage
-of space in contexts that aren't expected to hold very much (an example is
-the relcache's per-relation contexts).
+The creator of a context is able to specify an initial block size and
+a maximum block size.  Selecting smaller values can prevent wastage of
+space in contexts that aren't expected to hold very much (an example
+is the relcache's per-relation contexts).
 
 Also, it is possible to specify a minimum context size.  If this
 value is greater than zero then a block of that size will be grabbed
@@ -414,37 +420,3 @@ will not allocate very much space per tuple cycle.  To make this usage
 pattern cheap, the first block allocated in a context is not given
 back to malloc() during reset, but just cleared.  This avoids malloc
 thrashing.
-
-
-Memory Context Reset/Delete Callbacks
--------------------------------------
-
-A feature introduced in Postgres 9.5 allows memory contexts to be used
-for managing more resources than just plain palloc'd memory.  This is
-done by registering a "reset callback function" for a memory context.
-Such a function will be called, once, just before the context is next
-reset or deleted.  It can be used to give up resources that are in some
-sense associated with an object allocated within the context.  Possible
-use-cases include
-* closing open files associated with a tuplesort object;
-* releasing reference counts on long-lived cache objects that are held
-  by some object within the context being reset;
-* freeing malloc-managed memory associated with some palloc'd object.
-That last case would just represent bad programming practice for pure
-Postgres code; better to have made all the allocations using palloc,
-in the target context or some child context.  However, it could well
-come in handy for code that interfaces to non-Postgres libraries.
-
-Any number of reset callbacks can be established for a memory context;
-they are called in reverse order of registration.  Also, callbacks
-attached to child contexts are called before callbacks attached to
-parent contexts, if a tree of contexts is being reset or deleted.
-
-The API for this requires the caller to provide a MemoryContextCallback
-memory chunk to hold the state for a callback.  Typically this should be
-allocated in the same context it is logically attached to, so that it
-will be released automatically after use.  The reason for asking the
-caller to provide this memory is that in most usage scenarios, the caller
-will be creating some larger struct within the target context, and the
-MemoryContextCallback struct can be made "for free" without a separate
-palloc() call by including it in this larger struct.
-- 
2.11.0.22.g8d7a455.dirty

#77Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#76)
Re: PATCH: two slab-like memory allocators

On 2017-02-28 10:41:22 -0800, Andres Freund wrote:

Hi,

On 2017-02-27 23:44:20 -0800, Andres Freund wrote:

*preliminary* patch attached. This needs a good bit of polishing
(primarily comment work, verifying that valgrind works), but I'm too
tired now.

I'm not quite sure how to deal with mmgr/README - it's written as kind
of a historical document, and the "Mechanisms to Allow Multiple Types of
Contexts" is already quite out of date. I think it'd be good to rip out
all the historical references and just describe the current state, but
I'm not really enthusiastic about tackling that :/

While still not enthusiastic, I took a stab at doing so. While still
not perfect, I do think this is an improvement.

Is anybody uncomfortable going away from the current historical account
style?

I've pushed these now. I'm not claiming that the README revision is
perfect, but we can incremently improve it further...

- Andres, hoping the buildfarm turns greener

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#78Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#77)
Re: PATCH: two slab-like memory allocators

On 2017-02-28 20:18:35 -0800, Andres Freund wrote:

- Andres, hoping the buildfarm turns greener

Oh well, that didn't work. Investigating.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#79Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#78)
Re: PATCH: two slab-like memory allocators

On 2017-02-28 20:29:36 -0800, Andres Freund wrote:

On 2017-02-28 20:18:35 -0800, Andres Freund wrote:

- Andres, hoping the buildfarm turns greener

Oh well, that didn't work. Investigating.

The fix for that was fairly trivial, and the buildfarm has cooled down.

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#80Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#77)
Re: PATCH: two slab-like memory allocators

On 03/01/2017 05:18 AM, Andres Freund wrote:

On 2017-02-28 10:41:22 -0800, Andres Freund wrote:

Hi,

On 2017-02-27 23:44:20 -0800, Andres Freund wrote:

*preliminary* patch attached. This needs a good bit of polishing
(primarily comment work, verifying that valgrind works), but I'm too
tired now.

I'm not quite sure how to deal with mmgr/README - it's written as kind
of a historical document, and the "Mechanisms to Allow Multiple Types of
Contexts" is already quite out of date. I think it'd be good to rip out
all the historical references and just describe the current state, but
I'm not really enthusiastic about tackling that :/

While still not enthusiastic, I took a stab at doing so. While still
not perfect, I do think this is an improvement.

Is anybody uncomfortable going away from the current historical account
style?

I've pushed these now. I'm not claiming that the README revision is
perfect, but we can incremently improve it further...

Thanks. I went through the README and it definitely looks better now.

I've noticed two minor typos:

1) That is solved this by creating ...
- extra "this"

2) Given this, routines like pfree their corresponding context ...
- missing "find" or "determine"

I also see you've explicitly mentioned the callbacks were added in 9.5.
Doesn't that somewhat reintroduce the historical account?

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#81Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#79)
Re: PATCH: two slab-like memory allocators

On 03/01/2017 11:55 PM, Andres Freund wrote:

On 2017-02-28 20:29:36 -0800, Andres Freund wrote:

On 2017-02-28 20:18:35 -0800, Andres Freund wrote:

- Andres, hoping the buildfarm turns greener

Oh well, that didn't work. Investigating.

The fix for that was fairly trivial, and the buildfarm has cooled down.

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

I assume you're only using that code snippet as an example of code that
might be broken by MemoryContextContains() false positives, right?

(I don't see how the slab allocator could interfere with aggregates, as
it's only used for reorderbuffer.c).

Do others think this isn't an issue and we can just live with it?

My understanding is all the places calling MemoryContextContains()
assume they can't receive memory not allocated as a simple chunk by
palloc(). If that's not the case, it's likely broken.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#82Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#80)
Re: PATCH: two slab-like memory allocators

On 2017-03-02 04:36:23 +0100, Tomas Vondra wrote:

I've noticed two minor typos:

1) That is solved this by creating ...
- extra "this"

2) Given this, routines like pfree their corresponding context ...
- missing "find" or "determine"

Will fix.

I also see you've explicitly mentioned the callbacks were added in 9.5.
Doesn't that somewhat reintroduce the historical account?

That section I just moved up, the version reference was there before. I
left it in, because it seemed new enough to still be somewhat
relevant; removed and added it, not sure what's better.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#83Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#81)
Re: PATCH: two slab-like memory allocators

On 2017-03-02 04:47:13 +0100, Tomas Vondra wrote:

On 03/01/2017 11:55 PM, Andres Freund wrote:

On 2017-02-28 20:29:36 -0800, Andres Freund wrote:

On 2017-02-28 20:18:35 -0800, Andres Freund wrote:

- Andres, hoping the buildfarm turns greener

Oh well, that didn't work. Investigating.

The fix for that was fairly trivial, and the buildfarm has cooled down.

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

I assume you're only using that code snippet as an example of code that
might be broken by MemoryContextContains() false positives, right?

I'm mentioning that piece of code because it's what temporarily caused
all 32bit animals to fail, when I had made MemoryContextContains() less
forgiving.

(I don't see how the slab allocator could interfere with aggregates, as it's
only used for reorderbuffer.c).

Indeed, this is independent of slab.c. I just came across it because I
triggered crashes when shrinking the StandardChunkHeader to be just the
chunk's MemoryContext.

Do others think this isn't an issue and we can just live with it?

My understanding is all the places calling MemoryContextContains() assume
they can't receive memory not allocated as a simple chunk by palloc(). If
that's not the case, it's likely broken.

Yea, that's my conclusion too. Which means nodeAgg.c and nodeWindowAgg.c
are broken afaics, because of e.g. int2int4_sum's() use of
Int64GetDatumFast() on sub-parts of larger allocations.

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#84Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#78)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 03/01/2017 05:29 AM, Andres Freund wrote:

On 2017-02-28 20:18:35 -0800, Andres Freund wrote:

- Andres, hoping the buildfarm turns greener

Oh well, that didn't work. Investigating.

Attaches is the last part of the patch series, rebased to current master
and adopting the new chunk header approach. Unlike Slab, this context
needs the whole AllocSet header (size, requested_size), and also the
block pointer, so no padding seems to be needed.

I've tested this on x86-64 and amrv7l, and the test_decoding test suite
passes on both.

FWIW, I'm still not entirely happy with the name "Generation". I agree
with Andres that it's perhaps a bit too generic, but more importantly
the name might actually be a bit obsolete. There used to be generations
of chunks, but that's gone. Now it simply does not reuse the chunks at
all, and frees the blocks when they get empty.

It's not entirely FIFO though, because the transactions interleave, so
later blocks may be released first. But the "allocated close, freed
close" is still there. So perhaps something like "TemporalSet" or
something like that would be a better name?

Man, naming things is hard ...

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

gen-context.patchbinary/octet-stream; name=gen-context.patchDownload
#85Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#82)
Re: PATCH: two slab-like memory allocators

On 2017-03-01 22:19:30 -0800, Andres Freund wrote:

On 2017-03-02 04:36:23 +0100, Tomas Vondra wrote:

I've noticed two minor typos:

1) That is solved this by creating ...
- extra "this"

2) Given this, routines like pfree their corresponding context ...
- missing "find" or "determine"

Will fix.

And done.

I also see you've explicitly mentioned the callbacks were added in 9.5.
Doesn't that somewhat reintroduce the historical account?

That section I just moved up, the version reference was there before. I
left it in, because it seemed new enough to still be somewhat
relevant; removed and added it, not sure what's better.

Left that in place for now.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#86Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#85)
Re: PATCH: two slab-like memory allocators

On 03/04/2017 02:58 AM, Andres Freund wrote:

On 2017-03-01 22:19:30 -0800, Andres Freund wrote:

On 2017-03-02 04:36:23 +0100, Tomas Vondra wrote:

I've noticed two minor typos:

1) That is solved this by creating ...
- extra "this"

2) Given this, routines like pfree their corresponding context ...
- missing "find" or "determine"

Will fix.

And done.

I also see you've explicitly mentioned the callbacks were added
in 9.5. Doesn't that somewhat reintroduce the historical
account?

That section I just moved up, the version reference was there
before. I left it in, because it seemed new enough to still be
somewhat relevant; removed and added it, not sure what's better.

Left that in place for now.

Yeah. I haven't realized it was just moved a bit, and moreover it
probably makes sense to have some comments regarding differences between
current versions. So +1 to that.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#87Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#79)
Re: PATCH: two slab-like memory allocators

On Wed, Mar 1, 2017 at 5:55 PM, Andres Freund <andres@anarazel.de> wrote:

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

I think it's 100% broken to call MemoryContextContains() on something
that's not guaranteed to be a palloc'd chunk.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#88Andres Freund
andres@anarazel.de
In reply to: Robert Haas (#87)
Re: PATCH: two slab-like memory allocators

On 2017-03-06 12:40:18 -0500, Robert Haas wrote:

On Wed, Mar 1, 2017 at 5:55 PM, Andres Freund <andres@anarazel.de> wrote:

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

I think it's 100% broken to call MemoryContextContains() on something
that's not guaranteed to be a palloc'd chunk.

I agree, but to me it seems the only fix would be to just yank out the
whole optimization?

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#89Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#88)
Re: PATCH: two slab-like memory allocators

On Mon, Mar 6, 2017 at 12:44 PM, Andres Freund <andres@anarazel.de> wrote:

On 2017-03-06 12:40:18 -0500, Robert Haas wrote:

On Wed, Mar 1, 2017 at 5:55 PM, Andres Freund <andres@anarazel.de> wrote:

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

I think it's 100% broken to call MemoryContextContains() on something
that's not guaranteed to be a palloc'd chunk.

I agree, but to me it seems the only fix would be to just yank out the
whole optimization?

Dunno, haven't looked into it.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#90Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Robert Haas (#89)
Re: PATCH: two slab-like memory allocators

On 03/06/2017 07:05 PM, Robert Haas wrote:

On Mon, Mar 6, 2017 at 12:44 PM, Andres Freund <andres@anarazel.de> wrote:

On 2017-03-06 12:40:18 -0500, Robert Haas wrote:

On Wed, Mar 1, 2017 at 5:55 PM, Andres Freund <andres@anarazel.de> wrote:

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

I think it's 100% broken to call MemoryContextContains() on something
that's not guaranteed to be a palloc'd chunk.

I agree, but to me it seems the only fix would be to just yank out the
whole optimization?

Dunno, haven't looked into it.

I think it might be fixable by adding a flag into the chunk, with 'true'
for regular allocations, and 'false' for the optimized ones. And then
only use MemoryContextContains() for 'flag=true' chunks.

The question however is whether this won't make the optimization
pointless. I also, wonder how much we save by this optimization and how
widely it's used? Can someone point me to some numbers?

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#91Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#90)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-03-06 19:49:56 +0100, Tomas Vondra wrote:

On 03/06/2017 07:05 PM, Robert Haas wrote:

On Mon, Mar 6, 2017 at 12:44 PM, Andres Freund <andres@anarazel.de> wrote:

On 2017-03-06 12:40:18 -0500, Robert Haas wrote:

On Wed, Mar 1, 2017 at 5:55 PM, Andres Freund <andres@anarazel.de> wrote:

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

I think it's 100% broken to call MemoryContextContains() on something
that's not guaranteed to be a palloc'd chunk.

I agree, but to me it seems the only fix would be to just yank out the
whole optimization?

Dunno, haven't looked into it.

I think it might be fixable by adding a flag into the chunk, with 'true' for
regular allocations, and 'false' for the optimized ones. And then only use
MemoryContextContains() for 'flag=true' chunks.

I'm not quite following here. We only get a Datum and the knowledge
that it's a pass-by-ref argument, so we really don't know that much. We
could create an "EmbeddedDatum" type that has a preceding chunk header
(appropriately for the version), that just gets zeroed out at start. Is
that what you mean?

The question however is whether this won't make the optimization pointless.
I also, wonder how much we save by this optimization and how widely it's
used? Can someone point me to some numbers?

I don't recall any recent numbers. I'm more than a bit doubful that it
really matters - it's only used for the results of aggregate/window
functions, and surely they've a good chunk of their own overhead...

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#92Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#91)
Re: PATCH: two slab-like memory allocators

On 03/06/2017 08:08 PM, Andres Freund wrote:

Hi,

On 2017-03-06 19:49:56 +0100, Tomas Vondra wrote:

On 03/06/2017 07:05 PM, Robert Haas wrote:

On Mon, Mar 6, 2017 at 12:44 PM, Andres Freund <andres@anarazel.de> wrote:

On 2017-03-06 12:40:18 -0500, Robert Haas wrote:

On Wed, Mar 1, 2017 at 5:55 PM, Andres Freund <andres@anarazel.de> wrote:

The issue was that on 32bit platforms the Datum returned by some
functions (int2int4_sum in this case) isn't actually a separately
allocated Datum, but rather just something embedded in a larger
struct. That, combined with the following code:
if (!peraggstate->resulttypeByVal && !*isnull &&
!MemoryContextContains(CurrentMemoryContext,
DatumGetPointer(*result)))
seems somewhat problematic to me. MemoryContextContains() can give
false positives when used on memory that's not a distinctly allocated
chunk, and if so, we violate memory lifetime rules. It's quite
unlikely, given the required bit patterns, but nonetheless it's making
me somewhat uncomfortable.

Do others think this isn't an issue and we can just live with it?

I think it's 100% broken to call MemoryContextContains() on something
that's not guaranteed to be a palloc'd chunk.

I agree, but to me it seems the only fix would be to just yank out the
whole optimization?

Dunno, haven't looked into it.

I think it might be fixable by adding a flag into the chunk, with 'true' for
regular allocations, and 'false' for the optimized ones. And then only use
MemoryContextContains() for 'flag=true' chunks.

I'm not quite following here. We only get a Datum and the knowledge
that it's a pass-by-ref argument, so we really don't know that much. We
could create an "EmbeddedDatum" type that has a preceding chunk header
(appropriately for the version), that just gets zeroed out at start. Is
that what you mean?

Yes, that's roughly what I meant.

The question however is whether this won't make the optimization pointless.
I also, wonder how much we save by this optimization and how widely it's
used? Can someone point me to some numbers?

I don't recall any recent numbers. I'm more than a bit doubful that it
really matters - it's only used for the results of aggregate/window
functions, and surely they've a good chunk of their own overhead...

And if the benefit is negligible, trying to keep the optimization might
easily result in slowdown (compared to non-optimized code).

But I'm puzzled why we haven't seen any reports of failures? I mean,
doing sum(int4) is not particularly extravagant thing, if there really
is an issue, shouldn't we see a lot of reports? What are we missing?

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#93Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#92)
Re: PATCH: two slab-like memory allocators

Hi,

On 2017-03-06 23:32:30 +0100, Tomas Vondra wrote:

The question however is whether this won't make the optimization pointless.
I also, wonder how much we save by this optimization and how widely it's
used? Can someone point me to some numbers?

I don't recall any recent numbers. I'm more than a bit doubful that it
really matters - it's only used for the results of aggregate/window
functions, and surely they've a good chunk of their own overhead...

And if the benefit is negligible, trying to keep the optimization might
easily result in slowdown (compared to non-optimized code).

I doubt the branch is noticeable here, given that we're doing a memory
allocation otherwise. Should also be decently predictable.

But I'm puzzled why we haven't seen any reports of failures? I mean, doing
sum(int4) is not particularly extravagant thing, if there really is an
issue, shouldn't we see a lot of reports? What are we missing?

Reports about what? False positives causing crashes / wrong results? I
think it's quite unlikely to actually trigger this in practice, because
you need a properly aligned pointer, and then the preceding header has
to to point to a bit pattern that's equal to the context - that's
presumably quite unlikely in practice.

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#94Andres Freund
andres@anarazel.de
In reply to: Tomas Vondra (#84)
Re: PATCH: two slab-like memory allocators

On 2017-03-02 22:51:09 +0100, Tomas Vondra wrote:

Attaches is the last part of the patch series, rebased to current master and
adopting the new chunk header approach.

Something seems to have gone awry while sending that - the attachement
is a whopping 0 bytes...

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#95Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Andres Freund (#94)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 03/07/2017 12:19 AM, Andres Freund wrote:

On 2017-03-02 22:51:09 +0100, Tomas Vondra wrote:

Attaches is the last part of the patch series, rebased to current master and
adopting the new chunk header approach.

Something seems to have gone awry while sending that - the attachement
is a whopping 0 bytes...

Why are you complaining? That makes it much easier to review and commit!

But if you insist, here is the actual patch ;-)

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

gen-context.patchtext/x-diff; name=gen-context.patchDownload
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 8aac670..0bdc214 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -149,15 +149,6 @@ typedef struct ReorderBufferDiskChange
  */
 static const Size max_changes_in_memory = 4096;
 
-/*
- * We use a very simple form of a slab allocator for frequently allocated
- * objects, simply keeping a fixed number in a linked list when unused,
- * instead pfree()ing them. Without that in many workloads aset.c becomes a
- * major bottleneck, especially when spilling to disk while decoding batch
- * workloads.
- */
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-
 /* ---------------------------------------
  * primary reorderbuffer support routines
  * ---------------------------------------
@@ -248,6 +239,10 @@ ReorderBufferAllocate(void)
 											SLAB_DEFAULT_BLOCK_SIZE,
 											sizeof(ReorderBufferTXN));
 
+	buffer->tup_context = GenerationContextCreate(new_ctx,
+										   "Tuples",
+										   SLAB_LARGE_BLOCK_SIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -258,15 +253,12 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_tuplebufs = 0;
-
 	buffer->outbuf = NULL;
 	buffer->outbufsize = 0;
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
 }
@@ -419,42 +411,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
-
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
@@ -468,21 +430,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index cd0e803..7263399 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
+OBJS = aset.o generation.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
new file mode 100644
index 0000000..16f9b61
--- /dev/null
+++ b/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,765 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the allocated chunks
+ *	have similar lifespan, i.e. that chunks allocated close from each other
+ *	(by time) will also be freed in close proximity, and mostly in the same
+ *	order. This is typical for various queue-like use cases, i.e. when tuples
+ *	are constructed, processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. The space released by freed chunks is not
+ *	reused, and once all chunks are freed (i.e. when nallocated == nfreed),
+ *	the whole block is thrown away. When the allocated chunks have similar
+ *	lifespan, this works very well and is extremely cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlockData))
+#define Generation_CHUNKHDRSZ	sizeof(GenerationChunkData)
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlockData *GenerationBlock;	/* forward reference */
+typedef struct GenerationChunkData *GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	block;			/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+typedef GenerationContext *Generation;
+
+/*
+ * GenerationBlockData
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlockData
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlockData;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif   /* MEMORY_CONTEXT_CHECKING */
+
+	GenerationContext *context;		/* owning context */
+	/* there must not be any padding to reach a MAXALIGN boundary here! */
+}	GenerationChunkData;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+					((GenerationChunk)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+					((GenerationPointer)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	Generation			set;
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Generation) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationBlock	block;
+	GenerationChunk	chunk;
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->context = set;
+		chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->context = set;
+	chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+							   sizeof(chunk->requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 ((MemoryContext)set)->name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Generation			set = (Generation) context;
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+	Size		oldsize = chunk->size;
+	GenerationPointer	newPointer;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 ((MemoryContext)set)->name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+								   sizeof(chunk->requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk	chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	Generation			set = (Generation) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	Generation			set = (Generation) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				((MemoryContext)set)->name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	Generation	gen = (Generation) context;
+	char	   *name = context->name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &gen->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock	block = dlist_container(GenerationBlockData, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk	chunk = (GenerationChunk)ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->context != gen)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+									   sizeof(chunk->requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+									   sizeof(chunk->requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->size != MAXALIGN(chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->requested_size < chunk->size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index fe6bc90..815a52a 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenerationContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 28aca92..2ef935a 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -279,6 +279,7 @@ typedef enum NodeTag
 	T_MemoryContext,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenerationContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 17e47b3..5e0b4c8 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -335,20 +335,7 @@ struct ReorderBuffer
 	 */
 	MemoryContext change_context;
 	MemoryContext txn_context;
-
-	/*
-	 * Data structure slab cache.
-	 *
-	 * We allocate/deallocate some structures very frequently, to avoid bigger
-	 * overhead we cache some unused ones here.
-	 *
-	 * The maximum number of cached entries is controlled by const variables
-	 * on top of reorderbuffer.c
-	 */
-
-	/* cached ReorderBufferTupleBufs */
-	slist_head	cached_tuplebufs;
-	Size		nr_cached_tuplebufs;
+	MemoryContext tup_context;
 
 	XLogRecPtr	current_restart_decoding_lsn;
 
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 58e816d..b3e1eb5 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -155,6 +155,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 				  Size blockSize,
 				  Size chunkSize);
 
+/* gen.c */
+extern MemoryContext GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
#96David Rowley
david.rowley@2ndquadrant.com
In reply to: Andres Freund (#53)
1 attachment(s)
Re: PATCH: two slab-like memory allocators

On 28 February 2017 at 01:02, Andres Freund <andres@anarazel.de> wrote:

Hi,

On 2017-02-27 03:17:32 -0800, Andres Freund wrote:

I'll work on getting slab committed first, and then review / edit /
commit generation.c later. One first note there is that I'm wondering
if generation.c is a too generic filename.

And pushed slab and its usage. Will have a look at generation.c
tomorrow.

Attached is a patch to fix the compiler warning for compilers that
don't understand elog(ERROR) does not return.

--
David Rowley http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

slaballoc_warning_fix.patchapplication/octet-stream; name=slaballoc_warning_fix.patchDownload
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index 11a126a..463177f 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -570,6 +570,7 @@ SlabRealloc(MemoryContext context, void *pointer, Size size)
 		return pointer;
 
 	elog(ERROR, "slab allocator does not support realloc()");
+	return NULL;		/* keep compiler quiet */
 }
 
 /*
#97Petr Jelinek
petr.jelinek@2ndquadrant.com
In reply to: Tomas Vondra (#95)
Re: PATCH: two slab-like memory allocators

Hi,

this patch is marked as committed in CF application but the second part
(generational allocator) was AFAICS never committed.

Does anybody plan to push this forward?

--
Petr Jelinek http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#98Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#1)
1 attachment(s)
PATCH : Generational memory allocator (was PATCH: two slab-like memory allocators)

Hi,

Attached is a rebased version of the Generational context, originally
submitted with SlabContext (which was already committed into Pg 10).

The main change is that I've abandoned the pattern of defining a Data
structure and then a pointer typedef, i.e.

typedef struct GenerationContextData { ... } GenerationContextData;
typedef struct GenerationContextData *GenerationContext;

Now it's just

typedef struct GenerationContext { ... } GenerationContext;

mostly because SlabContext was committed like that, and because Andres
was complaining about this code pattern ;-)

Otherwise the design is the same as repeatedly discussed before.

To show that this is still valuable change (even after SlabContext and
adding doubly-linked list to AllocSet), I've repeated the test done by
Andres in [1]/messages/by-id/20170227111732.vrx5v72ighehwpkf@alap3.anarazel.de using the test case described in [2]/messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org, that is

-- generate data
SELECT COUNT(*) FROM (SELECT test1()
FROM generate_series(1, 50000)) foo;

-- benchmark (measure time and VmPeak)
SELECT COUNT(*) FROM (SELECT *
FROM pg_logical_slot_get_changes('test', NULL,
NULL, 'include-xids', '0')) foo;

with different values passed to the first step (instead of the 50000).
The VmPeak numbers look like this:

N master patched
--------------------------------------
100000 1155220 kB 361604 kB
200000 2020668 kB 434060 kB
300000 2890236 kB 502452 kB
400000 3751592 kB 570816 kB
500000 4621124 kB 639168 kB

and the timing (on assert-enabled build):

N master patched
--------------------------------------
100000 1103.182 ms 412.734 ms
200000 2216.711 ms 820.438 ms
300000 3320.095 ms 1223.576 ms
400000 4584.919 ms 1621.261 ms
500000 5590.444 ms 2113.820 ms

So it seems it's still a significant improvement, both in terms of
memory usage and timing. Admittedly, this is a single test, so ideas of
other useful test cases are welcome.

regards

[1]: /messages/by-id/20170227111732.vrx5v72ighehwpkf@alap3.anarazel.de
/messages/by-id/20170227111732.vrx5v72ighehwpkf@alap3.anarazel.de

[2]: /messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org
/messages/by-id/20160706185502.1426.28143@wrigleys.postgresql.org

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0001-Generational-memory-allocator.patchtext/x-patch; name=0001-Generational-memory-allocator.patchDownload
From 1c46d25ffa9bb104c415cba7c7b3a013958b6ab5 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Mon, 14 Aug 2017 01:52:50 +0200
Subject: [PATCH] Generational memory allocator

This memory context is based on the assumption that the allocated chunks
have similar lifespan, i.e. that chunks allocated close from each other
(by time) will also be freed in close proximity, and mostly in the same
order. This is typical for various queue-like use cases, i.e. when
tuples are constructed, processed and then thrown away.

The memory context uses a very simple approach to free space management.
Instead of a complex global freelist, each block tracks a number
of allocated and freed chunks. The space released by freed chunks is not
reused, and once all chunks are freed (i.e. when nallocated == nfreed),
the whole block is thrown away. When the allocated chunks have similar
lifespan, this works very well and is extremely cheap.
---
 src/backend/replication/logical/reorderbuffer.c |  74 +--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/generation.c             | 768 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   4 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  15 +-
 src/include/utils/memutils.h                    |   5 +
 7 files changed, 790 insertions(+), 79 deletions(-)
 create mode 100644 src/backend/utils/mmgr/generation.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 5567bee..5309170 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -150,15 +150,6 @@ typedef struct ReorderBufferDiskChange
  */
 static const Size max_changes_in_memory = 4096;
 
-/*
- * We use a very simple form of a slab allocator for frequently allocated
- * objects, simply keeping a fixed number in a linked list when unused,
- * instead pfree()ing them. Without that in many workloads aset.c becomes a
- * major bottleneck, especially when spilling to disk while decoding batch
- * workloads.
- */
-static const Size max_cached_tuplebufs = 4096 * 2;	/* ~8MB */
-
 /* ---------------------------------------
  * primary reorderbuffer support routines
  * ---------------------------------------
@@ -248,6 +239,10 @@ ReorderBufferAllocate(void)
 											SLAB_DEFAULT_BLOCK_SIZE,
 											sizeof(ReorderBufferTXN));
 
+	buffer->tup_context = GenerationContextCreate(new_ctx,
+										   "Tuples",
+										   SLAB_LARGE_BLOCK_SIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -258,15 +253,12 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_tuplebufs = 0;
-
 	buffer->outbuf = NULL;
 	buffer->outbufsize = 0;
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
 }
@@ -419,42 +411,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
-
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
@@ -468,21 +430,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index cd0e803..f644c40 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
+OBJS = aset.o dsa.o freepage.o generation.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
new file mode 100644
index 0000000..a35155c
--- /dev/null
+++ b/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,768 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the allocated chunks
+ *	have similar lifespan, i.e. that chunks allocated close from each other
+ *	(by time) will also be freed in close proximity, and mostly in the same
+ *	order. This is typical for various queue-like use cases, i.e. when tuples
+ *	are constructed, processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. The space released by freed chunks is not
+ *	reused, and once all chunks are freed (i.e. when nallocated == nfreed),
+ *	the whole block is thrown away. When the allocated chunks have similar
+ *	lifespan, this works very well and is extremely cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlock))
+#define Generation_CHUNKHDRSZ	sizeof(GenerationChunk)
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunk, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunk, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunk, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlock GenerationBlock;	/* forward reference */
+typedef struct GenerationChunk GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	*block;		/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+/*
+ * GenerationBlock
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBloc is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlock
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlock;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunk
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif   /* MEMORY_CONTEXT_CHECKING */
+
+	GenerationContext *context;		/* owning context */
+	/* there must not be any padding to reach a MAXALIGN boundary here! */
+}	GenerationChunk;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+	((GenerationChunk *)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+	((GenerationPointer *)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	GenerationContext  *set;
+
+	StaticAssertStmt(offsetof(GenerationChunk, context) + sizeof(MemoryContext) ==
+					 MAXALIGN(sizeof(GenerationChunk)),
+					 "padding calculation in GenerationChunk is wrong");
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (GenerationContext *) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	GenerationBlock	   *block;
+	GenerationChunk	   *chunk;
+
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock *) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk *) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->context = set;
+		chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock *) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk *) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->context = set;
+	chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+							   sizeof(chunk->requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	GenerationChunk	   *chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	   *block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 ((MemoryContext)set)->name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	GenerationChunk	   *chunk = GenerationPointerGetChunk(pointer);
+	GenerationPointer	newPointer;
+	Size		oldsize = chunk->size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 ((MemoryContext)set)->name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+								   sizeof(chunk->requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk *chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				((MemoryContext)set)->name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	GenerationContext  *gen = (GenerationContext *) context;
+	char	   *name = context->name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &gen->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk *chunk = (GenerationChunk *) ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->context != gen)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+									   sizeof(chunk->requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+									   sizeof(chunk->requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->size != MAXALIGN(chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->requested_size < chunk->size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 7a0c676..e22d9fb 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenerationContext)))
 
 #endif							/* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 27bd4f3..202ecb3 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -274,6 +274,7 @@ typedef enum NodeTag
 	T_MemoryContext,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenerationContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 86effe1..b18ce5a 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -344,20 +344,7 @@ struct ReorderBuffer
 	 */
 	MemoryContext change_context;
 	MemoryContext txn_context;
-
-	/*
-	 * Data structure slab cache.
-	 *
-	 * We allocate/deallocate some structures very frequently, to avoid bigger
-	 * overhead we cache some unused ones here.
-	 *
-	 * The maximum number of cached entries is controlled by const variables
-	 * on top of reorderbuffer.c
-	 */
-
-	/* cached ReorderBufferTupleBufs */
-	slist_head	cached_tuplebufs;
-	Size		nr_cached_tuplebufs;
+	MemoryContext tup_context;
 
 	XLogRecPtr	current_restart_decoding_lsn;
 
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index c553349..42b5246 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -155,6 +155,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 				  Size blockSize,
 				  Size chunkSize);
 
+/* generation.c */
+extern MemoryContext GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
-- 
2.9.4

#99Simon Riggs
simon@2ndquadrant.com
In reply to: Tomas Vondra (#98)
Re: PATCH : Generational memory allocator (was PATCH: two slab-like memory allocators)

On 14 August 2017 at 01:35, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:

Hi,

Attached is a rebased version of the Generational context, originally
submitted with SlabContext (which was already committed into Pg 10).

The main change is that I've abandoned the pattern of defining a Data
structure and then a pointer typedef, i.e.

typedef struct GenerationContextData { ... } GenerationContextData;
typedef struct GenerationContextData *GenerationContext;

Now it's just

typedef struct GenerationContext { ... } GenerationContext;

mostly because SlabContext was committed like that, and because Andres was
complaining about this code pattern ;-)

Otherwise the design is the same as repeatedly discussed before.

To show that this is still valuable change (even after SlabContext and
adding doubly-linked list to AllocSet), I've repeated the test done by
Andres in [1] using the test case described in [2], that is

-- generate data
SELECT COUNT(*) FROM (SELECT test1()
FROM generate_series(1, 50000)) foo;

-- benchmark (measure time and VmPeak)
SELECT COUNT(*) FROM (SELECT *
FROM pg_logical_slot_get_changes('test', NULL,
NULL, 'include-xids', '0')) foo;

with different values passed to the first step (instead of the 50000). The
VmPeak numbers look like this:

N master patched
--------------------------------------
100000 1155220 kB 361604 kB
200000 2020668 kB 434060 kB
300000 2890236 kB 502452 kB
400000 3751592 kB 570816 kB
500000 4621124 kB 639168 kB

and the timing (on assert-enabled build):

N master patched
--------------------------------------
100000 1103.182 ms 412.734 ms
200000 2216.711 ms 820.438 ms
300000 3320.095 ms 1223.576 ms
400000 4584.919 ms 1621.261 ms
500000 5590.444 ms 2113.820 ms

So it seems it's still a significant improvement, both in terms of memory
usage and timing. Admittedly, this is a single test, so ideas of other
useful test cases are welcome.

This all looks good.

What I think this needs is changes to
src/backend/utils/mmgr/README
which decribe the various options that now exist (normal?, slab) and
will exist (generational)

Don't really care about the name, as long as its clear when to use it
and when not to use it.

This description of generational seems wrong: "When the allocated
chunks have similar lifespan, this works very well and is extremely
cheap."
They don't need the same lifespan they just need to be freed in groups
and in the order they were allocated.

For this patch specifically, we need additional comments in
reorderbuffer.c to describe the memory allocation pattern in that
module so that it is clear that the choice of allocator is useful and
appropriate, possibly with details of how that testing was performed,
so it can be re-tested later or tested on a variety of platforms.

Particularly in reorderbuffer, surely we will almost immediately reuse
chunks again, so is it worth issuing free() and then malloc() again
soon after? Does that cause additional overhead we could also avoid?
Could we possibly keep the last/few free'd chunks around rather than
re-malloc?

Seems like we should commit this soon.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#100Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Simon Riggs (#99)
Re: PATCH : Generational memory allocator (was PATCH: two slab-like memory allocators)

On 09/14/2017 04:21 PM, Simon Riggs wrote:

On 14 August 2017 at 01:35, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:

Hi,

Attached is a rebased version of the Generational context, originally
submitted with SlabContext (which was already committed into Pg 10).

The main change is that I've abandoned the pattern of defining a Data
structure and then a pointer typedef, i.e.

typedef struct GenerationContextData { ... } GenerationContextData;
typedef struct GenerationContextData *GenerationContext;

Now it's just

typedef struct GenerationContext { ... } GenerationContext;

mostly because SlabContext was committed like that, and because Andres was
complaining about this code pattern ;-)

Otherwise the design is the same as repeatedly discussed before.

To show that this is still valuable change (even after SlabContext and
adding doubly-linked list to AllocSet), I've repeated the test done by
Andres in [1] using the test case described in [2], that is

-- generate data
SELECT COUNT(*) FROM (SELECT test1()
FROM generate_series(1, 50000)) foo;

-- benchmark (measure time and VmPeak)
SELECT COUNT(*) FROM (SELECT *
FROM pg_logical_slot_get_changes('test', NULL,
NULL, 'include-xids', '0')) foo;

with different values passed to the first step (instead of the 50000). The
VmPeak numbers look like this:

N master patched
--------------------------------------
100000 1155220 kB 361604 kB
200000 2020668 kB 434060 kB
300000 2890236 kB 502452 kB
400000 3751592 kB 570816 kB
500000 4621124 kB 639168 kB

and the timing (on assert-enabled build):

N master patched
--------------------------------------
100000 1103.182 ms 412.734 ms
200000 2216.711 ms 820.438 ms
300000 3320.095 ms 1223.576 ms
400000 4584.919 ms 1621.261 ms
500000 5590.444 ms 2113.820 ms

So it seems it's still a significant improvement, both in terms of memory
usage and timing. Admittedly, this is a single test, so ideas of other
useful test cases are welcome.

This all looks good.

What I think this needs is changes to
src/backend/utils/mmgr/README
which decribe the various options that now exist (normal?, slab) and
will exist (generational)

Don't really care about the name, as long as its clear when to use it
and when not to use it.

This description of generational seems wrong: "When the allocated
chunks have similar lifespan, this works very well and is extremely
cheap."
They don't need the same lifespan they just need to be freed in groups
and in the order they were allocated.

Agreed, should be described differently. What matters is (mostly) FIFO
pattern of the palloc/pfree requests, which allows us to release the memory.

For this patch specifically, we need additional comments in
reorderbuffer.c to describe the memory allocation pattern in that
module so that it is clear that the choice of allocator is useful
and appropriate, possibly with details of how that testing was
performed, so it can be re-tested later or tested on a variety of
platforms.

Including details about the testing into reorderbuffer.c does not seem
very attractive to me. I don't recall any other place describing the
tests in detail. Why not to discuss the details here, and then include a
link to this thread in the commit message?

In any case, I doubt anyone will repeat the testing on a variety of
platforms (and I don't have any such comprehensive test suite anyway).
What will likely happen is someone bumping into a poorly performing
corner case, we will analyze it and fix it as usual.

Particularly in reorderbuffer, surely we will almost immediately
reuse chunks again, so is it worth issuing free() and then malloc()
again soon after? Does that cause additional overhead we could also
avoid? Could we possibly keep the last/few free'd chunks around
rather than re-malloc?

I haven't seen anything like that in tests I've done. The malloc/free
overhead is negligible thanks as our allocators significantly reduce the
number of calls to those functions. If we happen to run into such case,
it shouldn't be difficult to keep a few empty blocks. But perhaps we can
leave that as a future optimization.

Seems like we should commit this soon.

Thanks.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#101Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tomas Vondra (#100)
1 attachment(s)
Re: PATCH : Generational memory allocator (was PATCH: two slab-like memory allocators)

Hi,

Attached is an updated version of the patch, tweaking the comments.

1) I've added a section at the end of src/backend/utils/mmgr/README,
briefly explaining the alternative memory allocators we have. I don't
think we should get into too much low-level detail here, that's more
appropriate for the .c file for each context.

2) I've slightly reworded a paragraph in generation.c describing what
use cases are suitable for the memory context. It used to say:

This memory context is based on the assumption that the allocated
chunks have similar lifespan, i.e. that chunks allocated close from
each other (by time) will also be freed in close proximity, and
mostly in the same order. This is typical for various queue-like use
cases, i.e. when tuples are constructed, processed and then thrown
away.

and now it says:

This memory context is based on the assumption that the chunks are
freed roughly in the same order as they were allocated (FIFO), or in
groups with similar lifespan (generations - hence the name of the
context). This is typical for various queue-like use cases, i.e. when
tuples are constructed, processed and then thrown away.

3) I've also added a brief note into reorderbuffer.c mentioning that it
uses SlabContext and GenerationContext. As I explained, I don't think we
should include details about how we tested the patch or whatever here.

regard

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0001-Generational-memory-allocator-v2.patchtext/x-patch; name=0001-Generational-memory-allocator-v2.patchDownload
From 25806c68a05287f3294f2d8508bd45599232f67b Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sun, 24 Sep 2017 22:19:17 +0200
Subject: [PATCH] Generational memory allocator

This memory context is based on the assumption that the allocated chunks
have similar lifespan, i.e. that chunks allocated close from each other
(by time) will also be freed in close proximity, and mostly in the same
order. This is typical for various queue-like use cases, i.e. when
tuples are constructed, processed and then thrown away.

The memory context uses a very simple approach to free space management.
Instead of a complex global freelist, each block tracks a number
of allocated and freed chunks. The space released by freed chunks is not
reused, and once all chunks are freed (i.e. when nallocated == nfreed),
the whole block is thrown away. When the allocated chunks have similar
lifespan, this works very well and is extremely cheap.
---
 src/backend/replication/logical/reorderbuffer.c |  80 +--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/README                   |  23 +
 src/backend/utils/mmgr/generation.c             | 768 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   4 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  15 +-
 src/include/utils/memutils.h                    |   5 +
 8 files changed, 819 insertions(+), 79 deletions(-)
 create mode 100644 src/backend/utils/mmgr/generation.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 0f607ba..dc0ad5b 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -43,6 +43,12 @@
  *	  transaction there will be no other data carrying records between a row's
  *	  toast chunks and the row data itself. See ReorderBufferToast* for
  *	  details.
+ *
+ *	  ReorderBuffer uses two special memory context types - SlabContext for
+ *	  allocations of fixed-length structures (changes and transactions), and
+ *	  GenerationContext for the variable-length transaction data (allocated
+ *	  and freed in groups with similar lifespan).
+ *
  * -------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -150,15 +156,6 @@ typedef struct ReorderBufferDiskChange
  */
 static const Size max_changes_in_memory = 4096;
 
-/*
- * We use a very simple form of a slab allocator for frequently allocated
- * objects, simply keeping a fixed number in a linked list when unused,
- * instead pfree()ing them. Without that in many workloads aset.c becomes a
- * major bottleneck, especially when spilling to disk while decoding batch
- * workloads.
- */
-static const Size max_cached_tuplebufs = 4096 * 2;	/* ~8MB */
-
 /* ---------------------------------------
  * primary reorderbuffer support routines
  * ---------------------------------------
@@ -248,6 +245,10 @@ ReorderBufferAllocate(void)
 											SLAB_DEFAULT_BLOCK_SIZE,
 											sizeof(ReorderBufferTXN));
 
+	buffer->tup_context = GenerationContextCreate(new_ctx,
+										   "Tuples",
+										   SLAB_LARGE_BLOCK_SIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -258,15 +259,12 @@ ReorderBufferAllocate(void)
 	buffer->by_txn_last_xid = InvalidTransactionId;
 	buffer->by_txn_last_txn = NULL;
 
-	buffer->nr_cached_tuplebufs = 0;
-
 	buffer->outbuf = NULL;
 	buffer->outbufsize = 0;
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
-	slist_init(&buffer->cached_tuplebufs);
 
 	return buffer;
 }
@@ -419,42 +417,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
-
-
-	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
-	}
-	else
-	{
-		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
-							   sizeof(ReorderBufferTupleBuf) +
-							   MAXIMUM_ALIGNOF + alloc_len);
-		tuple->alloc_tuple_size = alloc_len;
-		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-	}
+	tuple = (ReorderBufferTupleBuf *)
+		MemoryContextAlloc(rb->tup_context,
+						   sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + alloc_len);
+	tuple->alloc_tuple_size = alloc_len;
+	tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
 
 	return tuple;
 }
@@ -468,21 +436,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index cd0e803..f644c40 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
+OBJS = aset.o dsa.o freepage.o generation.o mcxt.o memdebug.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/README b/src/backend/utils/mmgr/README
index 0ab81bd..296fa19 100644
--- a/src/backend/utils/mmgr/README
+++ b/src/backend/utils/mmgr/README
@@ -431,3 +431,26 @@ will not allocate very much space per tuple cycle.  To make this usage
 pattern cheap, the first block allocated in a context is not given
 back to malloc() during reset, but just cleared.  This avoids malloc
 thrashing.
+
+
+Alternative Memory Context Implementations
+------------------------------------------
+
+aset.c is our default general-purpose implementation, working fine
+in most situations. We also have two implementations optimized for
+special use cases, providing either better performance or lower memory
+usage compared to aset.c (or both).
+
+* slab.c (SlabContext) is designed for allocations of fixed-length
+  chunks, and does not allow allocations of chunks with different size.
+
+* generation.c (GenerationContext) is designed for cases when chunks
+  are allocated in groups with similar lifespan (generations), or
+  roughly in FIFO order.
+
+Both memory contexts aim to free memory back to the operating system
+(unlike aset.c, which keeps the freed chunks in a freelist, and only
+returns the memory when reset/deleted).
+
+These memory contexts were initially developed for ReorderBuffer, but
+may be useful elsewhere as long as the allocation patterns match.
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
new file mode 100644
index 0000000..11a6a37
--- /dev/null
+++ b/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,768 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/Generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the chunks are freed
+ *	roughly in the same order as they were allocated (FIFO), or in groups with
+ *	similar lifespan (generations - hence the name of the context). This is
+ *	typical for various queue-like use cases, i.e. when tuples are constructed,
+ *	processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks. Freed chunks are not reused, and once all
+ *	chunks on a block are freed, the whole block is thrown away. When the
+ *	chunks allocated on the same block have similar lifespan, this works
+ *	very well and is very cheap.
+ *
+ *	The current implementation only uses a fixed block size - maybe it should
+ *	adapt a min/max block size range, and grow the blocks automatically.
+ *	It already uses dedicated blocks for oversized chunks.
+ *
+ *	XXX It might be possible to improve this by keeping a small freelist for
+ *	only a small number of recent blocks, but it's not clear it's worth the
+ *	additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "lib/ilist.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlock))
+#define Generation_CHUNKHDRSZ	sizeof(GenerationChunk)
+
+/* Portion of Generation_CHUNKHDRSZ examined outside Generation.c. */
+#define Generation_CHUNK_PUBLIC	\
+	(offsetof(GenerationChunk, size) + sizeof(Size))
+
+/* Portion of Generation_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunk, requested_size) + sizeof(Size))
+#else
+#define Generation_CHUNK_USED	\
+	(offsetof(GenerationChunk, size) + sizeof(Size))
+#endif
+
+typedef struct GenerationBlock GenerationBlock;	/* forward reference */
+typedef struct GenerationChunk GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks, and
+ * freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generationerational context parameters */
+	Size		blockSize;		/* block size */
+
+	GenerationBlock	*block;		/* current (most recently allocated) block */
+	dlist_head	blocks;			/* list of blocks */
+
+}	GenerationContext;
+
+/*
+ * GenerationBlock
+ *		A GenerationBlock is the unit of memory that is obtained by Generation.c
+ *		from malloc().  It contains one or more GenerationChunks, which are
+ *		the units requested by palloc() and freed by pfree().  GenerationChunks
+ *		cannot be returned to malloc() individually, instead pfree()
+ *		updates a free counter on a block and when all chunks on a block
+ *		are freed the whole block is returned to malloc().
+ *
+ *		GenerationBloc is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct GenerationBlock
+{
+	dlist_node	node;			/* doubly-linked list */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}	GenerationBlock;
+
+/*
+ * GenerationChunk
+ *		The prefix of each piece of memory in an GenerationBlock
+ */
+typedef struct GenerationChunk
+{
+	/* block owning this chunk */
+	void	   *block;
+
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif   /* MEMORY_CONTEXT_CHECKING */
+
+	GenerationContext *context;		/* owning context */
+	/* there must not be any padding to reach a MAXALIGN boundary here! */
+}	GenerationChunk;
+
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+	((GenerationChunk *)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+	((GenerationPointer *)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationInit(MemoryContext context);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static MemoryContextMethods GenerationMethods = {
+	GenerationAlloc,
+	GenerationFree,
+	GenerationRealloc,
+	GenerationInit,
+	GenerationReset,
+	GenerationDelete,
+	GenerationGetChunkSpace,
+	GenerationIsEmpty,
+	GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,GenerationCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define GenerationFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationFree: %s: %p, %lu\n", \
+				(_cxt)->name, (_chunk), (_chunk)->size)
+#define GenerationAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "GenerationAlloc: %s: %p, %lu\n", \
+				(_cxt)->name, (_chunk), (_chunk)->size)
+#else
+#define GenerationFreeInfo(_cxt, _chunk)
+#define GenerationAllocInfo(_cxt, _chunk)
+#endif
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize)
+{
+	GenerationContext  *set;
+
+	StaticAssertStmt(offsetof(GenerationChunk, context) + sizeof(MemoryContext) ==
+					 MAXALIGN(sizeof(GenerationChunk)),
+					 "padding calculation in GenerationChunk is wrong");
+
+	/*
+	 * First, validate allocation parameters.  (If we're going to throw an
+	 * error, we should do so before the context is created, not after.)  We
+	 * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+	 * that's what AllocSet does.
+	 */
+	if (blockSize != MAXALIGN(blockSize) ||
+		blockSize < 1024 ||
+		!AllocHugeSizeIsValid(blockSize))
+		elog(ERROR, "invalid blockSize for memory context: %zu",
+			 blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (GenerationContext *) MemoryContextCreate(T_GenerationContext,
+									sizeof(GenerationContext),
+									&GenerationMethods,
+									parent,
+									name);
+
+	set->blockSize = blockSize;
+	set->block = NULL;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationInit
+ *		Context-type-specific initialization routine.
+ */
+static void
+GenerationInit(MemoryContext context)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+
+	dlist_init(&set->blocks);
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	dlist_mutable_iter miter;
+
+	AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, miter.cur);
+
+		dlist_delete(miter.cur);
+
+		/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, set->blockSize);
+#endif
+
+		free(block);
+	}
+
+	set->block = NULL;
+
+	Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call GenerationReset() which does all the
+ *		dirty work.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+	/* just reset (although not really necessary) */
+	GenerationReset(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	GenerationBlock	   *block;
+	GenerationChunk	   *chunk;
+
+	Size		chunk_size = MAXALIGN(size);
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->blockSize / 8)
+	{
+		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+		block = (GenerationBlock *) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		/* block with a single (used) chunk */
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (GenerationChunk *) (((char *) block) + Generation_BLOCKHDRSZ);
+		chunk->context = set;
+		chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Valgrind: Will be made NOACCESS below. */
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < chunk_size)
+			set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		GenerationAllocInfo(set, chunk);
+
+		/*
+		 * Chunk header public fields remain DEFINED.  The requested
+		 * allocation itself can be NOACCESS or UNDEFINED; our caller will
+		 * soon make it UNDEFINED.  Make extra space at the end of the chunk,
+		 * if any, NOACCESS.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + Generation_CHUNK_PUBLIC,
+							 chunk_size + Generation_CHUNKHDRSZ - Generation_CHUNK_PUBLIC);
+
+		return GenerationChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an over-sized chunk. Is there enough space on the current block? If
+	 * not, allocate a new "regular" block.
+	 */
+	block = set->block;
+
+	if ((block == NULL) ||
+		(block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+	{
+		Size		blksize = set->blockSize;
+
+		block = (GenerationBlock *) malloc(blksize);
+
+		if (block == NULL)
+			return NULL;
+
+		block->nchunks = 0;
+		block->nfree = 0;
+
+		block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - Generation_BLOCKHDRSZ);
+
+		/* add it to the doubly-linked list of blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (GenerationChunk *) block->freeptr;
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk->block = block;
+
+	chunk->context = set;
+	chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Valgrind: Free list requested_size should be DEFINED. */
+	chunk->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+							   sizeof(chunk->requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+	GenerationAllocInfo(set, chunk);
+	return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks on the block, and if all chunks on the block
+ *		are freeed then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	GenerationChunk	   *chunk = GenerationPointerGetChunk(pointer);
+	GenerationBlock	   *block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 ((MemoryContext)set)->name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->requested_size = 0;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks on the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+	GenerationChunk	   *chunk = GenerationPointerGetChunk(pointer);
+	GenerationPointer	newPointer;
+	Size		oldsize = chunk->size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < oldsize)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 ((MemoryContext)set)->name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context is not use the power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->requested_size = size;
+		VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+								   sizeof(chunk->requested_size));
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldsize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+		return NULL;
+
+	/*
+	 * GenerationSetAlloc() just made the region NOACCESS.  Change it to UNDEFINED
+	 * for the moment; memcpy() will then transfer definedness from the old
+	 * allocation to the new.  If we know the old allocation, copy just that
+	 * much.  Otherwise, make the entire old chunk defined to avoid errors as
+	 * we copy the currently-NOACCESS trailing bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree((MemoryContext) set, pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+	GenerationChunk *chunk = GenerationPointerGetChunk(pointer);
+
+	return chunk->size + Generation_CHUNKHDRSZ;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is an Generation empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+
+	return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of an Generation.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Generation into *totals.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context, int level, bool print,
+		 MemoryContextCounters *totals)
+{
+	GenerationContext  *set = (GenerationContext *) context;
+
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += set->blockSize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"Generation: %s: %zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used\n",
+				((MemoryContext)set)->name, totalspace, nblocks, nchunks, freespace,
+				nfreechunks, totalspace - freespace);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+	GenerationContext  *gen = (GenerationContext *) context;
+	char	   *name = context->name;
+	dlist_iter	iter;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &gen->blocks)
+	{
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+		/* We can't free more chunks than allocated. */
+		if (block->nfree <= block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			GenerationChunk *chunk = (GenerationChunk *) ptr;
+
+			/* move to the next chunk */
+			ptr += (chunk->size + Generation_CHUNKHDRSZ);
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunk->block != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+			if (chunk->context != gen)
+				elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+					 name, block, chunk);
+
+			nchunks += 1;
+
+			/* if requested_size==0, the chunk was freed */
+			if (chunk->requested_size > 0)
+			{
+				/* if the chunk was not freed, we can trigger valgrind checks */
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+									   sizeof(chunk->requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+									   sizeof(chunk->requested_size));
+
+				/* now make sure the chunk size is correct */
+				if (chunk->size != MAXALIGN(chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->requested_size < chunk->size &&
+					!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 7a0c676..e22d9fb 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,8 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
+	 (IsA((context), AllocSetContext) || \
+	  IsA((context), SlabContext) || \
+	  IsA((context), GenerationContext)))
 
 #endif							/* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 27bd4f3..202ecb3 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -274,6 +274,7 @@ typedef enum NodeTag
 	T_MemoryContext,
 	T_AllocSetContext,
 	T_SlabContext,
+	T_GenerationContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 86effe1..b18ce5a 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -344,20 +344,7 @@ struct ReorderBuffer
 	 */
 	MemoryContext change_context;
 	MemoryContext txn_context;
-
-	/*
-	 * Data structure slab cache.
-	 *
-	 * We allocate/deallocate some structures very frequently, to avoid bigger
-	 * overhead we cache some unused ones here.
-	 *
-	 * The maximum number of cached entries is controlled by const variables
-	 * on top of reorderbuffer.c
-	 */
-
-	/* cached ReorderBufferTupleBufs */
-	slist_head	cached_tuplebufs;
-	Size		nr_cached_tuplebufs;
+	MemoryContext tup_context;
 
 	XLogRecPtr	current_restart_decoding_lsn;
 
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 869c59d..ff8e5d7 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -155,6 +155,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
 				  Size blockSize,
 				  Size chunkSize);
 
+/* generation.c */
+extern MemoryContext GenerationContextCreate(MemoryContext parent,
+				 const char *name,
+				 Size blockSize);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
-- 
2.9.5

#102Simon Riggs
simon@2ndquadrant.com
In reply to: Tomas Vondra (#101)
Re: PATCH : Generational memory allocator (was PATCH: two slab-like memory allocators)

On 24 September 2017 at 21:32, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:

Attached is an updated version of the patch, tweaking the comments.

That looks good, thanks. Marking Ready for Committer to give notice
before commit.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers