diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 77375d9..3bbab7b 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -135,6 +135,9 @@ typedef struct ReorderBufferDiskChange /* data follows */ } ReorderBufferDiskChange; +/* 10k tuples seems like a reasonable value (~80MB with MaxHeapTupleSize) */ +#define TUPLES_PER_GENERATION 10000 + /* * Maximum number of changes kept in memory, per transaction. After that, * changes are spooled to disk. @@ -156,9 +159,6 @@ static const Size max_changes_in_memory = 4096; * major bottleneck, especially when spilling to disk while decoding batch * workloads. */ -static const Size max_cached_changes = 4096 * 2; -static const Size max_cached_tuplebufs = 4096 * 2; /* ~8MB */ -static const Size max_cached_transactions = 512; /* --------------------------------------- @@ -243,6 +243,28 @@ ReorderBufferAllocate(void) buffer->context = new_ctx; + buffer->change_context = SlabContextCreate(new_ctx, + "ReorderBufferChange", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferChange)); + + buffer->txn_context = SlabContextCreate(new_ctx, + "ReorderBufferChangeTXN", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferTXN)); + + buffer->tup_context = SlabContextCreate(new_ctx, + "ReorderBufferChangeTup", + SLAB_LARGE_BLOCK_SIZE, + sizeof(ReorderBufferTupleBuf) + + MAXIMUM_ALIGNOF + MaxHeapTupleSize); + + buffer->tup_context2 = AllocSetContextCreate(new_ctx, + "ReorderBufferChangeTup2", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + hash_ctl.keysize = sizeof(TransactionId); hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt); hash_ctl.hcxt = buffer->context; @@ -262,11 +284,17 @@ ReorderBufferAllocate(void) buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; + buffer->tuples_count = 0; + buffer->tuples_size = 0; + dlist_init(&buffer->toplevel_by_lsn); dlist_init(&buffer->cached_transactions); dlist_init(&buffer->cached_changes); slist_init(&buffer->cached_tuplebufs); + buffer->tup_context_history = NIL; + buffer->current_size = MaxHeapTupleSize; + return buffer; } @@ -293,19 +321,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb) { ReorderBufferTXN *txn; - /* check the slab cache */ - if (rb->nr_cached_transactions > 0) - { - rb->nr_cached_transactions--; - txn = (ReorderBufferTXN *) - dlist_container(ReorderBufferTXN, node, - dlist_pop_head_node(&rb->cached_transactions)); - } - else - { - txn = (ReorderBufferTXN *) - MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN)); - } + txn = (ReorderBufferTXN *) + MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN)); memset(txn, 0, sizeof(ReorderBufferTXN)); @@ -346,18 +363,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) txn->invalidations = NULL; } - /* check whether to put into the slab cache */ - if (rb->nr_cached_transactions < max_cached_transactions) - { - rb->nr_cached_transactions++; - dlist_push_head(&rb->cached_transactions, &txn->node); - VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN)); - VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node)); - } - else - { - pfree(txn); - } + pfree(txn); } /* @@ -368,19 +374,8 @@ ReorderBufferGetChange(ReorderBuffer *rb) { ReorderBufferChange *change; - /* check the slab cache */ - if (rb->nr_cached_changes) - { - rb->nr_cached_changes--; - change = (ReorderBufferChange *) - dlist_container(ReorderBufferChange, node, - dlist_pop_head_node(&rb->cached_changes)); - } - else - { - change = (ReorderBufferChange *) - MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange)); - } + change = (ReorderBufferChange *) + MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange)); memset(change, 0, sizeof(ReorderBufferChange)); return change; @@ -436,20 +431,27 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change) break; } - /* check whether to put into the slab cache */ - if (rb->nr_cached_changes < max_cached_changes) - { - rb->nr_cached_changes++; - dlist_push_head(&rb->cached_changes, &change->node); - VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange)); - VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node)); - } - else + pfree(change); +} + +static void +cleanup_empty_contexts(ReorderBuffer *rb) +{ + ListCell *lc; + List *newlist = NIL; + + /* first walk through old contexts and throw away the empty ones */ + foreach(lc, rb->tup_context_history) { - pfree(change); + MemoryContext ctx = (MemoryContext)lfirst(lc); + if (! MemoryContextIsEmpty(ctx)) + newlist = lappend(newlist, ctx); + else + MemoryContextDelete(ctx); } -} + rb->tup_context_history = newlist; +} /* * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at @@ -463,37 +465,52 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) alloc_len = tuple_len + SizeofHeapTupleHeader; - /* - * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for - * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples - * generated for oldtuples can be bigger, as they don't have out-of-line - * toast columns. - */ - if (alloc_len < MaxHeapTupleSize) - alloc_len = MaxHeapTupleSize; + /* see if we need to allocate a new context generation */ + if (rb->tuples_count == TUPLES_PER_GENERATION) + { + Size new_size; + Size avg_length = (rb->tuples_size / rb->tuples_count); + + /* get rid of slab contexts that got entirely empty */ + cleanup_empty_contexts(rb); + + rb->tup_context_history + = lappend(rb->tup_context_history, rb->tup_context); + + /* assume +50% is enough slack to fit most tuples into the slab context */ + new_size = MAXALIGN(avg_length * 1.5); + + rb->current_size = new_size; + rb->tup_context = SlabContextCreate(rb->context, + "ReorderBufferChangeTup", + SLAB_LARGE_BLOCK_SIZE, + sizeof(ReorderBufferTupleBuf) + + MAXIMUM_ALIGNOF + rb->current_size); + /* we could also recreate the aset context, with block sizes set so + * that the palloc always does malloc(), but not sure about that */ + + rb->tuples_count = 0; + rb->tuples_size = 0; + } + + rb->tuples_count += 1; + rb->tuples_size += alloc_len; /* if small enough, check the slab cache */ - if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs) + if (alloc_len <= rb->current_size) { - rb->nr_cached_tuplebufs--; - tuple = slist_container(ReorderBufferTupleBuf, node, - slist_pop_head_node(&rb->cached_tuplebufs)); - Assert(tuple->alloc_tuple_size == MaxHeapTupleSize); -#ifdef USE_ASSERT_CHECKING - memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData)); - VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData)); -#endif + tuple = (ReorderBufferTupleBuf *) + MemoryContextAlloc(rb->tup_context, + sizeof(ReorderBufferTupleBuf) + + MAXIMUM_ALIGNOF + rb->current_size); + tuple->alloc_tuple_size = rb->current_size; tuple->tuple.t_data = ReorderBufferTupleBufData(tuple); -#ifdef USE_ASSERT_CHECKING - memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size); - VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size); -#endif } else { tuple = (ReorderBufferTupleBuf *) - MemoryContextAlloc(rb->context, + MemoryContextAlloc(rb->tup_context2, sizeof(ReorderBufferTupleBuf) + MAXIMUM_ALIGNOF + alloc_len); tuple->alloc_tuple_size = alloc_len; @@ -512,21 +529,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) void ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple) { - /* check whether to put into the slab cache, oversized tuples never are */ - if (tuple->alloc_tuple_size == MaxHeapTupleSize && - rb->nr_cached_tuplebufs < max_cached_tuplebufs) - { - rb->nr_cached_tuplebufs++; - slist_push_head(&rb->cached_tuplebufs, &tuple->node); - VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size); - VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf)); - VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node)); - VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size)); - } - else - { - pfree(tuple); - } + pfree(tuple); } /* diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile index b2403e1..321289f 100644 --- a/src/backend/utils/mmgr/Makefile +++ b/src/backend/utils/mmgr/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = aset.o mcxt.o portalmem.o +OBJS = aset.o mcxt.o portalmem.o slab.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c new file mode 100644 index 0000000..b0aeb1a --- /dev/null +++ b/src/backend/utils/mmgr/slab.c @@ -0,0 +1,821 @@ +/*------------------------------------------------------------------------- + * + * slab.c + * SLAB allocator definitions. + * + * SLAB is a custom memory context MemoryContext implementation designed for + * cases of equally-sized objects. + * + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/mmgr/slab.c + * + * + * The constant allocation size allows significant simplification and various + * optimizations that are not possible in AllocSet. Firstly, we can get rid + * of the doubling and carve the blocks into chunks of exactly the right size + * (plus alignment), now wasting memory. + * + * The allocator also does not need the complex array of freelists (one for + * each possible size), and as the number of chunks per block is constant, + * which allows us to store the per-block freelist directly in the block as + * a bitmap, and organize the blocks into lists by number of free chunks. + * + * The in-block free bitmap allows us to quickly determine whether the whole + * block is empty, and free it in that case. This is another major difference + * compared to AllocSet, which never frees the allocated memory (unless the + * whole context is reset, which is not very practical in most cases). + * + * To make the freeing more likely, we use the global freelist starting from + * the most full blocks, and reusing those. This works particularly well for + * use cases that allocate a lot of objects, use them for a while and then + * free most of them at once. + * + * + * About CLOBBER_FREED_MEMORY: + * + * If this symbol is defined, all freed memory is overwritten with 0x7F's. + * This is useful for catching places that reference already-freed memory. + * + * About MEMORY_CONTEXT_CHECKING: + * + * Since we usually round request sizes up to the next power of 2, there + * is often some unused space immediately after a requested data area. + * Thus, if someone makes the common error of writing past what they've + * requested, the problem is likely to go unnoticed ... until the day when + * there *isn't* any wasted space, perhaps because of different memory + * alignment on a new platform, or some other effect. To catch this sort + * of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond + * the requested space whenever the request is less than the actual chunk + * size, and verifies that the byte is undamaged when the chunk is freed. + * + * + * About USE_VALGRIND and Valgrind client requests: + * + * Valgrind provides "client request" macros that exchange information with + * the host Valgrind (if any). Under !USE_VALGRIND, memdebug.h stubs out + * currently-used macros. + * + * When running under Valgrind, we want a NOACCESS memory region both before + * and after the allocation. The chunk header is tempting as the preceding + * region, but mcxt.c expects to able to examine the standard chunk header + * fields. Therefore, we use, when available, the requested_size field and + * any subsequent padding. requested_size is made NOACCESS before returning + * a chunk pointer to a caller. However, to reduce client request traffic, + * it is kept DEFINED in chunks on the free list. + * + * The rounded-up capacity of the chunk usually acts as a post-allocation + * NOACCESS region. If the request consumes precisely the entire chunk, + * there is no such region; another chunk header may immediately follow. In + * that case, Valgrind will not detect access beyond the end of the chunk. + * + * See also the cooperating Valgrind client requests in mcxt.c. + * + *------------------------------------------------------------------------- + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/memdebug.h" +#include "utils/memutils.h" + + +#define SLAB_BLOCKHDRSZ MAXALIGN(sizeof(SlabBlockData)) +#define SLAB_CHUNKHDRSZ MAXALIGN(sizeof(SlabChunkData)) + +/* Portion of SLAB_CHUNKHDRSZ examined outside slab.c. */ +#define SLAB_CHUNK_PUBLIC \ + (offsetof(SlabChunkData, size) + sizeof(Size)) + +/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */ +#ifdef MEMORY_CONTEXT_CHECKING +#define SLAB_CHUNK_USED \ + (offsetof(SlabChunkData, requested_size) + sizeof(Size)) +#else +#define SLAB_CHUNK_USED \ + (offsetof(SlabChunkData, size) + sizeof(Size)) +#endif + +typedef struct SlabBlockData *SlabBlock; /* forward reference */ +typedef struct SlabChunkData *SlabChunk; + +/* + * SlabPointer + * Aligned pointer which may be a member of an allocation set. + */ +typedef void *SlabPointer; + +/* + * SlabContext is our standard implementation of MemoryContext. + */ +typedef struct SlabContext +{ + MemoryContextData header; /* Standard memory-context fields */ + /* Allocation parameters for this context: */ + Size chunkSize; /* chunk size */ + Size fullChunkSize; /* chunk size including header and alignment */ + Size blockSize; /* block size */ + int chunksPerBlock; /* number of chunks per block */ + int minFreeCount; /* min number of free chunks in any block */ + int nblocks; /* number of blocks allocated */ + /* Info about storage allocated in this context: */ + SlabBlock freelist[1]; /* free lists (block-level) */ +} SlabContext; + +typedef SlabContext *Slab; + +typedef struct SlabBlockData +{ + Slab slab; /* slab that owns this block */ + SlabBlock prev; /* previous block in slab's block list */ + SlabBlock next; /* next block in slab's blocks list */ + int nfree; /* number of free chunks */ + int firstFreeChunk; /* index of the first free chunk in the block */ + char *bitmapptr; /* pointer to free bitmap */ +} SlabBlockData; + +/* + * SlabChunk + * The prefix of each piece of memory in an SlabBlock + * + * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h. + */ +typedef struct SlabChunkData +{ + /* block owning this chunk */ + void *block; + /* aset is the owning aset if allocated, or the freelist link if free */ + void *aset; + /* size is always the size of the usable space in the chunk */ + Size size; +#ifdef MEMORY_CONTEXT_CHECKING + /* when debugging memory usage, also store actual requested size */ + /* this is zero in a free chunk */ + Size requested_size; +#endif +} SlabChunkData; + +/* + * SlabPointerIsValid + * True iff pointer is valid allocation pointer. + */ +#define SlabPointerIsValid(pointer) PointerIsValid(pointer) + +/* + * SlabIsValid + * True iff set is valid allocation set. + */ +#define SlabIsValid(set) PointerIsValid(set) + +#define SlabPointerGetChunk(ptr) \ + ((SlabChunk)(((char *)(ptr)) - SLAB_CHUNKHDRSZ)) +#define SlabChunkGetPointer(chk) \ + ((SlabPointer)(((char *)(chk)) + SLAB_CHUNKHDRSZ)) + +/* + * These functions implement the MemoryContext API for Slab contexts. + */ +static void *SlabAlloc(MemoryContext context, Size size); +static void SlabFree(MemoryContext context, void *pointer); +static void *SlabRealloc(MemoryContext context, void *pointer, Size size); +static void SlabInit(MemoryContext context); +static void SlabReset(MemoryContext context); +static void SlabDelete(MemoryContext context); +static Size SlabGetChunkSpace(MemoryContext context, void *pointer); +static bool SlabIsEmpty(MemoryContext context); +static void SlabStats(MemoryContext context, int level, bool print, + MemoryContextCounters *totals); + +#ifdef MEMORY_CONTEXT_CHECKING +static void SlabCheck(MemoryContext context); +#endif + +/* + * This is the virtual function table for Slab contexts. + */ +static MemoryContextMethods SlabMethods = { + SlabAlloc, + SlabFree, + SlabRealloc, + SlabInit, + SlabReset, + SlabDelete, + SlabGetChunkSpace, + SlabIsEmpty, + SlabStats +#ifdef MEMORY_CONTEXT_CHECKING + ,SlabCheck +#endif +}; + +/* ---------- + * Debug macros + * ---------- + */ +#ifdef HAVE_ALLOCINFO +#define SlabFreeInfo(_cxt, _chunk) \ + fprintf(stderr, "AllocFree: %s: %p, %d\n", \ + (_cxt)->header.name, (_chunk), (_chunk)->size) +#define SlabAllocInfo(_cxt, _chunk) \ + fprintf(stderr, "AllocAlloc: %s: %p, %d\n", \ + (_cxt)->header.name, (_chunk), (_chunk)->size) +#else +#define SlabFreeInfo(_cxt, _chunk) +#define SlabAllocInfo(_cxt, _chunk) +#endif + + +#ifdef CLOBBER_FREED_MEMORY + +/* Wipe freed memory for debugging purposes */ +static void +wipe_mem(void *ptr, size_t size) +{ + VALGRIND_MAKE_MEM_UNDEFINED(ptr, size); + memset(ptr, 0x7F, size); + VALGRIND_MAKE_MEM_NOACCESS(ptr, size); +} +#endif + +#ifdef MEMORY_CONTEXT_CHECKING +static void +set_sentinel(void *base, Size offset) +{ + char *ptr = (char *) base + offset; + + VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1); + *ptr = 0x7E; + VALGRIND_MAKE_MEM_NOACCESS(ptr, 1); +} + +static bool +sentinel_ok(const void *base, Size offset) +{ + const char *ptr = (const char *) base + offset; + bool ret; + + VALGRIND_MAKE_MEM_DEFINED(ptr, 1); + ret = *ptr == 0x7E; + VALGRIND_MAKE_MEM_NOACCESS(ptr, 1); + + return ret; +} +#endif + +#ifdef RANDOMIZE_ALLOCATED_MEMORY + +/* + * Fill a just-allocated piece of memory with "random" data. It's not really + * very random, just a repeating sequence with a length that's prime. What + * we mainly want out of it is to have a good probability that two palloc's + * of the same number of bytes start out containing different data. + * + * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as + * we fill it. Filling the region makes it DEFINED, so make it UNDEFINED + * again afterward. Whether to finally make it UNDEFINED or NOACCESS is + * fairly arbitrary. UNDEFINED is more convenient for SlabRealloc(), and + * other callers have no preference. + */ +static void +randomize_mem(char *ptr, size_t size) +{ + static int save_ctr = 1; + size_t remaining = size; + int ctr; + + ctr = save_ctr; + VALGRIND_MAKE_MEM_UNDEFINED(ptr, size); + while (remaining-- > 0) + { + *ptr++ = ctr; + if (++ctr > 251) + ctr = 1; + } + VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size); + save_ctr = ctr; +} +#endif /* RANDOMIZE_ALLOCATED_MEMORY */ + + +/* + * Public routines + */ + + +/* + * SlabContextCreate + * Create a new Slab context. + * + * parent: parent context, or NULL if top-level context + * name: name of context (for debugging --- string will be copied) + * minContextSize: minimum context size + * initBlockSize: initial allocation block size + * maxBlockSize: maximum allocation block size + */ +MemoryContext +SlabContextCreate(MemoryContext parent, + const char *name, + Size blockSize, + Size chunkSize) +{ + int i, chunksPerBlock; + Size fullChunkSize; + Slab set; + + /* chunk, including SLAB header (both addresses nicely aligned) */ + fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize)); + + /* so how many chunks can we fit into a block, including header and bitmap */ + chunksPerBlock = 0; + for (i = 0; i <= blockSize / chunkSize; i++) + { + Size bitmapsize = ((i + 7) / 8); + + /* repeat until we hit the block size */ + if (((i * fullChunkSize) + sizeof(SlabBlockData) + bitmapsize) > blockSize) + break; + + chunksPerBlock = i; + } + + /* if we can't fit at least one chunk into the block, we're hosed */ + Assert(chunksPerBlock > 0); + + /* Do the type-independent part of context creation */ + set = (Slab) MemoryContextCreate(T_SlabContext, + /* allocate context and freelist at once */ + (offsetof(SlabContext, freelist) + sizeof(SlabChunk) * (chunksPerBlock + 1)), + &SlabMethods, + parent, + name); + + set->blockSize = blockSize; + set->chunkSize = chunkSize; + set->fullChunkSize = fullChunkSize; + set->chunksPerBlock = chunksPerBlock; + set->nblocks = 0; + set->minFreeCount = 0; + + return (MemoryContext) set; +} + +/* + * SlabInit + * Context-type-specific initialization routine. SlabContext does not + * need anything extra, at this moment. + */ +static void +SlabInit(MemoryContext context) +{ + /* + * Since MemoryContextCreate already zeroed the context node, we don't + * have to do anything here: it's already OK. + */ +} + +/* + * SlabReset + * Frees all memory which is allocated in the given set. + * + * The code simply frees all the blocks in the context - we don't keep any + * keeper blocks or anything like that. + */ +static void +SlabReset(MemoryContext context) +{ + int i; + Slab set = (Slab) context; + + AssertArg(SlabIsValid(set)); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + SlabCheck(context); +#endif + + + /* walk over freelists and free the blocks */ + for (i = 0; i <= set->chunksPerBlock; i++) + { + SlabBlock block = set->freelist[i]; + set->freelist[i] = NULL; + + while (block != NULL) + { + SlabBlock next = block->next; + + /* Normal case, release the block */ +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(block, set->blockSize); +#endif + free(block); + set->nblocks--; + + block = next; + } + } + + set->minFreeCount = 0; + + Assert(set->nblocks == 0); +} + +/* + * SlabDelete + * Frees all memory which is allocated in the given set, in preparation + * for deletion of the set. We simply call SlabReset(). + */ +static void +SlabDelete(MemoryContext context) +{ + /* just reset the context */ + SlabReset(context); +} + +/* operations on the freelist - adding/removing/moving blocks */ +static void +remove_from_freelist(Slab set, SlabBlock block, int nfree_old) +{ + /* either it has a previous block, or it's the first block in list */ + if (block->prev) + block->prev->next = block->next; + else + set->freelist[nfree_old] = block->next; + + /* if it has a next block, update it too */ + if (block->next) + block->next->prev = block->prev; + + block->prev = NULL; + block->next = NULL; +} + +static void +add_to_freelist(Slab set, SlabBlock block) +{ + /* otherwise add it to the proper freelist bin */ + if (set->freelist[block->nfree]) + set->freelist[block->nfree]->prev = block; + + block->next = set->freelist[block->nfree]; + set->freelist[block->nfree] = block; +} + + +static void +move_in_freelist(Slab set, SlabBlock block, int nfree_old) +{ + remove_from_freelist(set, block, nfree_old); + add_to_freelist(set, block); +} + +/* + * SlabAlloc + * Returns pointer to allocated memory of given size or NULL if + * request could not be completed; memory is added to the set. + * + * No request may exceed: + * MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ + * All callers use a much-lower limit. + */ +static void * +SlabAlloc(MemoryContext context, Size size) +{ + Slab set = (Slab) context; + SlabBlock block; + SlabChunk chunk; + int idx; + + AssertArg(SlabIsValid(set)); + + Assert(size == set->chunkSize); + Assert((set->minFreeCount >= 0) && (set->minFreeCount < set->chunksPerBlock)); + + /* + * If there are no free chunks in any existing block, create a new block + * and put it to the last freelist bucket. + */ + if (set->minFreeCount == 0) + { + block = (SlabBlock)malloc(set->blockSize); + + if (block == NULL) + return NULL; + + memset(block, 0, set->blockSize); + + block->slab = set; + block->nfree = set->chunksPerBlock; + block->prev = NULL; + block->next = NULL; + block->firstFreeChunk = 0; + + /* the free bitmap is placed at the end */ + block->bitmapptr + = ((char *) block) + set->blockSize - ((set->chunksPerBlock + 7) / 8); + + /* + * And add it to the last freelist with all chunks empty (we know + * there are no blocks in the freelist, otherwise we wouldn't need + * a new block. + */ + set->freelist[set->chunksPerBlock] = block; + set->minFreeCount = set->chunksPerBlock; + set->nblocks += 1; + } + + /* grab the block from the freelist (even the new block is there) */ + block = set->freelist[set->minFreeCount]; + + /* make sure we actually got a valid block, with matching nfree */ + Assert(block != NULL); + Assert(set->minFreeCount == block->nfree); + Assert(block->nfree > 0); + + Assert((char*)block < block->bitmapptr); + Assert((char*)block + set->blockSize > block->bitmapptr); + + /* we know the first free chunk */ + idx = block->firstFreeChunk; + + /* make sure the chunk index is valid, and that it's marked as empty */ + Assert((idx >= 0) && (idx < set->chunksPerBlock)); + Assert(!((block->bitmapptr[idx/8] & (0x01 << (idx % 8))))); + + /* mark the chunk as used (set 1 to the bit) */ + block->bitmapptr[idx/8] |= (0x01 << (idx % 8)); + + /* compute the chunk location block start (after the block header) */ + chunk = (SlabChunk) ((char*)block + sizeof(SlabBlockData) + + (idx * set->fullChunkSize)); + + /* + * update the block nfree count, and also the minFreeCount as we've + * decreased nfree for a block with the minimum count + */ + block->nfree--; + set->minFreeCount = block->nfree; + + /* but we need to find the next one, for the next alloc call (unless the + * block just got full, in that case simply set it to -1 */ + if (block->nfree == 0) + block->firstFreeChunk = set->chunksPerBlock; + else + { + /* look for the next free chunk in the block, after the first one */ + while ((++block->firstFreeChunk) < set->chunksPerBlock) + { + int byte = block->firstFreeChunk / 8; + int bit = block->firstFreeChunk % 8; + + /* stop when you find 0 (unused chunk) */ + if (! (block->bitmapptr[byte] & (0x01 << bit))) + break; + } + + /* must have found the free chunk */ + Assert(block->firstFreeChunk != set->chunksPerBlock); + } + + /* move the block to the right place in the freelist */ + move_in_freelist(set, block, (block->nfree + 1)); + + /* but if the minimum is 0, we need to look for a new one */ + if (set->minFreeCount == 0) + for (idx = 1; idx <= set->chunksPerBlock; idx++) + if (set->freelist[idx]) + { + set->minFreeCount = idx; + break; + } + + if (set->minFreeCount == set->chunksPerBlock) + set->minFreeCount = 0; + + /* Prepare to initialize the chunk header. */ + VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED); + + chunk->aset = (void *) set; + chunk->block = (void *) block; + chunk->size = MAXALIGN(size); + +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size, + sizeof(chunk->requested_size)); + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + set_sentinel(SlabChunkGetPointer(chunk), size); +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) SlabChunkGetPointer(chunk), size); +#endif + + SlabAllocInfo(set, chunk); + return SlabChunkGetPointer(chunk); +} + +/* + * SlabFree + * Frees allocated memory; memory is removed from the set. + */ +static void +SlabFree(MemoryContext context, void *pointer) +{ + int idx; + Slab set = (Slab) context; + SlabChunk chunk = SlabPointerGetChunk(pointer); + SlabBlock block = chunk->block; + + SlabFreeInfo(set, chunk); + +#ifdef MEMORY_CONTEXT_CHECKING + VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size, + sizeof(chunk->requested_size)); + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < chunk->size) + if (!sentinel_ok(pointer, chunk->requested_size)) + elog(WARNING, "detected write past chunk end in %s %p", + set->header.name, chunk); +#endif + + /* compute index wrt to block start */ + idx = ((char*)chunk - ((char*)block + sizeof(SlabBlockData))) / set->fullChunkSize; + + Assert((block->bitmapptr[idx/8] & (0x01 << (idx % 8)))); + + /* mark the chunk as unused (set 0 to the bit), and update block nfree count */ + block->bitmapptr[idx/8] ^= (0x01 << (idx % 8)); + block->nfree++; + block->firstFreeChunk = Min(block->firstFreeChunk, idx); + + Assert(block->nfree > 0); + Assert(block->nfree <= set->chunksPerBlock); + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(pointer, chunk->size); +#endif + +#ifdef MEMORY_CONTEXT_CHECKING + /* Reset requested_size to 0 in chunks that are on freelist */ + chunk->requested_size = 0; +#endif + + /* now decide what to do with the block */ + + /* + * See if we need to update the minFreeCount field for the set - we only + * need to do that if the block had that number of free chunks before we + * freed one. In that case, we check if there still are blocks with that + * number of free chunks - we can simply check if the chunk has siblings. + * Otherwise we simply increment the value by one, as the new block is + * still the one with minimum free chunks (even without the one chunk). + */ + if (set->minFreeCount == (block->nfree-1)) + if ((block->prev == NULL) && (block->next == NULL)) /* no other blocks */ + { + /* but if we made the block entirely free, we'll free it */ + if (block->nfree == set->chunksPerBlock) + set->minFreeCount = 0; + else + set->minFreeCount++; + } + + /* remove the block from a freelist */ + remove_from_freelist(set, block, block->nfree-1); + + /* If the block is now completely empty, free it. */ + if (block->nfree == set->chunksPerBlock) + { + free(block); + set->nblocks--; + } + else + add_to_freelist(set, block); + + Assert(set->nblocks >= 0); +} + +/* + * SlabRealloc + * As Slab is designed for allocating equally-sized chunks of memory, it + * can't really do an actual realloc. However we try to be gentle and + * allow calls with exactly the same size as in that case we can simply + * return the same chunk. When the size differs, we fail with assert + * failure or return NULL. + */ +static void * +SlabRealloc(MemoryContext context, void *pointer, Size size) +{ + Slab set = (Slab)context; + + /* can't do actual realloc with slab, but let's try to be gentle */ + if (size == set->chunkSize) + return pointer; + + /* we can't really do repalloc with this allocator */ + Assert(false); + + return NULL; +} + +/* + * SlabGetChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + */ +static Size +SlabGetChunkSpace(MemoryContext context, void *pointer) +{ + SlabChunk chunk = SlabPointerGetChunk(pointer); + + return chunk->size + SLAB_CHUNKHDRSZ; +} + +/* + * SlabIsEmpty + * Is an Slab empty of any allocated space? + */ +static bool +SlabIsEmpty(MemoryContext context) +{ + Slab set = (Slab)context; + return (set->nblocks == 0); +} + +/* + * SlabStats + * Compute stats about memory consumption of an Slab. + * + * level: recursion level (0 at top level); used for print indentation. + * print: true to print stats to stderr. + * totals: if not NULL, add stats about this Slab into *totals. + */ +static void +SlabStats(MemoryContext context, int level, bool print, + MemoryContextCounters *totals) +{ + Slab set = (Slab) context; + Size nblocks = 0; + Size freechunks = 0; + Size totalspace = 0; + Size freespace = 0; + int i; + + for (i = 0; i <= set->chunksPerBlock; i++) + { + SlabBlock block = set->freelist[i]; + while (block != NULL) + { + nblocks++; + totalspace += set->blockSize; + freespace += set->fullChunkSize * block->nfree; + freechunks += block->nfree; + block = block->next; + } + } + + if (print) + { + int i; + + for (i = 0; i < level; i++) + fprintf(stderr, " "); + fprintf(stderr, + "%s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n", + set->header.name, totalspace, nblocks, freespace, freechunks, + totalspace - freespace); + } + + if (totals) + { + totals->nblocks += nblocks; + totals->freechunks += freechunks; + totals->totalspace += totalspace; + totals->freespace += freespace; + } +} + + +#ifdef MEMORY_CONTEXT_CHECKING + +/* + * SlabCheck + * Walk through chunks and check consistency of memory. + * + * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll + * find yourself in an infinite loop when trouble occurs, because this + * routine will be entered again when elog cleanup tries to release memory! + */ +static void +SlabCheck(MemoryContext context) +{ + /* FIXME */ +} + +#endif /* MEMORY_CONTEXT_CHECKING */ diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h index ba069cc..92a7478 100644 --- a/src/include/nodes/memnodes.h +++ b/src/include/nodes/memnodes.h @@ -96,6 +96,6 @@ typedef struct MemoryContextData */ #define MemoryContextIsValid(context) \ ((context) != NULL && \ - (IsA((context), AllocSetContext))) + (IsA((context), AllocSetContext) || IsA((context), SlabContext))) #endif /* MEMNODES_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 6b850e4..62005bb 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -274,6 +274,7 @@ typedef enum NodeTag */ T_MemoryContext = 600, T_AllocSetContext, + T_SlabContext, /* * TAGS FOR VALUE NODES (value.h) diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 9e209ae..4959c74 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -331,6 +331,22 @@ struct ReorderBuffer MemoryContext context; /* + * slab contexts for change and TXN objects. + */ + MemoryContext change_context; + MemoryContext txn_context; + MemoryContext tup_context; + MemoryContext tup_context2; + + /* counters for current generation of tuples */ + int tuples_count; + Size tuples_size; + Size current_size; + + /* history of tuple slab contexts (generational design) */ + List *tup_context_history; + + /* * Data structure slab cache. * * We allocate/deallocate some structures very frequently, to avoid bigger diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index ae07705..3209f71 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -135,6 +135,12 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent, Size initBlockSize, Size maxBlockSize); +/* slab.c */ +extern MemoryContext SlabContextCreate(MemoryContext parent, + const char *name, + Size blockSize, + Size chunkSize); + /* * Recommended default alloc parameters, suitable for "ordinary" contexts * that might hold quite a lot of data. @@ -159,4 +165,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent, */ #define ALLOCSET_SEPARATE_THRESHOLD 8192 +#define SLAB_DEFAULT_BLOCK_SIZE 8192 +#define SLAB_LARGE_BLOCK_SIZE (8 * 1024 * 1024) + #endif /* MEMUTILS_H */