From 7a1521dcafbc42b2482d16e8dd0781dfbd5ef2b4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Oct 2022 09:47:45 -0700
Subject: [PATCH 2/3] XXX palloc_io_aligned() -- not for review here

This patch will be posted for review by David Rowley in its own thread,
but a copy is included here as a dependency.
---
 contrib/bloom/blinsert.c                   |  2 +-
 src/backend/access/gist/gistbuild.c        |  8 +-
 src/backend/access/gist/gistbuildbuffers.c |  5 +-
 src/backend/access/heap/rewriteheap.c      |  2 +-
 src/backend/access/nbtree/nbtree.c         |  2 +-
 src/backend/access/nbtree/nbtsort.c        |  8 +-
 src/backend/access/spgist/spginsert.c      |  2 +-
 src/backend/nodes/gen_node_support.pl      |  2 +-
 src/backend/storage/buffer/buf_init.c      |  7 +-
 src/backend/storage/buffer/localbuf.c      |  4 +-
 src/backend/storage/page/bufpage.c         |  2 +-
 src/backend/storage/smgr/md.c              | 14 ++-
 src/backend/utils/mmgr/mcxt.c              | 99 ++++++++++++++++++++--
 src/include/nodes/memnodes.h               |  5 +-
 src/include/utils/memutils_internal.h      |  4 +-
 src/include/utils/palloc.h                 |  5 ++
 16 files changed, 141 insertions(+), 30 deletions(-)

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index dd26d6ac29..b0da3ac529 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -166,7 +166,7 @@ blbuildempty(Relation index)
 	Page		metapage;
 
 	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
+	metapage = (Page) palloc_io_aligned(BLCKSZ, 0);
 	BloomFillMetapage(index, metapage);
 
 	/*
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index fb0f466708..2daa9b2e10 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state)
 	 * Write an empty page as a placeholder for the root page. It will be
 	 * replaced with the real root page at the end.
 	 */
-	page = palloc0(BLCKSZ);
+	page = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
 	smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			   page, true);
 	state->pages_allocated++;
@@ -509,7 +509,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 			levelstate->current_page++;
 
 		if (levelstate->pages[levelstate->current_page] == NULL)
-			levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
+			levelstate->pages[levelstate->current_page] = palloc_io_aligned(BLCKSZ, 0);
 
 		newPage = levelstate->pages[levelstate->current_page];
 		gistinitpage(newPage, old_page_flags);
@@ -579,7 +579,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 
 		/* Create page and copy data */
 		data = (char *) (dist->list);
-		target = palloc0(BLCKSZ);
+		target = (Page) palloc_io_aligned(BLCKSZ, 0);
 		gistinitpage(target, isleaf ? F_LEAF : 0);
 		for (int i = 0; i < dist->block.num; i++)
 		{
@@ -630,7 +630,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		if (parent == NULL)
 		{
 			parent = palloc0(sizeof(GistSortedBuildLevelState));
-			parent->pages[0] = (Page) palloc(BLCKSZ);
+			parent->pages[0] = (Page) palloc_io_aligned(BLCKSZ, 0);
 			parent->parent = NULL;
 			gistinitpage(parent->pages[0], 0);
 
diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
index 538e3880c9..9e188633ae 100644
--- a/src/backend/access/gist/gistbuildbuffers.c
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -186,8 +186,9 @@ gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
 {
 	GISTNodeBufferPage *pageBuffer;
 
-	pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
-															   BLCKSZ);
+	pageBuffer = (GISTNodeBufferPage *)
+		MemoryContextAllocIOAligned(gfbb->context,
+									BLCKSZ, MCXT_ALLOC_ZERO);
 	pageBuffer->prev = InvalidBlockNumber;
 
 	/* Set page free space */
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index b01b39b008..6fe7f1aed4 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -257,7 +257,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 
 	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
-	state->rs_buffer = (Page) palloc(BLCKSZ);
+	state->rs_buffer = (Page) palloc_io_aligned(BLCKSZ, 0);
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
 	state->rs_buffer_valid = false;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b52eca8f38..924da953aa 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -153,7 +153,7 @@ btbuildempty(Relation index)
 	Page		metapage;
 
 	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
+	metapage = (Page) palloc_io_aligned(BLCKSZ, 0);
 	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
 
 	/*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 501e011ce1..563e6cce1f 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level)
 	Page		page;
 	BTPageOpaque opaque;
 
-	page = (Page) palloc(BLCKSZ);
+	page = (Page) palloc_io_aligned(BLCKSZ, 0);
 
 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);
@@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 	while (blkno > wstate->btws_pages_written)
 	{
 		if (!wstate->btws_zeropage)
-			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+			wstate->btws_zeropage =
+				(Page) palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
+
 		/* don't set checksum for all-zero page */
 		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
 				   wstate->btws_pages_written++,
@@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
 	 */
-	metapage = (Page) palloc(BLCKSZ);
+	metapage = (Page) palloc_io_aligned(BLCKSZ, 0);
 	_bt_initmetapage(metapage, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index c6821b5952..d5b83710e4 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -158,7 +158,7 @@ spgbuildempty(Relation index)
 	Page		page;
 
 	/* Construct metapage. */
-	page = (Page) palloc(BLCKSZ);
+	page = (Page) palloc_io_aligned(BLCKSZ, 0);
 	SpGistInitMetapage(page);
 
 	/*
diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl
index 81b8c184a9..9598056821 100644
--- a/src/backend/nodes/gen_node_support.pl
+++ b/src/backend/nodes/gen_node_support.pl
@@ -142,7 +142,7 @@ my @abstract_types = qw(Node);
 # they otherwise don't participate in node support.
 my @extra_tags = qw(
   IntList OidList XidList
-  AllocSetContext GenerationContext SlabContext
+  AllocSetContext GenerationContext SlabContext AlignedAllocRedirectContext
   TIDBitmap
   WindowObjectData
 );
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 6b6264854e..edd9bd48c3 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -79,8 +79,9 @@ InitBufferPool(void)
 						&foundDescs);
 
 	BufferBlocks = (char *)
-		ShmemInitStruct("Buffer Blocks",
-						NBuffers * (Size) BLCKSZ, &foundBufs);
+		TYPEALIGN(BLCKSZ,
+				  ShmemInitStruct("Buffer Blocks",
+								  (NBuffers + 1) * (Size) BLCKSZ, &foundBufs));
 
 	/* Align condition variables to cacheline boundary. */
 	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
@@ -164,6 +165,8 @@ BufferShmemSize(void)
 	size = add_size(size, PG_CACHE_LINE_SIZE);
 
 	/* size of data pages */
+    /* to allow aligning buffer blocks */
+	size = add_size(size, BLCKSZ);
 	size = add_size(size, mul_size(NBuffers, BLCKSZ));
 
 	/* size of stuff controlled by freelist.c */
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 30d67d1c40..f51d3527f6 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -546,8 +546,8 @@ GetLocalBufferStorage(void)
 		/* And don't overflow MaxAllocSize, either */
 		num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
 
-		cur_block = (char *) MemoryContextAlloc(LocalBufferContext,
-												num_bufs * BLCKSZ);
+		cur_block = (char *) MemoryContextAllocIOAligned(LocalBufferContext,
+														 num_bufs * BLCKSZ, 0);
 		next_buf_in_block = 0;
 		num_bufs_in_block = num_bufs;
 	}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 8b617c7e79..42f6f1782a 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1522,7 +1522,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
 	 * and second to avoid wasting space in processes that never call this.
 	 */
 	if (pageCopy == NULL)
-		pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
+		pageCopy = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
 
 	memcpy(pageCopy, (char *) page, BLCKSZ);
 	((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index a515bb36ac..719721a894 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -439,6 +439,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	int			nbytes;
 	MdfdVec    *v;
 
+#if PG_O_DIRECT != 0
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
+#endif
+
 	/* This assert is too expensive to have on normally ... */
 #ifdef CHECK_WRITE_VS_EXTEND
 	Assert(blocknum >= mdnblocks(reln, forknum));
@@ -661,6 +665,10 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	int			nbytes;
 	MdfdVec    *v;
 
+#if PG_O_DIRECT != 0
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
+#endif
+
 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 										reln->smgr_rlocator.locator.spcOid,
 										reln->smgr_rlocator.locator.dbOid,
@@ -726,6 +734,10 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	int			nbytes;
 	MdfdVec    *v;
 
+#if PG_O_DIRECT != 0
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
+#endif
+
 	/* This assert is too expensive to have on normally ... */
 #ifdef CHECK_WRITE_VS_EXTEND
 	Assert(blocknum < mdnblocks(reln, forknum));
@@ -1280,7 +1292,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 */
 			if (nblocks < ((BlockNumber) RELSEG_SIZE))
 			{
-				char	   *zerobuf = palloc0(BLCKSZ);
+				char	   *zerobuf = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
 
 				mdextend(reln, forknum,
 						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index f526ca82c1..807c0f3af3 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -36,6 +36,9 @@ static void BogusFree(void *pointer);
 static void *BogusRealloc(void *pointer, Size size);
 static MemoryContext BogusGetChunkContext(void *pointer);
 static Size BogusGetChunkSpace(void *pointer);
+static void AlignedAllocFree(void *pointer);
+static MemoryContext AlignedAllocGetChunkContext(void *pointer);
+
 
 /*****************************************************************************
  *	  GLOBAL MEMORY															 *
@@ -84,6 +87,10 @@ static const MemoryContextMethods mcxt_methods[] = {
 	[MCTX_SLAB_ID].check = SlabCheck,
 #endif
 
+	/* in here */
+	[MCTX_ALIGNED_REDIRECT_ID].get_chunk_context = AlignedAllocGetChunkContext,
+	[MCTX_ALIGNED_REDIRECT_ID].free_p = AlignedAllocFree,
+
 	/*
 	 * Unused (as yet) IDs should have dummy entries here.  This allows us to
 	 * fail cleanly if a bogus pointer is passed to pfree or the like.  It
@@ -110,11 +117,6 @@ static const MemoryContextMethods mcxt_methods[] = {
 	[MCTX_UNUSED4_ID].realloc = BogusRealloc,
 	[MCTX_UNUSED4_ID].get_chunk_context = BogusGetChunkContext,
 	[MCTX_UNUSED4_ID].get_chunk_space = BogusGetChunkSpace,
-
-	[MCTX_UNUSED5_ID].free_p = BogusFree,
-	[MCTX_UNUSED5_ID].realloc = BogusRealloc,
-	[MCTX_UNUSED5_ID].get_chunk_context = BogusGetChunkContext,
-	[MCTX_UNUSED5_ID].get_chunk_space = BogusGetChunkSpace,
 };
 
 /*
@@ -1306,11 +1308,16 @@ void
 pfree(void *pointer)
 {
 #ifdef USE_VALGRIND
+	MemoryContextMethodID method = GetMemoryChunkMethodID(pointer);
 	MemoryContext context = GetMemoryChunkContext(pointer);
 #endif
 
 	MCXT_METHOD(pointer, free_p) (pointer);
-	VALGRIND_MEMPOOL_FREE(context, pointer);
+
+#ifdef USE_VALGRIND
+	if (method != MCTX_ALIGNED_REDIRECT_ID)
+		VALGRIND_MEMPOOL_FREE(context, pointer);
+#endif
 }
 
 /*
@@ -1497,3 +1504,83 @@ pchomp(const char *in)
 		n--;
 	return pnstrdup(in, n);
 }
+
+/*
+ * pointer to fake memory context + pointer to actual allocation
+ */
+#define ALIGNED_ALLOC_CHUNK_SIZE (sizeof(uintptr_t) + sizeof(uintptr_t))
+
+#include "utils/memutils_memorychunk.h"
+
+static void
+AlignedAllocFree(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	void *unaligned;
+
+	Assert(!MemoryChunkIsExternal(chunk));
+
+	unaligned = MemoryChunkGetBlock(chunk);
+
+	pfree(unaligned);
+}
+
+MemoryContext
+AlignedAllocGetChunkContext(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+
+	Assert(!MemoryChunkIsExternal(chunk));
+
+	return GetMemoryChunkContext(MemoryChunkGetBlock(chunk));
+}
+
+void *
+MemoryContextAllocAligned(MemoryContext context,
+						  Size size, Size alignto, int flags)
+{
+	Size		alloc_size;
+	void	   *unaligned;
+	void	   *aligned;
+
+	/* wouldn't make much sense to waste that much space */
+	Assert(alignto < (128 * 1024 * 1024));
+
+	if (alignto < MAXIMUM_ALIGNOF)
+		return palloc_extended(size, flags);
+
+	/* allocate enough space for alignment padding */
+	alloc_size = size + alignto + sizeof(MemoryChunk);
+
+	unaligned = MemoryContextAllocExtended(context, alloc_size, flags);
+
+	aligned = (char *) unaligned + sizeof(MemoryChunk);
+	aligned = (void *) (TYPEALIGN(alignto, aligned) - sizeof(MemoryChunk));
+
+	MemoryChunkSetHdrMask(aligned, unaligned, 0, MCTX_ALIGNED_REDIRECT_ID);
+
+	/* XXX: should we adjust valgrind state here? */
+
+	Assert((char *) TYPEALIGN(alignto, MemoryChunkGetPointer(aligned)) == MemoryChunkGetPointer(aligned));
+
+	return MemoryChunkGetPointer(aligned);
+}
+
+void *
+MemoryContextAllocIOAligned(MemoryContext context, Size size, int flags)
+{
+	// FIXME: don't hardcode page size
+	return MemoryContextAllocAligned(context, size, 4096, flags);
+}
+
+void *
+palloc_aligned(Size size, Size alignto, int flags)
+{
+	return MemoryContextAllocAligned(CurrentMemoryContext, size, alignto, flags);
+}
+
+void *
+palloc_io_aligned(Size size, int flags)
+{
+	return MemoryContextAllocIOAligned(CurrentMemoryContext, size, flags);
+}
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 63d07358cd..dcfe41806a 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -104,10 +104,11 @@ typedef struct MemoryContextData
  *
  * Add new context types to the set accepted by this macro.
  */
-#define MemoryContextIsValid(context) \
+#define MemoryContextIsValid(context)                                         \
 	((context) != NULL && \
 	 (IsA((context), AllocSetContext) || \
 	  IsA((context), SlabContext) || \
-	  IsA((context), GenerationContext)))
+	  IsA((context), GenerationContext) || \
+	  IsA((context), AlignedAllocRedirectContext)))
 
 #endif							/* MEMNODES_H */
diff --git a/src/include/utils/memutils_internal.h b/src/include/utils/memutils_internal.h
index bc2cbdd506..9611a192a2 100644
--- a/src/include/utils/memutils_internal.h
+++ b/src/include/utils/memutils_internal.h
@@ -92,8 +92,8 @@ typedef enum MemoryContextMethodID
 	MCTX_ASET_ID,
 	MCTX_GENERATION_ID,
 	MCTX_SLAB_ID,
-	MCTX_UNUSED4_ID,			/* available */
-	MCTX_UNUSED5_ID				/* 111 occurs in wipe_mem'd memory */
+	MCTX_ALIGNED_REDIRECT_ID,
+	MCTX_UNUSED4_ID				/* 111 occurs in wipe_mem'd memory */
 } MemoryContextMethodID;
 
 /*
diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h
index 8eee0e2938..0b0ba2a953 100644
--- a/src/include/utils/palloc.h
+++ b/src/include/utils/palloc.h
@@ -73,10 +73,15 @@ extern void *MemoryContextAllocZero(MemoryContext context, Size size);
 extern void *MemoryContextAllocZeroAligned(MemoryContext context, Size size);
 extern void *MemoryContextAllocExtended(MemoryContext context,
 										Size size, int flags);
+extern void *MemoryContextAllocAligned(MemoryContext context,
+									   Size size, Size alignto, int flags);
+extern void *MemoryContextAllocIOAligned(MemoryContext context, Size size, int flags);
 
 extern void *palloc(Size size);
 extern void *palloc0(Size size);
 extern void *palloc_extended(Size size, int flags);
+extern void *palloc_aligned(Size size, Size alignto, int flags);
+extern void *palloc_io_aligned(Size size, int flags);
 extern pg_nodiscard void *repalloc(void *pointer, Size size);
 extern pg_nodiscard void *repalloc_extended(void *pointer,
 											Size size, int flags);
-- 
2.35.1

