From f6adf05ffa5bdf43cd3ca2bcc4dba39d1474ce09 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 13 Dec 2022 16:25:59 +1300
Subject: [PATCH v3 1/3] Introduce PG_IO_ALIGN_SIZE and align all I/O buffers.

In order to be allowed to use O_DIRECT in a later commit, we need to
align buffers to the virtual memory page size.  O_DIRECT would either
fail to work or fail to work efficiently without that on various
platforms.  Even without O_DIRECT, aligning on memory pages improves
traditional buffered I/O performance.

The alignment size is set to 4096, which is enough for currently known
systems.  There is no standard governing the requirements for O_DIRECT so
it's possible we might have to reconsider this approach or fail to work
on some exotic system, but for now this simplistic approach works and
it can be changed at compile time.

Adjust all call sites that allocate heap memory for file I/O to use the
new palloc_aligned() or MemoryContextAllocAligned() functions.  For
stack-allocated buffers, introduce PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler.  Also align the main
buffer pool in shared memory.

If arbitrary alignment of stack objects is not possible with this
compiler, then completely disable the use of O_DIRECT by setting
PG_O_DIRECT to 0.  (This avoids the need to consider systems that have
O_DIRECT but don't have a compiler with an extension that can align
stack objects the way we want; that could be done but we don't currently
know of any such system, so it's easier to pretend there is no O_DIRECT
support instead: that's an existing and tested class of system.)

Add assertions that all buffers passed into smgrread(), smgrwrite(),
smgrextend() are correctly aligned, if PG_O_DIRECT isn't 0.

Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
---
 contrib/bloom/blinsert.c                  |  2 +-
 contrib/pg_prewarm/pg_prewarm.c           |  2 +-
 src/backend/access/gist/gistbuild.c       |  9 +++---
 src/backend/access/hash/hashpage.c        |  2 +-
 src/backend/access/heap/rewriteheap.c     |  2 +-
 src/backend/access/heap/visibilitymap.c   |  2 +-
 src/backend/access/nbtree/nbtree.c        |  2 +-
 src/backend/access/nbtree/nbtsort.c       |  8 ++++--
 src/backend/access/spgist/spginsert.c     |  2 +-
 src/backend/access/transam/generic_xlog.c | 13 ++++++---
 src/backend/access/transam/xlog.c         |  9 +++---
 src/backend/catalog/storage.c             |  2 +-
 src/backend/storage/buffer/buf_init.c     | 10 +++++--
 src/backend/storage/buffer/bufmgr.c       |  2 +-
 src/backend/storage/buffer/localbuf.c     |  7 +++--
 src/backend/storage/file/buffile.c        |  6 ++++
 src/backend/storage/freespace/freespace.c |  2 +-
 src/backend/storage/page/bufpage.c        |  5 +++-
 src/backend/storage/smgr/md.c             | 15 +++++++++-
 src/backend/utils/sort/logtape.c          |  2 +-
 src/bin/pg_checksums/pg_checksums.c       |  2 +-
 src/bin/pg_rewind/local_source.c          |  4 +--
 src/bin/pg_upgrade/file.c                 |  4 +--
 src/common/file_utils.c                   |  2 +-
 src/include/c.h                           | 34 +++++++++++++++++------
 src/include/pg_config_manual.h            |  7 +++++
 src/include/storage/fd.h                  |  5 ++--
 src/tools/pgindent/typedefs.list          |  1 +
 28 files changed, 114 insertions(+), 49 deletions(-)

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index dd26d6ac29..53cc617a66 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -166,7 +166,7 @@ blbuildempty(Relation index)
 	Page		metapage;
 
 	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
+	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	BloomFillMetapage(index, metapage);
 
 	/*
diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index caff5c4a80..f50aa69eb2 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -36,7 +36,7 @@ typedef enum
 	PREWARM_BUFFER
 } PrewarmType;
 
-static PGAlignedBlock blockbuffer;
+static PGIOAlignedBlock blockbuffer;
 
 /*
  * pg_prewarm(regclass, mode text, fork text,
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index fb0f466708..d3d7d836e9 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state)
 	 * Write an empty page as a placeholder for the root page. It will be
 	 * replaced with the real root page at the end.
 	 */
-	page = palloc0(BLCKSZ);
+	page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
 	smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			   page, true);
 	state->pages_allocated++;
@@ -509,7 +509,8 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 			levelstate->current_page++;
 
 		if (levelstate->pages[levelstate->current_page] == NULL)
-			levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
+			levelstate->pages[levelstate->current_page] =
+				palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 
 		newPage = levelstate->pages[levelstate->current_page];
 		gistinitpage(newPage, old_page_flags);
@@ -579,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 
 		/* Create page and copy data */
 		data = (char *) (dist->list);
-		target = palloc0(BLCKSZ);
+		target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
 		gistinitpage(target, isleaf ? F_LEAF : 0);
 		for (int i = 0; i < dist->block.num; i++)
 		{
@@ -630,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		if (parent == NULL)
 		{
 			parent = palloc0(sizeof(GistSortedBuildLevelState));
-			parent->pages[0] = (Page) palloc(BLCKSZ);
+			parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 			parent->parent = NULL;
 			gistinitpage(parent->pages[0], 0);
 
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 55b2929ad5..147af95e92 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -992,7 +992,7 @@ static bool
 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 {
 	BlockNumber lastblock;
-	PGAlignedBlock zerobuf;
+	PGIOAlignedBlock zerobuf;
 	Page		page;
 	HashPageOpaque ovflopaque;
 
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2fe9e48e50..23d966940e 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 
 	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
-	state->rs_buffer = (Page) palloc(BLCKSZ);
+	state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
 	state->rs_buffer_valid = false;
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 4ed70275e2..3bd65b275b 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -620,7 +620,7 @@ static void
 vm_extend(Relation rel, BlockNumber vm_nblocks)
 {
 	BlockNumber vm_nblocks_now;
-	PGAlignedBlock pg;
+	PGIOAlignedBlock pg;
 	SMgrRelation reln;
 
 	PageInit((Page) pg.data, BLCKSZ, 0);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b52eca8f38..e8ac7390ae 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -153,7 +153,7 @@ btbuildempty(Relation index)
 	Page		metapage;
 
 	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
+	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
 
 	/*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 501e011ce1..5e3c461f6f 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level)
 	Page		page;
 	BTPageOpaque opaque;
 
-	page = (Page) palloc(BLCKSZ);
+	page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 
 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);
@@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 	while (blkno > wstate->btws_pages_written)
 	{
 		if (!wstate->btws_zeropage)
-			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+			wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
+														  PG_IO_ALIGN_SIZE,
+														  MCXT_ALLOC_ZERO);
 		/* don't set checksum for all-zero page */
 		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
 				   wstate->btws_pages_written++,
@@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
 	 */
-	metapage = (Page) palloc(BLCKSZ);
+	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	_bt_initmetapage(metapage, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index c6821b5952..6f14b41329 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -158,7 +158,7 @@ spgbuildempty(Relation index)
 	Page		page;
 
 	/* Construct metapage. */
-	page = (Page) palloc(BLCKSZ);
+	page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	SpGistInitMetapage(page);
 
 	/*
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index 6db9a1fca1..458e270d55 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -58,14 +58,17 @@ typedef struct
 	char		delta[MAX_DELTA_SIZE];	/* delta between page images */
 } PageData;
 
-/* State of generic xlog record construction */
+/*
+ * State of generic xlog record construction.  Must be allocated at an I/O
+ * aligned address.
+ */
 struct GenericXLogState
 {
+	/* Page images (properly aligned, must be first) */
+	PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
 	/* Info about each page, see above */
 	PageData	pages[MAX_GENERIC_XLOG_PAGES];
 	bool		isLogged;
-	/* Page images (properly aligned) */
-	PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
 };
 
 static void writeFragment(PageData *pageData, OffsetNumber offset,
@@ -269,7 +272,9 @@ GenericXLogStart(Relation relation)
 	GenericXLogState *state;
 	int			i;
 
-	state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
+	state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
+												PG_IO_ALIGN_SIZE,
+												0);
 	state->isLogged = RelationNeedsWAL(relation);
 
 	for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 91473b00d9..172b4a2fcf 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4502,7 +4502,7 @@ XLOGShmemSize(void)
 	/* xlblocks array */
 	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
 	/* extra alignment padding for XLOG I/O buffers */
-	size = add_size(size, XLOG_BLCKSZ);
+	size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
 	/* and the buffers themselves */
 	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
 
@@ -4599,10 +4599,11 @@ XLOGShmemInit(void)
 
 	/*
 	 * Align the start of the page buffers to a full xlog block size boundary.
-	 * This simplifies some calculations in XLOG insertion. It is also
-	 * required for O_DIRECT.
+	 * This simplifies some calculations in XLOG insertion.  We also need I/O
+	 * alignment for O_DIRECT, but that's also a power of two and usually
+	 * smaller.  Take the larger of the two alignment requirements.
 	 */
-	allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
+	allocptr = (char *) TYPEALIGN(Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE), allocptr);
 	XLogCtl->pages = allocptr;
 	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
 
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index d708af19ed..0c5ac1f94b 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -451,7 +451,7 @@ void
 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 					ForkNumber forkNum, char relpersistence)
 {
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	Page		page;
 	bool		use_wal;
 	bool		copying_initfork;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 6b6264854e..76a30d44b7 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -78,9 +78,12 @@ InitBufferPool(void)
 						NBuffers * sizeof(BufferDescPadded),
 						&foundDescs);
 
+	/* Align buffer pool on IO page size boundary. */
 	BufferBlocks = (char *)
-		ShmemInitStruct("Buffer Blocks",
-						NBuffers * (Size) BLCKSZ, &foundBufs);
+		TYPEALIGN(PG_IO_ALIGN_SIZE,
+				  ShmemInitStruct("Buffer Blocks",
+								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+								  &foundBufs));
 
 	/* Align condition variables to cacheline boundary. */
 	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
@@ -163,7 +166,8 @@ BufferShmemSize(void)
 	/* to allow aligning buffer descriptors */
 	size = add_size(size, PG_CACHE_LINE_SIZE);
 
-	/* size of data pages */
+	/* size of data pages, plus alignment padding */
+	size = add_size(size, PG_IO_ALIGN_SIZE);
 	size = add_size(size, mul_size(NBuffers, BLCKSZ));
 
 	/* size of stuff controlled by freelist.c */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 73d30bf619..aba07e94c9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3717,7 +3717,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
 	bool		use_wal;
 	BlockNumber nblocks;
 	BlockNumber blkno;
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	BufferAccessStrategy bstrategy_src;
 	BufferAccessStrategy bstrategy_dst;
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 30d67d1c40..735f7c6018 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -546,8 +546,11 @@ GetLocalBufferStorage(void)
 		/* And don't overflow MaxAllocSize, either */
 		num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
 
-		cur_block = (char *) MemoryContextAlloc(LocalBufferContext,
-												num_bufs * BLCKSZ);
+		/* Buffers should be I/O aligned. */
+		cur_block = (char *)
+			TYPEALIGN(PG_IO_ALIGN_SIZE,
+					  MemoryContextAlloc(LocalBufferContext,
+										 num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
 		next_buf_in_block = 0;
 		num_bufs_in_block = num_bufs;
 	}
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index b0b4eeb3bd..2261c3ebe3 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -95,6 +95,12 @@ struct BufFile
 	off_t		curOffset;		/* offset part of current pos */
 	int			pos;			/* next read/write position in buffer */
 	int			nbytes;			/* total # of valid bytes in buffer */
+
+	/*
+	 * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
+	 * wasting per-file alignment padding when some users create many
+	 * files.
+	 */
 	PGAlignedBlock buffer;
 };
 
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index a6b0533103..7230d538fd 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -608,7 +608,7 @@ static void
 fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 {
 	BlockNumber fsm_nblocks_now;
-	PGAlignedBlock pg;
+	PGIOAlignedBlock pg;
 	SMgrRelation reln;
 
 	PageInit((Page) pg.data, BLCKSZ, 0);
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 8b617c7e79..0728ce30c0 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1522,7 +1522,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
 	 * and second to avoid wasting space in processes that never call this.
 	 */
 	if (pageCopy == NULL)
-		pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
+		pageCopy = MemoryContextAllocAligned(TopMemoryContext,
+											 BLCKSZ,
+											 PG_IO_ALIGN_SIZE,
+											 0);
 
 	memcpy(pageCopy, (char *) page, BLCKSZ);
 	((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 14b6fa0fd9..3e034afdf1 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -453,6 +453,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	int			nbytes;
 	MdfdVec    *v;
 
+	/* If this build supports direct I/O, the buffer must be I/O aligned. */
+	if (PG_O_DIRECT != 0)
+		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
 	/* This assert is too expensive to have on normally ... */
 #ifdef CHECK_WRITE_VS_EXTEND
 	Assert(blocknum >= mdnblocks(reln, forknum));
@@ -675,6 +679,10 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	int			nbytes;
 	MdfdVec    *v;
 
+	/* If this build supports direct I/O, the buffer must be I/O aligned. */
+	if (PG_O_DIRECT != 0)
+		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 										reln->smgr_rlocator.locator.spcOid,
 										reln->smgr_rlocator.locator.dbOid,
@@ -740,6 +748,10 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	int			nbytes;
 	MdfdVec    *v;
 
+	/* If this build supports direct I/O, the buffer must be I/O aligned. */
+	if (PG_O_DIRECT != 0)
+		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
 	/* This assert is too expensive to have on normally ... */
 #ifdef CHECK_WRITE_VS_EXTEND
 	Assert(blocknum < mdnblocks(reln, forknum));
@@ -1294,7 +1306,8 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 */
 			if (nblocks < ((BlockNumber) RELSEG_SIZE))
 			{
-				char	   *zerobuf = palloc0(BLCKSZ);
+				char	   *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
+													 MCXT_ALLOC_ZERO);
 
 				mdextend(reln, forknum,
 						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index c384f98e13..6ba5030a5f 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -252,7 +252,7 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
 	 */
 	while (blocknum > lts->nBlocksWritten)
 	{
-		PGAlignedBlock zerobuf;
+		PGIOAlignedBlock zerobuf;
 
 		MemSet(zerobuf.data, 0, sizeof(zerobuf));
 
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 7f3d5fc040..be6bae2b78 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -183,7 +183,7 @@ skipfile(const char *fn)
 static void
 scan_file(const char *fn, int segmentno)
 {
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	PageHeader	header = (PageHeader) buf.data;
 	int			f;
 	BlockNumber blockno;
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c39..83b37a1e91 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -77,7 +77,7 @@ static void
 local_queue_fetch_file(rewind_source *source, const char *path, size_t len)
 {
 	const char *datadir = ((local_source *) source)->datadir;
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	char		srcpath[MAXPGPATH];
 	int			srcfd;
 	size_t		written_len;
@@ -129,7 +129,7 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off,
 						size_t len)
 {
 	const char *datadir = ((local_source *) source)->datadir;
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	char		srcpath[MAXPGPATH];
 	int			srcfd;
 	off_t		begin = off;
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 079fbda838..b5809236f6 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -178,8 +178,8 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
 {
 	int			src_fd;
 	int			dst_fd;
-	PGAlignedBlock buffer;
-	PGAlignedBlock new_vmbuf;
+	PGIOAlignedBlock buffer;
+	PGIOAlignedBlock new_vmbuf;
 	ssize_t		totalBytesRead = 0;
 	ssize_t		src_filesize;
 	int			rewriteVmBytesPerPage;
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index d8507d88a5..83ef0609a2 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -539,7 +539,7 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 ssize_t
 pg_pwrite_zeros(int fd, size_t size)
 {
-	PGAlignedBlock zbuffer;		/* worth BLCKSZ */
+	PGIOAlignedBlock zbuffer;		/* worth BLCKSZ */
 	size_t		zbuffer_sz;
 	struct iovec iov[PG_IOV_MAX];
 	int			blocks;
diff --git a/src/include/c.h b/src/include/c.h
index bd6d8e5bf5..d811181fdf 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1071,14 +1071,11 @@ extern void ExceptionalCondition(const char *conditionName,
 
 /*
  * Use this, not "char buf[BLCKSZ]", to declare a field or local variable
- * holding a page buffer, if that page might be accessed as a page and not
- * just a string of bytes.  Otherwise the variable might be under-aligned,
- * causing problems on alignment-picky hardware.  (In some places, we use
- * this to declare buffers even though we only pass them to read() and
- * write(), because copying to/from aligned buffers is usually faster than
- * using unaligned buffers.)  We include both "double" and "int64" in the
- * union to ensure that the compiler knows the value must be MAXALIGN'ed
- * (cf. configure's computation of MAXIMUM_ALIGNOF).
+ * holding a page buffer, if that page might be accessed as a page.  Otherwise
+ * the variable might be under-aligned, causing problems on alignment-picky
+ * hardware.  We include both "double" and "int64" in the union to ensure that
+ * the compiler knows the value must be MAXALIGN'ed (cf.  configure's
+ * computation of MAXIMUM_ALIGNOF).
  */
 typedef union PGAlignedBlock
 {
@@ -1087,9 +1084,30 @@ typedef union PGAlignedBlock
 	int64		force_align_i64;
 } PGAlignedBlock;
 
+/*
+ * Use this to declare a field or local variable holding a page buffer, if that
+ * page might be accessed as a page or passed to an SMgr I/O function.  If
+ * allocating using the MemoryContext API, the aligned allocation functions
+ * should be used with PG_IO_ALIGN_SIZE.  This alignment may be more efficient
+ * for I/O in general, but may be strictly required on some platforms when
+ * using direct I/O.
+ */
+typedef union PGIOAlignedBlock
+{
+#ifdef pg_attribute_aligned
+	pg_attribute_aligned(PG_IO_ALIGN_SIZE)
+#endif
+	char		data[BLCKSZ];
+	double		force_align_d;
+	int64		force_align_i64;
+} PGIOAlignedBlock;
+
 /* Same, but for an XLOG_BLCKSZ-sized buffer */
 typedef union PGAlignedXLogBlock
 {
+#ifdef pg_attribute_aligned
+	pg_attribute_aligned(PG_IO_ALIGN_SIZE)
+#endif
 	char		data[XLOG_BLCKSZ];
 	double		force_align_d;
 	int64		force_align_i64;
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index f2a106f983..323a4cfb4f 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -227,6 +227,13 @@
  */
 #define PG_CACHE_LINE_SIZE		128
 
+/*
+ * Assumed memory alignment requirement for direct I/O.  On currently known
+ * systems this size applies, even for memory that is backed by larger virtual
+ * memory pages.
+ */
+#define PG_IO_ALIGN_SIZE		4096
+
 /*
  *------------------------------------------------------------------------
  * The following symbols are for enabling debugging code, not for
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 7144fc9f60..85ef12c440 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -82,9 +82,10 @@ extern PGDLLIMPORT int max_safe_fds;
  * to the appropriate Windows flag in src/port/open.c.  We simulate it with
  * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper.  We use the name
  * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good
- * idea on a Unix).
+ * idea on a Unix).  We can only use it if the compiler will correctly align
+ * PGIOAlignedBlock for us, though.
  */
-#if defined(O_DIRECT)
+#if defined(O_DIRECT) && defined(pg_attribute_aligned)
 #define		PG_O_DIRECT O_DIRECT
 #elif defined(F_NOCACHE)
 #define		PG_O_DIRECT 0x80000000
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 60c71d05fe..9a77664154 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1687,6 +1687,7 @@ PGEventResultDestroy
 PGFInfoFunction
 PGFileType
 PGFunction
+PGIOAlignedBlock
 PGLZ_HistEntry
 PGLZ_Strategy
 PGMessageField
-- 
2.35.1

