From 31759bafc9a44442f14b6363689bc343f0f9fb86 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 4 Apr 2025 15:03:34 -0400
Subject: [PATCH v1 2/3] aio: Make AIO compatible with valgrind

In some edge cases valgrind flags issues with AIO related code. All of the
cases addressed in this change are false positives.

Most are caused by UnpinBuffer[NoOwner] marking buffer data as
inaccessible. This happens even though the AIO subsystem still holds a
pin. That's good, there shouldn't be accesses to the buffer outside of AIO
related code until it is pinned bu "user" code again. But it requires some
explicit work.

There is a bit of additional work due to temp tables, as valgrind does not
understand io_uring sufficiently to mark buffer contents as defined after a
read.

Per buildfarm animal skink.

Reviewed-by:
Discussion: https://postgr.es/m/3pd4322mogfmdd5nln3zphdwhtmq3rzdldqjwb2sfqzcgs22lf@ok2gletdaoe6
---
 src/include/storage/aio_internal.h        |  1 +
 src/backend/storage/aio/aio_io.c          | 23 +++++++++++
 src/backend/storage/aio/method_io_uring.c | 49 ++++++++++++++++++++++-
 src/backend/storage/aio/method_worker.c   | 18 +++++++++
 src/backend/storage/buffer/bufmgr.c       | 19 +++++++++
 5 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h
index 7f18da2c856..33f27b9fe50 100644
--- a/src/include/storage/aio_internal.h
+++ b/src/include/storage/aio_internal.h
@@ -344,6 +344,7 @@ extern PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh);
 extern void pgaio_io_perform_synchronously(PgAioHandle *ioh);
 extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
 extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
+extern int	pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov);
 
 /* aio_target.c */
 extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
diff --git a/src/backend/storage/aio/aio_io.c b/src/backend/storage/aio/aio_io.c
index 4d31392ddc7..bd8be987526 100644
--- a/src/backend/storage/aio/aio_io.c
+++ b/src/backend/storage/aio/aio_io.c
@@ -210,3 +210,26 @@ pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
 
 	return false;				/* silence compiler */
 }
+
+/*
+ * Return the iovecand its length. Currently only expected to be used by
+ * debugging infrastructure
+ */
+int
+pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov)
+{
+	Assert(ioh->state >= PGAIO_HS_DEFINED);
+
+	*iov = &pgaio_ctl->iovecs[ioh->iovec_off];
+
+	switch (ioh->op)
+	{
+		case PGAIO_OP_READV:
+			return ioh->op_data.read.iov_length;
+		case PGAIO_OP_WRITEV:
+			return ioh->op_data.write.iov_length;
+		default:
+			pg_unreachable();
+			return 0;
+	}
+}
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index c719ba2727a..1a90cadfd49 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -38,6 +38,7 @@
 #include "storage/shmem.h"
 #include "storage/lwlock.h"
 #include "storage/procnumber.h"
+#include "utils/memdebug.h"
 #include "utils/wait_event.h"
 
 
@@ -324,6 +325,49 @@ pgaio_uring_completion_error_callback(void *arg)
 	errcontext("completing I/O on behalf of process %d", owner_pid);
 }
 
+static void
+pgaio_uring_io_process_completion(PgAioHandle *ioh, int32 res)
+{
+	/*
+	 * Valgrind does not understand io_uring sufficiently to mark the
+	 * referenced region as defined on IO completion, and it'd probably be
+	 * nontrivial to teach it to do so. So we just have to do the legwork
+	 * ourselves.
+	 */
+#ifdef USE_VALGRIND
+	if (ioh->op == PGAIO_OP_READV)
+	{
+		struct iovec *iov;
+		uint16		iov_length = pgaio_io_get_iovec_length(ioh, &iov);
+		int32		processed = 0;
+
+		for (int i = 0; i < iov_length; i++)
+		{
+			const char *base = iov[i].iov_base;
+			int32		len = iov[i].iov_len;
+
+			Assert(iov[i].iov_len <= PG_INT32_MAX);
+
+			if (processed + len <= res)
+				VALGRIND_MAKE_MEM_DEFINED(base, len);
+			else if (processed <= res)
+			{
+				size_t		middle = res - processed;
+
+				VALGRIND_MAKE_MEM_DEFINED(base, middle);
+				VALGRIND_MAKE_MEM_UNDEFINED(base + middle, len - middle);
+			}
+			else
+				VALGRIND_MAKE_MEM_UNDEFINED(base, len);
+
+			processed += len;
+		}
+	}
+#endif
+
+	pgaio_io_process_completion(ioh, res);
+}
+
 static void
 pgaio_uring_drain_locked(PgAioUringContext *context)
 {
@@ -361,13 +405,16 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 		for (int i = 0; i < ncqes; i++)
 		{
 			struct io_uring_cqe *cqe = cqes[i];
+			int32		res;
 			PgAioHandle *ioh;
 
 			ioh = io_uring_cqe_get_data(cqe);
 			errcallback.arg = ioh;
+			res = cqe->res;
+
 			io_uring_cqe_seen(&context->io_uring_ring, cqe);
 
-			pgaio_io_process_completion(ioh, cqe->res);
+			pgaio_uring_io_process_completion(ioh, res);
 			errcallback.arg = NULL;
 		}
 
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 31d94ac82c5..232b79be4c4 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -42,6 +42,7 @@
 #include "storage/latch.h"
 #include "storage/proc.h"
 #include "tcop/tcopprot.h"
+#include "utils/memdebug.h"
 #include "utils/ps_status.h"
 #include "utils/wait_event.h"
 
@@ -529,6 +530,23 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 			error_errno = 0;
 			error_ioh = NULL;
 
+			/*
+			 * As part of IO completion the buffer will be marked as
+			 * non-accessible, until the buffer is pinned again. The next time
+			 * there is IO for the same buffer, the memory will be considered
+			 * inaccessible. Therefore we need to explicitly allow access to
+			 * the memory before reading data into it.
+			 */
+#ifdef USE_VALGRIND
+			{
+				struct iovec *iov;
+				uint16		iov_length = pgaio_io_get_iovec_length(ioh, &iov);
+
+				for (int i = 0; i < iov_length; i++)
+					VALGRIND_MAKE_MEM_UNDEFINED(iov[i].iov_base, iov[i].iov_len);
+			}
+#endif
+
 			/*
 			 * We don't expect this to ever fail with ERROR or FATAL, no need
 			 * to keep error_ioh set to the IO.
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 13cebad9b12..d34a1e335e5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -6883,6 +6883,19 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
 	{
 		PgAioResult result_one;
 
+		/*
+		 * If the buffer is not currently pinned by this backend, e.g. because
+		 * we're completing this IO after an error, the buffer data will have
+		 * been marked as inaccessible when the buffer was unpinned. The AIO
+		 * subystem holds a pin, but that doesn't prevent the buffer from
+		 * having been marked as inaccessible. The completion might also be
+		 * executed in a different process.
+		 */
+#ifdef USE_VALGRIND
+		if (!BufferIsPinned(buffer))
+			VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
+#endif
+
 		if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
 							failed_checksum))
 		{
@@ -6901,6 +6914,12 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
 		else if (*failed_checksum)
 			*ignored_checksum = true;
 
+		/* undo what we did above */
+#ifdef USE_VALGRIND
+		if (!BufferIsPinned(buffer))
+			VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
+#endif
+
 		/*
 		 * Immediately log a message about the invalid page, but only to the
 		 * server log. The reason to do so immediately is that this may be
-- 
2.48.1.76.g4e746b1a31.dirty

