>From f1f38829160d0f2998cd187f2f920cdc7b1fa709 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 29 Mar 2015 20:55:32 +0200
Subject: [PATCH] WIP: Saner heap extension.

---
 src/backend/access/heap/hio.c | 110 ++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 47 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 6d091f6..178c417 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -15,6 +15,8 @@
 
 #include "postgres.h"
 
+#include "miscadmin.h"
+
 #include "access/heapam.h"
 #include "access/hio.h"
 #include "access/htup_details.h"
@@ -420,63 +422,77 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	/*
 	 * Have to extend the relation.
 	 *
-	 * We have to use a lock to ensure no one else is extending the rel at the
-	 * same time, else we will both try to initialize the same new page.  We
-	 * can skip locking for new or temp relations, however, since no one else
-	 * could be accessing them.
+	 * To avoid, as it used to be the case, holding the extension lock during
+	 * victim buffer search for the new buffer, we extend the relation here
+	 * instead of relying on bufmgr.c. We still have to hold the extension
+	 * lock to prevent a race between two backends initializing the same page.
 	 */
-	needLock = !RELATION_IS_LOCAL(relation);
+	while(true)
+	{
+		char		emptybuf[BLCKSZ];
 
-	if (needLock)
-		LockRelationForExtension(relation, ExclusiveLock);
+		/*
+		 * We have to use a lock to ensure no one else is extending the rel at
+		 * the same time, else we will both try to initialize the same new
+		 * page.  We can skip locking for new or temp relations, however,
+		 * since no one else could be accessing them.
+		 */
+		needLock = !RELATION_IS_LOCAL(relation);
+		RelationOpenSmgr(relation);
 
-	/*
-	 * XXX This does an lseek - rather expensive - but at the moment it is the
-	 * only way to accurately determine how many blocks are in a relation.  Is
-	 * it worth keeping an accurate file length in shared memory someplace,
-	 * rather than relying on the kernel to do it for us?
-	 */
-	buffer = ReadBufferBI(relation, P_NEW, bistate);
+		MemSet((char *) emptybuf, 0, BLCKSZ);
+		PageInit((Page) emptybuf, BLCKSZ, 0);
 
-	/*
-	 * We can be certain that locking the otherBuffer first is OK, since it
-	 * must have a lower page number.
-	 */
-	if (otherBuffer != InvalidBuffer)
-		LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+		/*
+		 * Acquire extension lock to avoid two backends extending the
+		 * relation at the same time. This could be avoided by using
+		 * lseek(SEEK_END, +BLKSZ) *without* immediately writing to the
+		 * block. Then read in the page and only initialize after locking
+		 * it. Unclear whether it's a benefit or whether it might be too
+		 * likely to result in sparse files.
+		 */
+		if (needLock)
+			LockRelationForExtension(relation, ExclusiveLock);
 
-	/*
-	 * Now acquire lock on the new page.
-	 */
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		targetBlock = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+		smgrextend(relation->rd_smgr, MAIN_FORKNUM, targetBlock,
+					   emptybuf, false);
 
-	/*
-	 * Release the file-extension lock; it's now OK for someone else to extend
-	 * the relation some more.  Note that we cannot release this lock before
-	 * we have buffer lock on the new page, or we risk a race condition
-	 * against vacuumlazy.c --- see comments therein.
-	 */
-	if (needLock)
-		UnlockRelationForExtension(relation, ExclusiveLock);
+		if (needLock)
+			UnlockRelationForExtension(relation, ExclusiveLock);
 
-	/*
-	 * We need to initialize the empty new page.  Double-check that it really
-	 * is empty (this should never happen, but if it does we don't want to
-	 * risk wiping out valid data).
-	 */
-	page = BufferGetPage(buffer);
+		buffer = ReadBufferBI(relation, targetBlock, bistate);
+
+		/*
+		 * We can be certain that locking the otherBuffer first is OK,
+		 * since it must have a lower page number.
+		 */
+		if (otherBuffer != InvalidBuffer)
+			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
-	if (!PageIsNew(page))
-		elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-			 BufferGetBlockNumber(buffer),
-			 RelationGetRelationName(relation));
+		page = BufferGetPage(buffer);
 
-	PageInit(page, BufferGetPageSize(buffer), 0);
+		Assert(!PageIsNew(page));
 
-	if (len > PageGetHeapFreeSpace(page))
-	{
-		/* We should not get here given the test at the top */
-		elog(PANIC, "tuple is too big: size %zu", len);
+		/*
+		 * While unlikely, it's possible that another backend managed to use
+		 * up the free space till we got the exclusive lock. That'd require
+		 * the page to be vacuumed (to be put on the free space list) and then
+		 * be used; possible but fairly unlikely in practice. If it happens,
+		 * just retry.
+		 */
+		if (len <= PageGetHeapFreeSpace(page))
+			break;
+
+		if (otherBuffer != InvalidBuffer)
+			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
+
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+
+		CHECK_FOR_INTERRUPTS();
 	}
 
 	/*
-- 
2.3.0.149.gf3f4077

