>From 5bc40c11081a0d034f2857b45cb3d45c5b63fa58 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 30 Oct 2014 13:24:52 +0100
Subject: [PATCH 2/2] Much faster/O(1) mfdvec.

---
 src/backend/storage/smgr/md.c   | 307 +++++++++++++++++++++-------------------
 src/backend/storage/smgr/smgr.c |   4 +-
 src/include/storage/smgr.h      |   4 +-
 3 files changed, 166 insertions(+), 149 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 167d61c..6a7afa8 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -92,27 +92,22 @@
  *	out to an unlinked old copy of a segment file that will eventually
  *	disappear.
  *
- *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
- *	cache is, therefore, just the head of a list of MdfdVec objects, one
- *	per segment.  But note the md_fd pointer can be NULL, indicating
- *	relation not open.
+ *	File descriptors are stored in the md_seg_fds arrays inside
+ *	SMgrRelation. The length of these arrays is stored in md_seg_no.  Note
+ *	that md_nfd having a specific value does not necessarily mean the relation
+ *	doesn't have additional segments; we may just not have opened the next
+ *	segment yet.  (We could not have "all segments are in the array" as an
+ *	invariant anyway, since another backend could extend the relation when we
+ *	weren't looking.)  We do not have entries for inactive segments, however;
+ *	as soon as we find a partial segment, we assume that any subsequent
+ *	segments are inactive.
  *
- *	Also note that mdfd_chain == NULL does not necessarily mean the relation
- *	doesn't have another segment after this one; we may just not have
- *	opened the next segment yet.  (We could not have "all segments are
- *	in the chain" as an invariant anyway, since another backend could
- *	extend the relation when we weren't looking.)  We do not make chain
- *	entries for inactive segments, however; as soon as we find a partial
- *	segment, we assume that any subsequent segments are inactive.
- *
- *	All MdfdVec objects are palloc'd in the MdCxt memory context.
+ *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
  */
-
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;		/* fd number in fd.c's pool */
 	BlockNumber mdfd_segno;		/* segment number, from 0 */
-	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
@@ -178,7 +173,9 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 					   MdfdVec *seg);
 static void register_unlink(RelFileNodeBackend rnode);
-static MdfdVec *_fdvec_alloc(void);
+static void _fdvec_alloc(SMgrRelation reln,
+							  ForkNumber forknum,
+							  size_t nseg);
 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 			  BlockNumber segno);
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
@@ -288,13 +285,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum)
 void
 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
+	MdfdVec    *mdfd;
 	char	   *path;
 	File		fd;
 
-	if (isRedo && reln->md_fd[forkNum] != NULL)
+	if (isRedo && reln->md_seg_no[forkNum] > 0)
 		return;					/* created and opened already... */
 
-	Assert(reln->md_fd[forkNum] == NULL);
+	Assert(reln->md_seg_no[forkNum] == 0);
 
 	path = relpath(reln->smgr_rnode, forkNum);
 
@@ -324,11 +322,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
 	pfree(path);
 
-	reln->md_fd[forkNum] = _fdvec_alloc();
-
-	reln->md_fd[forkNum]->mdfd_vfd = fd;
-	reln->md_fd[forkNum]->mdfd_segno = 0;
-	reln->md_fd[forkNum]->mdfd_chain = NULL;
+	_fdvec_alloc(reln, forkNum, 1);
+	mdfd = &reln->md_seg_fds[forkNum][0];
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
 }
 
 /*
@@ -573,8 +570,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 	File		fd;
 
 	/* No work if already open */
-	if (reln->md_fd[forknum])
-		return reln->md_fd[forknum];
+	if (reln->md_seg_no[forknum] > 0)
+		return reln->md_seg_fds[forknum];
 
 	path = relpath(reln->smgr_rnode, forknum);
 
@@ -606,11 +603,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 
 	pfree(path);
 
-	reln->md_fd[forknum] = mdfd = _fdvec_alloc();
-
+	_fdvec_alloc(reln, forknum, 1);
+	mdfd = &reln->md_seg_fds[forknum][0];
 	mdfd->mdfd_vfd = fd;
 	mdfd->mdfd_segno = 0;
-	mdfd->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 
 	return mdfd;
@@ -622,24 +619,21 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 void
 mdclose(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v = reln->md_fd[forknum];
-
-	/* No work if already closed */
-	if (v == NULL)
-		return;
+	size_t		nopensegs;
 
-	reln->md_fd[forknum] = NULL;	/* prevent dangling pointer after error */
+	nopensegs = reln->md_seg_no[forknum];
 
-	while (v != NULL)
+	/* close segments starting from the end */
+	while (nopensegs > 0)
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
 
 		/* if not closed already */
 		if (v->mdfd_vfd >= 0)
 			FileClose(v->mdfd_vfd);
-		/* Now free vector */
-		v = v->mdfd_chain;
-		pfree(ov);
+
+		nopensegs--;
+		_fdvec_alloc(reln, forknum, nopensegs);
 	}
 }
 
@@ -814,6 +808,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber nblocks;
 	BlockNumber segno = 0;
 
+	/* mdopen has opened the first segment */
+	Assert(reln->md_seg_no[forknum] > 0);
+
 	/*
 	 * Skip through any segments that aren't the last one, to avoid redundant
 	 * seeks on them.  We have previously verified that these segments are
@@ -827,11 +824,8 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	 * segments; that's OK because the checkpointer never needs to compute
 	 * relation size.)
 	 */
-	while (v->mdfd_chain != NULL)
-	{
-		segno++;
-		v = v->mdfd_chain;
-	}
+	segno = reln->md_seg_no[forknum] - 1;
+	v = &reln->md_seg_fds[forknum][segno];
 
 	for (;;)
 	{
@@ -846,23 +840,17 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 		 */
 		segno++;
 
-		if (v->mdfd_chain == NULL)
-		{
-			/*
-			 * Because we pass O_CREAT, we will create the next segment (with
-			 * zero length) immediately, if the last segment is of length
-			 * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
-			 * the logic simple.
-			 */
-			v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
-			if (v->mdfd_chain == NULL)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not open file \"%s\": %m",
-								_mdfd_segpath(reln, forknum, segno))));
-		}
-
-		v = v->mdfd_chain;
+		/*
+		 * Because we pass O_CREAT, we will create the next segment (with zero
+		 * length) immediately, if the last segment is of length RELSEG_SIZE.
+		 * While perhaps not strictly necessary, this keeps the logic simple.
+		 */
+		v = _mdfd_openseg(reln, forknum, segno, O_CREAT);
+		if (v == NULL)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m",
+							_mdfd_segpath(reln, forknum, segno))));
 	}
 }
 
@@ -872,9 +860,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 void
 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	MdfdVec    *v;
 	BlockNumber curnblk;
 	BlockNumber priorblocks;
+	size_t		curopensegs;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -894,19 +882,24 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	if (nblocks == curnblk)
 		return;					/* no work */
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
-
-	priorblocks = 0;
-	while (v != NULL)
+	/*
+	 * Truncate segments, starting at the last one. Starting at the end makes
+	 * managing the memory for the fd array easier should there be errors.
+	 */
+	curopensegs = reln->md_seg_no[forknum];
+	while (curopensegs > 0 )
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v;
+
+		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+
+		v = &reln->md_seg_fds[forknum][curopensegs - 1];
 
 		if (priorblocks > nblocks)
 		{
 			/*
-			 * This segment is no longer active (and has already been unlinked
-			 * from the mdfd_chain). We truncate the file, but do not delete
-			 * it, for reasons explained in the header comments.
+			 * This segment is no longer active. We truncate the file, but do
+			 * not delete it, for reasons explained in the header comments.
 			 */
 			if (FileTruncate(v->mdfd_vfd, 0) < 0)
 				ereport(ERROR,
@@ -916,20 +909,20 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
-												 * segment */
-			pfree(ov);
+
+			/* we never drop the 1st segment */
+			Assert(v != &reln->md_seg_fds[forknum][0]);
+
+			_fdvec_alloc(reln, forknum, curopensegs - 1);
 		}
 		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 		{
 			/*
 			 * This is the last segment we want to keep. Truncate the file to
-			 * the right length, and clear chain link that points to any
-			 * remaining segments (which we shall zap). NOTE: if nblocks is
-			 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
-			 * segment to 0 length but keep it. This adheres to the invariant
-			 * given in the header comments.
+			 * the right length. NOTE: if nblocks is exactly a multiple K of
+			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
+			 * keep it. This adheres to the invariant given in the header
+			 * comments.
 			 */
 			BlockNumber lastsegblocks = nblocks - priorblocks;
 
@@ -941,18 +934,16 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 						   nblocks)));
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			ov->mdfd_chain = NULL;
 		}
 		else
 		{
 			/*
-			 * We still need this segment and 0 or more blocks beyond it, so
-			 * nothing to do here.
+			 * We still need this segment, so nothing to do for this and any
+			 * earlier segment.
 			 */
-			v = v->mdfd_chain;
+			break;
 		}
-		priorblocks += RELSEG_SIZE;
+		curopensegs--;
 	}
 }
 
@@ -965,7 +956,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v;
+	size_t		i;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -973,16 +964,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	 */
 	mdnblocks(reln, forknum);
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
+	i = reln->md_seg_no[forknum];
 
-	while (v != NULL)
+	while (i > 0)
 	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][i];
+
 		if (FileSync(v->mdfd_vfd) < 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-		v = v->mdfd_chain;
+		i--;
 	}
 }
 
@@ -1645,10 +1638,29 @@ ForgetDatabaseFsyncRequests(Oid dbid)
 /*
  *	_fdvec_alloc() -- Make a MdfdVec object.
  */
-static MdfdVec *
-_fdvec_alloc(void)
+static void
+_fdvec_alloc(SMgrRelation reln,
+			 ForkNumber forknum,
+			 size_t nseg)
 {
-	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
+	if (nseg == 0)
+	{
+		if (reln->md_seg_no[forknum] > 0)
+			pfree(reln->md_seg_fds[forknum]);
+	}
+	else if (reln->md_seg_no[forknum] == 0)
+	{
+		reln->md_seg_fds[forknum] =
+			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
+	}
+	else
+	{
+		reln->md_seg_fds[forknum] =
+			repalloc(reln->md_seg_fds[forknum],
+					 sizeof(MdfdVec) * nseg);
+	}
+
+	reln->md_seg_no[forknum] = nseg;
 }
 
 /*
@@ -1696,13 +1708,14 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	if (fd < 0)
 		return NULL;
 
-	/* allocate an mdfdvec entry for it */
-	v = _fdvec_alloc();
+	if (reln->md_seg_no[forknum] < segno + 1)
+		_fdvec_alloc(reln, forknum, segno + 1);
 
 	/* fill the entry */
+	v = &reln->md_seg_fds[forknum][segno];
 	v->mdfd_vfd = fd;
 	v->mdfd_segno = segno;
-	v->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 
 	/* all done */
@@ -1721,65 +1734,69 @@ static MdfdVec *
 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 bool skipFsync, ExtensionBehavior behavior)
 {
-	MdfdVec    *v = mdopen(reln, forknum, behavior);
+	MdfdVec    *v = NULL;
 	BlockNumber targetseg;
 	BlockNumber nextsegno;
 
-	if (!v)
-		return NULL;			/* only possible if EXTENSION_RETURN_NULL */
-
 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
-	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
+	nextsegno = targetseg;
+
+	/* perhaps it's already open */
+	if (reln->md_seg_no[forknum] >= targetseg)
 	{
-		Assert(nextsegno == v->mdfd_segno + 1);
+		v =_mdfd_openseg(reln, forknum, targetseg, 0);
+		goto done;
+	}
+
+	/* check the size of the relation, this opens all segments */
+	mdnblocks(reln, forknum);
 
-		if (v->mdfd_chain == NULL)
+	/* check whether it's an existing segment */
+	if (reln->md_seg_no[forknum] >= targetseg)
+	{
+		v = _mdfd_openseg(reln, forknum, targetseg, 0);
+		goto done;
+	}
+
+	if (behavior != EXTENSION_CREATE && !InRecovery)
+		goto done;
+
+
+	if (reln->md_seg_no[forknum] > 0)
+		v = &reln->md_seg_fds[forknum][reln->md_seg_no[forknum] - 1];
+
+	/*
+	 * We only get here if we want to extend the filenode to have one more
+	 * relation. For that we first need to extend the last existing segment to
+	 * be of the full size if necessary, and then add the new segment(s).
+	 */
+	nextsegno = reln->md_seg_no[forknum] + 1;
+	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
+	{
+		if (v != NULL && _mdnblocks(reln, forknum, v) < RELSEG_SIZE)
 		{
-			/*
-			 * Normally we will create new segments only if authorized by the
-			 * caller (i.e., we are doing mdextend()).  But when doing WAL
-			 * recovery, create segments anyway; this allows cases such as
-			 * replaying WAL data that has a write into a high-numbered
-			 * segment of a relation that was later deleted.  We want to go
-			 * ahead and create the segments so we can finish out the replay.
-			 *
-			 * We have to maintain the invariant that segments before the last
-			 * active segment are of size RELSEG_SIZE; therefore, pad them out
-			 * with zeroes if needed.  (This only matters if caller is
-			 * extending the relation discontiguously, but that can happen in
-			 * hash indexes.)
-			 */
-			if (behavior == EXTENSION_CREATE || InRecovery)
-			{
-				if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
-				{
-					char	   *zerobuf = palloc0(BLCKSZ);
+			char	   *zerobuf = palloc0(BLCKSZ);
 
-					mdextend(reln, forknum,
-							 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-							 zerobuf, skipFsync);
-					pfree(zerobuf);
-				}
-				v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
-			}
-			else
-			{
-				/* We won't create segment if not existent */
-				v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
-			}
-			if (v->mdfd_chain == NULL)
-			{
-				if (behavior == EXTENSION_RETURN_NULL &&
-					FILE_POSSIBLY_DELETED(errno))
-					return NULL;
-				ereport(ERROR,
-						(errcode_for_file_access(),
-				   errmsg("could not open file \"%s\" (target block %u): %m",
-						  _mdfd_segpath(reln, forknum, nextsegno),
-						  blkno)));
-			}
+			mdextend(reln, forknum,
+					 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+					 zerobuf, skipFsync);
+			pfree(zerobuf);
 		}
-		v = v->mdfd_chain;
+
+		v = _mdfd_openseg(reln, forknum, nextsegno, O_CREAT);
+	}
+
+done:
+	if (v == NULL && behavior == EXTENSION_RETURN_NULL &&
+		FILE_POSSIBLY_DELETED(errno))
+		return NULL;
+	else if (v == NULL)
+	{
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\" (target block %u): %m",
+						_mdfd_segpath(reln, forknum, nextsegno),
+						blkno)));
 	}
 	return v;
 }
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d16f559..43b6bae 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -173,7 +173,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 
 		/* mark it not open */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			reln->md_fd[forknum] = NULL;
+			reln->md_seg_no[forknum] = 0;
 
 		/* it has no owner yet */
 		add_to_unowned_list(reln);
@@ -378,7 +378,7 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 	 * Exit quickly in WAL replay mode if we've already opened the file. If
 	 * it's open, it surely must exist.
 	 */
-	if (isRedo && reln->md_fd[forknum] != NULL)
+	if (isRedo && reln->md_seg_no[forknum] > 0)
 		return;
 
 	/*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index ba7c909..e6758c6 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -64,8 +64,8 @@ typedef struct SMgrRelationData
 	 */
 	int			smgr_which;		/* storage manager selector */
 
-	/* for md.c; NULL for forks that are not open */
-	struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
+	size_t		md_seg_no[MAX_FORKNUM + 1];
+	struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
 
 	/* if unowned, list link in list of all unowned SMgrRelations */
 	struct SMgrRelationData *next_unowned_reln;
-- 
2.0.0.rc2.4.g1dc51c6.dirty

