From 54585cc93cc5207b80c7dbd8f34772c49290e2f2 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Mon, 24 Aug 2015 13:22:11 -0700
Subject: [PATCH 5/5] Use "tuple proper" memory pool in tuplesort

Allocating one pool of memory for use by READTUP() routines during an
on-the-fly merge phase of an external sort (where tuples are preloaded
from disk in batch) accelerates the merge phase significantly.  Merging
was a notable remaining bottleneck following previous commits
overhauling external sorting.

Sequentially consuming memory from the pool accelerates merging because
the natural order of all subsequent access (access during merging
proper) is sequential per tape; cache characteristics are improved.  In
addition, modern microarchitectures and compilers may even automatically
perform prefetching with a sequential access pattern.
---
 src/backend/utils/sort/tuplesort.c | 346 +++++++++++++++++++++++++++++++++----
 1 file changed, 315 insertions(+), 31 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d828723..7941fcc 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -362,6 +362,23 @@ struct Tuplesortstate
 	int64	   *mergeavailmem;	/* availMem for prereading each tape */
 	int			mergefreelist;	/* head of freelist of recycled slots */
 	int			mergefirstfree; /* first slot never used in this merge */
+	int64		spacePerTape;	/* space (memory) segmented to each tape */
+
+	/*
+	 * During final on-the-fly merge steps, an ad-hoc "tuple proper" memory
+	 * pool is used, reducing palloc() overhead.  Importantly, this also
+	 * results in far more cache efficient merging, since each tape's tuples
+	 * must naturally be accessed sequentially (in sorted order).
+	 *
+	 * These variables manage each active tape's ownership of memory pool
+	 * space.  The underlying buffer is allocated once, just before
+	 * TSS_FINALMERGE prereading.
+	 */
+	bool		usePool;		/* using "tuple proper" memory pool? */
+	char	  **mergetupstart;	/* start of each run's partition */
+	char	  **mergetupcur;	/* current offset into each run's partition */
+	char	  **mergetuptail;	/* last appended item's start point */
+	char	  **mergeoverflow;	/* single retail palloc() "overflow" */
 
 	/*
 	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
@@ -527,9 +544,13 @@ static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
 static void mergeonerun(Tuplesortstate *state);
 static void mergememruns(Tuplesortstate *state);
-static void beginmerge(Tuplesortstate *state);
+static void beginmerge(Tuplesortstate *state, bool finalMerge);
+static void mergepool(Tuplesortstate *state);
+static void mergepoolone(Tuplesortstate *state, int srcTape,
+					  SortTuple *stup, bool *should_free);
 static void mergepreread(Tuplesortstate *state);
-static void mergeprereadone(Tuplesortstate *state, int srcTape);
+static void mergeprereadone(Tuplesortstate *state, int srcTape,
+					  SortTuple *stup, bool *should_free);
 static void dumptuples(Tuplesortstate *state, bool alltuples);
 static void dumpbatch(Tuplesortstate *state, bool alltuples);
 static void make_bounded_heap(Tuplesortstate *state);
@@ -541,6 +562,7 @@ static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
 static void reversedirection(Tuplesortstate *state);
 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 static void markrunend(Tuplesortstate *state, int tapenum);
+static void *tupproperalloc(Tuplesortstate *state, int tapenum, Size size);
 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
 				Tuplesortstate *state);
 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -669,6 +691,7 @@ tuplesort_begin_common(int workMem, double rowNumHint, bool randomAccess)
 	 * inittapes(), if needed
 	 */
 
+	state->usePool = false;		/* memory pool not used until final merge */
 	state->result_tape = -1;	/* flag that result tape has not been formed */
 
 	MemoryContextSwitchTo(oldcontext);
@@ -1780,6 +1803,7 @@ tuplesort_performsort(Tuplesortstate *state)
  * Internal routine to fetch the next tuple in either forward or back
  * direction into *stup.  Returns FALSE if no more tuples.
  * If *should_free is set, the caller must pfree stup.tuple when done with it.
+ * Otherwise, caller should not use tuple following next call here.
  */
 static bool
 tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
@@ -1791,6 +1815,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 	{
 		case TSS_SORTEDINMEM:
 			Assert(forward || state->randomAccess);
+			Assert(!state->usePool);
 			*should_free = false;
 			if (forward)
 			{
@@ -1856,6 +1881,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 
 		case TSS_SORTEDONTAPE:
 			Assert(forward || state->randomAccess);
+			Assert(!state->usePool);
 			*should_free = true;
 			if (forward)
 			{
@@ -1941,6 +1967,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 
 		case TSS_MEMTAPEMERGE:
 			Assert(forward);
+			Assert(!state->usePool);
 			/* For now, assume tuple returned from memory */
 			*should_free = false;
 
@@ -2065,7 +2092,9 @@ just_memtuples:
 
 		case TSS_FINALMERGE:
 			Assert(forward);
-			*should_free = true;
+			Assert(state->usePool);
+			/* For now, assume "tuple proper" is from memory pool */
+			*should_free = false;
 
 			/*
 			 * This code should match the inner loop of mergeonerun().
@@ -2073,18 +2102,17 @@ just_memtuples:
 			if (state->memtupcount > 0)
 			{
 				int			srcTape = state->memtuples[0].tupindex;
-				Size		tuplen;
 				int			tupIndex;
 				SortTuple  *newtup;
 
+				/*
+				 * Returned tuple is still counted in our memory space most
+				 * of the time.  See mergepoolone() for discussion of why
+				 * caller may occasionally be required to free returned
+				 * tuple, and how the preread memory pool is managed with
+				 * regard to edge cases more generally.
+				 */
 				*stup = state->memtuples[0];
-				/* returned tuple is no longer counted in our memory space */
-				if (stup->tuple)
-				{
-					tuplen = GetMemoryChunkSpace(stup->tuple);
-					state->availMem += tuplen;
-					state->mergeavailmem[srcTape] += tuplen;
-				}
 				tuplesort_heap_siftup(state, false);
 				if ((tupIndex = state->mergenext[srcTape]) == 0)
 				{
@@ -2092,9 +2120,11 @@ just_memtuples:
 					 * out of preloaded data on this tape, try to read more
 					 *
 					 * Unlike mergeonerun(), we only preload from the single
-					 * tape that's run dry.  See mergepreread() comments.
+					 * tape that's run dry, though not before preparing its
+					 * partition within the memory pool for a new round of
+					 * sequential consumption.  See mergepreread() comments.
 					 */
-					mergeprereadone(state, srcTape);
+					mergeprereadone(state, srcTape, stup, should_free);
 
 					/*
 					 * if still no data, we've reached end of run on this tape
@@ -2156,6 +2186,8 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
  * Fetch the next tuple in either forward or back direction.
  * Returns NULL if no more tuples.  If *should_free is set, the
  * caller must pfree the returned tuple when done with it.
+ * If it is not set, caller should not use tuple following next
+ * call here.
  */
 HeapTuple
 tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free)
@@ -2175,6 +2207,8 @@ tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free)
  * Fetch the next index tuple in either forward or back direction.
  * Returns NULL if no more tuples.  If *should_free is set, the
  * caller must pfree the returned tuple when done with it.
+ * If it is not set, caller should not use tuple following next
+ * call here.
  */
 IndexTuple
 tuplesort_getindextuple(Tuplesortstate *state, bool forward,
@@ -2472,6 +2506,10 @@ inittapes(Tuplesortstate *state)
 	state->mergelast = (int *) palloc0(maxTapes * sizeof(int));
 	state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int));
 	state->mergeavailmem = (int64 *) palloc0(maxTapes * sizeof(int64));
+	state->mergetupstart = (char **) palloc0(maxTapes * sizeof(char *));
+	state->mergetupcur = (char **) palloc0(maxTapes * sizeof(char *));
+	state->mergetuptail = (char **) palloc0(maxTapes * sizeof(char *));
+	state->mergeoverflow = (char **) palloc0(maxTapes * sizeof(char *));
 	state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
 	state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
 	state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
@@ -2659,7 +2697,7 @@ mergeruns(Tuplesortstate *state)
 				/* Tell logtape.c we won't be writing anymore */
 				LogicalTapeSetForgetFreeSpace(state->tapeset);
 				/* Initialize for the final merge pass */
-				beginmerge(state);
+				beginmerge(state, true);
 				state->status = TSS_FINALMERGE;
 				return;
 			}
@@ -2765,7 +2803,7 @@ mergeonerun(Tuplesortstate *state)
 	 * Start the merge by loading one tuple from each active source tape into
 	 * the heap.  We can also decrease the input run/dummy run counts.
 	 */
-	beginmerge(state);
+	beginmerge(state, false);
 
 	/*
 	 * Execute merge by repeatedly extracting lowest tuple in heap, writing it
@@ -2888,13 +2926,12 @@ mergememruns(Tuplesortstate *state)
  * fill the merge heap with the first tuple from each active tape.
  */
 static void
-beginmerge(Tuplesortstate *state)
+beginmerge(Tuplesortstate *state, bool finalMerge)
 {
 	int			activeTapes;
 	int			tapenum;
 	int			srcTape;
 	int			slotsPerTape;
-	int64		spacePerTape;
 
 	/* Heap should be empty here */
 	Assert(state->memtupcount == 0);
@@ -2933,19 +2970,29 @@ beginmerge(Tuplesortstate *state)
 	Assert(activeTapes > 0);
 	slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes;
 	Assert(slotsPerTape > 0);
-	spacePerTape = state->availMem / activeTapes;
+	state->spacePerTape = state->availMem / activeTapes;
 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
 	{
 		if (state->mergeactive[srcTape])
 		{
 			state->mergeavailslots[srcTape] = slotsPerTape;
-			state->mergeavailmem[srcTape] = spacePerTape;
+			state->mergeavailmem[srcTape] = state->spacePerTape;
 		}
 	}
 
 	/*
+	 * Preallocate "tuple proper" pool memory, and partition pool among
+	 * tapes.  Actual memory allocation is performed here at most once per
+	 * sort, just in advance of the final on-the-fly merge step.  That
+	 * implies that the optimization is never used by randomAccess callers,
+	 * since no on-the-fly final merge step will occur.
+	 */
+	if (finalMerge)
+		mergepool(state);
+
+	/*
 	 * Preread as many tuples as possible (and at least one) from each active
-	 * tape
+	 * tape.  This may use "tuple proper" pool.
 	 */
 	mergepreread(state);
 
@@ -2971,6 +3018,152 @@ beginmerge(Tuplesortstate *state)
 }
 
 /*
+ * mergepool - initialize "tuple proper" memory pool
+ *
+ * This allows sequential access to sorted tuples buffered in memory from
+ * tapes/runs on disk during a final on-the-fly merge step.
+ */
+static void
+mergepool(Tuplesortstate *state)
+{
+	char	   *tupProperPool;
+	Size		poolSize = state->spacePerTape * state->activeTapes;
+	int			srcTape;
+	int			i;
+
+	/* Heap should be empty here */
+	Assert(state->memtupcount == 0);
+	Assert(state->activeTapes > 0);
+	Assert(!state->randomAccess);
+
+	/*
+	 * For the purposes of tuplesort's memory accounting, the memory pool is
+	 * not special, and so per-active-tape mergeavailmem is decreased only
+	 * as memory is consumed from the pool through regular USEMEM() calls
+	 * (see mergeprereadone()).  All memory is actually allocated here all
+	 * at once, with only rare exceptions, so freeing memory consumed from
+	 * the pool must for the most part also occur at a higher, "logical"
+	 * level.
+	 *
+	 * mergeavailmem is reset specially (logically freed) when the partition
+	 * space is exhausted for its tape when a new round of prereading is
+	 * required.  This is okay because there is no "global" availMem
+	 * consumer from here on.  The inability for such a consumer to consume
+	 * memory from the pool is therefore of no concern.
+	 *
+	 * mergeavailmem is only relied on by the on-the-fly merge step to
+	 * determine that space has been exhausted for a tape.  There still
+	 * needs to be some leeway to perform retail palloc() calls because
+	 * LACKMEM() is only a soft limit.  In general, mergeavailmem may be
+	 * insufficient memory for storing even one tuple.
+	 */
+	tupProperPool = MemoryContextAllocHuge(state->sortcontext, poolSize);
+	state->usePool = true;
+
+	i = 0;
+	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
+	{
+		if (!state->mergeactive[srcTape])
+			continue;
+
+		state->mergetupstart[srcTape] = tupProperPool + (i++ * state->spacePerTape);
+		/* Initialize current point into buffer */
+		state->mergetupcur[srcTape] = state->mergetupstart[srcTape];
+		state->mergetuptail[srcTape] = state->mergetupstart[srcTape];
+		state->mergeoverflow[srcTape] = NULL;
+	}
+	Assert(i == state->activeTapes);
+}
+
+/*
+ * mergepoolone - prepare preallocated pool for one merge input tape
+ *
+ * This is called following the exhaustion of preread tuples for one input
+ * tape.  While no memory is actually deallocated (or allocated) here, this
+ * routine could be said to logically free the source tape's segment of the
+ * memory pool.  Of course, all that actually occurs is that the memory pool
+ * state for the source tape is reset to indicate that all memory may be
+ * reused.  Because the tape's mergeavailmem is also reset to the general
+ * per-active-tape share, calling here will generally result in LACKMEM()
+ * ceasing to apply.
+ *
+ * This routine must deal with fixing up the tuple that is about to be
+ * returned to the client, due to fine aspects of memory management.
+ */
+static void
+mergepoolone(Tuplesortstate *state, int srcTape, SortTuple *stup,
+			 bool *should_free)
+{
+	Size		lastTupLen;
+
+	/* By here, final on-the-fly merge step actually underway */
+	Assert(state->status == TSS_FINALMERGE);
+
+	/*
+	 * Tuple about to be returned to caller ("stup") is final preread tuple
+	 * from tape, just removed from the top of the heap.  Special steps
+	 * around memory management must be performed for that tuple.
+	 */
+	if (!state->mergeoverflow[srcTape])
+	{
+		/*
+		 * Mark tuple proper buffer range for reuse, but be careful to move
+		 * final, tail tuple to start of space for next run so that it's
+		 * available to caller when stup is returned, and remains available
+		 * at least until the next tuple is requested.
+		 */
+		lastTupLen =
+			state->mergetupcur[srcTape] - state->mergetuptail[srcTape];
+		state->mergetupcur[srcTape] = state->mergetupstart[srcTape];
+
+		memmove(state->mergetupcur[srcTape],
+				state->mergetuptail[srcTape],
+				lastTupLen);
+
+		/* Make SortTuple at top of the heap point to new "tuple proper" */
+		stup->tuple = (void *) state->mergetupcur[srcTape];
+		state->mergetupcur[srcTape] += lastTupLen;
+	}
+	else
+	{
+		/*
+		 * Handle an "overflow" retail palloc.
+		 *
+		 * This is needed on the rare occasions when very little work_mem is
+		 * available, particularly relative to the size of individual tuples
+		 * passed by caller.  Sometimes, tuplesort will only store one tuple
+		 * per tape in memtuples, so some amount of dynamic allocation is
+		 * inevitable.
+		 */
+		Size		tuplen;
+
+		/* No moving of last "tuple proper" required */
+		lastTupLen = 0;
+		state->mergetupcur[srcTape] = state->mergetupstart[srcTape];
+
+		 /* Returned tuple is no longer counted in our memory space */
+		if (stup->tuple)
+		{
+			Assert(stup->tuple == (void *) state->mergeoverflow[srcTape]);
+			tuplen = GetMemoryChunkSpace(stup->tuple);
+			state->availMem += tuplen;
+			state->mergeavailmem[srcTape] += tuplen;
+			/* Caller should free palloc'd tuple proper */
+			*should_free = true;
+		}
+		state->mergeoverflow[srcTape] = NULL;
+	}
+
+	/*
+	 * Give back tape's range in memory pool (accounting for tail contents
+	 * having been moved to front in the common case where there was no
+	 * handling of an "overflow" retail palloc).
+	 */
+	state->mergetuptail[srcTape] = state->mergetupstart[srcTape];
+	state->mergeavailmem[srcTape] = state->spacePerTape - lastTupLen;
+}
+
+/*
  * mergepreread - load tuples from merge input tapes
  *
  * This routine exists to improve sequentiality of reads during a merge pass,
@@ -2991,7 +3184,8 @@ beginmerge(Tuplesortstate *state)
  * that state and so no point in scanning through all the tapes to fix one.
  * (Moreover, there may be quite a lot of inactive tapes in that state, since
  * we might have had many fewer runs than tapes.  In a regular tape-to-tape
- * merge we can expect most of the tapes to be active.)
+ * merge we can expect most of the tapes to be active.  Plus, only FINALMERGE
+ * state has to consider memory management for the "tuple proper" memory pool.)
  */
 static void
 mergepreread(Tuplesortstate *state)
@@ -2999,7 +3193,7 @@ mergepreread(Tuplesortstate *state)
 	int			srcTape;
 
 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
-		mergeprereadone(state, srcTape);
+		mergeprereadone(state, srcTape, NULL, NULL);
 }
 
 /*
@@ -3008,9 +3202,15 @@ mergepreread(Tuplesortstate *state)
  * Read tuples from the specified tape until it has used up its free memory
  * or array slots; but ensure that we have at least one tuple, if any are
  * to be had.
+ *
+ * FINALMERGE state passes *rtup and *should_free variables, to have
+ * pool-related memory management responsibilities handled by
+ * mergepoolone().  Otherwise a memory pool isn't used, and this is not
+ * required.
  */
 static void
-mergeprereadone(Tuplesortstate *state, int srcTape)
+mergeprereadone(Tuplesortstate *state, int srcTape, SortTuple *rtup,
+				bool *should_free)
 {
 	unsigned int tuplen;
 	SortTuple	stup;
@@ -3020,6 +3220,26 @@ mergeprereadone(Tuplesortstate *state, int srcTape)
 
 	if (!state->mergeactive[srcTape])
 		return;					/* tape's run is already exhausted */
+
+	/*
+	 * Manage memory pool segment for tape (if pool is in use).
+	 *
+	 * This is also a natural point to redraw partitioning boundaries inside
+	 * the preread memory pool, by donating the unneeded memory of our tape
+	 * (once it becomes exhausted) to some adjacent active tape.
+	 *
+	 * For now we don't bother, though.  It does not seem to help much even
+	 * in the event of a strong logical/physical correlation, where earlier
+	 * runs will be drained before later runs return their earliest/lowest
+	 * tuples.  A contributing factor must be that the size of memtuples
+	 * during the merge phase is based on memory accounting with significant
+	 * palloc() overhead.  There is almost always initially somewhat more
+	 * memory for each tape in the pool than is needed, because the number
+	 * of slots available tends to be the limiting factor then.
+	 */
+	if (rtup)
+		mergepoolone(state, srcTape, rtup, should_free);
+
 	priorAvail = state->availMem;
 	state->availMem = state->mergeavailmem[srcTape];
 	while ((state->mergeavailslots[srcTape] > 0 && !LACKMEM(state)) ||
@@ -3666,6 +3886,68 @@ markrunend(Tuplesortstate *state, int tapenum)
 	LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
 }
 
+/*
+ * Allocate memory either using palloc(), or using a dedicated memory pool
+ * "logical allocation" during tuple preloading.  READTUP() routines call
+ * here in place of a palloc() and USEMEM() call.
+ *
+ * READTUP() routines may receive memory from the memory pool when calling
+ * here, but at that point the tuples cannot subsequently be used with
+ * WRITETUP() routines, since they are unprepared for any "tuple proper" not
+ * allocated with a retail palloc().
+ *
+ * In the main, it doesn't seem worth optimizing the case where WRITETUP()
+ * must be called following consuming memory from the "tuple proper" pool
+ * (and allocating the pool earlier).  During initial sorts of runs, the
+ * order in which tuples will finally need to be written out is
+ * unpredictable, so no big improvement in cache efficiency should be
+ * expected from using a memory pool.  Multi-pass sorts are usually
+ * unnecessary and generally best avoided, so again, the non-use of a pool
+ * is not considered a problem.  However, randomAccess callers may
+ * appreciably benefit from using a memory pool, and may be revisited as a
+ * target for memory pooling in the future.
+ */
+static void *
+tupproperalloc(Tuplesortstate *state, int tapenum, Size size)
+{
+	Size		reserve_size = MAXALIGN(size);
+	char	   *ret;
+
+	if (!state->usePool)
+	{
+		/* Memory pool not in use */
+		ret = palloc(size);
+		USEMEM(state, GetMemoryChunkSpace(ret));
+	}
+	else if (state->mergetupcur[tapenum] + reserve_size <
+			 state->mergetupstart[tapenum] + state->spacePerTape)
+	{
+		/*
+		 * Usual case -- caller is returned memory from its tape's partition
+		 * in buffer, since there is an adequate supply of memory.
+		 */
+		ret = state->mergetuptail[tapenum] = state->mergetupcur[tapenum];
+
+		/*
+		 * Logically, for accounting purposes, memory from our pool has only
+		 * been allocated now.  We expect reserve_size to actually diminish
+		 * our tape's mergeavailmem, not "global" availMem.
+		 */
+		state->mergetupcur[tapenum] += reserve_size;
+		USEMEM(state, reserve_size);
+	}
+	else
+	{
+		/* Should only use overflow allocation once per tape per level */
+		Assert(state->mergeoverflow[tapenum] == NULL);
+		/* palloc() in ordinary way */
+		ret = state->mergeoverflow[tapenum] = palloc(size);
+		USEMEM(state, GetMemoryChunkSpace(ret));
+	}
+
+	return ret;
+}
+
 
 /*
  * Routines specialized for HeapTuple (actually MinimalTuple) case
@@ -3826,6 +4108,7 @@ writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
 		LogicalTapeWrite(state->tapeset, tapenum,
 						 (void *) &tuplen, sizeof(tuplen));
 
+	Assert(!state->usePool);
 	FREEMEM(state, GetMemoryChunkSpace(tuple));
 	heap_free_minimal_tuple(tuple);
 }
@@ -3836,11 +4119,10 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
 {
 	unsigned int tupbodylen = len - sizeof(int);
 	unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
-	MinimalTuple tuple = (MinimalTuple) palloc(tuplen);
+	MinimalTuple tuple = (MinimalTuple) tupproperalloc(state, tapenum, tuplen);
 	char	   *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
 	HeapTupleData htup;
 
-	USEMEM(state, GetMemoryChunkSpace(tuple));
 	/* read in the tuple proper */
 	tuple->t_len = tuplen;
 	LogicalTapeReadExact(state->tapeset, tapenum,
@@ -4059,6 +4341,7 @@ writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
 		LogicalTapeWrite(state->tapeset, tapenum,
 						 &tuplen, sizeof(tuplen));
 
+	Assert(!state->usePool);
 	FREEMEM(state, GetMemoryChunkSpace(tuple));
 	heap_freetuple(tuple);
 }
@@ -4068,9 +4351,9 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup,
 				int tapenum, unsigned int tuplen)
 {
 	unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
-	HeapTuple	tuple = (HeapTuple) palloc(t_len + HEAPTUPLESIZE);
+	HeapTuple	tuple = (HeapTuple) tupproperalloc(state, tapenum,
+												   t_len + HEAPTUPLESIZE);
 
-	USEMEM(state, GetMemoryChunkSpace(tuple));
 	/* Reconstruct the HeapTupleData header */
 	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
 	tuple->t_len = t_len;
@@ -4359,6 +4642,7 @@ writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
 		LogicalTapeWrite(state->tapeset, tapenum,
 						 (void *) &tuplen, sizeof(tuplen));
 
+	Assert(!state->usePool);
 	FREEMEM(state, GetMemoryChunkSpace(tuple));
 	pfree(tuple);
 }
@@ -4368,9 +4652,8 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
 			  int tapenum, unsigned int len)
 {
 	unsigned int tuplen = len - sizeof(unsigned int);
-	IndexTuple	tuple = (IndexTuple) palloc(tuplen);
+	IndexTuple	tuple = (IndexTuple) tupproperalloc(state, tapenum, tuplen);
 
-	USEMEM(state, GetMemoryChunkSpace(tuple));
 	LogicalTapeReadExact(state->tapeset, tapenum,
 						 tuple, tuplen);
 	if (state->randomAccess)	/* need trailing length word? */
@@ -4450,6 +4733,7 @@ writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
 		LogicalTapeWrite(state->tapeset, tapenum,
 						 (void *) &writtenlen, sizeof(writtenlen));
 
+	Assert(!state->usePool);
 	if (stup->tuple)
 	{
 		FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
@@ -4480,14 +4764,13 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
 	}
 	else
 	{
-		void	   *raddr = palloc(tuplen);
+		void	   *raddr = tupproperalloc(state, tapenum, tuplen);
 
 		LogicalTapeReadExact(state->tapeset, tapenum,
 							 raddr, tuplen);
 		stup->datum1 = PointerGetDatum(raddr);
 		stup->isnull1 = false;
 		stup->tuple = raddr;
-		USEMEM(state, GetMemoryChunkSpace(raddr));
 	}
 
 	if (state->randomAccess)	/* need trailing length word? */
@@ -4501,6 +4784,7 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
 static void
 free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
 {
+	Assert(!state->usePool);
 	FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
 	pfree(stup->tuple);
 }
-- 
1.9.1

