Replace polyphase merge algorithm with a simple balanced k-way merge.

The advantage of polyphase merge is that it can reuse the input tapes as output tapes efficiently, but that is irrelevant on modern hardware, when we can easily emulate any number of tape drives. The number of input tapes we can/should use during merging is limited by work_mem, but output tapes that we are not currently writing to only cost a little bit of memory, so there is no need to skimp on them. Refactor LogicalTapeSet/LogicalTape interface. All the tape functions, like LogicalTapeRead and LogicalTapeWrite, take a LogicalTape as argument, instead of LogicalTapeSet+tape number. You can create any number of LogicalTapes in a single LogicalTapeSet, and you don't need to decide the number upfront, when you create the tape set.
author: Heikki Linnakangas 2016-10-12 09:26:54 +0000
committer: Heikki Linnakangas 2018-05-02 11:22:47 +0000
commit: 31772f7af8cd890c9ee57a68e3f68378e7c9af68 (patch)
tree: 2d6eb510ef5e2affe318769682587cad5b10dee1
parent: 4d427a4ca7572d025b8f94710ccee6476289b9ca (diff)
7 files changed, 534 insertions, 803 deletions
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 9cdddba510..951fdd88fc 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -802,60 +802,18 @@ BufFileTellBlock(BufFile *file)
 #endif
 
 /*
- * Return the current file size.  Counts any holes left behind by
- * BufFileViewAppend as part of the size.
+ * Return the current file size.
  */
 off_t
 BufFileSize(BufFile *file)
 {
-	return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) +
-		FileGetSize(file->files[file->numFiles - 1]);
-}
-
-/*
- * Append the contents of source file (managed within shared fileset) to
- * end of target file (managed within same shared fileset).
- *
- * Note that operation subsumes ownership of underlying resources from
- * "source".  Caller should never call BufFileClose against source having
- * called here first.  Resource owners for source and target must match,
- * too.
- *
- * This operation works by manipulating lists of segment files, so the
- * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
- * boundary, typically creating empty holes before the boundary.  These
- * areas do not contain any interesting data, and cannot be read from by
- * caller.
- *
- * Returns the block number within target where the contents of source
- * begins.  Caller should apply this as an offset when working off block
- * positions that are in terms of the original BufFile space.
- */
-long
-BufFileAppend(BufFile *target, BufFile *source)
-{
-	long		startBlock = target->numFiles * BUFFILE_SEG_SIZE;
-	int			newNumFiles = target->numFiles + source->numFiles;
-	int			i;
-
-	Assert(target->fileset != NULL);
-	Assert(source->readOnly);
-	Assert(!source->dirty);
-	Assert(source->fileset != NULL);
+	off_t		lastFileSize;
 
-	if (target->resowner != source->resowner)
-		elog(ERROR, "could not append BufFile with non-matching resource owner");
+	lastFileSize = FileSeek(file->files[file->numFiles - 1], 0, SEEK_END);
+	if (lastFileSize < 0)
+		return -1;
+	file->offsets[file->numFiles - 1] = lastFileSize;
 
-	target->files = (File *)
-		repalloc(target->files, sizeof(File) * newNumFiles);
-	target->offsets = (off_t *)
-		repalloc(target->offsets, sizeof(off_t) * newNumFiles);
-	for (i = target->numFiles; i < newNumFiles; i++)
-	{
-		target->files[i] = source->files[i - target->numFiles];
-		target->offsets[i] = 0L;
-	}
-	target->numFiles = newNumFiles;
-
-	return startBlock;
+	return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) +
+		lastFileSize;
 }
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index afce5dadc0..441f18dcf5 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2256,16 +2256,6 @@ FileGetRawMode(File file)
 }
 
 /*
- * FileGetSize - returns the size of file
- */
-off_t
-FileGetSize(File file)
-{
-	Assert(FileIsValid(file));
-	return VfdCache[file].fileSize;
-}
-
-/*
  * Make room for another allocatedDescs[] array entry if needed and possible.
  * Returns true if an array element is available.
  */
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 19eb2fddca..891a663d31 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -64,12 +64,11 @@
  * palloc context.
  *
  * To support parallel sort operations involving coordinated callers to
- * tuplesort.c routines across multiple workers, it is necessary to
- * concatenate each worker BufFile/tapeset into one single logical tapeset
- * managed by the leader.  Workers should have produced one final
- * materialized tape (their entire output) when this happens in leader.
- * There will always be the same number of runs as input tapes, and the same
- * number of input tapes as participants (worker Tuplesortstates).
+ * tuplesort.c routines across multiple workers, tapes and tape sets can
+ * be passed between processes.  To do that, a SharedFileSet must be
+ * specified when the tape set is created, and the tapes must be frozen
+ * before exporting them.  In the receiving process, use
+ * LogicalTapeSetImport() and LogicalTapeImport().
  *
  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -127,9 +126,12 @@ typedef struct TapeBlockTrailer
  */
 typedef struct LogicalTape
 {
+	LogicalTapeSet *tapeSet;	/* tape set this tape belongs to */
+
 	bool		writing;		/* T while in write phase */
 	bool		frozen;			/* T if blocks should not be freed when read */
-	bool		dirty;			/* does buffer need to be written? */
+
+	long		numBlocks;		/* Length of the tape, in blocks */
 
 	/*
 	 * Block numbers of the first, current, and next block of the tape.
@@ -138,21 +140,16 @@ typedef struct LogicalTape
 	 * a frozen tape.  (When reading from an unfrozen tape, we use a larger
 	 * read buffer that holds multiple blocks, so the "current" block is
 	 * ambiguous.)
-	 *
-	 * When concatenation of worker tape BufFiles is performed, an offset to
-	 * the first block in the unified BufFile space is applied during reads.
 	 */
 	long		firstBlockNumber;
 	long		curBlockNumber;
 	long		nextBlockNumber;
-	long		offsetBlockNumber;
 
 	/*
 	 * Buffer for current data block(s).
 	 */
 	char	   *buffer;			/* physical buffer (separately palloc'd) */
 	int			buffer_size;	/* allocated size of the buffer */
-	int			max_size;		/* highest useful, safe buffer_size */
 	int			pos;			/* next read/write position in buffer */
 	int			nbytes;			/* total # of valid bytes in buffer */
 } LogicalTape;
@@ -173,13 +170,10 @@ struct LogicalTapeSet
 	 * by ltsGetFreeBlock(), and it is always greater than or equal to
 	 * nBlocksWritten.  Blocks between nBlocksAllocated and nBlocksWritten are
 	 * blocks that have been allocated for a tape, but have not been written
-	 * to the underlying file yet.  nHoleBlocks tracks the total number of
-	 * blocks that are in unused holes between worker spaces following BufFile
-	 * concatenation.
+	 * to the underlying file yet.
 	 */
 	long		nBlocksAllocated;	/* # of blocks allocated */
 	long		nBlocksWritten; /* # of blocks used in underlying file */
-	long		nHoleBlocks;	/* # of "hole" blocks left */
 
 	/*
 	 * We store the numbers of recycled-and-available blocks in freeBlocks[].
@@ -199,18 +193,12 @@ struct LogicalTapeSet
 	long	   *freeBlocks;		/* resizable array */
 	int			nFreeBlocks;	/* # of currently free blocks */
 	int			freeBlocksLen;	/* current allocated length of freeBlocks[] */
-
-	/* The array of logical tapes. */
-	int			nTapes;			/* # of logical tapes in set */
-	LogicalTape tapes[FLEXIBLE_ARRAY_MEMBER];	/* has nTapes nentries */
 };
 
 static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static long ltsGetFreeBlock(LogicalTapeSet *lts);
 static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
-static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
-					 SharedFileSet *fileset);
 
 
 /*
@@ -232,11 +220,6 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
 	 * previous tape isn't flushed to disk until the end of the sort, so you
 	 * get one-block hole, where the last block of the previous tape will
 	 * later go.
-	 *
-	 * Note that BufFile concatenation can leave "holes" in BufFile between
-	 * worker-owned block ranges.  These are tracked for reporting purposes
-	 * only.  We never read from nor write to these hole blocks, and so they
-	 * are not considered here.
 	 */
 	while (blocknum > lts->nBlocksWritten)
 	{
@@ -283,7 +266,7 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
  * Returns true if anything was read, 'false' on EOF.
  */
 static bool
-ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
+ltsReadFillBuffer(LogicalTape *lt)
 {
 	lt->pos = 0;
 	lt->nbytes = 0;
@@ -296,13 +279,11 @@ ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
 		/* Fetch next block number */
 		if (datablocknum == -1L)
 			break;				/* EOF */
-		/* Apply worker offset, needed for leader tapesets */
-		datablocknum += lt->offsetBlockNumber;
 
 		/* Read the block */
-		ltsReadBlock(lts, datablocknum, (void *) thisbuf);
+		ltsReadBlock(lt->tapeSet, datablocknum, (void *) thisbuf);
 		if (!lt->frozen)
-			ltsReleaseBlock(lts, datablocknum);
+			ltsReleaseBlock(lt->tapeSet, datablocknum);
 		lt->curBlockNumber = lt->nextBlockNumber;
 
 		lt->nbytes += TapeBlockGetNBytes(thisbuf);
@@ -398,204 +379,197 @@ ltsReleaseBlock(LogicalTapeSet *lts, long blocknum)
 }
 
 /*
- * Claim ownership of a set of logical tapes from existing shared BufFiles.
+ * Create an empty tape set in a new temporary file.
  *
- * Caller should be leader process.  Though tapes are marked as frozen in
- * workers, they are not frozen when opened within leader, since unfrozen tapes
- * use a larger read buffer. (Frozen tapes have smaller read buffer, optimized
- * for random access.)
+ * If 'fileset' is given, a result tape can be passed to a different
+ * process after calling LogicalTapeFreeze().  This is used in parallel
+ * sorts.  Each worker processes creates a tapeset in the same 'fileset',
+ * with a different 'worker' identifier. The leader process collects
+ * the result tapes from each worker process, imports them with
+ * LogicalTapeImport(), and performs the final merge.
+ *
+ * 'worker' is ignored if 'fileset' is not given.  Pass -1 to be tidy.
  */
-static void
-ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
-					 SharedFileSet *fileset)
+LogicalTapeSet *
+LogicalTapeSetCreate(SharedFileSet *fileset, int worker)
 {
-	LogicalTape *lt = NULL;
-	long		tapeblocks = 0L;
-	long		nphysicalblocks = 0L;
-	int			i;
+	LogicalTapeSet *lts;
 
-	/* Should have at least one worker tape, plus leader's tape */
-	Assert(lts->nTapes >= 2);
+	/*
+	 * Create top-level struct.
+	 */
+	lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet));
+	lts->nBlocksAllocated = 0L;
+	lts->nBlocksWritten = 0L;
+	lts->forgetFreeSpace = false;
+	lts->blocksSorted = true;	/* a zero-length array is sorted ... */
+	lts->freeBlocksLen = 32;	/* reasonable initial guess */
+	lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long));
+	lts->nFreeBlocks = 0;
 
 	/*
-	 * Build concatenated view of all BufFiles, remembering the block number
-	 * where each source file begins.  No changes are needed for leader/last
-	 * tape.
+	 * Create temp BufFile storage as required.
+	 *
+	 * Workers use a shared fileset, so that it can be passed to the leader
+	 * process, whereas serial sorts use a conventional serial BufFile.
 	 */
-	for (i = 0; i < lts->nTapes - 1; i++)
+	if (fileset)
 	{
 		char		filename[MAXPGPATH];
-		BufFile    *file;
 
-		lt = &lts->tapes[i];
-
-		pg_itoa(i, filename);
-		file = BufFileOpenShared(fileset, filename);
-
-		/*
-		 * Stash first BufFile, and concatenate subsequent BufFiles to that.
-		 * Store block offset into each tape as we go.
-		 */
-		lt->firstBlockNumber = shared[i].firstblocknumber;
-		if (i == 0)
-		{
-			lts->pfile = file;
-			lt->offsetBlockNumber = 0L;
-		}
-		else
-		{
-			lt->offsetBlockNumber = BufFileAppend(lts->pfile, file);
-		}
-		/* Don't allocate more for read buffer than could possibly help */
-		lt->max_size = Min(MaxAllocSize, shared[i].buffilesize);
-		tapeblocks = shared[i].buffilesize / BLCKSZ;
-		nphysicalblocks += tapeblocks;
+		pg_itoa(worker, filename);
+		lts->pfile = BufFileCreateShared(fileset, filename);
 	}
+	else
+		lts->pfile = BufFileCreateTemp(false);
 
-	/*
-	 * Set # of allocated blocks, as well as # blocks written.  Use extent of
-	 * new BufFile space (from 0 to end of last worker's tape space) for this.
-	 * Allocated/written blocks should include space used by holes left
-	 * between concatenated BufFiles.
-	 */
-	lts->nBlocksAllocated = lt->offsetBlockNumber + tapeblocks;
-	lts->nBlocksWritten = lts->nBlocksAllocated;
-
-	/*
-	 * Compute number of hole blocks so that we can later work backwards, and
-	 * instrument number of physical blocks.  We don't simply use physical
-	 * blocks directly for instrumentation because this would break if we ever
-	 * subsequently wrote to worker tape.
-	 *
-	 * Working backwards like this keeps our options open.  If shared BufFiles
-	 * ever support being written to post-export, logtape.c can automatically
-	 * take advantage of that.  We'd then support writing to the leader tape
-	 * while recycling space from worker tapes, because the leader tape has a
-	 * zero offset (write routines won't need to have extra logic to apply an
-	 * offset).
-	 *
-	 * The only thing that currently prevents writing to the leader tape from
-	 * working is the fact that BufFiles opened using BufFileOpenShared() are
-	 * read-only by definition, but that could be changed if it seemed
-	 * worthwhile.  For now, writing to the leader tape will raise a "Bad file
-	 * descriptor" error, so tuplesort must avoid writing to the leader tape
-	 * altogether.
-	 */
-	lts->nHoleBlocks = lts->nBlocksAllocated - nphysicalblocks;
+	return lts;
 }
 
 /*
- * Create a set of logical tapes in a temporary underlying file.
+ * Claim ownership of a tape set from an existing shared BufFile.
  *
- * Each tape is initialized in write state.  Serial callers pass ntapes,
- * NULL argument for shared, and -1 for worker.  Parallel worker callers
- * pass ntapes, a shared file handle, NULL shared argument,  and their own
- * worker number.  Leader callers, which claim shared worker tapes here,
- * must supply non-sentinel values for all arguments except worker number,
- * which should be -1.
+ * Caller should be leader process.  Though tapes are marked as frozen in
+ * workers, they are not frozen when opened within leader, since unfrozen tapes
+ * use a larger read buffer. (Frozen tapes have smaller read buffer, optimized
+ * for random access.)
+ *
+ * LogicalTapeImport() constructs a tapeset object, and attaches the existing
+ * tape to it.  It is assumed that workers export their entire output as only
+ * one final materialized tape.
  *
- * Leader caller is passing back an array of metadata each worker captured
- * when LogicalTapeFreeze() was called for their final result tapes.  Passed
- * tapes array is actually sized ntapes - 1, because it includes only
- * worker tapes, whereas leader requires its own leader tape.  Note that we
- * rely on the assumption that reclaimed worker tapes will only be read
- * from once by leader, and never written to again (tapes are initialized
- * for writing, but that's only to be consistent).  Leader may not write to
- * its own tape purely due to a restriction in the shared buffile
- * infrastructure that may be lifted in the future.
+ * NOTE: shared files are currently read-only. That could be changed if it
+ * seemed worthwhile, but for now, you cannot write to an imported tape, or
+ * you get a "Bad file descriptor" error.
  */
 LogicalTapeSet *
-LogicalTapeSetCreate(int ntapes, TapeShare *shared, SharedFileSet *fileset,
-					 int worker)
+LogicalTapeSetImport(SharedFileSet *fileset, int worker)
 {
 	LogicalTapeSet *lts;
-	LogicalTape *lt;
-	int			i;
+	char		filename[MAXPGPATH];
+	off_t		buffilesize;
 
 	/*
-	 * Create top-level struct including per-tape LogicalTape structs.
+	 * Create a tapeset that points to the existing file.
+	 *
+	 * Since we won't be writing, set 'forgetFreeSpace'.
 	 */
-	Assert(ntapes > 0);
-	lts = (LogicalTapeSet *) palloc(offsetof(LogicalTapeSet, tapes) +
-									ntapes * sizeof(LogicalTape));
+	lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet));
 	lts->nBlocksAllocated = 0L;
 	lts->nBlocksWritten = 0L;
-	lts->nHoleBlocks = 0L;
-	lts->forgetFreeSpace = false;
+	lts->forgetFreeSpace = true;
 	lts->blocksSorted = true;	/* a zero-length array is sorted ... */
-	lts->freeBlocksLen = 32;	/* reasonable initial guess */
-	lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long));
+	lts->freeBlocksLen = 0;
+	lts->freeBlocks = NULL;
 	lts->nFreeBlocks = 0;
-	lts->nTapes = ntapes;
 
-	/*
-	 * Initialize per-tape structs.  Note we allocate the I/O buffer and the
-	 * first block for a tape only when it is first actually written to.  This
-	 * avoids wasting memory space when tuplesort.c overestimates the number
-	 * of tapes needed.
-	 */
-	for (i = 0; i < ntapes; i++)
-	{
-		lt = &lts->tapes[i];
-		lt->writing = true;
-		lt->frozen = false;
-		lt->dirty = false;
-		lt->firstBlockNumber = -1L;
-		lt->curBlockNumber = -1L;
-		lt->nextBlockNumber = -1L;
-		lt->offsetBlockNumber = 0L;
-		lt->buffer = NULL;
-		lt->buffer_size = 0;
-		/* palloc() larger than MaxAllocSize would fail */
-		lt->max_size = MaxAllocSize;
-		lt->pos = 0;
-		lt->nbytes = 0;
-	}
+	pg_itoa(worker, filename);
+	lts->pfile = BufFileOpenShared(fileset, filename);
 
 	/*
-	 * Create temp BufFile storage as required.
-	 *
-	 * Leader concatenates worker tapes, which requires special adjustment to
-	 * final tapeset data.  Things are simpler for the worker case and the
-	 * serial case, though.  They are generally very similar -- workers use a
-	 * shared fileset, whereas serial sorts use a conventional serial BufFile.
+	 * Set # of allocated blocks, as well as # blocks written, to reflect
+	 * the existing file's size.
 	 */
-	if (shared)
-		ltsConcatWorkerTapes(lts, shared, fileset);
-	else if (fileset)
-	{
-		char		filename[MAXPGPATH];
-
-		pg_itoa(worker, filename);
-		lts->pfile = BufFileCreateShared(fileset, filename);
-	}
-	else
-		lts->pfile = BufFileCreateTemp(false);
+	buffilesize = BufFileSize(lts->pfile);
+	lts->nBlocksAllocated = lts->nBlocksWritten = buffilesize / BLCKSZ;
 
 	return lts;
 }
 
 /*
  * Close a logical tape set and release all resources.
+ *
+ * NOTE: This doesn't close any of the tapes!  You must close them
+ * first, or you can let them be destroyed along with the memory context.
  */
 void
 LogicalTapeSetClose(LogicalTapeSet *lts)
 {
-	LogicalTape *lt;
-	int			i;
-
 	BufFileClose(lts->pfile);
-	for (i = 0; i < lts->nTapes; i++)
-	{
-		lt = &lts->tapes[i];
-		if (lt->buffer)
-			pfree(lt->buffer);
-	}
-	pfree(lts->freeBlocks);
+	if (lts->freeBlocks)
+		pfree(lts->freeBlocks);
 	pfree(lts);
 }
 
 /*
+ * Create a logical tape in the given tapeset.
+ *
+ * The tape is initialized in write state.
+ */
+LogicalTape *
+LogicalTapeCreate(LogicalTapeSet *lts)
+{
+	LogicalTape *lt;
+
+	lt = palloc(sizeof(LogicalTape));
+
+	/*
+	 * Initialize per-tape structs.  Note we allocate the I/O buffer and the
+	 * first block for a tape only when it is first actually written to.
+	 */
+	lt->tapeSet = lts;
+	lt->writing = true;
+	lt->frozen = false;
+	lt->firstBlockNumber = -1L;
+	lt->curBlockNumber = -1L;
+	lt->nextBlockNumber = -1L;
+	lt->buffer = NULL;
+	lt->buffer_size = 0;
+	lt->pos = 0;
+	lt->nbytes = 0;
+	lt->numBlocks = 0;
+
+	return lt;
+}
+
+/*
+ * Register a logical tape in a logical tape set that was imported from
+ * another process.
+ *
+ * Caller should be leader process.  Though tapes are marked as frozen in
+ * workers, they are not frozen when opened within leader, since unfrozen tapes
+ * use a larger read buffer. (Frozen tapes have smaller read buffer, optimized
+ * for random access.)
+ *
+ * NOTE: The imported tape is read-only!
+ */
+LogicalTape *
+LogicalTapeImport(LogicalTapeSet *lts, TapeShare *shared)
+{
+	LogicalTape *lt;
+
+	/*
+	 * Create a tape object, pointing to the block within the file where the
+	 * tape begins.
+	 */
+	lt = LogicalTapeCreate(lts);
+	lt->firstBlockNumber = shared->firstblocknumber;
+
+	/*
+	 * Assume that the tape fills up the whole file. That's not necessarily
+	 * quite right, but 'numBlocks' doesn't need to be accurate.
+	 */
+	lt->numBlocks = lts->nBlocksAllocated;
+
+	return lt;
+}
+
+/*
+ * Close a logical tape.
+ *
+ * Note: This doesn't return any blocks to the free list!  You must
+ * read the tape to the end first, to reuse the space.  In current use,
+ * though, we only close tapes after fully reading them.
+ */
+void
+LogicalTapeClose(LogicalTape *lt)
+{
+	if (lt->buffer)
+		pfree(lt->buffer);
+	pfree(lt);
+}
+
+/*
  * Mark a logical tape set as not needing management of free space anymore.
  *
  * This should be called if the caller does not intend to write any more data
@@ -611,34 +585,38 @@ LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
 }
 
 /*
+ * Obtain total disk space currently used by a LogicalTapeSet, in blocks.
+ */
+long
+LogicalTapeSetBlocks(LogicalTapeSet *lts)
+{
+	return lts->nBlocksAllocated;
+}
+
+/*
  * Write to a logical tape.
  *
  * There are no error returns; we ereport() on failure.
  */
 void
-LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
-				 void *ptr, size_t size)
+LogicalTapeWrite(LogicalTape *lt, void *ptr, size_t size)
 {
-	LogicalTape *lt;
 	size_t		nthistime;
 
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
 	Assert(lt->writing);
-	Assert(lt->offsetBlockNumber == 0L);
 
 	/* Allocate data buffer and first block on first write */
 	if (lt->buffer == NULL)
 	{
 		lt->buffer = (char *) palloc(BLCKSZ);
 		lt->buffer_size = BLCKSZ;
-	}
-	if (lt->curBlockNumber == -1)
-	{
+
+		Assert(lt->curBlockNumber == -1);
 		Assert(lt->firstBlockNumber == -1);
 		Assert(lt->pos == 0);
+		Assert(lt->nbytes == 0);
 
-		lt->curBlockNumber = ltsGetFreeBlock(lts);
+		lt->curBlockNumber = ltsGetFreeBlock(lt->tapeSet);
 		lt->firstBlockNumber = lt->curBlockNumber;
 
 		TapeBlockGetTrailer(lt->buffer)->prev = -1L;
@@ -652,21 +630,16 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 			/* Buffer full, dump it out */
 			long		nextBlockNumber;
 
-			if (!lt->dirty)
-			{
-				/* Hmm, went directly from reading to writing? */
-				elog(ERROR, "invalid logtape state: should be dirty");
-			}
-
 			/*
 			 * First allocate the next block, so that we can store it in the
 			 * 'next' pointer of this block.
 			 */
-			nextBlockNumber = ltsGetFreeBlock(lts);
+			nextBlockNumber = ltsGetFreeBlock(lt->tapeSet);
+			lt->numBlocks++;
 
 			/* set the next-pointer and dump the current block. */
 			TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
-			ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+			ltsWriteBlock(lt->tapeSet, lt->curBlockNumber, (void *) lt->buffer);
 
 			/* initialize the prev-pointer of the next block */
 			TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
@@ -682,7 +655,6 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 
 		memcpy(lt->buffer + lt->pos, ptr, nthistime);
 
-		lt->dirty = true;
 		lt->pos += nthistime;
 		if (lt->nbytes < lt->pos)
 			lt->nbytes = lt->pos;
@@ -704,13 +676,8 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
  * byte buffer is used.
  */
 void
-LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
+LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
 {
-	LogicalTape *lt;
-
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
-
 	/*
 	 * Round and cap buffer_size if needed.
 	 */
@@ -718,13 +685,22 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
 		buffer_size = BLCKSZ;
 	else
 	{
+		/*
+		 * The buffer doesn't need to be larger than the tape.
+		 */
+		if (buffer_size / BLCKSZ > lt->numBlocks)
+			buffer_size = lt->numBlocks * BLCKSZ;
+
 		/* need at least one block */
 		if (buffer_size < BLCKSZ)
 			buffer_size = BLCKSZ;
 
-		/* palloc() larger than max_size is unlikely to be helpful */
-		if (buffer_size > lt->max_size)
-			buffer_size = lt->max_size;
+		/*
+		 * palloc() larger MaxAllocSize would fail (a multi-gigabyte
+		 * buffer is unlikely to be helpful, anyway)
+		 */
+		if (buffer_size > MaxAllocSize)
+			buffer_size = MaxAllocSize;
 
 		/* round down to BLCKSZ boundary */
 		buffer_size -= buffer_size % BLCKSZ;
@@ -736,7 +712,7 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
 		 * Completion of a write phase.  Flush last partial data block, and
 		 * rewind for normal (destructive) read.
 		 */
-		if (lt->dirty)
+		if (lt->curBlockNumber != -1)
 		{
 			/*
 			 * As long as we've filled the buffer at least once, its contents
@@ -751,7 +727,7 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
 									  lt->buffer_size - lt->nbytes);
 
 			TapeBlockSetNBytes(lt->buffer, lt->nbytes);
-			ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+			ltsWriteBlock(lt->tapeSet, lt->curBlockNumber, (void *) lt->buffer);
 		}
 		lt->writing = false;
 	}
@@ -779,36 +755,7 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
 	lt->nextBlockNumber = lt->firstBlockNumber;
 	lt->pos = 0;
 	lt->nbytes = 0;
-	ltsReadFillBuffer(lts, lt);
-}
-
-/*
- * Rewind logical tape and switch from reading to writing.
- *
- * NOTE: we assume the caller has read the tape to the end; otherwise
- * untouched data will not have been freed. We could add more code to free
- * any unread blocks, but in current usage of this module it'd be useless
- * code.
- */
-void
-LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum)
-{
-	LogicalTape *lt;
-
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
-
-	Assert(!lt->writing && !lt->frozen);
-	lt->writing = true;
-	lt->dirty = false;
-	lt->firstBlockNumber = -1L;
-	lt->curBlockNumber = -1L;
-	lt->pos = 0;
-	lt->nbytes = 0;
-	if (lt->buffer)
-		pfree(lt->buffer);
-	lt->buffer = NULL;
-	lt->buffer_size = 0;
+	ltsReadFillBuffer(lt);
 }
 
 /*
@@ -817,15 +764,11 @@ LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum)
  * Early EOF is indicated by return value less than #bytes requested.
  */
 size_t
-LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
-				void *ptr, size_t size)
+LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
 {
-	LogicalTape *lt;
 	size_t		nread = 0;
 	size_t		nthistime;
 
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
 	Assert(!lt->writing);
 
 	while (size > 0)
@@ -833,7 +776,7 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
 		if (lt->pos >= lt->nbytes)
 		{
 			/* Try to load more data into buffer. */
-			if (!ltsReadFillBuffer(lts, lt))
+			if (!ltsReadFillBuffer(lt))
 				break;			/* EOF */
 		}
 
@@ -871,20 +814,15 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
  * Serial sorts should set share to NULL.
  */
 void
-LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
+LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
 {
-	LogicalTape *lt;
-
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
 	Assert(lt->writing);
-	Assert(lt->offsetBlockNumber == 0L);
 
 	/*
 	 * Completion of a write phase.  Flush last partial data block, and rewind
 	 * for nondestructive read.
 	 */
-	if (lt->dirty)
+	if (lt->curBlockNumber != -1)
 	{
 		/*
 		 * As long as we've filled the buffer at least once, its contents are
@@ -898,8 +836,7 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
 								  lt->buffer_size - lt->nbytes);
 
 		TapeBlockSetNBytes(lt->buffer, lt->nbytes);
-		ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
-		lt->writing = false;
+		ltsWriteBlock(lt->tapeSet, lt->curBlockNumber, (void *) lt->buffer);
 	}
 	lt->writing = false;
 	lt->frozen = true;
@@ -926,7 +863,7 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
 
 	if (lt->firstBlockNumber == -1L)
 		lt->nextBlockNumber = -1L;
-	ltsReadBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+	ltsReadBlock(lt->tapeSet, lt->curBlockNumber, (void *) lt->buffer);
 	if (TapeBlockIsLast(lt->buffer))
 		lt->nextBlockNumber = -1L;
 	else
@@ -936,9 +873,8 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
 	/* Handle extra steps when caller is to share its tapeset */
 	if (share)
 	{
-		BufFileExportShared(lts->pfile);
+		BufFileExportShared(lt->tapeSet->pfile);
 		share->firstblocknumber = lt->firstBlockNumber;
-		share->buffilesize = BufFileSize(lts->pfile);
 	}
 }
 
@@ -956,13 +892,10 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
  * that case.
  */
 size_t
-LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
+LogicalTapeBackspace(LogicalTape *lt, size_t size)
 {
-	LogicalTape *lt;
 	size_t		seekpos = 0;
 
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
 	Assert(lt->frozen);
 	Assert(lt->buffer_size == BLCKSZ);
 
@@ -994,7 +927,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
 			return seekpos;
 		}
 
-		ltsReadBlock(lts, prev, (void *) lt->buffer);
+		ltsReadBlock(lt->tapeSet, prev, (void *) lt->buffer);
 
 		if (TapeBlockGetTrailer(lt->buffer)->next != lt->curBlockNumber)
 			elog(ERROR, "broken tape, next of block %ld is %ld, expected %ld",
@@ -1027,20 +960,15 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
  * LogicalTapeTell().
  */
 void
-LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
-				long blocknum, int offset)
+LogicalTapeSeek(LogicalTape *lt, long blocknum, int offset)
 {
-	LogicalTape *lt;
-
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
 	Assert(lt->frozen);
 	Assert(offset >= 0 && offset <= TapeBlockPayloadSize);
 	Assert(lt->buffer_size == BLCKSZ);
 
 	if (blocknum != lt->curBlockNumber)
 	{
-		ltsReadBlock(lts, blocknum, (void *) lt->buffer);
+		ltsReadBlock(lt->tapeSet, blocknum, (void *) lt->buffer);
 		lt->curBlockNumber = blocknum;
 		lt->nbytes = TapeBlockPayloadSize;
 		lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
@@ -1058,27 +986,11 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
  * the position for a seek after freezing.  Not clear if anyone needs that.
  */
 void
-LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
-				long *blocknum, int *offset)
+LogicalTapeTell(LogicalTape *lt, long *blocknum, int *offset)
 {
-	LogicalTape *lt;
-
-	Assert(tapenum >= 0 && tapenum < lts->nTapes);
-	lt = &lts->tapes[tapenum];
-	Assert(lt->offsetBlockNumber == 0L);
-
 	/* With a larger buffer, 'pos' wouldn't be the same as offset within page */
 	Assert(lt->buffer_size == BLCKSZ);
 
 	*blocknum = lt->curBlockNumber;
 	*offset = lt->pos;
 }
-
-/*
- * Obtain total disk space currently used by a LogicalTapeSet, in blocks.
- */
-long
-LogicalTapeSetBlocks(LogicalTapeSet *lts)
-{
-	return lts->nBlocksAllocated - lts->nHoleBlocks;
-}
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index e6a8d22feb..01e012412a 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -11,13 +11,18 @@
  * algorithm.
  *
  * See Knuth, volume 3, for more than you want to know about the external
- * sorting algorithm.  Historically, we divided the input into sorted runs
- * using replacement selection, in the form of a priority tree implemented
- * as a heap (essentially his Algorithm 5.2.3H), but now we always use
- * quicksort for run generation.  We merge the runs using polyphase merge,
- * Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D are
- * implemented by logtape.c, which avoids space wastage by recycling disk
- * space as soon as each block is read from its "tape".
+ * sorting algorithms. The algorithm we use is a balanced k-way merge.
+ * Before PostgreSQL 10, we used the polyphase merge algorithm (Knuth's
+ * Algorithm 5.4.2D), but with modern hardware, a straightforward
+ * balanced merge is better.  Knuth is assuming that tape drives are
+ * expensive beasts, and in particular that there will always be many more
+ * runs than tape drives.  The polyphase merge algorithm was good at keeping
+ * all the tape drives busy, but in our implementation a "tape drive"
+ * doesn't cost much more than a few Kb of memory buffers, so we can afford
+ * to have lots of them.  In particular, if we can have as many tape drives
+ * as sorted runs, we can eliminate any repeated I/O at all.  The logical
+ * "tapes" are implemented by logtape.c, which avoids space wastage by
+ * recycling disk space as soon as each block is read from its "tape".
  *
  * The approximate amount of memory allowed for any one sort operation
  * is specified in kilobytes by the caller (most pass work_mem).  Initially,
@@ -27,9 +32,8 @@
  * tuples just by scanning the tuple array sequentially.  If we do exceed
  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
  * When tuples are dumped in batch after quicksorting, we begin a new run
- * with a new output tape (selected per Algorithm D).  After the end of the
- * input is reached, we dump out remaining tuples in memory into a final run,
- * then merge the runs using Algorithm D.
+ * with a new output tape.  After the end of the input is reached, we dump
+ * out remaining tuples in memory into a final run, then merge the runs.
  *
  * When merging runs, we use a heap containing just the frontmost tuple from
  * each source run; we repeatedly output the smallest tuple and replace it
@@ -52,6 +56,14 @@
  * accesses.  The pre-reading is handled by logtape.c, we just tell it how
  * much memory to use for the buffers.
  *
+ * In the current code we determine the number of input tapes M on the basis
+ * of workMem: we want workMem/M to be large enough that we read a fair
+ * amount of data each time we read from a tape, so as to maintain the
+ * locality of access described above.  Nonetheless, with large workMem we
+ * can have many tapes.  The logical "tapes" are implemented by logtape.c,
+ * which avoids space wastage by recycling disk space as soon as each block
+ * is read from its "tape".
+ *
  * When the caller requests random access to the sort result, we form
  * the final sorted run on a logical tape which is then "frozen", so
  * that we can access it randomly.  When the caller does not need random
@@ -60,20 +72,6 @@
  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
  * saves one cycle of writing all the data out to disk and reading it in.
  *
- * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
- * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
- * to Knuth's figure 70 (section 5.4.2).  However, Knuth is assuming that
- * tape drives are expensive beasts, and in particular that there will always
- * be many more runs than tape drives.  In our implementation a "tape drive"
- * doesn't cost much more than a few Kb of memory buffers, so we can afford
- * to have lots of them.  In particular, if we can have as many tape drives
- * as sorted runs, we can eliminate any repeated I/O at all.  In the current
- * code we determine the number of tapes M on the basis of workMem: we want
- * workMem/M to be large enough that we read a fair amount of data each time
- * we preread from a tape, so as to maintain the locality of access described
- * above.  Nonetheless, with large workMem we can have many tapes (but not
- * too many -- see the comments in tuplesort_merge_order).
- *
  * This module supports parallel sorting.  Parallel sorts involve coordination
  * among one or more worker processes, and a leader process, each with its own
  * tuplesort state.  The leader process (or, more accurately, the
@@ -241,10 +239,11 @@ struct Tuplesortstate
 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 	int64		availMem;		/* remaining memory available, in bytes */
 	int64		allowedMem;		/* total memory allowed, in bytes */
-	int			maxTapes;		/* number of tapes (Knuth's T) */
-	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
+	int			maxInputTapes;	/* max number of input tapes */
 	MemoryContext sortcontext;	/* memory context holding most sort data */
 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
+
+	List	   *tapesets;
 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 
 	/*
@@ -274,7 +273,7 @@ struct Tuplesortstate
 	 * SortTuple struct!), and increase state->availMem by the amount of
 	 * memory space thereby released.
 	 */
-	void		(*writetup) (Tuplesortstate *state, int tapenum,
+	void		(*writetup) (Tuplesortstate *state, LogicalTape *tape,
 							 SortTuple *stup);
 
 	/*
@@ -283,7 +282,7 @@ struct Tuplesortstate
 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
 	 */
 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
-							int tapenum, unsigned int len);
+							LogicalTape *tape, unsigned int len);
 
 	/*
 	 * This array holds the tuples now in sort memory.  If we are in state
@@ -330,8 +329,8 @@ struct Tuplesortstate
 	char	   *slabMemoryEnd;	/* end of slab memory arena */
 	SlabSlot   *slabFreeHead;	/* head of free list */
 
-	/* Buffer size to use for reading input tapes, during merge. */
-	size_t		read_buffer_size;
+	/* Memory to use for input tape buffers, during merge. */
+	size_t		read_buffer_mem;
 
 	/*
 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
@@ -348,36 +347,29 @@ struct Tuplesortstate
 	int			currentRun;
 
 	/*
-	 * Unless otherwise noted, all pointer variables below are pointers to
-	 * arrays of length maxTapes, holding per-tape data.
+	 * Logical tapes, for merging.
+	 *
+	 * The initial runs are written in the output tapes.  In each merge pass,
+	 * the output tapes of the previous pass become the input tapes, and
+	 * new output tapes are allocated as needed.  When nInputTapes == nInputRuns,
+	 * there is only one merge pass left.
 	 */
+	LogicalTape **inputTapes;
+	int			nInputTapes;
+	int			nInputRuns;
 
-	/*
-	 * This variable is only used during merge passes.  mergeactive[i] is true
-	 * if we are reading an input run from (actual) tape number i and have not
-	 * yet exhausted that run.
-	 */
-	bool	   *mergeactive;	/* active input run source? */
+	LogicalTape **outputTapes;
+	int			nOutputTapes;
+	int			nOutputRuns;
 
-	/*
-	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
-	 * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
-	 * "logical" and "actual" tape numbers straight!
-	 */
-	int			Level;			/* Knuth's l */
-	int			destTape;		/* current output tape (Knuth's j, less 1) */
-	int		   *tp_fib;			/* Target Fibonacci run counts (A[]) */
-	int		   *tp_runs;		/* # of real runs on each tape */
-	int		   *tp_dummy;		/* # of dummy runs for each tape (D[]) */
-	int		   *tp_tapenum;		/* Actual tape numbers (TAPE[]) */
-	int			activeTapes;	/* # of active input tapes in merge pass */
+	LogicalTape	*destTape;		/* current output tape */
 
 	/*
 	 * These variables are used after completion of sorting to keep track of
 	 * the next tuple to return.  (In the tape case, the tape's current read
 	 * position is also critical state.)
 	 */
-	int			result_tape;	/* actual tape number of finished output */
+	LogicalTape	*result_tape;	/* actual tape of finished output */
 	int			current;		/* array index (only used if SORTEDINMEM) */
 	bool		eof_reached;	/* reached EOF (needed for cursors) */
 
@@ -581,9 +573,9 @@ struct Sharedsort
  */
 
 /* When using this macro, beware of double evaluation of len */
-#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
+#define LogicalTapeReadExact(tape, ptr, len) \
 	do { \
-		if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
+		if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \
 			elog(ERROR, "unexpected end of data"); \
 	} while(0)
 
@@ -600,7 +592,7 @@ static void init_slab_allocator(Tuplesortstate *state, int numSlots);
 static void mergeruns(Tuplesortstate *state);
 static void mergeonerun(Tuplesortstate *state);
 static void beginmerge(Tuplesortstate *state);
-static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
+static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup);
 static void dumptuples(Tuplesortstate *state, bool alltuples);
 static void make_bounded_heap(Tuplesortstate *state);
 static void sort_bounded_heap(Tuplesortstate *state);
@@ -609,39 +601,39 @@ static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple);
 static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple);
 static void tuplesort_heap_delete_top(Tuplesortstate *state);
 static void reversedirection(Tuplesortstate *state);
-static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
-static void markrunend(Tuplesortstate *state, int tapenum);
+static unsigned int getlen(Tuplesortstate *state, LogicalTape *tape, bool eofOK);
+static void markrunend(Tuplesortstate *state, LogicalTape *tape);
 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
 				Tuplesortstate *state);
 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
-static void writetup_heap(Tuplesortstate *state, int tapenum,
+static void writetup_heap(Tuplesortstate *state, LogicalTape *tape,
 			  SortTuple *stup);
 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
-			 int tapenum, unsigned int len);
+			 LogicalTape *tape, unsigned int len);
 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
 				   Tuplesortstate *state);
 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
-static void writetup_cluster(Tuplesortstate *state, int tapenum,
+static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape,
 				 SortTuple *stup);
 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
-				int tapenum, unsigned int len);
+				LogicalTape *tape, unsigned int len);
 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 					   Tuplesortstate *state);
 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 					  Tuplesortstate *state);
 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
-static void writetup_index(Tuplesortstate *state, int tapenum,
+static void writetup_index(Tuplesortstate *state, LogicalTape *tape,
 			   SortTuple *stup);
 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
-			  int tapenum, unsigned int len);
+			  LogicalTape *tape, unsigned int len);
 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
 				 Tuplesortstate *state);
 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
-static void writetup_datum(Tuplesortstate *state, int tapenum,
+static void writetup_datum(Tuplesortstate *state, LogicalTape *tape,
 			   SortTuple *stup);
 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
-			  int tapenum, unsigned int len);
+			  LogicalTape *tape, unsigned int len);
 static int	worker_get_identifier(Tuplesortstate *state);
 static void worker_freeze_result_tape(Tuplesortstate *state);
 static void worker_nomergeruns(Tuplesortstate *state);
@@ -741,6 +733,7 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 	state->sortcontext = sortcontext;
 	state->tuplecontext = tuplecontext;
 	state->tapeset = NULL;
+	state->tapesets = NIL;
 
 	state->memtupcount = 0;
 
@@ -764,11 +757,11 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 	state->currentRun = 0;
 
 	/*
-	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
-	 * inittapes(), if needed
+	 * Tape variables (inputTapes, outputTapes, etc.) will be initialized by
+	 * inittapes(), if needed.
 	 */
 
-	state->result_tape = -1;	/* flag that result tape has not been formed */
+	state->result_tape = NULL;		/* flag that result tape has not been formed */
 
 	/*
 	 * Initialize parallel-related state based on coordination information
@@ -1236,12 +1229,20 @@ tuplesort_end(Tuplesortstate *state)
 {
 	/* context swap probably not needed, but let's be safe */
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	ListCell *lc;
+	bool		was_external_sort = (state->tapesets != NIL);
 
 #ifdef TRACE_SORT
 	long		spaceUsed;
 
-	if (state->tapeset)
-		spaceUsed = LogicalTapeSetBlocks(state->tapeset);
+	if (was_external_sort)
+	{
+		spaceUsed = 0;
+		foreach(lc, state->tapesets)
+		{
+			spaceUsed += LogicalTapeSetBlocks((LogicalTapeSet *) lfirst(lc));
+		}
+	}
 	else
 		spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
 #endif
@@ -1251,14 +1252,19 @@ tuplesort_end(Tuplesortstate *state)
 	 *
 	 * Note: want to include this in reported total cost of sort, hence need
 	 * for two #ifdef TRACE_SORT sections.
+	 *
+	 * We don't bother to destroy the individual tapes here, they will go away
+	 * with the sortcontext.
 	 */
-	if (state->tapeset)
-		LogicalTapeSetClose(state->tapeset);
+	foreach(lc, state->tapesets)
+	{
+		LogicalTapeSetClose((LogicalTapeSet *) lfirst(lc));
+	}
 
 #ifdef TRACE_SORT
 	if (trace_sort)
 	{
-		if (state->tapeset)
+		if (was_external_sort)
 			elog(LOG, "%s of %d ended, %ld disk blocks used: %s",
 				 SERIAL(state) ? "external sort" : "parallel external sort",
 				 state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
@@ -1268,14 +1274,14 @@ tuplesort_end(Tuplesortstate *state)
 				 state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
 	}
 
-	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
+	TRACE_POSTGRESQL_SORT_DONE(was_external_sort, spaceUsed);
 #else
 
 	/*
 	 * If you disabled TRACE_SORT, you can still probe sort__done, but you
 	 * ain't getting space-used stats.
 	 */
-	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
+	TRACE_POSTGRESQL_SORT_DONE(was_external_sort != NULL, 0L);
 #endif
 
 	/* Free any execution state created for CLUSTER case */
@@ -1879,7 +1885,7 @@ tuplesort_performsort(Tuplesortstate *state)
 	{
 		if (state->status == TSS_FINALMERGE)
 			elog(LOG, "performsort of %d done (except %d-way final merge): %s",
-				 state->worker, state->activeTapes,
+				 state->worker, state->nInputTapes,
 				 pg_rusage_show(&state->ru_start));
 		else
 			elog(LOG, "performsort of %d done: %s",
@@ -2004,8 +2010,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				 * end of file; back up to fetch last tuple's ending length
 				 * word.  If seek fails we must have a completely empty file.
 				 */
-				nmoved = LogicalTapeBackspace(state->tapeset,
-											  state->result_tape,
+				nmoved = LogicalTapeBackspace(state->result_tape,
 											  2 * sizeof(unsigned int));
 				if (nmoved == 0)
 					return false;
@@ -2019,8 +2024,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				 * Back up and fetch previously-returned tuple's ending length
 				 * word.  If seek fails, assume we are at start of file.
 				 */
-				nmoved = LogicalTapeBackspace(state->tapeset,
-											  state->result_tape,
+				nmoved = LogicalTapeBackspace(state->result_tape,
 											  sizeof(unsigned int));
 				if (nmoved == 0)
 					return false;
@@ -2031,8 +2035,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				/*
 				 * Back up to get ending length word of tuple before it.
 				 */
-				nmoved = LogicalTapeBackspace(state->tapeset,
-											  state->result_tape,
+				nmoved = LogicalTapeBackspace(state->result_tape,
 											  tuplen + 2 * sizeof(unsigned int));
 				if (nmoved == tuplen + sizeof(unsigned int))
 				{
@@ -2056,8 +2059,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 			 * Note: READTUP expects we are positioned after the initial
 			 * length word of the tuple, so back up to that point.
 			 */
-			nmoved = LogicalTapeBackspace(state->tapeset,
-										  state->result_tape,
+			nmoved = LogicalTapeBackspace(state->result_tape,
 										  tuplen);
 			if (nmoved != tuplen)
 				elog(ERROR, "bogus tuple length in backward scan");
@@ -2091,7 +2093,8 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 			 */
 			if (state->memtupcount > 0)
 			{
-				int			srcTape = state->memtuples[0].tupindex;
+				int			srcTapeIndex = state->memtuples[0].tupindex;
+				LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
 				SortTuple	newtup;
 
 				*stup = state->memtuples[0];
@@ -2113,16 +2116,16 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 					 * Remove the top node from the heap.
 					 */
 					tuplesort_heap_delete_top(state);
+					state->nInputRuns--;
 
 					/*
-					 * Rewind to free the read buffer.  It'd go away at the
-					 * end of the sort anyway, but better to release the
-					 * memory early.
+					 * Close the tape.  It'd go away at the end of the sort
+					 * anyway, but better to release the memory early.
 					 */
-					LogicalTapeRewindForWrite(state->tapeset, srcTape);
+					LogicalTapeClose(srcTape);
 					return true;
 				}
-				newtup.tupindex = srcTape;
+				newtup.tupindex = srcTapeIndex;
 				tuplesort_heap_replace_top(state, &newtup);
 				return true;
 			}
@@ -2363,8 +2366,8 @@ tuplesort_merge_order(int64 allowedMem)
 	 * array in this calculation, but we effectively treat that as part of the
 	 * MERGE_BUFFER_SIZE workspace.
 	 */
-	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
-		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
+	mOrder = (allowedMem) /
+		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD + TAPE_BUFFER_OVERHEAD);
 
 	/*
 	 * Even in minimum memory, use at least a MINORDER merge.  On the other
@@ -2391,67 +2394,57 @@ tuplesort_merge_order(int64 allowedMem)
 static void
 inittapes(Tuplesortstate *state, bool mergeruns)
 {
-	int			maxTapes,
-				j;
+	int			maxInputTapes;
 
 	Assert(!LEADER(state));
 
 	if (mergeruns)
 	{
-		/* Compute number of tapes to use: merge order plus 1 */
-		maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
+		/* Compute number of input tapes to use (aka merge order) */
+		maxInputTapes = tuplesort_merge_order(state->allowedMem) + 1;
 	}
 	else
 	{
 		/* Workers can sometimes produce single run, output without merge */
 		Assert(WORKER(state));
-		maxTapes = MINORDER + 1;
+		maxInputTapes = MINORDER;
 	}
 
 #ifdef TRACE_SORT
 	if (trace_sort)
 		elog(LOG, "%d switching to external sort with %d tapes: %s",
-			 state->worker, maxTapes, pg_rusage_show(&state->ru_start));
+			 state->worker, maxInputTapes, pg_rusage_show(&state->ru_start));
 #endif
 
-	/* Create the tape set and allocate the per-tape data arrays */
-	inittapestate(state, maxTapes);
+	/*
+	 * Create the tape set.  It is initially empty, the tapes are created as
+	 * needed.
+	 */
+	inittapestate(state, maxInputTapes);
 	state->tapeset =
-		LogicalTapeSetCreate(maxTapes, NULL,
-							 state->shared ? &state->shared->fileset : NULL,
+		LogicalTapeSetCreate(state->shared ? &state->shared->fileset : NULL,
 							 state->worker);
+	state->tapesets = lappend(state->tapesets, state->tapeset);
 
 	state->currentRun = 0;
-
-	/*
-	 * Initialize variables of Algorithm D (step D1).
-	 */
-	for (j = 0; j < maxTapes; j++)
-	{
-		state->tp_fib[j] = 1;
-		state->tp_runs[j] = 0;
-		state->tp_dummy[j] = 1;
-		state->tp_tapenum[j] = j;
-	}
-	state->tp_fib[state->tapeRange] = 0;
-	state->tp_dummy[state->tapeRange] = 0;
-
-	state->Level = 1;
-	state->destTape = 0;
+	state->destTape = NULL;
 
 	state->status = TSS_BUILDRUNS;
+
+	selectnewtape(state);
 }
 
 /*
  * inittapestate - initialize generic tape management state
  */
 static void
-inittapestate(Tuplesortstate *state, int maxTapes)
+inittapestate(Tuplesortstate *state, int maxInputTapes)
 {
 	int64		tapeSpace;
 
 	/*
-	 * Decrease availMem to reflect the space needed for tape buffers; but
+	 * Decrease availMem to reflect the space needed for tape buffer of the
+	 * output tape; but
 	 * don't decrease it to the point that we have no room for tuples. (That
 	 * case is only likely to occur if sorting pass-by-value Datums; in all
 	 * other scenarios the memtuples[] array is unlikely to occupy more than
@@ -2459,7 +2452,7 @@ inittapestate(Tuplesortstate *state, int maxTapes)
 	 * account for tuple space, so we don't care if LACKMEM becomes
 	 * inaccurate.)
 	 */
-	tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
+	tapeSpace = (int64) maxInputTapes * TAPE_BUFFER_OVERHEAD;
 
 	if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
 		USEMEM(state, tapeSpace);
@@ -2471,51 +2464,47 @@ inittapestate(Tuplesortstate *state, int maxTapes)
 	 */
 	PrepareTempTablespaces();
 
-	state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
-	state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
-	state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
-	state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
-	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
+	state->inputTapes = NULL;
+	state->nInputTapes = 0;
+	state->nInputRuns = 0;
+
+	state->outputTapes = palloc0(maxInputTapes * sizeof(LogicalTape *));
+	state->nOutputTapes = 0;
+	state->nOutputRuns = 0;
 
 	/* Record # of tapes allocated (for duration of sort) */
-	state->maxTapes = maxTapes;
-	/* Record maximum # of tapes usable as inputs when merging */
-	state->tapeRange = maxTapes - 1;
+	state->maxInputTapes = maxInputTapes;
 }
 
 /*
- * selectnewtape -- select new tape for new initial run.
+ * selectnewtape -- select next tape to output to.
  *
  * This is called after finishing a run when we know another run
- * must be started.  This implements steps D3, D4 of Algorithm D.
+ * must be started.  This is used both when building the initial
+ * runs, and during merge passes.
  */
 static void
 selectnewtape(Tuplesortstate *state)
 {
-	int			j;
-	int			a;
-
-	/* Step D3: advance j (destTape) */
-	if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
+	if (state->nOutputRuns < state->maxInputTapes)
 	{
-		state->destTape++;
-		return;
+		/* Create a new tape to hold the next run */
+		Assert(state->outputTapes[state->nOutputRuns] == NULL);
+		Assert(state->nOutputRuns == state->nOutputTapes);
+		state->destTape = LogicalTapeCreate(state->tapeset);
+		state->outputTapes[state->nOutputRuns] = state->destTape;
+		state->nOutputTapes++;
+		state->nOutputRuns++;
 	}
-	if (state->tp_dummy[state->destTape] != 0)
-	{
-		state->destTape = 0;
-		return;
-	}
-
-	/* Step D4: increase level */
-	state->Level++;
-	a = state->tp_fib[0];
-	for (j = 0; j < state->tapeRange; j++)
+	else
 	{
-		state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
-		state->tp_fib[j] = a + state->tp_fib[j + 1];
+		/*
+		 * We have reached the max number of tapes.  Append to an existing
+		 * tape.
+		 */
+		state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes];
+		state->nOutputRuns++;
 	}
-	state->destTape = 0;
 }
 
 /*
@@ -2554,18 +2543,14 @@ init_slab_allocator(Tuplesortstate *state, int numSlots)
 /*
  * mergeruns -- merge all the completed initial runs.
  *
- * This implements steps D5, D6 of Algorithm D.  All input data has
+ * This implements the Balanced k-Way Merge Algorithm.  All input data has
  * already been written to initial runs on tape (see dumptuples).
  */
 static void
 mergeruns(Tuplesortstate *state)
 {
-	int			tapenum,
-				svTape,
-				svRuns,
-				svDummy;
-	int			numTapes;
-	int			numInputTapes;
+	int			tapenum;
+	int64		tape_buffer_mem;
 
 	Assert(state->status == TSS_BUILDRUNS);
 	Assert(state->memtupcount == 0);
@@ -2602,46 +2587,33 @@ mergeruns(Tuplesortstate *state)
 	state->memtuples = NULL;
 
 	/*
-	 * If we had fewer runs than tapes, refund the memory that we imagined we
-	 * would need for the tape buffers of the unused tapes.
-	 *
-	 * numTapes and numInputTapes reflect the actual number of tapes we will
-	 * use.  Note that the output tape's tape number is maxTapes - 1, so the
-	 * tape numbers of the used tapes are not consecutive, and you cannot just
-	 * loop from 0 to numTapes to visit all used tapes!
-	 */
-	if (state->Level == 1)
-	{
-		numInputTapes = state->currentRun;
-		numTapes = numInputTapes + 1;
-		FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD);
-	}
-	else
-	{
-		numInputTapes = state->tapeRange;
-		numTapes = state->maxTapes;
-	}
-
-	/*
 	 * Initialize the slab allocator.  We need one slab slot per input tape,
 	 * for the tuples in the heap, plus one to hold the tuple last returned
 	 * from tuplesort_gettuple.  (If we're sorting pass-by-val Datums,
 	 * however, we don't need to do allocate anything.)
 	 *
+	 * In a multi-pass merge, we could shrink this allocation for the last
+	 * merge pass, if it has fewer tapes than previous passes, but we don't
+	 * bother.
+	 *
 	 * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
 	 * to track memory usage of individual tuples.
 	 */
 	if (state->tuples)
-		init_slab_allocator(state, numInputTapes + 1);
+		init_slab_allocator(state, state->nOutputTapes + 1);
 	else
 		init_slab_allocator(state, 0);
 
 	/*
 	 * Allocate a new 'memtuples' array, for the heap.  It will hold one tuple
 	 * from each input tape.
+	 *
+	 * We could shrink this, too, between passes in a multi-pass merge, but
+	 * we don't bother.  (The initial input tapes are still in outputTapes.
+	 * The number of input tapes will not increase between passes.)
 	 */
-	state->memtupsize = numInputTapes;
-	state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
+	state->memtupsize = state->nOutputTapes;
+	state->memtuples = (SortTuple *) palloc(state->nOutputTapes * sizeof(SortTuple));
 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
 
 	/*
@@ -2661,41 +2633,61 @@ mergeruns(Tuplesortstate *state)
 #ifdef TRACE_SORT
 	if (trace_sort)
 		elog(LOG, "%d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
-			 state->worker, state->availMem / 1024, numInputTapes);
+			 state->worker, state->availMem / 1024, state->nOutputTapes);
 #endif
 
-	state->read_buffer_size = Max(state->availMem / numInputTapes, 0);
-	USEMEM(state, state->read_buffer_size * numInputTapes);
+	state->read_buffer_mem = state->availMem;
+	USEMEM(state, state->availMem);
 
-	/* End of step D2: rewind all output tapes to prepare for merging */
-	for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
-		LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size);
+	/* We will use all remaining memory for read buffers */
+	tape_buffer_mem = state->availMem;
+	USEMEM(state, tape_buffer_mem);
 
 	for (;;)
 	{
 		/*
-		 * At this point we know that tape[T] is empty.  If there's just one
-		 * (real or dummy) run left on each input tape, then only one merge
-		 * pass remains.  If we don't have to produce a materialized sorted
-		 * tape, we can stop at this point and do the final merge on-the-fly.
+		 * On the first iteration, or if we have read all the runs from the input tapes in
+		 * a multi-pass merge, it's time to start a new pass.  Rewind all the output tapes,
+		 * and make them inputs for the next pass.
 		 */
-		if (!state->randomAccess && !WORKER(state))
+		if (state->nInputRuns == 0)
 		{
-			bool		allOneRun = true;
-
-			Assert(state->tp_runs[state->tapeRange] == 0);
-			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
+			/* Close the old, emptied, input tapes */
+			if (state->nInputTapes > 0)
 			{
-				if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
-				{
-					allOneRun = false;
-					break;
-				}
+				for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
+					LogicalTapeClose(state->inputTapes[tapenum]);
+				pfree(state->inputTapes);
 			}
-			if (allOneRun)
+
+			/* Previous pass's outputs become next pass's inputs. */
+			state->inputTapes = state->outputTapes;
+			state->nInputTapes = state->nOutputTapes;
+			state->nInputRuns = state->nOutputRuns;
+
+			/*
+			 * Reset output tape variables.  (The actual LogicalTapes will be created
+			 * as needed, we just allocate a large-enough array for them here.)
+			 */
+			state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *));
+			state->nOutputTapes = 0;
+			state->nOutputRuns = 0;
+
+			/* Prepare the new input tapes for merge pass. */
+			for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
+				LogicalTapeRewindForRead(state->inputTapes[tapenum],
+										 state->read_buffer_mem / state->nInputTapes);
+
+			/*
+			 * If there's just one run left on each input tape, then only one merge pass
+			 * remains.  If we don't have to produce a materialized sorted tape, we can
+			 * stop at this point and do the final merge on-the-fly.
+			 */
+			if (!state->randomAccess && state->nInputRuns <= state->nInputTapes && !WORKER(state))
 			{
 				/* Tell logtape.c we won't be writing anymore */
-				LogicalTapeSetForgetFreeSpace(state->tapeset);
+				if (state->tapeset)
+					LogicalTapeSetForgetFreeSpace(state->tapeset);
 				/* Initialize for the final merge pass */
 				beginmerge(state);
 				state->status = TSS_FINALMERGE;
@@ -2703,96 +2695,47 @@ mergeruns(Tuplesortstate *state)
 			}
 		}
 
-		/* Step D5: merge runs onto tape[T] until tape[P] is empty */
-		while (state->tp_runs[state->tapeRange - 1] ||
-			   state->tp_dummy[state->tapeRange - 1])
-		{
-			bool		allDummy = true;
-
-			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
-			{
-				if (state->tp_dummy[tapenum] == 0)
-				{
-					allDummy = false;
-					break;
-				}
-			}
-
-			if (allDummy)
-			{
-				state->tp_dummy[state->tapeRange]++;
-				for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
-					state->tp_dummy[tapenum]--;
-			}
-			else
-				mergeonerun(state);
-		}
+		/* Select an output tape */
+		selectnewtape(state);
 
-		/* Step D6: decrease level */
-		if (--state->Level == 0)
-			break;
-		/* rewind output tape T to use as new input */
-		LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange],
-								 state->read_buffer_size);
-		/* rewind used-up input tape P, and prepare it for write pass */
-		LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]);
-		state->tp_runs[state->tapeRange - 1] = 0;
+		/* Merge one run from each input tape. */
+		mergeonerun(state);
 
 		/*
-		 * reassign tape units per step D6; note we no longer care about A[]
+		 * If the input tapes are empty, and we output only one output run,
+		 * we're done.  The current output tape contains the final result.
 		 */
-		svTape = state->tp_tapenum[state->tapeRange];
-		svDummy = state->tp_dummy[state->tapeRange];
-		svRuns = state->tp_runs[state->tapeRange];
-		for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
-		{
-			state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
-			state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
-			state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
-		}
-		state->tp_tapenum[0] = svTape;
-		state->tp_dummy[0] = svDummy;
-		state->tp_runs[0] = svRuns;
+		if (state->nInputRuns == 0 && state->nOutputRuns <= 1)
+			break;
 	}
 
 	/*
-	 * Done.  Knuth says that the result is on TAPE[1], but since we exited
-	 * the loop without performing the last iteration of step D6, we have not
-	 * rearranged the tape unit assignment, and therefore the result is on
-	 * TAPE[T].  We need to do it this way so that we can freeze the final
-	 * output tape while rewinding it.  The last iteration of step D6 would be
-	 * a waste of cycles anyway...
+	 * Done.
 	 */
-	state->result_tape = state->tp_tapenum[state->tapeRange];
+	state->result_tape = state->outputTapes[0];
 	if (!WORKER(state))
-		LogicalTapeFreeze(state->tapeset, state->result_tape, NULL);
+		LogicalTapeFreeze(state->result_tape, NULL);
 	else
 		worker_freeze_result_tape(state);
 	state->status = TSS_SORTEDONTAPE;
 
-	/* Release the read buffers of all the other tapes, by rewinding them. */
-	for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
-	{
-		if (tapenum != state->result_tape)
-			LogicalTapeRewindForWrite(state->tapeset, tapenum);
-	}
+	/* Release the read buffers of all the now-empty input tapes. */
+	for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
+		LogicalTapeClose(state->inputTapes[tapenum]);
 }
 
 /*
- * Merge one run from each input tape, except ones with dummy runs.
- *
- * This is the inner loop of Algorithm D step D5.  We know that the
- * output tape is TAPE[T].
+ * Merge one run from each input tape.
  */
 static void
 mergeonerun(Tuplesortstate *state)
 {
-	int			destTape = state->tp_tapenum[state->tapeRange];
-	int			srcTape;
+	int			srcTapeIndex;
+	LogicalTape *srcTape;
 
 	/*
 	 * Start the merge by loading one tuple from each active source tape into
-	 * the heap.  We can also decrease the input run/dummy run counts.
+	 * the heap.
 	 */
 	beginmerge(state);
 
@@ -2806,8 +2749,9 @@ mergeonerun(Tuplesortstate *state)
 		SortTuple	stup;
 
 		/* write the tuple to destTape */
-		srcTape = state->memtuples[0].tupindex;
-		WRITETUP(state, destTape, &state->memtuples[0]);
+		srcTapeIndex = state->memtuples[0].tupindex;
+		srcTape = state->inputTapes[srcTapeIndex];
+		WRITETUP(state, state->destTape, &state->memtuples[0]);
 
 		/* recycle the slot of the tuple we just wrote out, for the next read */
 		if (state->memtuples[0].tuple)
@@ -2819,73 +2763,53 @@ mergeonerun(Tuplesortstate *state)
 		 */
 		if (mergereadnext(state, srcTape, &stup))
 		{
-			stup.tupindex = srcTape;
+			stup.tupindex = srcTapeIndex;
 			tuplesort_heap_replace_top(state, &stup);
 
 		}
 		else
+		{
 			tuplesort_heap_delete_top(state);
+			state->nInputRuns--;
+		}
 	}
 
 	/*
 	 * When the heap empties, we're done.  Write an end-of-run marker on the
-	 * output tape, and increment its count of real runs.
+	 * output tape.
 	 */
-	markrunend(state, destTape);
-	state->tp_runs[state->tapeRange]++;
+	markrunend(state, state->destTape);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
 		elog(LOG, "%d finished %d-way merge step: %s", state->worker,
-			 state->activeTapes, pg_rusage_show(&state->ru_start));
+			 state->nInputTapes, pg_rusage_show(&state->ru_start));
 #endif
 }
 
 /*
  * beginmerge - initialize for a merge pass
  *
- * We decrease the counts of real and dummy runs for each tape, and mark
- * which tapes contain active input runs in mergeactive[].  Then, fill the
- * merge heap with the first tuple from each active tape.
+ * Fill the merge heap with the first tuple from each input tape.
  */
 static void
 beginmerge(Tuplesortstate *state)
 {
 	int			activeTapes;
-	int			tapenum;
-	int			srcTape;
+	int			srcTapeIndex;
 
 	/* Heap should be empty here */
 	Assert(state->memtupcount == 0);
 
-	/* Adjust run counts and mark the active tapes */
-	memset(state->mergeactive, 0,
-		   state->maxTapes * sizeof(*state->mergeactive));
-	activeTapes = 0;
-	for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
-	{
-		if (state->tp_dummy[tapenum] > 0)
-			state->tp_dummy[tapenum]--;
-		else
-		{
-			Assert(state->tp_runs[tapenum] > 0);
-			state->tp_runs[tapenum]--;
-			srcTape = state->tp_tapenum[tapenum];
-			state->mergeactive[srcTape] = true;
-			activeTapes++;
-		}
-	}
-	Assert(activeTapes > 0);
-	state->activeTapes = activeTapes;
+	activeTapes = Min(state->nInputTapes, state->nInputRuns);
 
-	/* Load the merge heap with the first tuple from each input tape */
-	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
+	for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++)
 	{
 		SortTuple	tup;
 
-		if (mergereadnext(state, srcTape, &tup))
+		if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup))
 		{
-			tup.tupindex = srcTape;
+			tup.tupindex = srcTapeIndex;
 			tuplesort_heap_insert(state, &tup);
 		}
 	}
@@ -2897,19 +2821,13 @@ beginmerge(Tuplesortstate *state)
  * Returns false on EOF.
  */
 static bool
-mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
+mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
 {
 	unsigned int tuplen;
 
-	if (!state->mergeactive[srcTape])
-		return false;			/* tape's run is already exhausted */
-
 	/* read next tuple, if any */
 	if ((tuplen = getlen(state, srcTape, true)) == 0)
-	{
-		state->mergeactive[srcTape] = false;
 		return false;
-	}
 	READTUP(state, stup, srcTape, tuplen);
 
 	return true;
@@ -2944,16 +2862,10 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 	 * In general, short final runs are quite possible.  Rather than allowing
 	 * a special case where there was a superfluous selectnewtape() call (i.e.
 	 * a call with no subsequent run actually written to destTape), we prefer
-	 * to write out a 0 tuple run.
-	 *
-	 * mergereadnext() is prepared for 0 tuple runs, and will reliably mark
-	 * the tape inactive for the merge when called from beginmerge().  This
-	 * case is therefore similar to the case where mergeonerun() finds a dummy
-	 * run for the tape, and so doesn't need to merge a run from the tape (or
-	 * conceptually "merges" the dummy run, if you prefer).  According to
-	 * Knuth, Algorithm D "isn't strictly optimal" in its method of
-	 * distribution and dummy run assignment; this edge case seems very
-	 * unlikely to make that appreciably worse.
+	 * to write out a 0 tuple run.  In the worst case, that could add another
+	 * merge pass, if that pushes us over the threshold, but it's unlikely
+	 * enough to not warrant a special case. (XXX: Actually, I think some
+	 * refactoring to avoid that would be in order...)
 	 */
 	Assert(state->status == TSS_BUILDRUNS);
 
@@ -2992,8 +2904,7 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 	memtupwrite = state->memtupcount;
 	for (i = 0; i < memtupwrite; i++)
 	{
-		WRITETUP(state, state->tp_tapenum[state->destTape],
-				 &state->memtuples[i]);
+		WRITETUP(state, state->destTape, &state->memtuples[i]);
 		state->memtupcount--;
 	}
 
@@ -3006,14 +2917,13 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 	 */
 	MemoryContextReset(state->tuplecontext);
 
-	markrunend(state, state->tp_tapenum[state->destTape]);
-	state->tp_runs[state->destTape]++;
-	state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
+	markrunend(state, state->destTape);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
 		elog(LOG, "%d finished writing run %d to tape %d: %s",
-			 state->worker, state->currentRun, state->destTape,
+			 state->worker,
+			 state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1,
 			 pg_rusage_show(&state->ru_start));
 #endif
 
@@ -3040,9 +2950,7 @@ tuplesort_rescan(Tuplesortstate *state)
 			state->markpos_eof = false;
 			break;
 		case TSS_SORTEDONTAPE:
-			LogicalTapeRewindForRead(state->tapeset,
-									 state->result_tape,
-									 0);
+			LogicalTapeRewindForRead(state->result_tape, 0);
 			state->eof_reached = false;
 			state->markpos_block = 0L;
 			state->markpos_offset = 0;
@@ -3073,8 +2981,7 @@ tuplesort_markpos(Tuplesortstate *state)
 			state->markpos_eof = state->eof_reached;
 			break;
 		case TSS_SORTEDONTAPE:
-			LogicalTapeTell(state->tapeset,
-							state->result_tape,
+			LogicalTapeTell(state->result_tape,
 							&state->markpos_block,
 							&state->markpos_offset);
 			state->markpos_eof = state->eof_reached;
@@ -3105,8 +3012,7 @@ tuplesort_restorepos(Tuplesortstate *state)
 			state->eof_reached = state->markpos_eof;
 			break;
 		case TSS_SORTEDONTAPE:
-			LogicalTapeSeek(state->tapeset,
-							state->result_tape,
+			LogicalTapeSeek(state->result_tape,
 							state->markpos_block,
 							state->markpos_offset);
 			state->eof_reached = state->markpos_eof;
@@ -3138,10 +3044,16 @@ tuplesort_get_stats(Tuplesortstate *state,
 	 * to fix.  Is it worth creating an API for the memory context code to
 	 * tell us how much is actually used in sortcontext?
 	 */
-	if (state->tapeset)
+	if (state->tapesets)
 	{
+		ListCell *lc;
+
 		stats->spaceType = SORT_SPACE_TYPE_DISK;
-		stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
+		stats->spaceUsed = 0;
+		foreach(lc, state->tapesets)
+		{
+			stats->spaceUsed += LogicalTapeSetBlocks((LogicalTapeSet *) lfirst(lc)) * (BLCKSZ / 1024);
+		}
 	}
 	else
 	{
@@ -3451,11 +3363,11 @@ reversedirection(Tuplesortstate *state)
  */
 
 static unsigned int
-getlen(Tuplesortstate *state, int tapenum, bool eofOK)
+getlen(Tuplesortstate *state, LogicalTape *tape, bool eofOK)
 {
 	unsigned int len;
 
-	if (LogicalTapeRead(state->tapeset, tapenum,
+	if (LogicalTapeRead(tape,
 						&len, sizeof(len)) != sizeof(len))
 		elog(ERROR, "unexpected end of tape");
 	if (len == 0 && !eofOK)
@@ -3464,11 +3376,11 @@ getlen(Tuplesortstate *state, int tapenum, bool eofOK)
 }
 
 static void
-markrunend(Tuplesortstate *state, int tapenum)
+markrunend(Tuplesortstate *state, LogicalTape *tape)
 {
 	unsigned int len = 0;
 
-	LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
+	LogicalTapeWrite(tape, (void *) &len, sizeof(len));
 }
 
 /*
@@ -3646,7 +3558,7 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
 }
 
 static void
-writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
+writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
 {
 	MinimalTuple tuple = (MinimalTuple) stup->tuple;
 
@@ -3657,13 +3569,10 @@ writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
 	/* total on-disk footprint: */
 	unsigned int tuplen = tupbodylen + sizeof(int);
 
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 (void *) &tuplen, sizeof(tuplen));
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 (void *) tupbody, tupbodylen);
+	LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
+	LogicalTapeWrite(tape, (void *) tupbody, tupbodylen);
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeWrite(state->tapeset, tapenum,
-						 (void *) &tuplen, sizeof(tuplen));
+		LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
 
 	if (!state->slabAllocatorUsed)
 	{
@@ -3674,7 +3583,7 @@ writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
 
 static void
 readtup_heap(Tuplesortstate *state, SortTuple *stup,
-			 int tapenum, unsigned int len)
+			 LogicalTape *tape, unsigned int len)
 {
 	unsigned int tupbodylen = len - sizeof(int);
 	unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
@@ -3684,11 +3593,9 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
 
 	/* read in the tuple proper */
 	tuple->t_len = tuplen;
-	LogicalTapeReadExact(state->tapeset, tapenum,
-						 tupbody, tupbodylen);
+	LogicalTapeReadExact(tape, tupbody, tupbodylen);
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeReadExact(state->tapeset, tapenum,
-							 &tuplen, sizeof(tuplen));
+		LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
 	stup->tuple = (void *) tuple;
 	/* set up first-column key value */
 	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
@@ -3889,21 +3796,17 @@ copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
 }
 
 static void
-writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
+writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
 {
 	HeapTuple	tuple = (HeapTuple) stup->tuple;
 	unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
 
 	/* We need to store t_self, but not other fields of HeapTupleData */
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 &tuplen, sizeof(tuplen));
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 &tuple->t_self, sizeof(ItemPointerData));
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 tuple->t_data, tuple->t_len);
+	LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
+	LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData));
+	LogicalTapeWrite(tape, tuple->t_data, tuple->t_len);
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeWrite(state->tapeset, tapenum,
-						 &tuplen, sizeof(tuplen));
+		LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
 
 	if (!state->slabAllocatorUsed)
 	{
@@ -3914,7 +3817,7 @@ writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
 
 static void
 readtup_cluster(Tuplesortstate *state, SortTuple *stup,
-				int tapenum, unsigned int tuplen)
+				LogicalTape *tape, unsigned int tuplen)
 {
 	unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
 	HeapTuple	tuple = (HeapTuple) readtup_alloc(state,
@@ -3923,16 +3826,13 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup,
 	/* Reconstruct the HeapTupleData header */
 	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
 	tuple->t_len = t_len;
-	LogicalTapeReadExact(state->tapeset, tapenum,
-						 &tuple->t_self, sizeof(ItemPointerData));
+	LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData));
 	/* We don't currently bother to reconstruct t_tableOid */
 	tuple->t_tableOid = InvalidOid;
 	/* Read in the tuple body */
-	LogicalTapeReadExact(state->tapeset, tapenum,
-						 tuple->t_data, tuple->t_len);
+	LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len);
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeReadExact(state->tapeset, tapenum,
-							 &tuplen, sizeof(tuplen));
+		LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
 	stup->tuple = (void *) tuple;
 	/* set up first-column key value, if it's a simple column */
 	if (state->indexInfo->ii_IndexAttrNumbers[0] != 0)
@@ -4198,19 +4098,16 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
 }
 
 static void
-writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
+writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
 {
 	IndexTuple	tuple = (IndexTuple) stup->tuple;
 	unsigned int tuplen;
 
 	tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 (void *) &tuplen, sizeof(tuplen));
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 (void *) tuple, IndexTupleSize(tuple));
+	LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
+	LogicalTapeWrite(tape, (void *) tuple, IndexTupleSize(tuple));
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeWrite(state->tapeset, tapenum,
-						 (void *) &tuplen, sizeof(tuplen));
+		LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
 
 	if (!state->slabAllocatorUsed)
 	{
@@ -4221,16 +4118,14 @@ writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
 
 static void
 readtup_index(Tuplesortstate *state, SortTuple *stup,
-			  int tapenum, unsigned int len)
+			  LogicalTape *tape, unsigned int len)
 {
 	unsigned int tuplen = len - sizeof(unsigned int);
 	IndexTuple	tuple = (IndexTuple) readtup_alloc(state, tuplen);
 
-	LogicalTapeReadExact(state->tapeset, tapenum,
-						 tuple, tuplen);
+	LogicalTapeReadExact(tape, tuple, tuplen);
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeReadExact(state->tapeset, tapenum,
-							 &tuplen, sizeof(tuplen));
+		LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
 	stup->tuple = (void *) tuple;
 	/* set up first-column key value */
 	stup->datum1 = index_getattr(tuple,
@@ -4272,7 +4167,7 @@ copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
 }
 
 static void
-writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
+writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
 {
 	void	   *waddr;
 	unsigned int tuplen;
@@ -4297,13 +4192,10 @@ writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
 
 	writtenlen = tuplen + sizeof(unsigned int);
 
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 (void *) &writtenlen, sizeof(writtenlen));
-	LogicalTapeWrite(state->tapeset, tapenum,
-					 waddr, tuplen);
+	LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen));
+	LogicalTapeWrite(tape, waddr, tuplen);
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeWrite(state->tapeset, tapenum,
-						 (void *) &writtenlen, sizeof(writtenlen));
+		LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen));
 
 	if (!state->slabAllocatorUsed && stup->tuple)
 	{
@@ -4314,7 +4206,7 @@ writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
 
 static void
 readtup_datum(Tuplesortstate *state, SortTuple *stup,
-			  int tapenum, unsigned int len)
+			  LogicalTape *tape, unsigned int len)
 {
 	unsigned int tuplen = len - sizeof(unsigned int);
 
@@ -4328,8 +4220,7 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
 	else if (!state->tuples)
 	{
 		Assert(tuplen == sizeof(Datum));
-		LogicalTapeReadExact(state->tapeset, tapenum,
-							 &stup->datum1, tuplen);
+		LogicalTapeReadExact(tape, &stup->datum1, tuplen);
 		stup->isnull1 = false;
 		stup->tuple = NULL;
 	}
@@ -4337,16 +4228,14 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
 	{
 		void	   *raddr = readtup_alloc(state, tuplen);
 
-		LogicalTapeReadExact(state->tapeset, tapenum,
-							 raddr, tuplen);
+		LogicalTapeReadExact(tape, raddr, tuplen);
 		stup->datum1 = PointerGetDatum(raddr);
 		stup->isnull1 = false;
 		stup->tuple = raddr;
 	}
 
 	if (state->randomAccess)	/* need trailing length word? */
-		LogicalTapeReadExact(state->tapeset, tapenum,
-							 &tuplen, sizeof(tuplen));
+		LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
 }
 
 /*
@@ -4395,7 +4284,6 @@ tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
 	for (i = 0; i < nWorkers; i++)
 	{
 		shared->tapes[i].firstblocknumber = 0L;
-		shared->tapes[i].buffilesize = 0;
 	}
 }
 
@@ -4459,7 +4347,7 @@ worker_freeze_result_tape(Tuplesortstate *state)
 	TapeShare	output;
 
 	Assert(WORKER(state));
-	Assert(state->result_tape != -1);
+	Assert(state->result_tape != NULL);
 	Assert(state->memtupcount == 0);
 
 	/*
@@ -4475,7 +4363,7 @@ worker_freeze_result_tape(Tuplesortstate *state)
 	 * Parallel worker requires result tape metadata, which is to be stored in
 	 * shared memory for leader
 	 */
-	LogicalTapeFreeze(state->tapeset, state->result_tape, &output);
+	LogicalTapeFreeze(state->result_tape, &output);
 
 	/* Store properties of output tape, and update finished worker count */
 	SpinLockAcquire(&shared->mutex);
@@ -4494,14 +4382,14 @@ static void
 worker_nomergeruns(Tuplesortstate *state)
 {
 	Assert(WORKER(state));
-	Assert(state->result_tape == -1);
+	Assert(state->result_tape == NULL);
 
-	state->result_tape = state->tp_tapenum[state->destTape];
+	state->result_tape = state->destTape;
 	worker_freeze_result_tape(state);
 }
 
 /*
- * leader_takeover_tapes - create tapeset for leader from worker tapes
+ * leader_takeover_tapes - import worker tapes
  *
  * So far, leader Tuplesortstate has performed no actual sorting.  By now, all
  * sorting has occurred in workers, all of which must have already returned
@@ -4530,23 +4418,6 @@ leader_takeover_tapes(Tuplesortstate *state)
 		elog(ERROR, "cannot take over tapes before all workers finish");
 
 	/*
-	 * Create the tapeset from worker tapes, including a leader-owned tape at
-	 * the end.  Parallel workers are far more expensive than logical tapes,
-	 * so the number of tapes allocated here should never be excessive.
-	 *
-	 * We still have a leader tape, though it's not possible to write to it
-	 * due to restrictions in the shared fileset infrastructure used by
-	 * logtape.c.  It will never be written to in practice because
-	 * randomAccess is disallowed for parallel sorts.
-	 */
-	inittapestate(state, nParticipants + 1);
-	state->tapeset = LogicalTapeSetCreate(nParticipants + 1, shared->tapes,
-										  &shared->fileset, state->worker);
-
-	/* mergeruns() relies on currentRun for # of runs (in one-pass cases) */
-	state->currentRun = nParticipants;
-
-	/*
 	 * Initialize variables of Algorithm D to be consistent with runs from
 	 * workers having been generated in the leader.
 	 *
@@ -4554,21 +4425,24 @@ leader_takeover_tapes(Tuplesortstate *state)
 	 * tape per run, because workers always output exactly 1 run, even when
 	 * there were no input tuples for workers to sort.
 	 */
-	for (j = 0; j < state->maxTapes; j++)
+	inittapestate(state, nParticipants);
+
+	for (j = 0; j < nParticipants; j++)
 	{
 		/* One real run; no dummy runs for worker tapes */
-		state->tp_fib[j] = 1;
-		state->tp_runs[j] = 1;
-		state->tp_dummy[j] = 0;
-		state->tp_tapenum[j] = j;
+		LogicalTapeSet *importedTapeSet;
+
+		importedTapeSet = LogicalTapeSetImport(&shared->fileset, j);
+		state->tapesets = lappend(state->tapesets, importedTapeSet);
+		state->outputTapes[j] = LogicalTapeImport(importedTapeSet, &shared->tapes[j]);
+		state->nOutputTapes++;
+		state->nOutputRuns++;
 	}
-	/* Leader tape gets one dummy run, and no real runs */
-	state->tp_fib[state->tapeRange] = 0;
-	state->tp_runs[state->tapeRange] = 0;
-	state->tp_dummy[state->tapeRange] = 1;
 
-	state->Level = 1;
-	state->destTape = 0;
+	/*
+	 * There is no output.  'randomAccess' is disallowed for parallel sorts, so
+	 * we will perform the merge on-the-fly.
+	 */
 
 	state->status = TSS_BUILDRUNS;
 }
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index a6cdeb451c..8c8d44bab4 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -44,7 +44,6 @@ extern int	BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
 extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
 extern int	BufFileSeekBlock(BufFile *file, long blknum);
 extern off_t BufFileSize(BufFile *file);
-extern long BufFileAppend(BufFile *target, BufFile *source);
 
 extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
 extern void BufFileExportShared(BufFile *file);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 548a832be9..8e7c9728f4 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -78,7 +78,6 @@ extern char *FilePathName(File file);
 extern int	FileGetRawDesc(File file);
 extern int	FileGetRawFlags(File file);
 extern mode_t FileGetRawMode(File file);
-extern off_t FileGetSize(File file);
 
 /* Operations used for sharing named temporary files */
 extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h
index 9bf1d80142..a8091888e1 100644
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -18,9 +18,13 @@
 
 #include "storage/sharedfileset.h"
 
-/* LogicalTapeSet is an opaque type whose details are not known outside logtape.c. */
-
+/*
+ * LogicalTapeSet and LogicalTape are opaque types whose details are not
+ * known outside logtape.c.
+ */
 typedef struct LogicalTapeSet LogicalTapeSet;
+typedef struct LogicalTape LogicalTape;
+
 
 /*
  * The approach tuplesort.c takes to parallel external sorts is that workers,
@@ -46,36 +50,31 @@ typedef struct TapeShare
 	/*
 	 * firstblocknumber is first block that should be read from materialized
 	 * tape.
-	 *
-	 * buffilesize is the size of associated BufFile following freezing.
 	 */
 	long		firstblocknumber;
-	off_t		buffilesize;
 } TapeShare;
 
 /*
  * prototypes for functions in logtape.c
  */
 
-extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes, TapeShare *shared,
-					 SharedFileSet *fileset, int worker);
+extern LogicalTapeSet *LogicalTapeSetCreate(SharedFileSet *fileset, int worker);
 extern void LogicalTapeSetClose(LogicalTapeSet *lts);
 extern void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts);
-extern size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
-				void *ptr, size_t size);
-extern void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
-				 void *ptr, size_t size);
-extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum,
-						 size_t buffer_size);
-extern void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum);
-extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum,
-				  TapeShare *share);
-extern size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
-					 size_t size);
-extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
-				long blocknum, int offset);
-extern void LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
-				long *blocknum, int *offset);
 extern long LogicalTapeSetBlocks(LogicalTapeSet *lts);
 
+extern LogicalTape *LogicalTapeCreate(LogicalTapeSet *lts);
+extern void LogicalTapeClose(LogicalTape *lt);
+extern size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size);
+extern void LogicalTapeWrite(LogicalTape *lt, void *ptr, size_t size);
+extern void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size);
+extern void LogicalTapeRewindForWrite(LogicalTape *lt);
+extern void LogicalTapeFreeze(LogicalTape *lt, TapeShare *share);
+extern size_t LogicalTapeBackspace(LogicalTape *lt, size_t size);
+extern void LogicalTapeSeek(LogicalTape *lt, long blocknum, int offset);
+extern void LogicalTapeTell(LogicalTape *lt, long *blocknum, int *offset);
+
+extern LogicalTapeSet *LogicalTapeSetImport(SharedFileSet *fileset, int worker);
+extern LogicalTape *LogicalTapeImport(LogicalTapeSet *lts, TapeShare *shared);
+
 #endif							/* LOGTAPE_H */
author	Heikki Linnakangas	2016-10-12 09:26:54 +0000
committer	Heikki Linnakangas	2018-05-02 11:22:47 +0000
commit	31772f7af8cd890c9ee57a68e3f68378e7c9af68 (patch)
tree	2d6eb510ef5e2affe318769682587cad5b10dee1
parent	4d427a4ca7572d025b8f94710ccee6476289b9ca (diff)