Skip to content

Commit 56788d2

Browse files
committed
Allocate consecutive blocks during parallel seqscans
Previously we would allocate blocks to parallel workers during a parallel sequential scan 1 block at a time. Since other workers were likely to request a block before a worker returns for another block number to work on, this could lead to non-sequential I/O patterns in each worker which could cause the operating system's readahead to perform poorly or not at all. Here we change things so that we allocate consecutive "chunks" of blocks to workers and have them work on those until they're done, at which time we allocate another chunk for the worker. The size of these chunks is based on the size of the relation. Initial patch here was by Thomas Munro which showed some good improvements just having a fixed chunk size of 64 blocks with a simple ramp-down near the end of the scan. The revisions of the patch to make the chunk size based on the relation size and the adjusted ramp-down in powers of two was done by me, along with quite extensive benchmarking to determine the optimal chunk sizes. For the most part, benchmarks have shown significant performance improvements for large parallel sequential scans on Linux, FreeBSD and Windows using SSDs. It's less clear how this affects the performance of cloud providers. Tests done so far are unable to obtain stable enough performance to provide meaningful benchmark results. It is possible that this could cause some performance regressions on more obscure filesystems, so we may need to later provide users with some ability to get something closer to the old behavior. For now, let's leave that until we see that it's really required. Author: Thomas Munro, David Rowley Reviewed-by: Ranier Vilela, Soumyadeep Chakraborty, Robert Haas Reviewed-by: Amit Kapila, Kirk Jamison Discussion: https://postgr.es/m/CA+hUKGJ_EErDv41YycXcbMbCBkztA34+z1ts9VQH+ACRuvpxig@mail.gmail.com
1 parent 11a68e4 commit 56788d2

File tree

4 files changed

+144
-12
lines changed

4 files changed

+144
-12
lines changed

src/backend/access/heap/heapam.c

+16-6
Original file line numberDiff line numberDiff line change
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
520520
{
521521
ParallelBlockTableScanDesc pbscan =
522522
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
523+
ParallelBlockTableScanWorker pbscanwork =
524+
(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
523525

524526
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
525-
pbscan);
527+
pbscanwork, pbscan);
526528

527529
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
528-
pbscan);
530+
pbscanwork, pbscan);
529531

530532
/* Other processes might have already finished the scan. */
531533
if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
720722
{
721723
ParallelBlockTableScanDesc pbscan =
722724
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
725+
ParallelBlockTableScanWorker pbscanwork =
726+
(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
723727

724728
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
725-
pbscan);
729+
pbscanwork, pbscan);
726730
finished = (page == InvalidBlockNumber);
727731
}
728732
else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
834838
{
835839
ParallelBlockTableScanDesc pbscan =
836840
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
841+
ParallelBlockTableScanWorker pbscanwork =
842+
(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
837843

838844
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
839-
pbscan);
845+
pbscanwork, pbscan);
840846

841847
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
842-
pbscan);
848+
pbscanwork, pbscan);
843849

844850
/* Other processes might have already finished the scan. */
845851
if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
10191025
{
10201026
ParallelBlockTableScanDesc pbscan =
10211027
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1028+
ParallelBlockTableScanWorker pbscanwork =
1029+
(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
10221030

10231031
page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1024-
pbscan);
1032+
pbscanwork, pbscan);
10251033
finished = (page == InvalidBlockNumber);
10261034
}
10271035
else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
11551163
scan->rs_base.rs_nkeys = nkeys;
11561164
scan->rs_base.rs_flags = flags;
11571165
scan->rs_base.rs_parallel = parallel_scan;
1166+
scan->rs_base.rs_private =
1167+
palloc(sizeof(ParallelBlockTableScanWorkerData));
11581168
scan->rs_strategy = NULL; /* set in initscan */
11591169

11601170
/*

src/backend/access/table/tableam.c

+113-5
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,24 @@
2525
#include "access/tableam.h"
2626
#include "access/xact.h"
2727
#include "optimizer/plancat.h"
28+
#include "port/pg_bitutils.h"
2829
#include "storage/bufmgr.h"
2930
#include "storage/shmem.h"
3031
#include "storage/smgr.h"
3132

33+
/*
34+
* Constants to control the behavior of block allocation to parallel workers
35+
* during a parallel seqscan. Technically these values do not need to be
36+
* powers of 2, but having them as powers of 2 makes the math more optimal
37+
* and makes the ramp-down stepping more even.
38+
*/
39+
40+
/* The number of I/O chunks we try to break a parallel seqscan down into */
41+
#define PARALLEL_SEQSCAN_NCHUNKS 2048
42+
/* Ramp down size of allocations when we've only this number of chunks left */
43+
#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
44+
/* Cap the size of parallel I/O chunks to this number of blocks */
45+
#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192
3246

3347
/* GUC variables */
3448
char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
@@ -408,10 +422,37 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
408422
* to set the startblock once.
409423
*/
410424
void
411-
table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
425+
table_block_parallelscan_startblock_init(Relation rel,
426+
ParallelBlockTableScanWorker pbscanwork,
427+
ParallelBlockTableScanDesc pbscan)
412428
{
413429
BlockNumber sync_startpage = InvalidBlockNumber;
414430

431+
/* Reset the state we use for controlling allocation size. */
432+
memset(pbscanwork, 0, sizeof(*pbscanwork));
433+
434+
StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
435+
"pg_nextpower2_32 may be too small for non-standard BlockNumber width");
436+
437+
/*
438+
* We determine the chunk size based on the size of the relation. First we
439+
* split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
440+
* take the next highest power of 2 number of the chunk size. This means
441+
* we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
442+
* and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
443+
*/
444+
pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
445+
PARALLEL_SEQSCAN_NCHUNKS, 1));
446+
447+
/*
448+
* Ensure we don't go over the maximum chunk size with larger tables. This
449+
* means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
450+
* tables. Too large a chunk size has been shown to be detrimental to
451+
* synchronous scan performance.
452+
*/
453+
pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
454+
PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
455+
415456
retry:
416457
/* Grab the spinlock. */
417458
SpinLockAcquire(&pbscan->phs_mutex);
@@ -451,13 +492,40 @@ table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDes
451492
* backend gets an InvalidBlockNumber return.
452493
*/
453494
BlockNumber
454-
table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
495+
table_block_parallelscan_nextpage(Relation rel,
496+
ParallelBlockTableScanWorker pbscanwork,
497+
ParallelBlockTableScanDesc pbscan)
455498
{
456499
BlockNumber page;
457500
uint64 nallocated;
458501

459502
/*
460-
* phs_nallocated tracks how many pages have been allocated to workers
503+
* The logic below allocates block numbers out to parallel workers in a
504+
* way that each worker will receive a set of consecutive block numbers to
505+
* scan. Earlier versions of this would allocate the next highest block
506+
* number to the next worker to call this function. This would generally
507+
* result in workers never receiving consecutive block numbers. Some
508+
* operating systems would not detect the sequential I/O pattern due to
509+
* each backend being a different process which could result in poor
510+
* performance due to inefficient or no readahead. To work around this
511+
* issue, we now allocate a range of block numbers for each worker and
512+
* when they come back for another block, we give them the next one in
513+
* that range until the range is complete. When the worker completes the
514+
* range of blocks we then allocate another range for it and return the
515+
* first block number from that range.
516+
*
517+
* Here we name these ranges of blocks "chunks". The initial size of
518+
* these chunks is determined in table_block_parallelscan_startblock_init
519+
* based on the size of the relation. Towards the end of the scan, we
520+
* start making reductions in the size of the chunks in order to attempt
521+
* to divide the remaining work over all the workers as evenly as
522+
* possible.
523+
*
524+
* Here pbscanwork is local worker memory. phsw_chunk_remaining tracks
525+
* the number of blocks remaining in the chunk. When that reaches 0 then
526+
* we must allocate a new chunk for the worker.
527+
*
528+
* phs_nallocated tracks how many blocks have been allocated to workers
461529
* already. When phs_nallocated >= rs_nblocks, all blocks have been
462530
* allocated.
463531
*
@@ -468,10 +536,50 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
468536
* wide because of that, to avoid wrapping around when rs_nblocks is close
469537
* to 2^32.
470538
*
471-
* The actual page to return is calculated by adding the counter to the
539+
* The actual block to return is calculated by adding the counter to the
472540
* starting block number, modulo nblocks.
473541
*/
474-
nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
542+
543+
/*
544+
* First check if we have any remaining blocks in a previous chunk for
545+
* this worker. We must consume all of the blocks from that before we
546+
* allocate a new chunk to the worker.
547+
*/
548+
if (pbscanwork->phsw_chunk_remaining > 0)
549+
{
550+
/*
551+
* Give them the next block in the range and update the remaining
552+
* number of blocks.
553+
*/
554+
nallocated = ++pbscanwork->phsw_nallocated;
555+
pbscanwork->phsw_chunk_remaining--;
556+
}
557+
else
558+
{
559+
/*
560+
* When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
561+
* remaining in the scan, we half the chunk size. Since we reduce the
562+
* chunk size here, we'll hit this again after doing
563+
* PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
564+
* iterations of this, we'll end up doing the last few blocks with the
565+
* chunk size set to 1.
566+
*/
567+
if (pbscanwork->phsw_chunk_size > 1 &&
568+
pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
569+
(pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
570+
pbscanwork->phsw_chunk_size >>= 1;
571+
572+
nallocated = pbscanwork->phsw_nallocated =
573+
pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
574+
pbscanwork->phsw_chunk_size);
575+
576+
/*
577+
* Set the remaining number of blocks in this chunk so that subsequent
578+
* calls from this worker continue on with this chunk until it's done.
579+
*/
580+
pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
581+
}
582+
475583
if (nallocated >= pbscan->phs_nblocks)
476584
page = InvalidBlockNumber; /* all blocks have been allocated */
477585
else

src/include/access/relscan.h

+13-1
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
4242
*/
4343
uint32 rs_flags;
4444

45+
void *rs_private; /* per-worker private memory for AM to use */
4546
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
4647
* information */
47-
4848
} TableScanDescData;
4949
typedef struct TableScanDescData *TableScanDesc;
5050

@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData
8181
} ParallelBlockTableScanDescData;
8282
typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
8383

84+
/*
85+
* Per backend state for parallel table scan, for block-oriented storage.
86+
*/
87+
typedef struct ParallelBlockTableScanWorkerData
88+
{
89+
uint64 phsw_nallocated; /* Current # of blocks into the scan */
90+
uint32 phsw_chunk_remaining; /* # blocks left in this chunk */
91+
uint32 phsw_chunk_size; /* The number of blocks to allocate in
92+
* each I/O chunk for the scan */
93+
} ParallelBlockTableScanWorkerData;
94+
typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
95+
8496
/*
8597
* Base class for fetches from a table via an index. This is the base-class
8698
* for such scans, which needs to be embedded in the respective struct for

src/include/access/tableam.h

+2
Original file line numberDiff line numberDiff line change
@@ -1793,8 +1793,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
17931793
extern void table_block_parallelscan_reinitialize(Relation rel,
17941794
ParallelTableScanDesc pscan);
17951795
extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
1796+
ParallelBlockTableScanWorker pbscanwork,
17961797
ParallelBlockTableScanDesc pbscan);
17971798
extern void table_block_parallelscan_startblock_init(Relation rel,
1799+
ParallelBlockTableScanWorker pbscanwork,
17981800
ParallelBlockTableScanDesc pbscan);
17991801

18001802

0 commit comments

Comments
 (0)