From a7c7b95209c3538c7ab0d8a44a90170789d070bd Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Thu, 16 Jan 2025 15:35:03 -0800 Subject: [PATCH 1/4] Introduces table AM APIs for parallel table vacuuming. This commit introduces the following new table AM APIs for parallel table vacuuming: - parallel_vacuum_compute_workers - parallel_vacuum_estimate - parallel_vacuum_initialize - parallel_vacuum_initialize_worker - parallel_vacuum_collect_dead_items While parallel_vacuum_compute_workers is required, other new callbacks are optional. There is no code using these new APIs for now. Upcoming parallel vacuum patches utilize these APIs. Reviewed-by: Amit Kapila Reviewed-by: Hayato Kuroda Reviewed-by: Peter Smith Reviewed-by: Tomas Vondra Reviewed-by: Dilip Kumar Reviewed-by: Melanie Plageman Discussion: https://postgr.es/m/CAD21AoAEfCNv-GgaDheDJ+s-p_Lv1H24AiJeNoPGCmZNSwL1YA@mail.gmail.com --- src/backend/access/heap/heapam_handler.c | 4 +- src/backend/access/heap/vacuumlazy.c | 12 ++ src/backend/access/table/tableamapi.c | 11 ++ src/include/access/heapam.h | 2 + src/include/access/tableam.h | 140 +++++++++++++++++++++++ 5 files changed, 168 insertions(+), 1 deletion(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77a..aad419e46e8e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2668,7 +2668,9 @@ static const TableAmRoutine heapam_methods = { .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, - .scan_sample_next_tuple = heapam_scan_sample_next_tuple + .scan_sample_next_tuple = heapam_scan_sample_next_tuple, + + .parallel_vacuum_compute_workers = heap_parallel_vacuum_compute_workers, }; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f28326bad095..8fd44ccf5dc7 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -3756,6 +3756,18 @@ update_relstats_all_indexes(LVRelState *vacrel) } } +/* + * Compute the number of workers for parallel heap vacuum. + * + * Return 0 to disable parallel vacuum. + */ +int +heap_parallel_vacuum_compute_workers(Relation rel, int nworkers_requested, + void *state) +{ + return 0; +} + /* * Error context callback for errors occurring during vacuum. The error * context messages for index phases should match the messages set in parallel diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 476663b66aad..c3ee9869e12b 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -81,6 +81,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->relation_copy_data != NULL); Assert(routine->relation_copy_for_cluster != NULL); Assert(routine->relation_vacuum != NULL); + Assert(routine->parallel_vacuum_compute_workers != NULL); Assert(routine->scan_analyze_next_block != NULL); Assert(routine->scan_analyze_next_tuple != NULL); Assert(routine->index_build_range_scan != NULL); @@ -94,6 +95,16 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->scan_sample_next_block != NULL); Assert(routine->scan_sample_next_tuple != NULL); + /* + * Callbacks for parallel vacuum are also optional (except for + * parallel_vacuum_compute_workers). But one callback implies presence of + * the others. + */ + Assert(((((routine->parallel_vacuum_estimate == NULL) == + (routine->parallel_vacuum_initialize == NULL)) == + (routine->parallel_vacuum_initialize_worker == NULL)) == + (routine->parallel_vacuum_collect_dead_items == NULL))); + return routine; } diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index e48fe434cd39..4e794ba6a50f 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -399,6 +399,8 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, struct VacuumParams; extern void heap_vacuum_rel(Relation rel, struct VacuumParams *params, BufferAccessStrategy bstrategy); +extern int heap_parallel_vacuum_compute_workers(Relation rel, int nworkers_requested, + void *state); /* in heap/heapam_visibility.c */ extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8713e12cbfb9..4cecb9c92907 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -35,6 +35,9 @@ extern PGDLLIMPORT bool synchronize_seqscans; struct BulkInsertStateData; struct IndexInfo; +struct ParallelContext; +struct ParallelVacuumState; +struct ParallelWorkerContext; struct SampleScanState; struct VacuumParams; struct ValidateIndexState; @@ -648,6 +651,81 @@ typedef struct TableAmRoutine struct VacuumParams *params, BufferAccessStrategy bstrategy); + /* ------------------------------------------------------------------------ + * Callbacks for parallel table vacuum. + * ------------------------------------------------------------------------ + */ + + /* + * Compute the number of parallel workers for parallel table vacuum. The + * parallel degree for parallel vacuum is further limited by + * max_parallel_maintenance_workers. The function must return 0 to disable + * parallel table vacuum. + * + * 'nworkers_requested' is a >=0 number and the requested number of + * workers. This comes from the PARALLEL option. 0 means to choose the + * parallel degree based on the table AM specific factors such as table + * size. + */ + int (*parallel_vacuum_compute_workers) (Relation rel, + int nworkers_requested, + void *state); + + /* + * Estimate the size of shared memory needed for a parallel table vacuum + * of this relation. + * + * Not called if parallel table vacuum is disabled. + * + * Optional callback, but either all other parallel vacuum callbacks need + * to exist, or neither. + */ + void (*parallel_vacuum_estimate) (Relation rel, + struct ParallelContext *pcxt, + int nworkers, + void *state); + + /* + * Initialize DSM space for parallel table vacuum. + * + * Not called if parallel table vacuum is disabled. + * + * Optional callback, but either all other parallel vacuum callbacks need + * to exist, or neither. + */ + void (*parallel_vacuum_initialize) (Relation rel, + struct ParallelContext *pctx, + int nworkers, + void *state); + + /* + * Initialize AM-specific vacuum state for worker processes. + * + * The state_out is the output parameter so that arbitrary data can be + * passed to the subsequent callback, parallel_vacuum_remove_dead_items. + * + * Not called if parallel table vacuum is disabled. + * + * Optional callback, but either all other parallel vacuum callbacks need + * to exist, or neither. + */ + void (*parallel_vacuum_initialize_worker) (Relation rel, + struct ParallelVacuumState *pvs, + struct ParallelWorkerContext *pwcxt, + void **state_out); + + /* + * Execute a parallel scan to collect dead items. + * + * Not called if parallel table vacuum is disabled. + * + * Optional callback, but either all other parallel vacuum callbacks need + * to exist, or neither. + */ + void (*parallel_vacuum_collect_dead_items) (Relation rel, + struct ParallelVacuumState *pvs, + void *state); + /* * Prepare to analyze block `blockno` of `scan`. The scan has been started * with table_beginscan_analyze(). See also @@ -1670,6 +1748,68 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params, rel->rd_tableam->relation_vacuum(rel, params, bstrategy); } +/* ---------------------------------------------------------------------------- + * Parallel vacuum related functions. + * ---------------------------------------------------------------------------- + */ + +/* + * Compute the number of parallel workers for a parallel vacuum scan of this + * relation. + */ +static inline int +table_parallel_vacuum_compute_workers(Relation rel, int nworkers_requested, + void *state) +{ + return rel->rd_tableam->parallel_vacuum_compute_workers(rel, + nworkers_requested, + state); +} + +/* + * Estimate the size of shared memory needed for a parallel vacuum scan of this + * of this relation. + */ +static inline void +table_parallel_vacuum_estimate(Relation rel, struct ParallelContext *pcxt, + int nworkers, void *state) +{ + Assert(nworkers > 0); + rel->rd_tableam->parallel_vacuum_estimate(rel, pcxt, nworkers, state); +} + +/* + * Initialize shared memory area for a parallel vacuum scan of this relation. + */ +static inline void +table_parallel_vacuum_initialize(Relation rel, struct ParallelContext *pcxt, + int nworkers, void *state) +{ + Assert(nworkers > 0); + rel->rd_tableam->parallel_vacuum_initialize(rel, pcxt, nworkers, state); +} + +/* + * Initialize AM-specific vacuum state for worker processes. + */ +static inline void +table_parallel_vacuum_initialize_worker(Relation rel, struct ParallelVacuumState *pvs, + struct ParallelWorkerContext *pwcxt, + void **state_out) +{ + rel->rd_tableam->parallel_vacuum_initialize_worker(rel, pvs, pwcxt, state_out); +} + +/* + * Execute a parallel vacuum scan to collect dead items. + */ +static inline void +table_parallel_vacuum_collect_dead_items(Relation rel, struct ParallelVacuumState *pvs, + void *state) +{ + rel->rd_tableam->parallel_vacuum_collect_dead_items(rel, pvs, state); +} + /* * Prepare to analyze the next block in the read stream. The scan needs to * have been started with table_beginscan_analyze(). Note that this routine From b27c1df79b7b0bd72d41d0a8e3d0e11929f0bb06 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 18 Feb 2025 17:45:36 -0800 Subject: [PATCH 2/4] vacuumparallel.c: Support parallel vacuuming for tables to collect dead items. Previously, parallel vacuum was available only for index vacuuming and index cleanup, ParallelVacuumState was initialized only when the table has at least two indexes that are eligible for parallel index vacuuming and cleanup. This commit extends vacuumparallel.c to support parallel table vacuuming. parallel_vacuum_init() now initializes ParallelVacuumState to perform parallel heap scan to collect dead items, or paralel index vacuuming/cleanup, or both. During the initialization, it asks the table AM for the number of parallel workers required for parallel table vacuuming. If >0, it enables parallel table vacuuming and calls further table AM APIs such as parallel_vacuum_estimate. For parallel table vacuuming, this commit introduces parallel_vacuum_collect_dead_items_begin() function, which can be used to collect dead items in the table (for example, the first pass over heap table in lazy vacuum for heap tables). Heap table AM disables the parallel heap vacuuming for now, but an upcoming patch uses it. Reviewed-by: Amit Kapila Reviewed-by: Hayato Kuroda Reviewed-by: Peter Smith Reviewed-by: Tomas Vondra Reviewed-by: Dilip Kumar Reviewed-by: Melanie Plageman Discussion: https://postgr.es/m/CAD21AoAEfCNv-GgaDheDJ+s-p_Lv1H24AiJeNoPGCmZNSwL1YA@mail.gmail.com --- src/backend/access/heap/vacuumlazy.c | 2 +- src/backend/commands/vacuumparallel.c | 392 +++++++++++++++++++------- src/include/commands/vacuum.h | 5 +- src/tools/pgindent/typedefs.list | 1 + 4 files changed, 292 insertions(+), 108 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 8fd44ccf5dc7..3b948970437a 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -3514,7 +3514,7 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) vacrel->nindexes, nworkers, vac_work_mem, vacrel->verbose ? INFO : DEBUG2, - vacrel->bstrategy); + vacrel->bstrategy, (void *) vacrel); /* * If parallel mode started, dead_items and dead_items_info spaces are diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 2b9d548cdeb1..28997918f1ca 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -4,17 +4,18 @@ * Support routines for parallel vacuum execution. * * This file contains routines that are intended to support setting up, using, - * and tearing down a ParallelVacuumState. + * and tearing down a ParallelVacuumState. ParallelVacuumState contains shared + * information as well as the memory space for storing dead items allocated in + * the DSA area. We launch * - * In a parallel vacuum, we perform both index bulk deletion and index cleanup - * with parallel worker processes. Individual indexes are processed by one - * vacuum process. ParallelVacuumState contains shared information as well as - * the memory space for storing dead items allocated in the DSA area. We - * launch parallel worker processes at the start of parallel index - * bulk-deletion and index cleanup and once all indexes are processed, the - * parallel worker processes exit. Each time we process indexes in parallel, - * the parallel context is re-initialized so that the same DSM can be used for - * multiple passes of index bulk-deletion and index cleanup. + * In a parallel vacuum, we perform table scan, index bulk-deletion, index + * cleanup, or all of them with parallel worker processes depending on the + * number of parallel workers required for each phase. So different numbers of + * workers might be required for the table scanning and index processing. + * We launch parallel worker processes at the start of a phase, and once we + * complete all work in the phase, parallel workers exit. Each time we process + * table or indexes in parallel, the parallel context is re-initialized so that + * the same DSM can be used for multiple passes of each phase. * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -26,8 +27,10 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/amapi.h" #include "access/table.h" +#include "access/tableam.h" #include "access/xact.h" #include "commands/progress.h" #include "commands/vacuum.h" @@ -50,6 +53,13 @@ #define PARALLEL_VACUUM_KEY_WAL_USAGE 4 #define PARALLEL_VACUUM_KEY_INDEX_STATS 5 +/* The kind of parallel vacuum phases */ +typedef enum +{ + PV_WORK_PHASE_PROCESS_INDEXES, /* index vacuuming or cleanup */ + PV_WORK_PHASE_COLLECT_DEAD_ITEMS, /* collect dead tuples */ +} PVWorkPhase; + /* * Shared information among parallel workers. So this is allocated in the DSM * segment. @@ -65,6 +75,12 @@ typedef struct PVShared int elevel; uint64 queryid; + /* + * Tell parallel workers what phase to perform: processing indexes or + * collecting dead tuples from the table. + */ + PVWorkPhase work_phase; + /* * Fields for both index vacuum and cleanup. * @@ -164,6 +180,9 @@ struct ParallelVacuumState /* NULL for worker processes */ ParallelContext *pcxt; + /* Do we need to reinitialize parallel DSM? */ + bool need_reinitialize_dsm; + /* Parent Heap Relation */ Relation heaprel; @@ -178,7 +197,7 @@ struct ParallelVacuumState * Shared index statistics among parallel vacuum workers. The array * element is allocated for every index, even those indexes where parallel * index vacuuming is unsafe or not worthwhile (e.g., - * will_parallel_vacuum[] is false). During parallel vacuum, + * idx_will_parallel_vacuum[] is false). During parallel vacuum, * IndexBulkDeleteResult of each index is kept in DSM and is copied into * local memory at the end of parallel vacuum. */ @@ -193,12 +212,18 @@ struct ParallelVacuumState /* Points to WAL usage area in DSM */ WalUsage *wal_usage; + /* + * The number of workers for parallel table vacuuming. If 0, the parallel + * table vacuum is disabled. + */ + int nworkers_for_table; + /* * False if the index is totally unsuitable target for all parallel * processing. For example, the index could be < * min_parallel_index_scan_size cutoff. */ - bool *will_parallel_vacuum; + bool *idx_will_parallel_vacuum; /* * The number of indexes that support parallel index bulk-deletion and @@ -221,8 +246,10 @@ struct ParallelVacuumState PVIndVacStatus status; }; -static int parallel_vacuum_compute_workers(Relation *indrels, int nindexes, int nrequested, - bool *will_parallel_vacuum); +static int parallel_vacuum_compute_workers(Relation rel, Relation *indrels, int nindexes, + int nrequested, int *nworkers_for_table, + bool *idx_will_parallel_vacuum, + void *state); static void parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scans, bool vacuum); static void parallel_vacuum_process_safe_indexes(ParallelVacuumState *pvs); @@ -231,18 +258,25 @@ static void parallel_vacuum_process_one_index(ParallelVacuumState *pvs, Relation PVIndStats *indstats); static bool parallel_vacuum_index_is_parallel_safe(Relation indrel, int num_index_scans, bool vacuum); +static void parallel_vacuum_begin_work_phase(ParallelVacuumState *pvs, int nworkers, + PVWorkPhase work_phase); +static void parallel_vacuum_end_worke_phase(ParallelVacuumState *pvs); static void parallel_vacuum_error_callback(void *arg); /* * Try to enter parallel mode and create a parallel context. Then initialize * shared memory state. * + * nrequested_workers is the requested parallel degree. 0 means that the parallel + * degrees for table and indexes vacuum are decided differently. See the comments + * of parallel_vacuum_compute_workers() for details. + * * On success, return parallel vacuum state. Otherwise return NULL. */ ParallelVacuumState * parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, int nrequested_workers, int vac_work_mem, - int elevel, BufferAccessStrategy bstrategy) + int elevel, BufferAccessStrategy bstrategy, void *state) { ParallelVacuumState *pvs; ParallelContext *pcxt; @@ -251,38 +285,38 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, PVIndStats *indstats; BufferUsage *buffer_usage; WalUsage *wal_usage; - bool *will_parallel_vacuum; + bool *idx_will_parallel_vacuum; Size est_indstats_len; Size est_shared_len; int nindexes_mwm = 0; int parallel_workers = 0; + int nworkers_for_table; int querylen; - /* - * A parallel vacuum must be requested and there must be indexes on the - * relation - */ + /* A parallel vacuum must be requested */ Assert(nrequested_workers >= 0); - Assert(nindexes > 0); /* * Compute the number of parallel vacuum workers to launch */ - will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes); - parallel_workers = parallel_vacuum_compute_workers(indrels, nindexes, + idx_will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes); + parallel_workers = parallel_vacuum_compute_workers(rel, indrels, nindexes, nrequested_workers, - will_parallel_vacuum); + &nworkers_for_table, + idx_will_parallel_vacuum, + state); + if (parallel_workers <= 0) { /* Can't perform vacuum in parallel -- return NULL */ - pfree(will_parallel_vacuum); + pfree(idx_will_parallel_vacuum); return NULL; } pvs = (ParallelVacuumState *) palloc0(sizeof(ParallelVacuumState)); pvs->indrels = indrels; pvs->nindexes = nindexes; - pvs->will_parallel_vacuum = will_parallel_vacuum; + pvs->idx_will_parallel_vacuum = idx_will_parallel_vacuum; pvs->bstrategy = bstrategy; pvs->heaprel = rel; @@ -291,6 +325,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, parallel_workers); Assert(pcxt->nworkers > 0); pvs->pcxt = pcxt; + pvs->need_reinitialize_dsm = false; + pvs->nworkers_for_table = nworkers_for_table; /* Estimate size for index vacuum stats -- PARALLEL_VACUUM_KEY_INDEX_STATS */ est_indstats_len = mul_size(sizeof(PVIndStats), nindexes); @@ -327,6 +363,10 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, else querylen = 0; /* keep compiler quiet */ + /* Estimate AM-specific space for parallel table vacuum */ + if (pvs->nworkers_for_table > 0) + table_parallel_vacuum_estimate(rel, pcxt, pvs->nworkers_for_table, state); + InitializeParallelDSM(pcxt); /* Prepare index vacuum stats */ @@ -345,7 +385,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0)); Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE); - if (!will_parallel_vacuum[i]) + if (!idx_will_parallel_vacuum[i]) continue; if (indrel->rd_indam->amusemaintenanceworkmem) @@ -419,6 +459,10 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery); } + /* Initialize AM-specific DSM space for parallel table vacuum */ + if (pvs->nworkers_for_table > 0) + table_parallel_vacuum_initialize(rel, pcxt, pvs->nworkers_for_table, state); + /* Success -- return parallel vacuum state */ return pvs; } @@ -456,7 +500,7 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) DestroyParallelContext(pvs->pcxt); ExitParallelMode(); - pfree(pvs->will_parallel_vacuum); + pfree(pvs->idx_will_parallel_vacuum); pfree(pvs); } @@ -533,26 +577,35 @@ parallel_vacuum_cleanup_all_indexes(ParallelVacuumState *pvs, long num_table_tup } /* - * Compute the number of parallel worker processes to request. Both index - * vacuum and index cleanup can be executed with parallel workers. - * The index is eligible for parallel vacuum iff its size is greater than - * min_parallel_index_scan_size as invoking workers for very small indexes - * can hurt performance. + * Compute the number of parallel worker processes to request for table + * vacuum and index vacuum/cleanup. Return the maximum number of parallel + * workers for table vacuuming and index vacuuming. + * + * nrequested is the number of parallel workers that user requested, which + * applies to both the number of workers for table vacuum and index vacuum. + * If nrequested is 0, we compute the parallel degree for them differently + * as described below. * - * nrequested is the number of parallel workers that user requested. If - * nrequested is 0, we compute the parallel degree based on nindexes, that is - * the number of indexes that support parallel vacuum. This function also - * sets will_parallel_vacuum to remember indexes that participate in parallel - * vacuum. + * For parallel table vacuum, we ask AM-specific routine to compute the + * number of parallel worker processes. The result is set to nworkers_table_p. + * + * For parallel index vacuum, the index is eligible for parallel vacuum iff + * its size is greater than min_parallel_index_scan_size as invoking workers + * for very small indexes can hurt performance. This function sets + * idx_will_parallel_vacuum to remember indexes that participate in parallel vacuum. */ static int -parallel_vacuum_compute_workers(Relation *indrels, int nindexes, int nrequested, - bool *will_parallel_vacuum) +parallel_vacuum_compute_workers(Relation rel, Relation *indrels, int nindexes, + int nrequested, int *nworkers_table_p, + bool *idx_will_parallel_vacuum, void *state) { int nindexes_parallel = 0; int nindexes_parallel_bulkdel = 0; int nindexes_parallel_cleanup = 0; - int parallel_workers; + int nworkers_table = 0; + int nworkers_index = 0; + + *nworkers_table_p = 0; /* * We don't allow performing parallel operation in standalone backend or @@ -561,6 +614,13 @@ parallel_vacuum_compute_workers(Relation *indrels, int nindexes, int nrequested, if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0) return 0; + /* Compute the number of workers for parallel table scan */ + nworkers_table = table_parallel_vacuum_compute_workers(rel, nrequested, + state); + + /* Cap by max_parallel_maintenance_workers */ + nworkers_table = Min(nworkers_table, max_parallel_maintenance_workers); + /* * Compute the number of indexes that can participate in parallel vacuum. */ @@ -574,7 +634,7 @@ parallel_vacuum_compute_workers(Relation *indrels, int nindexes, int nrequested, RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size) continue; - will_parallel_vacuum[i] = true; + idx_will_parallel_vacuum[i] = true; if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0) nindexes_parallel_bulkdel++; @@ -589,18 +649,18 @@ parallel_vacuum_compute_workers(Relation *indrels, int nindexes, int nrequested, /* The leader process takes one index */ nindexes_parallel--; - /* No index supports parallel vacuum */ - if (nindexes_parallel <= 0) - return 0; - - /* Compute the parallel degree */ - parallel_workers = (nrequested > 0) ? - Min(nrequested, nindexes_parallel) : nindexes_parallel; + if (nindexes_parallel > 0) + { + /* Take into account the requested number of workers */ + nworkers_index = (nrequested > 0) ? + Min(nrequested, nindexes_parallel) : nindexes_parallel; - /* Cap by max_parallel_maintenance_workers */ - parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers); + /* Cap by max_parallel_maintenance_workers */ + nworkers_index = Min(nworkers_index, max_parallel_maintenance_workers); + } - return parallel_workers; + *nworkers_table_p = nworkers_table; + return Max(nworkers_table, nworkers_index); } /* @@ -657,7 +717,7 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan Assert(indstats->status == PARALLEL_INDVAC_STATUS_INITIAL); indstats->status = new_status; indstats->parallel_workers_can_process = - (pvs->will_parallel_vacuum[i] && + (pvs->idx_will_parallel_vacuum[i] && parallel_vacuum_index_is_parallel_safe(pvs->indrels[i], num_index_scans, vacuum)); @@ -669,40 +729,9 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan /* Setup the shared cost-based vacuum delay and launch workers */ if (nworkers > 0) { - /* Reinitialize parallel context to relaunch parallel workers */ - if (num_index_scans > 0) - ReinitializeParallelDSM(pvs->pcxt); - - /* - * Set up shared cost balance and the number of active workers for - * vacuum delay. We need to do this before launching workers as - * otherwise, they might not see the updated values for these - * parameters. - */ - pg_atomic_write_u32(&(pvs->shared->cost_balance), VacuumCostBalance); - pg_atomic_write_u32(&(pvs->shared->active_nworkers), 0); - - /* - * The number of workers can vary between bulkdelete and cleanup - * phase. - */ - ReinitializeParallelWorkers(pvs->pcxt, nworkers); - - LaunchParallelWorkers(pvs->pcxt); - - if (pvs->pcxt->nworkers_launched > 0) - { - /* - * Reset the local cost values for leader backend as we have - * already accumulated the remaining balance of heap. - */ - VacuumCostBalance = 0; - VacuumCostBalanceLocal = 0; - - /* Enable shared cost balance for leader backend */ - VacuumSharedCostBalance = &(pvs->shared->cost_balance); - VacuumActiveNWorkers = &(pvs->shared->active_nworkers); - } + /* Start parallel vacuum workers for processing indexes */ + parallel_vacuum_begin_work_phase(pvs, nworkers, + PV_WORK_PHASE_PROCESS_INDEXES); if (vacuum) ereport(pvs->shared->elevel, @@ -732,13 +761,7 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan * to finish, or we might get incomplete data.) */ if (nworkers > 0) - { - /* Wait for all vacuum workers to finish */ - WaitForParallelWorkersToFinish(pvs->pcxt); - - for (int i = 0; i < pvs->pcxt->nworkers_launched; i++) - InstrAccumParallelQuery(&pvs->buffer_usage[i], &pvs->wal_usage[i]); - } + parallel_vacuum_end_worke_phase(pvs); /* * Reset all index status back to initial (while checking that we have @@ -755,15 +778,8 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan indstats->status = PARALLEL_INDVAC_STATUS_INITIAL; } - /* - * Carry the shared balance value to heap scan and disable shared costing - */ - if (VacuumSharedCostBalance) - { - VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance); - VacuumSharedCostBalance = NULL; - VacuumActiveNWorkers = NULL; - } + /* Parallel DSM will need to be reinitialized for the next execution */ + pvs->need_reinitialize_dsm = true; } /* @@ -979,6 +995,77 @@ parallel_vacuum_index_is_parallel_safe(Relation indrel, int num_index_scans, return true; } +/* + * Begin the parallel scan to collect dead items. Return the number of + * launched parallel workers. + * + * The caller must call parallel_vacuum_collect_dead_items_end() to finish + * the parallel scan. + */ +int +parallel_vacuum_collect_dead_items_begin(ParallelVacuumState *pvs) +{ + Assert(!IsParallelWorker()); + + if (pvs->nworkers_for_table == 0) + return 0; + + /* Start parallel vacuum workers for collecting dead items */ + Assert(pvs->nworkers_for_table <= pvs->pcxt->nworkers); + parallel_vacuum_begin_work_phase(pvs, pvs->nworkers_for_table, + PV_WORK_PHASE_COLLECT_DEAD_ITEMS); + + /* Include the worker count for the leader itself */ + if (pvs->pcxt->nworkers_launched > 0) + pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1); + + return pvs->pcxt->nworkers_launched; +} + +/* + * Wait for all workers for parallel vacuum workers launched by + * parallel_vacuum_collect_dead_items_begin(), and gather workers' statistics. + */ +void +parallel_vacuum_collect_dead_items_end(ParallelVacuumState *pvs) +{ + Assert(!IsParallelWorker()); + Assert(pvs->shared->work_phase == PV_WORK_PHASE_COLLECT_DEAD_ITEMS); + + if (pvs->nworkers_for_table == 0) + return; + + /* Wait for parallel workers to finish */ + parallel_vacuum_end_worke_phase(pvs); + + /* Decrement the worker count for the leader itself */ + if (VacuumActiveNWorkers) + pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1); +} + +/* + * The function is for parallel workers to execute the parallel scan to + * collect dead tuples. + */ +static void +parallel_vacuum_process_table(ParallelVacuumState *pvs, void *state) +{ + Assert(VacuumActiveNWorkers); + Assert(pvs->shared->work_phase == PV_WORK_PHASE_COLLECT_DEAD_ITEMS); + + /* Increment the active worker before starting the table vacuum */ + pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1); + + /* Do the parallel scan to collect dead tuples */ + table_parallel_vacuum_collect_dead_items(pvs->heaprel, pvs, state); + + /* + * We have completed the table vacuum so decrement the active worker + * count. + */ + pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1); +} + /* * Perform work within a launched parallel process. * @@ -998,6 +1085,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) WalUsage *wal_usage; int nindexes; char *sharedquery; + void *state; ErrorContextCallback errcallback; /* @@ -1030,7 +1118,6 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) * matched to the leader's one. */ vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels); - Assert(nindexes > 0); /* * Apply the desired value of maintenance_work_mem within this process. @@ -1076,6 +1163,17 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) pvs.bstrategy = GetAccessStrategyWithSize(BAS_VACUUM, shared->ring_nbuffers * (BLCKSZ / 1024)); + /* Initialize AM-specific vacuum state for parallel table vacuuming */ + if (shared->work_phase == PV_WORK_PHASE_COLLECT_DEAD_ITEMS) + { + ParallelWorkerContext pwcxt; + + pwcxt.toc = toc; + pwcxt.seg = seg; + table_parallel_vacuum_initialize_worker(rel, &pvs, &pwcxt, + &state); + } + /* Setup error traceback support for ereport() */ errcallback.callback = parallel_vacuum_error_callback; errcallback.arg = &pvs; @@ -1085,8 +1183,19 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Prepare to track buffer usage during parallel execution */ InstrStartParallelQuery(); - /* Process indexes to perform vacuum/cleanup */ - parallel_vacuum_process_safe_indexes(&pvs); + switch (pvs.shared->work_phase) + { + case PV_WORK_PHASE_COLLECT_DEAD_ITEMS: + /* Scan the table to collect dead items */ + parallel_vacuum_process_table(&pvs, state); + break; + case PV_WORK_PHASE_PROCESS_INDEXES: + /* Process indexes to perform vacuum/cleanup */ + parallel_vacuum_process_safe_indexes(&pvs); + break; + default: + elog(ERROR, "unrecognized parallel vacuum phase %d", pvs.shared->work_phase); + } /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); @@ -1109,6 +1218,77 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) FreeAccessStrategy(pvs.bstrategy); } +/* + * Launch parallel vacuum workers for the given phase. If at least one + * worker launched, enable the shared vacuum delay costing. + */ +static void +parallel_vacuum_begin_work_phase(ParallelVacuumState *pvs, int nworkers, + PVWorkPhase work_phase) +{ + /* Set the work phase */ + pvs->shared->work_phase = work_phase; + + /* Reinitialize parallel context to relaunch parallel workers */ + if (pvs->need_reinitialize_dsm) + ReinitializeParallelDSM(pvs->pcxt); + + /* + * Set up shared cost balance and the number of active workers for vacuum + * delay. We need to do this before launching workers as otherwise, they + * might not see the updated values for these parameters. + */ + pg_atomic_write_u32(&(pvs->shared->cost_balance), VacuumCostBalance); + pg_atomic_write_u32(&(pvs->shared->active_nworkers), 0); + + /* + * The number of workers can vary between bulkdelete and cleanup phase. + */ + ReinitializeParallelWorkers(pvs->pcxt, nworkers); + + LaunchParallelWorkers(pvs->pcxt); + + /* Enable shared vacuum costing if we are able to launch any worker */ + if (pvs->pcxt->nworkers_launched > 0) + { + /* + * Reset the local cost values for leader backend as we have already + * accumulated the remaining balance of heap. + */ + VacuumCostBalance = 0; + VacuumCostBalanceLocal = 0; + + /* Enable shared cost balance for leader backend */ + VacuumSharedCostBalance = &(pvs->shared->cost_balance); + VacuumActiveNWorkers = &(pvs->shared->active_nworkers); + } +} + +/* + * Wait for parallel vacuum workers to finish, accumulate the statistics, + * and disable shared vacuum delay costing if enabled. + */ +static void +parallel_vacuum_end_worke_phase(ParallelVacuumState *pvs) +{ + /* Wait for all vacuum workers to finish */ + WaitForParallelWorkersToFinish(pvs->pcxt); + + for (int i = 0; i < pvs->pcxt->nworkers_launched; i++) + InstrAccumParallelQuery(&pvs->buffer_usage[i], &pvs->wal_usage[i]); + + /* Carry the shared balance value and disable shared costing */ + if (VacuumSharedCostBalance) + { + VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance); + VacuumSharedCostBalance = NULL; + VacuumActiveNWorkers = NULL; + } + + /* Parallel DSM will need to be reinitialized for the next execution */ + pvs->need_reinitialize_dsm = true; +} + /* * Error context callback for errors occurring during parallel index vacuum. * The error context messages should match the messages set in the lazy vacuum diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index bc37a80dc74f..e785a4a583f2 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -382,7 +382,8 @@ extern void VacuumUpdateCosts(void); extern ParallelVacuumState *parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, int nrequested_workers, int vac_work_mem, int elevel, - BufferAccessStrategy bstrategy); + BufferAccessStrategy bstrategy, + void *state); extern void parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats); extern TidStore *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs, VacDeadItemsInfo **dead_items_info_p); @@ -394,6 +395,8 @@ extern void parallel_vacuum_cleanup_all_indexes(ParallelVacuumState *pvs, long num_table_tuples, int num_index_scans, bool estimated_count); +extern int parallel_vacuum_collect_dead_items_begin(ParallelVacuumState *pvs); +extern void parallel_vacuum_collect_dead_items_end(ParallelVacuumState *pvs); extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc); /* in commands/analyze.c */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e5879e00dffe..faa256ec56c9 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2031,6 +2031,7 @@ PVIndStats PVIndVacStatus PVOID PVShared +PVWorkPhase PX_Alias PX_Cipher PX_Combo From fa57490913e292740d0f83b8144daa29410ef376 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Wed, 26 Feb 2025 11:31:55 -0800 Subject: [PATCH 3/4] Move lazy heap scan related variables to new struct LVScanData. This is a pure refactoring for upcoming parallel heap scan, which requires storing relation statistics and relation data such as extant oldest XID/MXID collected during lazy heap scan to a shared memory area. Reviewed-by: Amit Kapila Reviewed-by: Hayato Kuroda Reviewed-by: Peter Smith Reviewed-by: Tomas Vondra Reviewed-by: Dilip Kumar Reviewed-by: Melanie Plageman Discussion: https://postgr.es/m/CAD21AoAEfCNv-GgaDheDJ+s-p_Lv1H24AiJeNoPGCmZNSwL1YA@mail.gmail.com --- src/backend/access/heap/vacuumlazy.c | 343 ++++++++++++++------------- src/tools/pgindent/typedefs.list | 1 + 2 files changed, 181 insertions(+), 163 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 3b948970437a..aebc7c91379b 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -256,6 +256,56 @@ typedef enum #define VAC_BLK_WAS_EAGER_SCANNED (1 << 0) #define VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM (1 << 1) +/* + * Data and counters updated during lazy heap scan. + */ +typedef struct LVScanData +{ + BlockNumber rel_pages; /* total number of pages */ + + BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ + + /* + * Count of all-visible blocks eagerly scanned (for logging only). This + * does not include skippable blocks scanned due to SKIP_PAGES_THRESHOLD. + */ + BlockNumber eager_scanned_pages; + + BlockNumber removed_pages; /* # pages removed by relation truncation */ + BlockNumber new_frozen_tuple_pages; /* # pages with newly frozen tuples */ + + /* # pages newly set all-visible in the VM */ + BlockNumber vm_new_visible_pages; + + /* + * # pages newly set all-visible and all-frozen in the VM. This is a + * subset of vm_new_visible_pages. That is, vm_new_visible_pages includes + * all pages set all-visible, but vm_new_visible_frozen_pages includes + * only those which were also set all-frozen. + */ + BlockNumber vm_new_visible_frozen_pages; + + /* # all-visible pages newly set all-frozen in the VM */ + BlockNumber vm_new_frozen_pages; + + BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ + BlockNumber missed_dead_pages; /* # pages with missed dead tuples */ + BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + + /* Counters that follow are only for scanned_pages */ + int64 tuples_deleted; /* # deleted from table */ + int64 tuples_frozen; /* # newly frozen */ + int64 lpdead_items; /* # deleted from indexes */ + int64 live_tuples; /* # live tuples remaining */ + int64 recently_dead_tuples; /* # dead, but not yet removable */ + int64 missed_dead_tuples; /* # removable, but not removed */ + + /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid. */ + TransactionId NewRelfrozenXid; + MultiXactId NewRelminMxid; + bool skippedallvis; +} LVScanData; + typedef struct LVRelState { /* Target heap relation and its indexes */ @@ -282,10 +332,6 @@ typedef struct LVRelState /* VACUUM operation's cutoffs for freezing and pruning */ struct VacuumCutoffs cutoffs; GlobalVisState *vistest; - /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */ - TransactionId NewRelfrozenXid; - MultiXactId NewRelminMxid; - bool skippedallvis; /* Error reporting state */ char *dbname; @@ -310,35 +356,8 @@ typedef struct LVRelState TidStore *dead_items; /* TIDs whose index tuples we'll delete */ VacDeadItemsInfo *dead_items_info; - BlockNumber rel_pages; /* total number of pages */ - BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ - - /* - * Count of all-visible blocks eagerly scanned (for logging only). This - * does not include skippable blocks scanned due to SKIP_PAGES_THRESHOLD. - */ - BlockNumber eager_scanned_pages; - - BlockNumber removed_pages; /* # pages removed by relation truncation */ - BlockNumber new_frozen_tuple_pages; /* # pages with newly frozen tuples */ - - /* # pages newly set all-visible in the VM */ - BlockNumber vm_new_visible_pages; - - /* - * # pages newly set all-visible and all-frozen in the VM. This is a - * subset of vm_new_visible_pages. That is, vm_new_visible_pages includes - * all pages set all-visible, but vm_new_visible_frozen_pages includes - * only those which were also set all-frozen. - */ - BlockNumber vm_new_visible_frozen_pages; - - /* # all-visible pages newly set all-frozen in the VM */ - BlockNumber vm_new_frozen_pages; - - BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ - BlockNumber missed_dead_pages; /* # pages with missed dead tuples */ - BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + /* Data and counters updated during lazy heap scan */ + LVScanData *scan_data; /* Statistics output by us, for table */ double new_rel_tuples; /* new estimated total # of tuples */ @@ -348,13 +367,6 @@ typedef struct LVRelState /* Instrumentation counters */ int num_index_scans; - /* Counters that follow are only for scanned_pages */ - int64 tuples_deleted; /* # deleted from table */ - int64 tuples_frozen; /* # newly frozen */ - int64 lpdead_items; /* # deleted from indexes */ - int64 live_tuples; /* # live tuples remaining */ - int64 recently_dead_tuples; /* # dead, but not yet removable */ - int64 missed_dead_tuples; /* # removable, but not removed */ /* State maintained by heap_vac_scan_next_block() */ BlockNumber current_block; /* last block returned */ @@ -524,7 +536,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) * the first region, making the second region the first to be eager * scanned normally. */ - if (vacrel->rel_pages < 2 * EAGER_SCAN_REGION_SIZE) + if (vacrel->scan_data->rel_pages < 2 * EAGER_SCAN_REGION_SIZE) return; /* @@ -616,6 +628,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy) { LVRelState *vacrel; + LVScanData *scan_data; bool verbose, instrument, skipwithvm, @@ -730,14 +743,25 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, } /* Initialize page counters explicitly (be tidy) */ - vacrel->scanned_pages = 0; - vacrel->eager_scanned_pages = 0; - vacrel->removed_pages = 0; - vacrel->new_frozen_tuple_pages = 0; - vacrel->lpdead_item_pages = 0; - vacrel->missed_dead_pages = 0; - vacrel->nonempty_pages = 0; - /* dead_items_alloc allocates vacrel->dead_items later on */ + scan_data = palloc(sizeof(LVScanData)); + scan_data->scanned_pages = 0; + scan_data->eager_scanned_pages = 0; + scan_data->removed_pages = 0; + scan_data->new_frozen_tuple_pages = 0; + scan_data->lpdead_item_pages = 0; + scan_data->missed_dead_pages = 0; + scan_data->nonempty_pages = 0; + scan_data->tuples_deleted = 0; + scan_data->tuples_frozen = 0; + scan_data->lpdead_items = 0; + scan_data->live_tuples = 0; + scan_data->recently_dead_tuples = 0; + scan_data->missed_dead_tuples = 0; + scan_data->vm_new_visible_pages = 0; + scan_data->vm_new_visible_frozen_pages = 0; + scan_data->vm_new_frozen_pages = 0; + scan_data->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); + vacrel->scan_data = scan_data; /* Allocate/initialize output statistics state */ vacrel->new_rel_tuples = 0; @@ -747,17 +771,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Initialize remaining counters (be tidy) */ vacrel->num_index_scans = 0; - vacrel->tuples_deleted = 0; - vacrel->tuples_frozen = 0; - vacrel->lpdead_items = 0; - vacrel->live_tuples = 0; - vacrel->recently_dead_tuples = 0; - vacrel->missed_dead_tuples = 0; - - vacrel->vm_new_visible_pages = 0; - vacrel->vm_new_visible_frozen_pages = 0; - vacrel->vm_new_frozen_pages = 0; - vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); + + /* dead_items_alloc allocates vacrel->dead_items later on */ /* * Get cutoffs that determine which deleted tuples are considered DEAD, @@ -778,15 +793,15 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); vacrel->vistest = GlobalVisTestFor(rel); /* Initialize state used to track oldest extant XID/MXID */ - vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; - vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; + vacrel->scan_data->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; + vacrel->scan_data->NewRelminMxid = vacrel->cutoffs.OldestMxact; /* * Initialize state related to tracking all-visible page skipping. This is * very important to determine whether or not it is safe to advance the * relfrozenxid/relminmxid. */ - vacrel->skippedallvis = false; + vacrel->scan_data->skippedallvis = false; skipwithvm = true; if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) { @@ -874,15 +889,15 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * value >= FreezeLimit, and relminmxid to a value >= MultiXactCutoff. * Non-aggressive VACUUMs may advance them by any amount, or not at all. */ - Assert(vacrel->NewRelfrozenXid == vacrel->cutoffs.OldestXmin || + Assert(vacrel->scan_data->NewRelfrozenXid == vacrel->cutoffs.OldestXmin || TransactionIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.FreezeLimit : vacrel->cutoffs.relfrozenxid, - vacrel->NewRelfrozenXid)); - Assert(vacrel->NewRelminMxid == vacrel->cutoffs.OldestMxact || + vacrel->scan_data->NewRelfrozenXid)); + Assert(vacrel->scan_data->NewRelminMxid == vacrel->cutoffs.OldestMxact || MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff : vacrel->cutoffs.relminmxid, - vacrel->NewRelminMxid)); - if (vacrel->skippedallvis) + vacrel->scan_data->NewRelminMxid)); + if (vacrel->scan_data->skippedallvis) { /* * Must keep original relfrozenxid in a non-aggressive VACUUM that @@ -890,15 +905,16 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * values will have missed unfrozen XIDs from the pages we skipped. */ Assert(!vacrel->aggressive); - vacrel->NewRelfrozenXid = InvalidTransactionId; - vacrel->NewRelminMxid = InvalidMultiXactId; + vacrel->scan_data->NewRelfrozenXid = InvalidTransactionId; + vacrel->scan_data->NewRelminMxid = InvalidMultiXactId; } /* * For safety, clamp relallvisible to be not more than what we're setting * pg_class.relpages to */ - new_rel_pages = vacrel->rel_pages; /* After possible rel truncation */ + new_rel_pages = vacrel->scan_data->rel_pages; /* After possible rel + * truncation */ visibilitymap_count(rel, &new_rel_allvisible, &new_rel_allfrozen); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; @@ -921,7 +937,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vac_update_relstats(rel, new_rel_pages, vacrel->new_live_tuples, new_rel_allvisible, new_rel_allfrozen, vacrel->nindexes > 0, - vacrel->NewRelfrozenXid, vacrel->NewRelminMxid, + vacrel->scan_data->NewRelfrozenXid, + vacrel->scan_data->NewRelminMxid, &frozenxid_updated, &minmulti_updated, false); /* @@ -937,8 +954,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, pgstat_report_vacuum(RelationGetRelid(rel), rel->rd_rel->relisshared, Max(vacrel->new_live_tuples, 0), - vacrel->recently_dead_tuples + - vacrel->missed_dead_tuples, + vacrel->scan_data->recently_dead_tuples + + vacrel->scan_data->missed_dead_tuples, starttime); pgstat_progress_end_command(); @@ -1012,23 +1029,23 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->relname, vacrel->num_index_scans); appendStringInfo(&buf, _("pages: %u removed, %u remain, %u scanned (%.2f%% of total), %u eagerly scanned\n"), - vacrel->removed_pages, + vacrel->scan_data->removed_pages, new_rel_pages, - vacrel->scanned_pages, + vacrel->scan_data->scanned_pages, orig_rel_pages == 0 ? 100.0 : - 100.0 * vacrel->scanned_pages / + 100.0 * vacrel->scan_data->scanned_pages / orig_rel_pages, - vacrel->eager_scanned_pages); + vacrel->scan_data->eager_scanned_pages); appendStringInfo(&buf, _("tuples: %" PRId64 " removed, %" PRId64 " remain, %" PRId64 " are dead but not yet removable\n"), - vacrel->tuples_deleted, + vacrel->scan_data->tuples_deleted, (int64) vacrel->new_rel_tuples, - vacrel->recently_dead_tuples); - if (vacrel->missed_dead_tuples > 0) + vacrel->scan_data->recently_dead_tuples); + if (vacrel->scan_data->missed_dead_tuples > 0) appendStringInfo(&buf, _("tuples missed: %" PRId64 " dead from %u pages not removed due to cleanup lock contention\n"), - vacrel->missed_dead_tuples, - vacrel->missed_dead_pages); + vacrel->scan_data->missed_dead_tuples, + vacrel->scan_data->missed_dead_pages); diff = (int32) (ReadNextTransactionId() - vacrel->cutoffs.OldestXmin); appendStringInfo(&buf, @@ -1036,33 +1053,33 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->cutoffs.OldestXmin, diff); if (frozenxid_updated) { - diff = (int32) (vacrel->NewRelfrozenXid - + diff = (int32) (vacrel->scan_data->NewRelfrozenXid - vacrel->cutoffs.relfrozenxid); appendStringInfo(&buf, _("new relfrozenxid: %u, which is %d XIDs ahead of previous value\n"), - vacrel->NewRelfrozenXid, diff); + vacrel->scan_data->NewRelfrozenXid, diff); } if (minmulti_updated) { - diff = (int32) (vacrel->NewRelminMxid - + diff = (int32) (vacrel->scan_data->NewRelminMxid - vacrel->cutoffs.relminmxid); appendStringInfo(&buf, _("new relminmxid: %u, which is %d MXIDs ahead of previous value\n"), - vacrel->NewRelminMxid, diff); + vacrel->scan_data->NewRelminMxid, diff); } appendStringInfo(&buf, _("frozen: %u pages from table (%.2f%% of total) had %" PRId64 " tuples frozen\n"), - vacrel->new_frozen_tuple_pages, + vacrel->scan_data->new_frozen_tuple_pages, orig_rel_pages == 0 ? 100.0 : - 100.0 * vacrel->new_frozen_tuple_pages / + 100.0 * vacrel->scan_data->new_frozen_tuple_pages / orig_rel_pages, - vacrel->tuples_frozen); + vacrel->scan_data->tuples_frozen); appendStringInfo(&buf, _("visibility map: %u pages set all-visible, %u pages set all-frozen (%u were all-visible)\n"), - vacrel->vm_new_visible_pages, - vacrel->vm_new_visible_frozen_pages + - vacrel->vm_new_frozen_pages, - vacrel->vm_new_frozen_pages); + vacrel->scan_data->vm_new_visible_pages, + vacrel->scan_data->vm_new_visible_frozen_pages + + vacrel->scan_data->vm_new_frozen_pages, + vacrel->scan_data->vm_new_frozen_pages); if (vacrel->do_index_vacuuming) { if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0) @@ -1082,10 +1099,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, msgfmt = _("%u pages from table (%.2f%% of total) have %" PRId64 " dead item identifiers\n"); } appendStringInfo(&buf, msgfmt, - vacrel->lpdead_item_pages, + vacrel->scan_data->lpdead_item_pages, orig_rel_pages == 0 ? 100.0 : - 100.0 * vacrel->lpdead_item_pages / orig_rel_pages, - vacrel->lpdead_items); + 100.0 * vacrel->scan_data->lpdead_item_pages / orig_rel_pages, + vacrel->scan_data->lpdead_items); for (int i = 0; i < vacrel->nindexes; i++) { IndexBulkDeleteResult *istat = vacrel->indstats[i]; @@ -1199,7 +1216,7 @@ static void lazy_scan_heap(LVRelState *vacrel) { ReadStream *stream; - BlockNumber rel_pages = vacrel->rel_pages, + BlockNumber rel_pages = vacrel->scan_data->rel_pages, blkno = 0, next_fsm_block_to_vacuum = 0; BlockNumber orig_eager_scan_success_limit = @@ -1260,8 +1277,8 @@ lazy_scan_heap(LVRelState *vacrel) * one-pass strategy, and the two-pass strategy with the index_cleanup * param set to 'off'. */ - if (vacrel->scanned_pages > 0 && - vacrel->scanned_pages % FAILSAFE_EVERY_PAGES == 0) + if (vacrel->scan_data->scanned_pages > 0 && + vacrel->scan_data->scanned_pages % FAILSAFE_EVERY_PAGES == 0) lazy_check_wraparound_failsafe(vacrel); /* @@ -1316,9 +1333,9 @@ lazy_scan_heap(LVRelState *vacrel) page = BufferGetPage(buf); blkno = BufferGetBlockNumber(buf); - vacrel->scanned_pages++; + vacrel->scan_data->scanned_pages++; if (blk_info & VAC_BLK_WAS_EAGER_SCANNED) - vacrel->eager_scanned_pages++; + vacrel->scan_data->eager_scanned_pages++; /* Report as block scanned, update error traceback information */ pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); @@ -1500,16 +1517,16 @@ lazy_scan_heap(LVRelState *vacrel) /* now we can compute the new value for pg_class.reltuples */ vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages, - vacrel->scanned_pages, - vacrel->live_tuples); + vacrel->scan_data->scanned_pages, + vacrel->scan_data->live_tuples); /* * Also compute the total number of surviving heap entries. In the * (unlikely) scenario that new_live_tuples is -1, take it as zero. */ vacrel->new_rel_tuples = - Max(vacrel->new_live_tuples, 0) + vacrel->recently_dead_tuples + - vacrel->missed_dead_tuples; + Max(vacrel->new_live_tuples, 0) + vacrel->scan_data->recently_dead_tuples + + vacrel->scan_data->missed_dead_tuples; read_stream_end(stream); @@ -1556,7 +1573,7 @@ lazy_scan_heap(LVRelState *vacrel) * callback_private_data contains a reference to the LVRelState, passed to the * read stream API during stream setup. The LVRelState is an in/out parameter * here (locally named `vacrel`). Vacuum options and information about the - * relation are read from it. vacrel->skippedallvis is set if we skip a block + * relation are read from it. vacrel->scan_data->skippedallvis is set if we skip a block * that's all-visible but not all-frozen (to ensure that we don't update * relfrozenxid in that case). vacrel also holds information about the next * unskippable block -- as bookkeeping for this function. @@ -1574,7 +1591,7 @@ heap_vac_scan_next_block(ReadStream *stream, next_block = vacrel->current_block + 1; /* Have we reached the end of the relation? */ - if (next_block >= vacrel->rel_pages) + if (next_block >= vacrel->scan_data->rel_pages) { if (BufferIsValid(vacrel->next_unskippable_vmbuffer)) { @@ -1618,7 +1635,7 @@ heap_vac_scan_next_block(ReadStream *stream, { next_block = vacrel->next_unskippable_block; if (skipsallvis) - vacrel->skippedallvis = true; + vacrel->scan_data->skippedallvis = true; } } @@ -1669,7 +1686,7 @@ heap_vac_scan_next_block(ReadStream *stream, static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis) { - BlockNumber rel_pages = vacrel->rel_pages; + BlockNumber rel_pages = vacrel->scan_data->rel_pages; BlockNumber next_unskippable_block = vacrel->next_unskippable_block + 1; Buffer next_unskippable_vmbuffer = vacrel->next_unskippable_vmbuffer; bool next_unskippable_eager_scanned = false; @@ -1900,11 +1917,11 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, */ if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) { - vacrel->vm_new_visible_pages++; - vacrel->vm_new_visible_frozen_pages++; + vacrel->scan_data->vm_new_visible_pages++; + vacrel->scan_data->vm_new_visible_frozen_pages++; } else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0) - vacrel->vm_new_frozen_pages++; + vacrel->scan_data->vm_new_frozen_pages++; } freespace = PageGetHeapFreeSpace(page); @@ -1979,10 +1996,10 @@ lazy_scan_prune(LVRelState *vacrel, heap_page_prune_and_freeze(rel, buf, vacrel->vistest, prune_options, &vacrel->cutoffs, &presult, PRUNE_VACUUM_SCAN, &vacrel->offnum, - &vacrel->NewRelfrozenXid, &vacrel->NewRelminMxid); + &vacrel->scan_data->NewRelfrozenXid, &vacrel->scan_data->NewRelminMxid); - Assert(MultiXactIdIsValid(vacrel->NewRelminMxid)); - Assert(TransactionIdIsValid(vacrel->NewRelfrozenXid)); + Assert(MultiXactIdIsValid(vacrel->scan_data->NewRelminMxid)); + Assert(TransactionIdIsValid(vacrel->scan_data->NewRelfrozenXid)); if (presult.nfrozen > 0) { @@ -1992,7 +2009,7 @@ lazy_scan_prune(LVRelState *vacrel, * frozen tuples (don't confuse that with pages newly set all-frozen * in VM). */ - vacrel->new_frozen_tuple_pages++; + vacrel->scan_data->new_frozen_tuple_pages++; } /* @@ -2027,7 +2044,7 @@ lazy_scan_prune(LVRelState *vacrel, */ if (presult.lpdead_items > 0) { - vacrel->lpdead_item_pages++; + vacrel->scan_data->lpdead_item_pages++; /* * deadoffsets are collected incrementally in @@ -2042,15 +2059,15 @@ lazy_scan_prune(LVRelState *vacrel, } /* Finally, add page-local counts to whole-VACUUM counts */ - vacrel->tuples_deleted += presult.ndeleted; - vacrel->tuples_frozen += presult.nfrozen; - vacrel->lpdead_items += presult.lpdead_items; - vacrel->live_tuples += presult.live_tuples; - vacrel->recently_dead_tuples += presult.recently_dead_tuples; + vacrel->scan_data->tuples_deleted += presult.ndeleted; + vacrel->scan_data->tuples_frozen += presult.nfrozen; + vacrel->scan_data->lpdead_items += presult.lpdead_items; + vacrel->scan_data->live_tuples += presult.live_tuples; + vacrel->scan_data->recently_dead_tuples += presult.recently_dead_tuples; /* Can't truncate this page */ if (presult.hastup) - vacrel->nonempty_pages = blkno + 1; + vacrel->scan_data->nonempty_pages = blkno + 1; /* Did we find LP_DEAD items? */ *has_lpdead_items = (presult.lpdead_items > 0); @@ -2099,17 +2116,17 @@ lazy_scan_prune(LVRelState *vacrel, */ if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) { - vacrel->vm_new_visible_pages++; + vacrel->scan_data->vm_new_visible_pages++; if (presult.all_frozen) { - vacrel->vm_new_visible_frozen_pages++; + vacrel->scan_data->vm_new_visible_frozen_pages++; *vm_page_frozen = true; } } else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 && presult.all_frozen) { - vacrel->vm_new_frozen_pages++; + vacrel->scan_data->vm_new_frozen_pages++; *vm_page_frozen = true; } } @@ -2197,8 +2214,8 @@ lazy_scan_prune(LVRelState *vacrel, */ if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) { - vacrel->vm_new_visible_pages++; - vacrel->vm_new_visible_frozen_pages++; + vacrel->scan_data->vm_new_visible_pages++; + vacrel->scan_data->vm_new_visible_frozen_pages++; *vm_page_frozen = true; } @@ -2208,7 +2225,7 @@ lazy_scan_prune(LVRelState *vacrel, */ else { - vacrel->vm_new_frozen_pages++; + vacrel->scan_data->vm_new_frozen_pages++; *vm_page_frozen = true; } } @@ -2249,8 +2266,8 @@ lazy_scan_noprune(LVRelState *vacrel, missed_dead_tuples; bool hastup; HeapTupleHeader tupleheader; - TransactionId NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid; - MultiXactId NoFreezePageRelminMxid = vacrel->NewRelminMxid; + TransactionId NoFreezePageRelfrozenXid = vacrel->scan_data->NewRelfrozenXid; + MultiXactId NoFreezePageRelminMxid = vacrel->scan_data->NewRelminMxid; OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -2377,8 +2394,8 @@ lazy_scan_noprune(LVRelState *vacrel, * this particular page until the next VACUUM. Remember its details now. * (lazy_scan_prune expects a clean slate, so we have to do this last.) */ - vacrel->NewRelfrozenXid = NoFreezePageRelfrozenXid; - vacrel->NewRelminMxid = NoFreezePageRelminMxid; + vacrel->scan_data->NewRelfrozenXid = NoFreezePageRelfrozenXid; + vacrel->scan_data->NewRelminMxid = NoFreezePageRelminMxid; /* Save any LP_DEAD items found on the page in dead_items */ if (vacrel->nindexes == 0) @@ -2405,25 +2422,25 @@ lazy_scan_noprune(LVRelState *vacrel, * indexes will be deleted during index vacuuming (and then marked * LP_UNUSED in the heap) */ - vacrel->lpdead_item_pages++; + vacrel->scan_data->lpdead_item_pages++; dead_items_add(vacrel, blkno, deadoffsets, lpdead_items); - vacrel->lpdead_items += lpdead_items; + vacrel->scan_data->lpdead_items += lpdead_items; } /* * Finally, add relevant page-local counts to whole-VACUUM counts */ - vacrel->live_tuples += live_tuples; - vacrel->recently_dead_tuples += recently_dead_tuples; - vacrel->missed_dead_tuples += missed_dead_tuples; + vacrel->scan_data->live_tuples += live_tuples; + vacrel->scan_data->recently_dead_tuples += recently_dead_tuples; + vacrel->scan_data->missed_dead_tuples += missed_dead_tuples; if (missed_dead_tuples > 0) - vacrel->missed_dead_pages++; + vacrel->scan_data->missed_dead_pages++; /* Can't truncate this page */ if (hastup) - vacrel->nonempty_pages = blkno + 1; + vacrel->scan_data->nonempty_pages = blkno + 1; /* Did we find LP_DEAD items? */ *has_lpdead_items = (lpdead_items > 0); @@ -2452,7 +2469,7 @@ lazy_vacuum(LVRelState *vacrel) /* Should not end up here with no indexes */ Assert(vacrel->nindexes > 0); - Assert(vacrel->lpdead_item_pages > 0); + Assert(vacrel->scan_data->lpdead_item_pages > 0); if (!vacrel->do_index_vacuuming) { @@ -2481,12 +2498,12 @@ lazy_vacuum(LVRelState *vacrel) * HOT through careful tuning. */ bypass = false; - if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0) + if (vacrel->consider_bypass_optimization && vacrel->scan_data->rel_pages > 0) { BlockNumber threshold; Assert(vacrel->num_index_scans == 0); - Assert(vacrel->lpdead_items == vacrel->dead_items_info->num_items); + Assert(vacrel->scan_data->lpdead_items == vacrel->dead_items_info->num_items); Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2512,8 +2529,8 @@ lazy_vacuum(LVRelState *vacrel) * be negligible. If this optimization is ever expanded to cover more * cases then this may need to be reconsidered. */ - threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; - bypass = (vacrel->lpdead_item_pages < threshold && + threshold = (double) vacrel->scan_data->rel_pages * BYPASS_THRESHOLD_PAGES; + bypass = (vacrel->scan_data->lpdead_item_pages < threshold && TidStoreMemoryUsage(vacrel->dead_items) < 32 * 1024 * 1024); } @@ -2651,7 +2668,7 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) * place). */ Assert(vacrel->num_index_scans > 0 || - vacrel->dead_items_info->num_items == vacrel->lpdead_items); + vacrel->dead_items_info->num_items == vacrel->scan_data->lpdead_items); Assert(allindexes || VacuumFailsafeActive); /* @@ -2813,8 +2830,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) * the second heap pass. No more, no less. */ Assert(vacrel->num_index_scans > 1 || - (vacrel->dead_items_info->num_items == vacrel->lpdead_items && - vacuumed_pages == vacrel->lpdead_item_pages)); + (vacrel->dead_items_info->num_items == vacrel->scan_data->lpdead_items && + vacuumed_pages == vacrel->scan_data->lpdead_item_pages)); ereport(DEBUG2, (errmsg("table \"%s\": removed %" PRId64 " dead item identifiers in %u pages", @@ -2930,14 +2947,14 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, */ if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) { - vacrel->vm_new_visible_pages++; + vacrel->scan_data->vm_new_visible_pages++; if (all_frozen) - vacrel->vm_new_visible_frozen_pages++; + vacrel->scan_data->vm_new_visible_frozen_pages++; } else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) - vacrel->vm_new_frozen_pages++; + vacrel->scan_data->vm_new_frozen_pages++; } /* Revert to the previous phase information for error traceback */ @@ -3013,7 +3030,7 @@ static void lazy_cleanup_all_indexes(LVRelState *vacrel) { double reltuples = vacrel->new_rel_tuples; - bool estimated_count = vacrel->scanned_pages < vacrel->rel_pages; + bool estimated_count = vacrel->scan_data->scanned_pages < vacrel->scan_data->rel_pages; const int progress_start_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_INDEXES_TOTAL @@ -3194,10 +3211,10 @@ should_attempt_truncation(LVRelState *vacrel) if (!vacrel->do_rel_truncate || VacuumFailsafeActive) return false; - possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; + possibly_freeable = vacrel->scan_data->rel_pages - vacrel->scan_data->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || - possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION)) + possibly_freeable >= vacrel->scan_data->rel_pages / REL_TRUNCATE_FRACTION)) return true; return false; @@ -3209,7 +3226,7 @@ should_attempt_truncation(LVRelState *vacrel) static void lazy_truncate_heap(LVRelState *vacrel) { - BlockNumber orig_rel_pages = vacrel->rel_pages; + BlockNumber orig_rel_pages = vacrel->scan_data->rel_pages; BlockNumber new_rel_pages; bool lock_waiter_detected; int lock_retry; @@ -3220,7 +3237,7 @@ lazy_truncate_heap(LVRelState *vacrel) /* Update error traceback information one last time */ update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE, - vacrel->nonempty_pages, InvalidOffsetNumber); + vacrel->scan_data->nonempty_pages, InvalidOffsetNumber); /* * Loop until no more truncating can be done. @@ -3321,15 +3338,15 @@ lazy_truncate_heap(LVRelState *vacrel) * without also touching reltuples, since the tuple count wasn't * changed by the truncation. */ - vacrel->removed_pages += orig_rel_pages - new_rel_pages; - vacrel->rel_pages = new_rel_pages; + vacrel->scan_data->removed_pages += orig_rel_pages - new_rel_pages; + vacrel->scan_data->rel_pages = new_rel_pages; ereport(vacrel->verbose ? INFO : DEBUG2, (errmsg("table \"%s\": truncated %u to %u pages", vacrel->relname, orig_rel_pages, new_rel_pages))); orig_rel_pages = new_rel_pages; - } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected); + } while (new_rel_pages > vacrel->scan_data->nonempty_pages && lock_waiter_detected); } /* @@ -3353,11 +3370,11 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) * unsigned.) To make the scan faster, we prefetch a few blocks at a time * in forward direction, so that OS-level readahead can kick in. */ - blkno = vacrel->rel_pages; + blkno = vacrel->scan_data->rel_pages; StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, "prefetch size must be power of 2"); prefetchedUntil = InvalidBlockNumber; - while (blkno > vacrel->nonempty_pages) + while (blkno > vacrel->scan_data->nonempty_pages) { Buffer buf; Page page; @@ -3469,7 +3486,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) * pages still are; we need not bother to look at the last known-nonempty * page. */ - return vacrel->nonempty_pages; + return vacrel->scan_data->nonempty_pages; } /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index faa256ec56c9..6cab30079b9d 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1522,6 +1522,7 @@ LSEG LUID LVRelState LVSavedErrInfo +LVScanData LWLock LWLockHandle LWLockMode From 9a3f8c49532bd48b9880ffb073636b21a82bac53 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Thu, 27 Feb 2025 13:41:35 -0800 Subject: [PATCH 4/4] Support parallelism for collecting dead items during lazy vacuum. This feature allows the vacuum to leverage multiple CPUs in order to collect dead items (i.e. the first pass over heap table) with parallel workers. The parallel degree for parallel heap vacuuming is determined based on the number of blocks to vacuum unless PARALLEL option of VACUUM command is specified, and further limited by max_parallel_maintenance_workers. For the parallel heap scan to collect dead items, we utilize a parallel block table scan, controlled by ParallelBlockTableScanDesc, in conjunction with the read stream. The workers' parallel scan descriptions are stored in the DSM space, enabling different parallel workers to resume the heap scan (phase 1) after a cycle of heap vacuuming and index vacuuming (phase 2 and 3) from their previous state. However, due to the potential presence of pinned buffers loaded by the read stream's look-ahead mechanism, we cannot abruptly stop phase 1 even when the space of dead_items TIDs exceeds the limit. Therefore, once the space of dead_items TIDs exceeds the limit, we begin processing pages without attempting to retrieve additional blocks by look-ahead mechanism until the read stream is exhausted, even if the the memory limit is surpassed. While this approach may increase the memory usage, it typically doesn't pose a significant problem, as processing a few 10s-100s buffers doesn't substantially increase the size of dead_items TIDs. When the parallel heap scan to collect dead items is enabled, we disable eager scanning. This is because parallel vacuum is available only in the VACUUM command and would not occur frequently, which doesn't align with the purpose of eager scanning. Reviewed-by: Amit Kapila Reviewed-by: Hayato Kuroda Reviewed-by: Peter Smith Reviewed-by: Tomas Vondra Reviewed-by: Dilip Kumar Reviewed-by: Melanie Plageman Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CAD21AoAEfCNv-GgaDheDJ+s-p_Lv1H24AiJeNoPGCmZNSwL1YA@mail.gmail.com --- doc/src/sgml/ref/vacuum.sgml | 54 +- src/backend/access/heap/heapam_handler.c | 4 + src/backend/access/heap/vacuumlazy.c | 992 ++++++++++++++++++++--- src/backend/commands/vacuumparallel.c | 29 + src/include/access/heapam.h | 11 + src/include/commands/vacuum.h | 3 + src/test/regress/expected/vacuum.out | 6 + src/test/regress/sql/vacuum.sql | 7 + src/tools/pgindent/typedefs.list | 4 + 9 files changed, 989 insertions(+), 121 deletions(-) diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index bd5dcaf86a5c..294494877d9d 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -280,25 +280,41 @@ VACUUM [ ( option [, ...] ) ] [ PARALLEL - Perform index vacuum and index cleanup phases of VACUUM - in parallel using integer - background workers (for the details of each vacuum phase, please - refer to ). The number of workers used - to perform the operation is equal to the number of indexes on the - relation that support parallel vacuum which is limited by the number of - workers specified with PARALLEL option if any which is - further limited by . - An index can participate in parallel vacuum if and only if the size of the - index is more than . - Please note that it is not guaranteed that the number of parallel workers - specified in integer will be - used during execution. It is possible for a vacuum to run with fewer - workers than specified, or even with no workers at all. Only one worker - can be used per index. So parallel workers are launched only when there - are at least 2 indexes in the table. Workers for - vacuum are launched before the start of each phase and exit at the end of - the phase. These behaviors might change in a future release. This - option can't be used with the FULL option. + Perform scanning heap, index vacuum, and index cleanup phases of + VACUUM in parallel using + integer background workers + (for the details of each vacuum phase, please refer to + ). + + + For heap tables, the number of workers used to perform the scanning + heap is determined based on the size of table. A table can participate in + parallel scanning heap if and only if the size of the table is more than + . During scanning heap, + the heap table's blocks will be divided into ranges and shared among the + cooperating processes. Each worker process will complete the scanning of + its given range of blocks before requesting an additional range of blocks. + + + The number of workers used to perform parallel index vacuum and index + cleanup is equal to the number of indexes on the relation that support + parallel vacuum. An index can participate in parallel vacuum if and only + if the size of the index is more than . + Only one worker can be used per index. So parallel workers for index vacuum + and index cleanup are launched only when there are at least 2 + indexes in the table. + + + Workers for vacuum are launched before the start of each phase and exit + at the end of the phase. The number of workers for each phase is limited by + the number of workers specified with PARALLEL option if + any which is futher limited by . + Please note that in any parallel vacuum phase, it is not guaanteed that the + number of parallel workers specified in integer + will be used during execution. It is possible for a vacuum to run with fewer + workers than specified, or even with no workers at all. These behaviors might + change in a future release. This option can't be used with the FULL + option. diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index aad419e46e8e..dc02be807fbc 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2671,6 +2671,10 @@ static const TableAmRoutine heapam_methods = { .scan_sample_next_tuple = heapam_scan_sample_next_tuple, .parallel_vacuum_compute_workers = heap_parallel_vacuum_compute_workers, + .parallel_vacuum_estimate = heap_parallel_vacuum_estimate, + .parallel_vacuum_initialize = heap_parallel_vacuum_initialize, + .parallel_vacuum_initialize_worker = heap_parallel_vacuum_initialize_worker, + .parallel_vacuum_collect_dead_items = heap_parallel_vacuum_collect_dead_items, }; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index aebc7c91379b..88e13eea0fc4 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -99,6 +99,46 @@ * After pruning and freezing, pages that are newly all-visible and all-frozen * are marked as such in the visibility map. * + * Parallel Vacuum: + * + * Lazy vacuum on heap tables supports parallel processing for phase I and + * phase II. Before starting phase I, we initialize parallel vacuum state, + * ParallelVacuumState, and allocate the TID store in a DSA area if we can + * use parallel mode for any of these two phases. + * + * We could require different number of parallel vacuum workers for each phase + * for various factors such as table size and number of indexes. Parallel + * workers are launched at the beginning of each phase and exit at the end of + * each phase. + * + * For the parallel lazy heap scan (i.e. parallel phase I), we employ a parallel + * block table scan, controlled by ParallelBlockTableScanDesc, in conjunction + * with the read stream. The table is split into multiple chunks, which are + * then distributed among parallel workers. + * + * While vacuum cutoffs are shared between leader and worker processes, each + * individual process uses its own GlobalVisState, potentially causing some + * workers to remove fewer tuples than optimal. During parallel lazy heap scans, + * each worker tracks the oldest existing XID and MXID. The leader computes the + * globally oldest existing XID and MXID after the parallel scan, while + * gathering table data too. + * + * The workers' parallel scan descriptions, ParallelBlockTableScanWorkerData, + * are stored in the DSM space, enabling different parallel workers to resume + * phase I from their previous state. However, due to the potential presence + * of pinned buffers loaded by the read stream's look-ahead mechanism, we + * cannot abruptly stop phase I even when the space of dead_items TIDs exceeds + * the limit. Instead, once this threshold is surpassed, we begin processing + * pages without attempting to retrieve additional blocks until the read + * stream is exhausted. While this approach may increase the memory usage, it + * typically doesn't pose a significant problem, as processing a few 10s-100s + * buffers doesn't substantially increase the size of dead_items TIDs. + * + * If the leader launches fewer workers than the previous time to resume the + * parallel lazy heap scan, some block within chunks may remain un-scanned. + * To address this, the leader completes workers' unfinished scans at the end + * of the parallel lazy heap scan (see complete_unfinished_lazy_scan_heap()). + * * Dead TID Storage: * * The major space usage for vacuuming is storage for the dead tuple IDs that @@ -147,6 +187,7 @@ #include "common/pg_prng.h" #include "executor/instrument.h" #include "miscadmin.h" +#include "optimizer/paths.h" /* for min_parallel_table_scan_size */ #include "pgstat.h" #include "portability/instr_time.h" #include "postmaster/autovacuum.h" @@ -214,11 +255,21 @@ */ #define PREFETCH_SIZE ((BlockNumber) 32) +/* + * DSM keys for parallel lazy vacuum. Unlike other parallel execution code, we + * we don't need to worry about DSM keys conflicting with plan_node_id, but need to + * avoid conflicting with DSM keys used in vacuumparallel.c. + */ +#define PARALLEL_LV_KEY_SHARED 0xFFFF0001 +#define PARALLEL_LV_KEY_SCANDESC 0xFFFF0002 +#define PARALLEL_LV_KEY_SCANWORKER 0xFFFF0003 + /* * Macro to check if we are in a parallel vacuum. If true, we are in the * parallel mode and the DSM segment is initialized. */ #define ParallelVacuumIsActive(vacrel) ((vacrel)->pvs != NULL) +#define ParallelHeapVacuumIsActive(vacrel) ((vacrel)->plvstate != NULL) /* Phases of vacuum during which we report error context. */ typedef enum @@ -306,6 +357,80 @@ typedef struct LVScanData bool skippedallvis; } LVScanData; +/* + * Struct for information that needs to be shared among parallel workers + * for parallel lazy vacuum. All fields are static, set by the leader + * process. + */ +typedef struct ParallelLVShared +{ + bool aggressive; + bool skipwithvm; + + /* The current oldest extant XID/MXID shared by the leader process */ + TransactionId NewRelfrozenXid; + MultiXactId NewRelminMxid; + + /* VACUUM operation's cutoffs for freezing and pruning */ + struct VacuumCutoffs cutoffs; +} ParallelLVShared; + +/* + * Per-worker data for scan description, statistics counters, and + * miscellaneous data need to be shared with the leader. + */ +typedef struct ParallelLVScanWorker +{ + /* Both last_blkno and pbscanworkdata are initialized? */ + bool scan_inited; + + /* The last processed block number */ + pg_atomic_uint32 last_blkno; + + /* per-worker parallel table scan state */ + ParallelBlockTableScanWorkerData pbscanworkdata; + + /* per-worker scan data and counters */ + LVScanData scandata; +} ParallelLVScanWorker; + +/* + * Struct to store parallel lazy vacuum working state. + */ +typedef struct ParallelLVState +{ + /* Parallel scan description shared among parallel workers */ + ParallelBlockTableScanDesc pbscan; + + /* Per-worker parallel table scan state */ + ParallelBlockTableScanWorker pbscanwork; + + /* Shared static information */ + ParallelLVShared *shared; + + /* Per-worker scan data. NULL for the leader process */ + ParallelLVScanWorker *scanworker; +} ParallelLVState; + +/* + * Struct for the leader process in parallel lazy vacuum. + */ +typedef struct ParallelLVLeader +{ + /* Shared memory size for each shared object */ + Size pbscan_len; + Size shared_len; + Size scanworker_len; + + /* The number of workers launched for parallel lazy heap scan */ + int nworkers_launched; + + /* + * Points to the array of all per-worker scan states stored on DSM area. + */ + ParallelLVScanWorker *scanworkers; +} ParallelLVLeader; + typedef struct LVRelState { /* Target heap relation and its indexes */ @@ -368,6 +493,12 @@ typedef struct LVRelState /* Instrumentation counters */ int num_index_scans; + /* Last processed block number */ + BlockNumber last_blkno; + + /* Next block to check for FSM vacuum */ + BlockNumber next_fsm_block_to_vacuum; + /* State maintained by heap_vac_scan_next_block() */ BlockNumber current_block; /* last block returned */ BlockNumber next_unskippable_block; /* next unskippable block */ @@ -375,6 +506,16 @@ typedef struct LVRelState bool next_unskippable_eager_scanned; /* if it was eagerly scanned */ Buffer next_unskippable_vmbuffer; /* buffer containing its VM bit */ + /* Fields used for parallel lazy vacuum */ + + /* Parallel lazy vacuum working state */ + ParallelLVState *plvstate; + + /* + * The leader state for parallel lazy vacuum. NULL for parallel workers. + */ + ParallelLVLeader *leader; + /* State related to managing eager scanning of all-visible pages */ /* @@ -434,12 +575,14 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel); +static void do_lazy_scan_heap(LVRelState *vacrel, bool do_vacuum); static void heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params); static BlockNumber heap_vac_scan_next_block(ReadStream *stream, void *callback_private_data, void *per_buffer_data); -static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis); +static bool find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis, + BlockNumber start_blk, BlockNumber end_blk); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); @@ -450,6 +593,12 @@ static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool *has_lpdead_items); +static void do_parallel_lazy_scan_heap(LVRelState *vacrel); +static BlockNumber parallel_lazy_scan_compute_min_scan_block(LVRelState *vacrel); +static void complete_unfinished_lazy_scan_heap(LVRelState *vacrel); +static void parallel_lazy_scan_heap_begin(LVRelState *vacrel); +static void parallel_lazy_scan_heap_end(LVRelState *vacrel); +static void parallel_lazy_scan_gather_scan_results(LVRelState *vacrel); static void lazy_vacuum(LVRelState *vacrel); static bool lazy_vacuum_all_indexes(LVRelState *vacrel); static void lazy_vacuum_heap_rel(LVRelState *vacrel); @@ -474,6 +623,7 @@ static BlockNumber count_nondeletable_pages(LVRelState *vacrel, static void dead_items_alloc(LVRelState *vacrel, int nworkers); static void dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets, int num_offsets); +static bool dead_items_check_memory_limit(LVRelState *vacrel); static void dead_items_reset(LVRelState *vacrel); static void dead_items_cleanup(LVRelState *vacrel); static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, @@ -529,6 +679,22 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) if (vacrel->aggressive) return; + /* + * Disable eager scanning if parallel lazy vacuum is enabled. + * + * One might think that it would make sense to use the eager scanning even + * during parallel lazy vacuum, but parallel vacuum is available only in + * VACUUM command and would not be something that happens frequently, + * which seems not fit to the purpose of the eager scanning. Also, it + * would require making the code complex. So it would make sense to + * disable it for now. + * + * XXX: this limitation might need to be eliminated in the future for + * example when we use parallel vacuum also in autovacuum. + */ + if (ParallelHeapVacuumIsActive(vacrel)) + return; + /* * Aggressively vacuuming a small relation shouldn't take long, so it * isn't worth amortizing. We use two times the region size as the size @@ -771,6 +937,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Initialize remaining counters (be tidy) */ vacrel->num_index_scans = 0; + vacrel->next_fsm_block_to_vacuum = 0; /* dead_items_alloc allocates vacrel->dead_items later on */ @@ -815,13 +982,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->skipwithvm = skipwithvm; - /* - * Set up eager scan tracking state. This must happen after determining - * whether or not the vacuum must be aggressive, because only normal - * vacuums use the eager scan algorithm. - */ - heap_vacuum_eager_scan_setup(vacrel, params); - if (verbose) { if (vacrel->aggressive) @@ -846,6 +1006,13 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, lazy_check_wraparound_failsafe(vacrel); dead_items_alloc(vacrel, params->nworkers); + /* + * Set up eager scan tracking state. This must happen after determining + * whether or not the vacuum must be aggressive, because only normal + * vacuums use the eager scan algorithm. + */ + heap_vacuum_eager_scan_setup(vacrel, params); + /* * Call lazy_scan_heap to perform all required heap pruning, index * vacuuming, and heap vacuuming (plus related processing) @@ -1215,13 +1382,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, static void lazy_scan_heap(LVRelState *vacrel) { - ReadStream *stream; - BlockNumber rel_pages = vacrel->scan_data->rel_pages, - blkno = 0, - next_fsm_block_to_vacuum = 0; - BlockNumber orig_eager_scan_success_limit = - vacrel->eager_scan_remaining_successes; /* for logging */ - Buffer vmbuffer = InvalidBuffer; + BlockNumber rel_pages = vacrel->scan_data->rel_pages; const int initprog_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_TOTAL_HEAP_BLKS, @@ -1242,6 +1403,73 @@ lazy_scan_heap(LVRelState *vacrel) vacrel->next_unskippable_eager_scanned = false; vacrel->next_unskippable_vmbuffer = InvalidBuffer; + /* Do the actual work */ + if (ParallelHeapVacuumIsActive(vacrel)) + do_parallel_lazy_scan_heap(vacrel); + else + do_lazy_scan_heap(vacrel, true); + + /* + * Report that everything is now scanned. We never skip scanning the last + * block in the relation, so we can pass rel_pages here. + */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, + rel_pages); + + /* now we can compute the new value for pg_class.reltuples */ + vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages, + vacrel->scan_data->scanned_pages, + vacrel->scan_data->live_tuples); + + /* + * Also compute the total number of surviving heap entries. In the + * (unlikely) scenario that new_live_tuples is -1, take it as zero. + */ + vacrel->new_rel_tuples = + Max(vacrel->new_live_tuples, 0) + vacrel->scan_data->recently_dead_tuples + + vacrel->scan_data->missed_dead_tuples; + + /* + * Do index vacuuming (call each index's ambulkdelete routine), then do + * related heap vacuuming + */ + if (vacrel->dead_items_info->num_items > 0) + lazy_vacuum(vacrel); + + /* + * Vacuum the remainder of the Free Space Map. We must do this whether or + * not there were indexes, and whether or not we bypassed index vacuuming. + * We can pass rel_pages here because we never skip scanning the last + * block of the relation. + */ + if (rel_pages > vacrel->next_fsm_block_to_vacuum) + FreeSpaceMapVacuumRange(vacrel->rel, vacrel->next_fsm_block_to_vacuum, rel_pages); + + /* report all blocks vacuumed */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, rel_pages); + + /* Do final index cleanup (call each index's amvacuumcleanup routine) */ + if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) + lazy_cleanup_all_indexes(vacrel); +} + +/* + * Workhorse for lazy_scan_heap(). + * + * If do_vacuum is true, we stop the lazy heap scan and invoke a cycle of index + * vacuuming and table vacuuming if the space of dead_items TIDs exceeds the limit, and + * then resume it. On the other hand, if it's false, we continue scanning until the + * read stream is exhausted. + */ +static void +do_lazy_scan_heap(LVRelState *vacrel, bool do_vacuum) +{ + ReadStream *stream; + BlockNumber blkno = InvalidBlockNumber; + BlockNumber orig_eager_scan_success_limit = + vacrel->eager_scan_remaining_successes; /* for logging */ + Buffer vmbuffer = InvalidBuffer; + /* * Set up the read stream for vacuum's first pass through the heap. * @@ -1276,8 +1504,11 @@ lazy_scan_heap(LVRelState *vacrel) * that point. This check also provides failsafe coverage for the * one-pass strategy, and the two-pass strategy with the index_cleanup * param set to 'off'. + * + * The failsafe check is done only by the leader process. */ - if (vacrel->scan_data->scanned_pages > 0 && + if (!IsParallelWorker() && + vacrel->scan_data->scanned_pages > 0 && vacrel->scan_data->scanned_pages % FAILSAFE_EVERY_PAGES == 0) lazy_check_wraparound_failsafe(vacrel); @@ -1285,12 +1516,9 @@ lazy_scan_heap(LVRelState *vacrel) * Consider if we definitely have enough space to process TIDs on page * already. If we are close to overrunning the available space for * dead_items TIDs, pause and do a cycle of vacuuming before we tackle - * this page. However, let's force at least one page-worth of tuples - * to be stored as to ensure we do at least some work when the memory - * configured is so low that we run out before storing anything. + * this page. */ - if (vacrel->dead_items_info->num_items > 0 && - TidStoreMemoryUsage(vacrel->dead_items) > vacrel->dead_items_info->max_bytes) + if (do_vacuum && dead_items_check_memory_limit(vacrel)) { /* * Before beginning index vacuuming, we release any pin we may @@ -1313,15 +1541,16 @@ lazy_scan_heap(LVRelState *vacrel) * upper-level FSM pages. Note that blkno is the previously * processed block. */ - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + FreeSpaceMapVacuumRange(vacrel->rel, vacrel->next_fsm_block_to_vacuum, blkno + 1); - next_fsm_block_to_vacuum = blkno; + vacrel->next_fsm_block_to_vacuum = blkno; /* Report that we are once again scanning the heap */ pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_SCAN_HEAP); } + /* Read the next block to process */ buf = read_stream_next_buffer(stream, &per_buffer_data); /* The relation is exhausted. */ @@ -1331,7 +1560,7 @@ lazy_scan_heap(LVRelState *vacrel) blk_info = *((uint8 *) per_buffer_data); CheckBufferIsPinnedOnce(buf); page = BufferGetPage(buf); - blkno = BufferGetBlockNumber(buf); + blkno = vacrel->last_blkno = BufferGetBlockNumber(buf); vacrel->scan_data->scanned_pages++; if (blk_info & VAC_BLK_WAS_EAGER_SCANNED) @@ -1491,13 +1720,36 @@ lazy_scan_heap(LVRelState *vacrel) * visible on upper FSM pages. This is done after vacuuming if the * table has indexes. There will only be newly-freed space if we * held the cleanup lock and lazy_scan_prune() was called. + * + * During parallel lazy heap scanning, only the leader process + * vacuums the FSM. However, we cannot vacuum the FSM for blocks + * up to 'blk' because there may be un-scanned blocks or blocks + * being processed by workers before this point. Instead, parallel + * workers advertise the block numbers they have just processed, + * and the leader vacuums the FSM up to the smallest block number + * among them. This approach ensures we vacuum the FSM for + * consecutive processed blocks. */ if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items && - blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) + blkno - vacrel->next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) { - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + if (IsParallelWorker()) + { + pg_atomic_write_u32(&(vacrel->plvstate->scanworker->last_blkno), blkno); - next_fsm_block_to_vacuum = blkno; + } + else + { + BlockNumber fsmvac_upto = blkno; + + if (ParallelHeapVacuumIsActive(vacrel)) + fsmvac_upto = parallel_lazy_scan_compute_min_scan_block(vacrel); + + FreeSpaceMapVacuumRange(vacrel->rel, vacrel->next_fsm_block_to_vacuum, + fsmvac_upto); + } + + vacrel->next_fsm_block_to_vacuum = blkno; } } else @@ -1508,50 +1760,7 @@ lazy_scan_heap(LVRelState *vacrel) if (BufferIsValid(vmbuffer)) ReleaseBuffer(vmbuffer); - /* - * Report that everything is now scanned. We never skip scanning the last - * block in the relation, so we can pass rel_pages here. - */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, - rel_pages); - - /* now we can compute the new value for pg_class.reltuples */ - vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages, - vacrel->scan_data->scanned_pages, - vacrel->scan_data->live_tuples); - - /* - * Also compute the total number of surviving heap entries. In the - * (unlikely) scenario that new_live_tuples is -1, take it as zero. - */ - vacrel->new_rel_tuples = - Max(vacrel->new_live_tuples, 0) + vacrel->scan_data->recently_dead_tuples + - vacrel->scan_data->missed_dead_tuples; - read_stream_end(stream); - - /* - * Do index vacuuming (call each index's ambulkdelete routine), then do - * related heap vacuuming - */ - if (vacrel->dead_items_info->num_items > 0) - lazy_vacuum(vacrel); - - /* - * Vacuum the remainder of the Free Space Map. We must do this whether or - * not there were indexes, and whether or not we bypassed index vacuuming. - * We can pass rel_pages here because we never skip scanning the last - * block of the relation. - */ - if (rel_pages > next_fsm_block_to_vacuum) - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, rel_pages); - - /* report all blocks vacuumed */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, rel_pages); - - /* Do final index cleanup (call each index's amvacuumcleanup routine) */ - if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) - lazy_cleanup_all_indexes(vacrel); } /* @@ -1565,7 +1774,8 @@ lazy_scan_heap(LVRelState *vacrel) * heap_vac_scan_next_block() uses the visibility map, vacuum options, and * various thresholds to skip blocks which do not need to be processed and * returns the next block to process or InvalidBlockNumber if there are no - * remaining blocks. + * remaining blocks or the space of dead_items TIDs reaches the limit (only + * in parallel lazy vacuum cases). * * The visibility status of the next block to process and whether or not it * was eager scanned is set in the per_buffer_data. @@ -1587,11 +1797,37 @@ heap_vac_scan_next_block(ReadStream *stream, LVRelState *vacrel = callback_private_data; uint8 blk_info = 0; - /* relies on InvalidBlockNumber + 1 overflowing to 0 on first call */ - next_block = vacrel->current_block + 1; +retry: + next_block = InvalidBlockNumber; + + /* Get the next block to process */ + if (ParallelHeapVacuumIsActive(vacrel)) + { + /* + * Stop returning the next block to the read stream if we are close to + * overrunning the available space for dead_items TIDs so that the + * read stream returns pinned buffers in its buffers queue until the + * stream is exhausted. See the comments atop this file for details. + */ + if (!dead_items_check_memory_limit(vacrel)) + { + /* + * table_block_parallelscan_nextpage() returns InvalidBlockNumber + * if there are no remaining blocks. + */ + next_block = table_block_parallelscan_nextpage(vacrel->rel, + vacrel->plvstate->pbscanwork, + vacrel->plvstate->pbscan); + } + } + else + { + /* relies on InvalidBlockNumber + 1 overflowing to 0 on first call */ + next_block = vacrel->current_block + 1; + } /* Have we reached the end of the relation? */ - if (next_block >= vacrel->scan_data->rel_pages) + if (!BlockNumberIsValid(next_block) || next_block >= vacrel->scan_data->rel_pages) { if (BufferIsValid(vacrel->next_unskippable_vmbuffer)) { @@ -1613,8 +1849,42 @@ heap_vac_scan_next_block(ReadStream *stream, * visibility map. */ bool skipsallvis; + bool found; + BlockNumber end_block; + BlockNumber nblocks_skip; + + if (ParallelHeapVacuumIsActive(vacrel)) + { + /* We look for the next unskippable block within the chunk */ + end_block = next_block + + vacrel->plvstate->pbscanwork->phsw_chunk_remaining + 1; + } + else + end_block = vacrel->scan_data->rel_pages; + + found = find_next_unskippable_block(vacrel, &skipsallvis, next_block, end_block); + + /* + * We must have found the next unskippable block within the specified + * range in non-parallel cases as the end_block is always the last + * block + 1 and we must scan the last block. + */ + Assert(found || ParallelHeapVacuumIsActive(vacrel)); - find_next_unskippable_block(vacrel, &skipsallvis); + if (!found) + { + if (skipsallvis) + vacrel->scan_data->skippedallvis = true; + + /* + * Skip all remaining blocks in the current chunk, and retry with + * the next chunk. + */ + vacrel->plvstate->pbscanwork->phsw_chunk_remaining = 0; + goto retry; + } + + Assert(vacrel->next_unskippable_block < end_block); /* * We now know the next block that we must process. It can be the @@ -1631,11 +1901,20 @@ heap_vac_scan_next_block(ReadStream *stream, * pages then skipping makes updating relfrozenxid unsafe, which is a * real downside. */ - if (vacrel->next_unskippable_block - next_block >= SKIP_PAGES_THRESHOLD) + nblocks_skip = vacrel->next_unskippable_block - next_block; + if (nblocks_skip >= SKIP_PAGES_THRESHOLD) { - next_block = vacrel->next_unskippable_block; if (skipsallvis) vacrel->scan_data->skippedallvis = true; + + /* Tell the parallel scans to skip blocks */ + if (ParallelHeapVacuumIsActive(vacrel)) + { + vacrel->plvstate->pbscanwork->phsw_chunk_remaining -= nblocks_skip; + Assert(vacrel->plvstate->pbscanwork->phsw_chunk_remaining > 0); + } + + next_block = vacrel->next_unskippable_block; } } @@ -1671,9 +1950,11 @@ heap_vac_scan_next_block(ReadStream *stream, } /* - * Find the next unskippable block in a vacuum scan using the visibility map. - * The next unskippable block and its visibility information is updated in - * vacrel. + * Find the next unskippable block in a vacuum scan using the visibility map, + * in a range of 'start' (inclusive) and 'end' (exclusive). + * + * If found, the next unskippable block and its visibility information is updated + * in vacrel. Otherwise, return false and reset the information in vacrel. * * Note: our opinion of which blocks can be skipped can go stale immediately. * It's okay if caller "misses" a page whose all-visible or all-frozen marking @@ -1683,22 +1964,32 @@ heap_vac_scan_next_block(ReadStream *stream, * older XIDs/MXIDs. The *skippedallvis flag will be set here when the choice * to skip such a range is actually made, making everything safe.) */ -static void -find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis) +static bool +find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis, + BlockNumber start, BlockNumber end) { BlockNumber rel_pages = vacrel->scan_data->rel_pages; - BlockNumber next_unskippable_block = vacrel->next_unskippable_block + 1; + BlockNumber next_unskippable_block = start; Buffer next_unskippable_vmbuffer = vacrel->next_unskippable_vmbuffer; bool next_unskippable_eager_scanned = false; bool next_unskippable_allvis; + bool found = true; *skipsallvis = false; for (;; next_unskippable_block++) { - uint8 mapbits = visibilitymap_get_status(vacrel->rel, - next_unskippable_block, - &next_unskippable_vmbuffer); + uint8 mapbits; + + /* Reach the end of range? */ + if (next_unskippable_block >= end) + { + found = false; + break; + } + + mapbits = visibilitymap_get_status(vacrel->rel, next_unskippable_block, + &next_unskippable_vmbuffer); next_unskippable_allvis = (mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0; @@ -1774,11 +2065,274 @@ find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis) *skipsallvis = true; } - /* write the local variables back to vacrel */ - vacrel->next_unskippable_block = next_unskippable_block; - vacrel->next_unskippable_allvis = next_unskippable_allvis; - vacrel->next_unskippable_eager_scanned = next_unskippable_eager_scanned; - vacrel->next_unskippable_vmbuffer = next_unskippable_vmbuffer; + if (found) + { + /* write the local variables back to vacrel */ + vacrel->next_unskippable_block = next_unskippable_block; + vacrel->next_unskippable_allvis = next_unskippable_allvis; + vacrel->next_unskippable_eager_scanned = next_unskippable_eager_scanned; + vacrel->next_unskippable_vmbuffer = next_unskippable_vmbuffer; + } + else + { + if (BufferIsValid(next_unskippable_vmbuffer)) + ReleaseBuffer(next_unskippable_vmbuffer); + + /* + * There is not unskippable block in the specified range. Reset the + * related fields in vacrel. + */ + vacrel->next_unskippable_block = InvalidBlockNumber; + vacrel->next_unskippable_allvis = InvalidBlockNumber; + vacrel->next_unskippable_eager_scanned = false; + vacrel->next_unskippable_vmbuffer = InvalidBuffer; + } + + return found; +} + +/* + * A parallel variant of do_lazy_scan_heap(). The leader process launches + * parallel workers to scan the heap in parallel. +*/ +static void +do_parallel_lazy_scan_heap(LVRelState *vacrel) +{ + ParallelBlockTableScanWorkerData pbscanworkdata; + + Assert(ParallelHeapVacuumIsActive(vacrel)); + Assert(!IsParallelWorker()); + + /* + * Setup the parallel scan description for the leader to join as a worker. + */ + table_block_parallelscan_startblock_init(vacrel->rel, + &pbscanworkdata, + vacrel->plvstate->pbscan); + vacrel->plvstate->pbscanwork = &pbscanworkdata; + + for (;;) + { + BlockNumber fsmvac_upto; + + /* Launch parallel workers */ + parallel_lazy_scan_heap_begin(vacrel); + + /* + * Do lazy heap scan until the read stream is exhausted. We will stop + * retrieving new blocks for the read stream once the space of + * dead_items TIDs exceeds the limit. + */ + do_lazy_scan_heap(vacrel, false); + + /* Wait for parallel workers to finish and gather scan results */ + parallel_lazy_scan_heap_end(vacrel); + + if (!dead_items_check_memory_limit(vacrel)) + break; + + /* Perform a round of index and heap vacuuming */ + vacrel->consider_bypass_optimization = false; + lazy_vacuum(vacrel); + + /* Compute the smallest processed block number */ + fsmvac_upto = parallel_lazy_scan_compute_min_scan_block(vacrel); + + /* + * Vacuum the Free Space Map to make newly-freed space visible on + * upper-level FSM pages. + */ + if (fsmvac_upto > vacrel->next_fsm_block_to_vacuum) + { + FreeSpaceMapVacuumRange(vacrel->rel, vacrel->next_fsm_block_to_vacuum, + fsmvac_upto); + vacrel->next_fsm_block_to_vacuum = fsmvac_upto; + } + + /* Report that we are once again scanning the heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_SCAN_HEAP); + } + + /* + * The parallel heap scan finished, but it's possible that some workers + * have allocated blocks but not processed them yet. This can happen for + * example when workers exit because they are full of dead_items TIDs and + * the leader process launched fewer workers in the next cycle. + */ + complete_unfinished_lazy_scan_heap(vacrel); +} + +/* + * Return the smallest block number that the leader and workers have scanned. + */ +static BlockNumber +parallel_lazy_scan_compute_min_scan_block(LVRelState *vacrel) +{ + BlockNumber min_blk; + + Assert(ParallelHeapVacuumIsActive(vacrel)); + + /* Initialized with the leader's value */ + min_blk = vacrel->last_blkno; + + for (int i = 0; i < vacrel->leader->nworkers_launched; i++) + { + ParallelLVScanWorker *scanworker = &(vacrel->leader->scanworkers[i]); + BlockNumber blkno; + + /* Skip if no worker has been initialized the scan state */ + if (!scanworker->scan_inited) + continue; + + blkno = pg_atomic_read_u32(&(scanworker->last_blkno)); + + if (!BlockNumberIsValid(min_blk) || min_blk > blkno) + min_blk = blkno; + } + + Assert(BlockNumberIsValid(min_blk)); + + return min_blk; +} + +/* + * Complete parallel heaps scans that have remaining blocks in their + * chunks. + */ +static void +complete_unfinished_lazy_scan_heap(LVRelState *vacrel) +{ + int nworkers; + + Assert(!IsParallelWorker()); + + nworkers = parallel_vacuum_get_nworkers_table(vacrel->pvs); + + for (int i = 0; i < nworkers; i++) + { + ParallelLVScanWorker *scanworker = &(vacrel->leader->scanworkers[i]); + + if (!scanworker->scan_inited) + continue; + + if (scanworker->pbscanworkdata.phsw_chunk_remaining == 0) + continue; + + /* Attach the worker's scan state */ + vacrel->plvstate->pbscanwork = &(scanworker->pbscanworkdata); + + /* + * Complete the unfinished scan. Note that we might perform multiple + * cycles of index and heap vacuuming while completing the scans. + */ + vacrel->next_fsm_block_to_vacuum = pg_atomic_read_u32(&(scanworker->last_blkno)); + do_lazy_scan_heap(vacrel, true); + } + + /* + * We don't need to gather the scan results here because the leader's scan + * state got updated directly. + */ +} + +/* + * Helper routine to launch parallel workers for parallel lazy heap scan. + */ +static void +parallel_lazy_scan_heap_begin(LVRelState *vacrel) +{ + Assert(ParallelHeapVacuumIsActive(vacrel)); + Assert(!IsParallelWorker()); + + /* launcher workers */ + vacrel->leader->nworkers_launched = parallel_vacuum_collect_dead_items_begin(vacrel->pvs); + + ereport(vacrel->verbose ? INFO : DEBUG2, + (errmsg(ngettext("launched %d parallel vacuum worker for collecting dead tuples (planned: %d)", + "launched %d parallel vacuum workers for collecting dead tuples (planned: %d)", + vacrel->leader->nworkers_launched), + vacrel->leader->nworkers_launched, + parallel_vacuum_get_nworkers_table(vacrel->pvs)))); +} + +/* + * Helper routine to finish the parallel lazy heap scan. + */ +static void +parallel_lazy_scan_heap_end(LVRelState *vacrel) +{ + /* Wait for all parallel workers to finish */ + parallel_vacuum_collect_dead_items_end(vacrel->pvs); + + /* Gather the workers' scan results */ + parallel_lazy_scan_gather_scan_results(vacrel); +} + +/* + * Accumulate each worker's scan results into the leader's. +*/ +static void +parallel_lazy_scan_gather_scan_results(LVRelState *vacrel) +{ + Assert(ParallelHeapVacuumIsActive(vacrel)); + Assert(!IsParallelWorker()); + + /* Gather the workers' scan results */ + for (int i = 0; i < vacrel->leader->nworkers_launched; i++) + { + LVScanData *data = &(vacrel->leader->scanworkers[i].scandata); + + /* Accumulate the counters collected by workers */ +#define ACCUM_COUNT(item) vacrel->scan_data->item += data->item + ACCUM_COUNT(scanned_pages); + ACCUM_COUNT(removed_pages); + ACCUM_COUNT(new_frozen_tuple_pages); + ACCUM_COUNT(vm_new_visible_pages); + ACCUM_COUNT(vm_new_visible_frozen_pages); + ACCUM_COUNT(vm_new_frozen_pages); + ACCUM_COUNT(lpdead_item_pages); + ACCUM_COUNT(missed_dead_pages); + ACCUM_COUNT(tuples_deleted); + ACCUM_COUNT(tuples_frozen); + ACCUM_COUNT(lpdead_items); + ACCUM_COUNT(live_tuples); + ACCUM_COUNT(recently_dead_tuples); + ACCUM_COUNT(missed_dead_tuples); +#undef ACCUM_COUNT + + /* + * Track the greatest non-empty page among values the workers + * collected as it's used to cut-off point of heap truncation. + */ + if (vacrel->scan_data->nonempty_pages < data->nonempty_pages) + vacrel->scan_data->nonempty_pages = data->nonempty_pages; + + /* + * All workers must have initialized both values with the values + * passed by the leader. + */ + Assert(TransactionIdIsValid(data->NewRelfrozenXid)); + Assert(MultiXactIdIsValid(data->NewRelminMxid)); + + /* + * During parallel lazy scanning, since different workers process + * separate blocks, they may observe different existing XIDs and + * MXIDs. Therefore, we compute the oldest XID and MXID from the + * values observed by each worker (including the leader). These + * computations are crucial for correctly advancing both relfrozenxid + * and relmminmxid values. + */ + + if (TransactionIdPrecedes(data->NewRelfrozenXid, vacrel->scan_data->NewRelfrozenXid)) + vacrel->scan_data->NewRelfrozenXid = data->NewRelfrozenXid; + + if (MultiXactIdPrecedesOrEquals(data->NewRelminMxid, vacrel->scan_data->NewRelminMxid)) + vacrel->scan_data->NewRelminMxid = data->NewRelminMxid; + + /* Has any one of workers skipped all-visible page? */ + vacrel->scan_data->skippedallvis |= data->skippedallvis; + } } /* @@ -2067,7 +2621,8 @@ lazy_scan_prune(LVRelState *vacrel, /* Can't truncate this page */ if (presult.hastup) - vacrel->scan_data->nonempty_pages = blkno + 1; + vacrel->scan_data->nonempty_pages = + Max(blkno + 1, vacrel->scan_data->nonempty_pages); /* Did we find LP_DEAD items? */ *has_lpdead_items = (presult.lpdead_items > 0); @@ -2440,7 +2995,8 @@ lazy_scan_noprune(LVRelState *vacrel, /* Can't truncate this page */ if (hastup) - vacrel->scan_data->nonempty_pages = blkno + 1; + vacrel->scan_data->nonempty_pages = + Max(blkno + 1, vacrel->scan_data->nonempty_pages); /* Did we find LP_DEAD items? */ *has_lpdead_items = (lpdead_items > 0); @@ -3504,12 +4060,8 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) autovacuum_work_mem != -1 ? autovacuum_work_mem : maintenance_work_mem; - /* - * Initialize state for a parallel vacuum. As of now, only one worker can - * be used for an index, so we invoke parallelism only if there are at - * least two indexes on a table. - */ - if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming) + /* Initialize state for a parallel vacuum */ + if (nworkers >= 0) { /* * Since parallel workers cannot access data in temporary tables, we @@ -3527,11 +4079,17 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) vacrel->relname))); } else + { + /* + * We initialize the parallel vacuum state for either lazy heap + * scan, index vacuuming, or both. + */ vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels, vacrel->nindexes, nworkers, vac_work_mem, vacrel->verbose ? INFO : DEBUG2, vacrel->bstrategy, (void *) vacrel); + } /* * If parallel mode started, dead_items and dead_items_info spaces are @@ -3571,15 +4129,35 @@ dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets, }; int64 prog_val[2]; + if (ParallelHeapVacuumIsActive(vacrel)) + TidStoreLockExclusive(vacrel->dead_items); + TidStoreSetBlockOffsets(vacrel->dead_items, blkno, offsets, num_offsets); vacrel->dead_items_info->num_items += num_offsets; + if (ParallelHeapVacuumIsActive(vacrel)) + TidStoreUnlock(vacrel->dead_items); + /* update the progress information */ prog_val[0] = vacrel->dead_items_info->num_items; prog_val[1] = TidStoreMemoryUsage(vacrel->dead_items); pgstat_progress_update_multi_param(2, prog_index, prog_val); } +/* + * Check the memory usage of the collected dead items and return true + * if we are close to overrunning the available space for dead_items TIDs. + * However, let's force at least one page-worth of tuples to be stored as + * to ensure we do at least some work when the memory configured is so low + * that we run out before storing anything. + */ +static bool +dead_items_check_memory_limit(LVRelState *vacrel) +{ + return vacrel->dead_items_info->num_items > 0 && + TidStoreMemoryUsage(vacrel->dead_items) > vacrel->dead_items_info->max_bytes; +} + /* * Forget all collected dead items. */ @@ -3775,14 +4353,224 @@ update_relstats_all_indexes(LVRelState *vacrel) /* * Compute the number of workers for parallel heap vacuum. - * - * Return 0 to disable parallel vacuum. */ int heap_parallel_vacuum_compute_workers(Relation rel, int nworkers_requested, void *state) { - return 0; + int parallel_workers = 0; + + if (nworkers_requested == 0) + { + LVRelState *vacrel = (LVRelState *) state; + int heap_parallel_threshold; + int heap_pages; + BlockNumber allvisible; + BlockNumber allfrozen; + + /* + * Estimate the number of blocks that we're going to scan during + * lazy_scan_heap(). + */ + visibilitymap_count(rel, &allvisible, &allfrozen); + heap_pages = RelationGetNumberOfBlocks(rel) - + (vacrel->aggressive ? allfrozen : allvisible); + + Assert(heap_pages >= 0); + + /* + * Select the number of workers based on the log of the number of + * pages to scan. Note that the upper limit of the + * min_parallel_table_scan_size GUC is chosen to prevent overflow + * here. + */ + heap_parallel_threshold = Max(min_parallel_table_scan_size, 1); + while (heap_pages >= (BlockNumber) (heap_parallel_threshold * 3)) + { + parallel_workers++; + heap_parallel_threshold *= 3; + if (heap_parallel_threshold > INT_MAX / 3) + break; + } + } + else + parallel_workers = nworkers_requested; + + return parallel_workers; +} + +/* + * Estimate shared memory size required for parallel heap vacuum. + */ +void +heap_parallel_vacuum_estimate(Relation rel, ParallelContext *pcxt, int nworkers, + void *state) +{ + LVRelState *vacrel = (LVRelState *) state; + Size size = 0; + + vacrel->leader = palloc(sizeof(ParallelLVLeader)); + + /* Estimate space for ParallelLVShared */ + size = add_size(size, sizeof(ParallelLVShared)); + vacrel->leader->shared_len = size; + shm_toc_estimate_chunk(&pcxt->estimator, vacrel->leader->shared_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for ParallelBlockTableScanDesc */ + vacrel->leader->pbscan_len = table_block_parallelscan_estimate(rel); + shm_toc_estimate_chunk(&pcxt->estimator, vacrel->leader->pbscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for an array of ParallelLVScanWorker */ + vacrel->leader->scanworker_len = mul_size(sizeof(ParallelLVScanWorker), nworkers); + shm_toc_estimate_chunk(&pcxt->estimator, vacrel->leader->scanworker_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* + * Set up shared memory for parallel heap vacuum. + */ +void +heap_parallel_vacuum_initialize(Relation rel, ParallelContext *pcxt, int nworkers, + void *state) +{ + LVRelState *vacrel = (LVRelState *) state; + ParallelLVShared *shared; + ParallelBlockTableScanDesc pbscan; + ParallelLVScanWorker *scanworkers; + + vacrel->plvstate = palloc0(sizeof(ParallelLVState)); + + /* Initialize ParallelLVShared */ + shared = shm_toc_allocate(pcxt->toc, vacrel->leader->shared_len); + MemSet(shared, 0, vacrel->leader->shared_len); + shared->aggressive = vacrel->aggressive; + shared->skipwithvm = vacrel->skipwithvm; + shared->cutoffs = vacrel->cutoffs; + shared->NewRelfrozenXid = vacrel->scan_data->NewRelfrozenXid; + shared->NewRelminMxid = vacrel->scan_data->NewRelminMxid; + shm_toc_insert(pcxt->toc, PARALLEL_LV_KEY_SHARED, shared); + vacrel->plvstate->shared = shared; + + /* Initialize ParallelBlockTableScanDesc */ + pbscan = shm_toc_allocate(pcxt->toc, vacrel->leader->pbscan_len); + table_block_parallelscan_initialize(rel, (ParallelTableScanDesc) pbscan); + pbscan->base.phs_syncscan = false; /* always start from the first block */ + shm_toc_insert(pcxt->toc, PARALLEL_LV_KEY_SCANDESC, pbscan); + vacrel->plvstate->pbscan = pbscan; + + /* Initialize the array of ParallelLVScanWorker */ + scanworkers = shm_toc_allocate(pcxt->toc, vacrel->leader->scanworker_len); + MemSet(scanworkers, 0, vacrel->leader->scanworker_len); + shm_toc_insert(pcxt->toc, PARALLEL_LV_KEY_SCANWORKER, scanworkers); + vacrel->leader->scanworkers = scanworkers; +} + +/* + * Initialize lazy vacuum state with the information retrieved from + * shared memory. + */ +void +heap_parallel_vacuum_initialize_worker(Relation rel, ParallelVacuumState *pvs, + ParallelWorkerContext *pwcxt, + void **state_out) +{ + LVRelState *vacrel; + ParallelLVState *plvstate; + ParallelLVShared *shared; + ParallelLVScanWorker *scanworker; + ParallelBlockTableScanDesc pbscan; + + /* Initialize ParallelLVState and prepare the related objects */ + + plvstate = palloc0(sizeof(ParallelLVState)); + + /* Prepare ParallelLVShared */ + shared = (ParallelLVShared *) shm_toc_lookup(pwcxt->toc, PARALLEL_LV_KEY_SHARED, false); + plvstate->shared = shared; + + /* Prepare ParallelBlockTableScanWorkerData */ + pbscan = shm_toc_lookup(pwcxt->toc, PARALLEL_LV_KEY_SCANDESC, false); + plvstate->pbscan = pbscan; + + /* Prepare ParallelLVScanWorker */ + scanworker = shm_toc_lookup(pwcxt->toc, PARALLEL_LV_KEY_SCANWORKER, false); + plvstate->scanworker = &(scanworker[ParallelWorkerNumber]); + plvstate->pbscanwork = &(plvstate->scanworker->pbscanworkdata); + + /* Initialize LVRelState and prepare fields required by lazy scan heap */ + vacrel = palloc0(sizeof(LVRelState)); + vacrel->rel = rel; + vacrel->indrels = parallel_vacuum_get_table_indexes(pvs, + &vacrel->nindexes); + vacrel->bstrategy = parallel_vacuum_get_bstrategy(pvs); + vacrel->pvs = pvs; + vacrel->aggressive = shared->aggressive; + vacrel->skipwithvm = shared->skipwithvm; + vacrel->vistest = GlobalVisTestFor(rel); + vacrel->cutoffs = shared->cutoffs; + vacrel->dead_items = parallel_vacuum_get_dead_items(pvs, + &vacrel->dead_items_info); + vacrel->plvstate = plvstate; + vacrel->scan_data = &(plvstate->scanworker->scandata); + MemSet(vacrel->scan_data, 0, sizeof(LVScanData)); + vacrel->scan_data->NewRelfrozenXid = shared->NewRelfrozenXid; + vacrel->scan_data->NewRelminMxid = shared->NewRelminMxid; + vacrel->scan_data->skippedallvis = false; + vacrel->scan_data->rel_pages = RelationGetNumberOfBlocks(rel); + + /* + * Initialize the scan state if not yet. The chunk of blocks will be + * allocated when to get the scan block for the first time. + */ + if (!vacrel->plvstate->scanworker->scan_inited) + { + vacrel->plvstate->scanworker->scan_inited = true; + table_block_parallelscan_startblock_init(rel, + vacrel->plvstate->pbscanwork, + vacrel->plvstate->pbscan); + pg_atomic_init_u32(&(vacrel->plvstate->scanworker->last_blkno), + InvalidBlockNumber); + } + + *state_out = (void *) vacrel; +} + +/* + * Parallel heap vacuum callback for collecting dead items (i.e., lazy heap scan). + */ +void +heap_parallel_vacuum_collect_dead_items(Relation rel, ParallelVacuumState *pvs, + void *state) +{ + LVRelState *vacrel = (LVRelState *) state; + ErrorContextCallback errcallback; + + Assert(ParallelHeapVacuumIsActive(vacrel)); + + /* + * Setup error traceback support for ereport() for parallel table vacuum + * workers + */ + vacrel->dbname = get_database_name(MyDatabaseId); + vacrel->relnamespace = get_database_name(RelationGetNamespace(rel)); + vacrel->relname = pstrdup(RelationGetRelationName(rel)); + vacrel->indname = NULL; + vacrel->phase = VACUUM_ERRCB_PHASE_SCAN_HEAP; + errcallback.callback = vacuum_error_callback; + errcallback.arg = &vacrel; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Join the parallel heap vacuum */ + do_lazy_scan_heap(vacrel, false); + + /* Advertise the last processed block number */ + pg_atomic_write_u32(&(vacrel->plvstate->scanworker->last_blkno), vacrel->last_blkno); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; } /* diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 28997918f1ca..770e0395a964 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -504,6 +504,35 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) pfree(pvs); } +/* + * Return the number of parallel workers initialized for parallel table vacuum. + */ +int +parallel_vacuum_get_nworkers_table(ParallelVacuumState *pvs) +{ + return pvs->nworkers_for_table; +} + +/* + * Return the array of indexes associated to the given table to be vacuumed. + */ +Relation * +parallel_vacuum_get_table_indexes(ParallelVacuumState *pvs, int *nindexes) +{ + *nindexes = pvs->nindexes; + + return pvs->indrels; +} + +/* + * Return the buffer strategy for parallel vacuum. + */ +BufferAccessStrategy +parallel_vacuum_get_bstrategy(ParallelVacuumState *pvs) +{ + return pvs->bstrategy; +} + /* * Returns the dead items space and dead items information. */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 4e794ba6a50f..d09d353af57f 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -15,6 +15,7 @@ #define HEAPAM_H #include "access/heapam_xlog.h" +#include "access/parallel.h" #include "access/relation.h" /* for backward compatibility */ #include "access/relscan.h" #include "access/sdir.h" @@ -397,10 +398,20 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, /* in heap/vacuumlazy.c */ struct VacuumParams; +struct ParallelVacuumState; extern void heap_vacuum_rel(Relation rel, struct VacuumParams *params, BufferAccessStrategy bstrategy); extern int heap_parallel_vacuum_compute_workers(Relation rel, int nworkers_requested, void *state); +extern void heap_parallel_vacuum_estimate(Relation rel, ParallelContext *pcxt, int nworkers, + void *state); +extern void heap_parallel_vacuum_initialize(Relation rel, ParallelContext *pcxt, + int nworkers, void *state); +extern void heap_parallel_vacuum_initialize_worker(Relation rel, struct ParallelVacuumState *pvs, + ParallelWorkerContext *pwcxt, + void **state_out); +extern void heap_parallel_vacuum_collect_dead_items(Relation rel, struct ParallelVacuumState *pvs, + void *state); /* in heap/heapam_visibility.c */ extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index e785a4a583f2..849cb4dcc74d 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -385,6 +385,9 @@ extern ParallelVacuumState *parallel_vacuum_init(Relation rel, Relation *indrels BufferAccessStrategy bstrategy, void *state); extern void parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats); +extern int parallel_vacuum_get_nworkers_table(ParallelVacuumState *pvs); +extern Relation *parallel_vacuum_get_table_indexes(ParallelVacuumState *pvs, int *nindexes); +extern BufferAccessStrategy parallel_vacuum_get_bstrategy(ParallelVacuumState *pvs); extern TidStore *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs, VacDeadItemsInfo **dead_items_info_p); extern void parallel_vacuum_reset_dead_items(ParallelVacuumState *pvs); diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out index 0abcc99989e0..f92c3f73c29a 100644 --- a/src/test/regress/expected/vacuum.out +++ b/src/test/regress/expected/vacuum.out @@ -160,6 +160,11 @@ UPDATE pvactst SET i = i WHERE i < 1000; VACUUM (PARALLEL 2) pvactst; UPDATE pvactst SET i = i WHERE i < 1000; VACUUM (PARALLEL 0) pvactst; -- disable parallel vacuum +-- VACUUM invokes parallel heap vacuum. +SET min_parallel_table_scan_size to 0; +VACUUM (PARALLEL 2, FREEZE) pvactst2; +UPDATE pvactst2 SET i = i WHERE i < 1000; +VACUUM (PARALLEL 1) pvactst2; VACUUM (PARALLEL -1) pvactst; -- error ERROR: parallel workers for vacuum must be between 0 and 1024 LINE 1: VACUUM (PARALLEL -1) pvactst; @@ -185,6 +190,7 @@ VACUUM (PARALLEL 1, FULL FALSE) tmp; -- parallel vacuum disabled for temp tables WARNING: disabling parallel option of vacuum on "tmp" --- cannot vacuum temporary tables in parallel VACUUM (PARALLEL 0, FULL TRUE) tmp; -- can specify parallel disabled (even though that's implied by FULL) RESET min_parallel_index_scan_size; +RESET min_parallel_table_scan_size; DROP TABLE pvactst; DROP TABLE pvactst2; -- INDEX_CLEANUP option diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql index a72bdb5b619d..b8abab28ea92 100644 --- a/src/test/regress/sql/vacuum.sql +++ b/src/test/regress/sql/vacuum.sql @@ -129,6 +129,12 @@ VACUUM (PARALLEL 2) pvactst; UPDATE pvactst SET i = i WHERE i < 1000; VACUUM (PARALLEL 0) pvactst; -- disable parallel vacuum +-- VACUUM invokes parallel heap vacuum. +SET min_parallel_table_scan_size to 0; +VACUUM (PARALLEL 2, FREEZE) pvactst2; +UPDATE pvactst2 SET i = i WHERE i < 1000; +VACUUM (PARALLEL 1) pvactst2; + VACUUM (PARALLEL -1) pvactst; -- error VACUUM (PARALLEL 2, INDEX_CLEANUP FALSE) pvactst; VACUUM (PARALLEL 2, FULL TRUE) pvactst; -- error, cannot use both PARALLEL and FULL @@ -148,6 +154,7 @@ CREATE INDEX tmp_idx1 ON tmp (a); VACUUM (PARALLEL 1, FULL FALSE) tmp; -- parallel vacuum disabled for temp tables VACUUM (PARALLEL 0, FULL TRUE) tmp; -- can specify parallel disabled (even though that's implied by FULL) RESET min_parallel_index_scan_size; +RESET min_parallel_table_scan_size; DROP TABLE pvactst; DROP TABLE pvactst2; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 6cab30079b9d..bb8eefd34263 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1959,6 +1959,10 @@ PLpgSQL_type PLpgSQL_type_type PLpgSQL_var PLpgSQL_variable +ParallelLVLeader +ParallelLVScanWorker +ParallelLVShared +ParallelLVState PLwdatum PLword PLyArrayToOb