Skip to content

Some more parallel copy fixes #5924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 3, 2025
Merged
2 changes: 1 addition & 1 deletion graph/src/env/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ pub struct InnerStore {
last_rollup_from_poi: bool,
#[envconfig(from = "GRAPH_STORE_INSERT_EXTRA_COLS", default = "0")]
insert_extra_cols: usize,
#[envconfig(from = "GRAPH_STORE_FDW_FETCH_SIZE", default = "10000")]
#[envconfig(from = "GRAPH_STORE_FDW_FETCH_SIZE", default = "1000")]
fdw_fetch_size: usize,
}

Expand Down
40 changes: 33 additions & 7 deletions store/postgres/src/connection_pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,12 +347,14 @@ impl PoolName {
#[derive(Clone)]
struct PoolStateTracker {
available: Arc<AtomicBool>,
ignore_timeout: Arc<AtomicBool>,
}

impl PoolStateTracker {
fn new() -> Self {
Self {
available: Arc::new(AtomicBool::new(true)),
ignore_timeout: Arc::new(AtomicBool::new(false)),
}
}

Expand All @@ -367,6 +369,20 @@ impl PoolStateTracker {
fn is_available(&self) -> bool {
self.available.load(Ordering::Relaxed)
}

fn timeout_is_ignored(&self) -> bool {
self.ignore_timeout.load(Ordering::Relaxed)
}

fn ignore_timeout<F, R>(&self, f: F) -> R
where
F: FnOnce() -> R,
{
self.ignore_timeout.store(true, Ordering::Relaxed);
let res = f();
self.ignore_timeout.store(false, Ordering::Relaxed);
res
}
}

impl ConnectionPool {
Expand Down Expand Up @@ -530,8 +546,12 @@ impl ConnectionPool {
&self,
logger: &Logger,
timeout: Duration,
) -> Result<Option<PooledConnection<ConnectionManager<PgConnection>>>, StoreError> {
self.get_ready()?.try_get_fdw(logger, timeout)
) -> Option<PooledConnection<ConnectionManager<PgConnection>>> {
let Ok(inner) = self.get_ready() else {
return None;
};
self.state_tracker
.ignore_timeout(|| inner.try_get_fdw(logger, timeout))
}

pub fn connection_detail(&self) -> Result<ForeignServer, StoreError> {
Expand Down Expand Up @@ -740,6 +760,9 @@ impl HandleEvent for EventHandler {
}

fn handle_timeout(&self, event: e::TimeoutEvent) {
if self.state_tracker.timeout_is_ignored() {
return;
}
self.add_conn_wait_time(event.timeout());
if self.state_tracker.is_available() {
error!(self.logger, "Connection checkout timed out";
Expand Down Expand Up @@ -1042,15 +1065,18 @@ impl PoolInner {
&self,
logger: &Logger,
timeout: Duration,
) -> Result<Option<PooledConnection<ConnectionManager<PgConnection>>>, StoreError> {
) -> Option<PooledConnection<ConnectionManager<PgConnection>>> {
// Any error trying to get a connection is treated as "couldn't get
// a connection in time". If there is a serious error with the
// database, e.g., because it's not available, the next database
// operation will run into it and report it.
self.fdw_pool(logger)?
.get_timeout(timeout)
.map(|conn| Some(conn))
.or_else(|_| Ok(None))
let Ok(fdw_pool) = self.fdw_pool(logger) else {
return None;
};
let Ok(conn) = fdw_pool.get_timeout(timeout) else {
return None;
};
Some(conn)
}

pub fn connection_detail(&self) -> Result<ForeignServer, StoreError> {
Expand Down
190 changes: 147 additions & 43 deletions store/postgres/src/copy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use graph::{
info, lazy_static, o, warn, BlockNumber, BlockPtr, CheapClone, Logger, StoreError, ENV_VARS,
},
schema::EntityType,
slog::error,
};
use itertools::Itertools;

Expand Down Expand Up @@ -125,7 +126,7 @@ pub fn is_source(conn: &mut PgConnection, site: &Site) -> Result<bool, StoreErro
.map_err(StoreError::from)
}

#[derive(Copy, Clone, PartialEq, Eq)]
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub enum Status {
Finished,
Cancelled,
Expand Down Expand Up @@ -687,9 +688,18 @@ impl CopyTableWorker {
}
}

async fn run(mut self, logger: Logger, progress: Arc<CopyProgress>) -> Self {
self.result = self.run_inner(logger, &progress);
self
async fn run(
mut self,
logger: Logger,
progress: Arc<CopyProgress>,
) -> Result<Self, StoreError> {
let object = self.table.dst.object.cheap_clone();
graph::spawn_blocking_allow_panic(move || {
self.result = self.run_inner(logger, &progress);
self
})
.await
.map_err(|e| constraint_violation!("copy worker for {} panicked: {}", object, e))
}

fn run_inner(&mut self, logger: Logger, progress: &CopyProgress) -> Result<Status, StoreError> {
Expand Down Expand Up @@ -739,10 +749,19 @@ impl CopyTableWorker {
break status;
}
Err(StoreError::StatementTimeout) => {
let timeout = ENV_VARS
.store
.batch_timeout
.map(|t| t.as_secs().to_string())
.unwrap_or_else(|| "unlimted".to_string());
warn!(
logger,
"Current batch took longer than GRAPH_STORE_BATCH_TIMEOUT seconds. Retrying with a smaller batch size."
);
logger,
"Current batch timed out. Retrying with a smaller batch size.";
"timeout_s" => timeout,
"table" => self.table.dst.qualified_name.as_str(),
"current_vid" => self.table.batcher.next_vid(),
"current_batch_size" => self.table.batcher.batch_size(),
);
}
Err(e) => {
return Err(e);
Expand Down Expand Up @@ -857,7 +876,7 @@ impl Connection {
{
let Some(conn) = self.conn.as_mut() else {
return Err(constraint_violation!(
"copy connection has been handed to background task but not returned yet"
"copy connection has been handed to background task but not returned yet (transaction)"
));
};
conn.transaction(|conn| f(conn))
Expand Down Expand Up @@ -890,20 +909,18 @@ impl Connection {
&mut self,
state: &mut CopyState,
progress: &Arc<CopyProgress>,
) -> Result<Option<Pin<Box<dyn Future<Output = CopyTableWorker>>>>, StoreError> {
let conn = self.conn.take().ok_or_else(|| {
constraint_violation!(
"copy connection has been handed to background task but not returned yet"
)
})?;
) -> Option<Pin<Box<dyn Future<Output = Result<CopyTableWorker, StoreError>>>>> {
let Some(conn) = self.conn.take() else {
return None;
};
let Some(table) = state.unfinished.pop() else {
return Ok(None);
return None;
};

let worker = CopyTableWorker::new(conn, table);
Ok(Some(Box::pin(
Some(Box::pin(
worker.run(self.logger.cheap_clone(), progress.cheap_clone()),
)))
))
}

/// Opportunistically create an extra worker if we have more tables to
Expand All @@ -913,29 +930,74 @@ impl Connection {
&mut self,
state: &mut CopyState,
progress: &Arc<CopyProgress>,
) -> Result<Option<Pin<Box<dyn Future<Output = CopyTableWorker>>>>, StoreError> {
) -> Option<Pin<Box<dyn Future<Output = Result<CopyTableWorker, StoreError>>>>> {
// It's important that we get the connection before the table since
// we remove the table from the state and could drop it otherwise
let Some(conn) = self
.pool
.try_get_fdw(&self.logger, ENV_VARS.store.batch_worker_wait)?
.try_get_fdw(&self.logger, ENV_VARS.store.batch_worker_wait)
else {
return Ok(None);
return None;
};
let Some(table) = state.unfinished.pop() else {
return Ok(None);
return None;
};

let worker = CopyTableWorker::new(conn, table);
Ok(Some(Box::pin(
Some(Box::pin(
worker.run(self.logger.cheap_clone(), progress.cheap_clone()),
)))
))
}

/// Check that we can make progress, i.e., that we have at least one
/// worker that copies as long as there are unfinished tables. This is a
/// safety check to guard against `copy_data_internal` looping forever
/// because of some internal inconsistency
fn assert_progress(&self, num_workers: usize, state: &CopyState) -> Result<(), StoreError> {
if num_workers == 0 && !state.unfinished.is_empty() {
// Something bad happened. We should have at least one
// worker if there are still tables to copy
if self.conn.is_none() {
return Err(constraint_violation!(
"copy connection has been handed to background task but not returned yet (copy_data_internal)"
));
} else {
return Err(constraint_violation!(
"no workers left but still tables to copy"
));
}
}
Ok(())
}

pub async fn copy_data_internal(
/// Wait for all workers to finish. This is called when we a worker has
/// failed with an error that forces us to abort copying
async fn cancel_workers(
&mut self,
index_list: IndexList,
) -> Result<Status, StoreError> {
progress: Arc<CopyProgress>,
mut workers: Vec<Pin<Box<dyn Future<Output = Result<CopyTableWorker, StoreError>>>>>,
) {
progress.cancel();
error!(
self.logger,
"copying encountered an error; waiting for all workers to finish"
);
while !workers.is_empty() {
let (result, _, remaining) = select_all(workers).await;
workers = remaining;
match result {
Ok(worker) => {
self.conn = Some(worker.conn);
}
Err(e) => {
/* Ignore; we had an error previously */
error!(self.logger, "copy worker panicked: {}", e);
}
}
}
}

async fn copy_data_internal(&mut self, index_list: IndexList) -> Result<Status, StoreError> {
let src = self.src.clone();
let dst = self.dst.clone();
let target_block = self.target_block.clone();
Expand All @@ -949,40 +1011,69 @@ impl Connection {
// connection in `self.conn`. If the fdw pool has idle connections
// and there are more tables to be copied, we can start more
// workers, up to `self.workers` many
//
// The loop has to be very careful about terminating early so that
// we do not ever leave the loop with `self.conn == None`
let mut workers = Vec::new();
while !state.unfinished.is_empty() || !workers.is_empty() {
// We usually add at least one job here, except if we are out of
// tables to copy. In that case, we go through the `while` loop
// every time one of the tables we are currently copying
// finishes
if let Some(worker) = self.default_worker(&mut state, &progress)? {
if let Some(worker) = self.default_worker(&mut state, &progress) {
workers.push(worker);
}
loop {
if workers.len() >= self.workers {
break;
}
let Some(worker) = self.extra_worker(&mut state, &progress)? else {
let Some(worker) = self.extra_worker(&mut state, &progress) else {
break;
};
workers.push(worker);
}
let (worker, _idx, remaining) = select_all(workers).await;
workers = remaining;

// Put the connection back into self.conn so that we can use it
// in the next iteration.
self.conn = Some(worker.conn);
state.finished.push(worker.table);

if worker.result.is_err() {
progress.cancel();
return worker.result;
}
self.assert_progress(workers.len(), &state)?;
let (result, _idx, remaining) = select_all(workers).await;
workers = remaining;

if progress.is_cancelled() {
return Ok(Status::Cancelled);
}
// Analyze `result` and take another trip through the loop if
// everything is ok; wait for pending workers and return if
// there was an error or if copying was cancelled.
match result {
Err(e) => {
// This is a panic in the background task. We need to
// cancel all other tasks and return the error
self.cancel_workers(progress, workers).await;
return Err(e);
}
Ok(worker) => {
// Put the connection back into self.conn so that we can use it
// in the next iteration.
self.conn = Some(worker.conn);

match (worker.result, progress.is_cancelled()) {
(Ok(Status::Finished), false) => {
// The worker finished successfully, and nothing was
// cancelled; take another trip through the loop
state.finished.push(worker.table);
}
(Ok(Status::Finished), true) => {
state.finished.push(worker.table);
self.cancel_workers(progress, workers).await;
return Ok(Status::Cancelled);
}
(Ok(Status::Cancelled), _) => {
self.cancel_workers(progress, workers).await;
return Ok(Status::Cancelled);
}
(Err(e), _) => {
self.cancel_workers(progress, workers).await;
return Err(e);
}
}
}
};
}
debug_assert!(self.conn.is_some());

Expand Down Expand Up @@ -1048,7 +1139,7 @@ impl Connection {
/// lower(v1.block_range) => v2.vid > v1.vid` and we can therefore stop
/// the copying of each table as soon as we hit `max_vid = max { v.vid |
/// lower(v.block_range) <= target_block.number }`.
pub async fn copy_data(&mut self, index_list: IndexList) -> Result<Status, StoreError> {
pub async fn copy_data(mut self, index_list: IndexList) -> Result<Status, StoreError> {
// We require sole access to the destination site, and that we get a
// consistent view of what has been copied so far. In general, that
// is always true. It can happen though that this function runs when
Expand All @@ -1061,10 +1152,23 @@ impl Connection {
&self.logger,
"Obtaining copy lock (this might take a long time if another process is still copying)"
);

let dst_site = self.dst.site.cheap_clone();
self.transaction(|conn| advisory_lock::lock_copying(conn, &dst_site))?;

let res = self.copy_data_internal(index_list).await;

if self.conn.is_none() {
// A background worker panicked and left us without our
// dedicated connection, but we still need to release the copy
// lock; get a normal connection, not from the fdw pool for that
// as that will be much less contended. We won't be holding on
// to the connection for long as `res` will be an error and we
// will abort starting this subgraph
self.conn = Some(self.pool.get()?);
}
self.transaction(|conn| advisory_lock::unlock_copying(conn, &dst_site))?;

if matches!(res, Ok(Status::Cancelled)) {
warn!(&self.logger, "Copying was cancelled and is incomplete");
}
Expand Down
Loading
Loading