From bb87edcdb54291e13fa6932400d6b037be0f8250 Mon Sep 17 00:00:00 2001 From: David Steele Date: Fri, 24 Jan 2025 18:38:08 +0000 Subject: [PATCH 1/2] Add pg_control flag to prevent recovery without backup_label. Harden recovery by adding a flag to pg_control to indicate that backup_label is required. This prevents the user from deleting backup_label resulting in an inconsistent recovery. Another advantage is that the copy of pg_control used by pg_basebackup is guaranteed not to be torn. This functionality is limited to pg_basebackup (or any software comfortable with modifying pg_control). Control and catalog version bumps are required. --- doc/src/sgml/func.sgml | 5 +++++ src/backend/access/transam/xlog.c | 23 +++++++++++++++++++++++ src/backend/access/transam/xlogrecovery.c | 10 +++++++++- src/backend/backup/basebackup.c | 15 ++++++--------- src/backend/utils/misc/pg_controldata.c | 7 +++++-- src/bin/pg_controldata/pg_controldata.c | 2 ++ src/bin/pg_resetwal/pg_resetwal.c | 1 + src/bin/pg_rewind/pg_rewind.c | 1 + src/include/access/xlog.h | 1 + src/include/catalog/pg_control.h | 4 ++++ src/include/catalog/pg_proc.dat | 6 +++--- src/test/recovery/t/002_archiving.pl | 20 ++++++++++++++++++++ 12 files changed, 80 insertions(+), 15 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 1c5cfee25d12..02cda525a584 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -28256,6 +28256,11 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} boolean + + backup_label_required + boolean + + diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ec40c0b7c42b..ab4916b24684 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9590,6 +9590,29 @@ do_pg_abort_backup(int code, Datum arg) } } +/* + * Create a consistent copy of control data to be used for backup and update it + * to require a backup label for recovery. Also recalculate the CRC. + */ +void +backup_control_file(uint8_t *controlFile) +{ + ControlFileData *controlData = ((ControlFileData *)controlFile); + + memset(controlFile + sizeof(ControlFileData), 0, + PG_CONTROL_FILE_SIZE - sizeof(ControlFileData)); + + LWLockAcquire(ControlFileLock, LW_SHARED); + memcpy(controlFile, ControlFile, sizeof(ControlFileData)); + LWLockRelease(ControlFileLock); + + controlData->backupLabelRequired = true; + + INIT_CRC32C(controlData->crc); + COMP_CRC32C(controlData->crc, controlFile, offsetof(ControlFileData, crc)); + FIN_CRC32C(controlData->crc); +} + /* * Register a handler that will warn about unterminated backups at end of * session, unless this has already been done. diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 6ce979f2d8bc..e030511c409d 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -709,7 +709,14 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, } else { - /* No backup_label file has been found if we are here. */ + /* + * No backup_label file has been found if we are here. Error if the + * control file requires backup_label. + */ + if (ControlFile->backupLabelRequired) + ereport(FATAL, + (errmsg("could not find backup_label required for recovery"), + errhint("backup_label must be present for recovery to proceed"))); /* * If tablespace_map file is present without backup_label file, there @@ -984,6 +991,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, { ControlFile->backupStartPoint = checkPoint.redo; ControlFile->backupEndRequired = backupEndRequired; + ControlFile->backupLabelRequired = false; if (backupFromStandby) { diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index f0f88838dc21..b322b668b5cb 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -23,6 +23,7 @@ #include "backup/basebackup_incremental.h" #include "backup/basebackup_sink.h" #include "backup/basebackup_target.h" +#include "catalog/pg_control.h" #include "catalog/pg_tablespace_d.h" #include "commands/defrem.h" #include "common/compression.h" @@ -326,9 +327,9 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, if (ti->path == NULL) { - struct stat statbuf; bool sendtblspclinks = true; char *backup_label; + uint8_t controlFile[PG_CONTROL_FILE_SIZE]; bbsink_begin_archive(sink, "base.tar"); @@ -351,14 +352,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, sendtblspclinks, &manifest, InvalidOid, ib); /* ... and pg_control after everything else. */ - if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", - XLOG_CONTROL_FILE))); - sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, - false, InvalidOid, InvalidOid, - InvalidRelFileNumber, 0, &manifest, 0, NULL, 0); + backup_control_file(controlFile); + sendFileWithContent(sink, XLOG_CONTROL_FILE, + (char *)controlFile, PG_CONTROL_FILE_SIZE, + &manifest); } else { diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 6d036e3bf328..d0ebd8e98a7f 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -162,8 +162,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) Datum pg_control_recovery(PG_FUNCTION_ARGS) { - Datum values[5]; - bool nulls[5]; + Datum values[6]; + bool nulls[6]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; @@ -195,6 +195,9 @@ pg_control_recovery(PG_FUNCTION_ARGS) values[4] = BoolGetDatum(ControlFile->backupEndRequired); nulls[4] = false; + values[5] = BoolGetDatum(ControlFile->backupLabelRequired); + nulls[5] = false; + htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 7bb801bb8861..2d6327089c56 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -294,6 +294,8 @@ main(int argc, char *argv[]) LSN_FORMAT_ARGS(ControlFile->backupEndPoint)); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); + printf(_("Backup label required: %s\n"), + ControlFile->backupLabelRequired ? _("yes") : _("no")); printf(_("wal_level setting: %s\n"), wal_level_str(ControlFile->wal_level)); printf(_("wal_log_hints setting: %s\n"), diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 2ae3702efba1..88acbb9efb85 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -899,6 +899,7 @@ RewriteControlFile(void) ControlFile.backupStartPoint = 0; ControlFile.backupEndPoint = 0; ControlFile.backupEndRequired = false; + ControlFile.backupLabelRequired = false; /* * Force the defaults for max_* settings. The values don't really matter diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 9d16c1e6b475..02c76335c196 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -727,6 +727,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source, ControlFile_new.minRecoveryPoint = endrec; ControlFile_new.minRecoveryPointTLI = endtli; ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY; + ControlFile_new.backupLabelRequired = true; if (!dry_run) update_controlfile(datadir_target, &ControlFile_new, do_sync); } diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d313099c027f..78bc0943af19 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -296,6 +296,7 @@ extern void do_pg_backup_start(const char *backupidstr, bool fast, StringInfo tblspcmapfile); extern void do_pg_backup_stop(BackupState *state, bool waitforarchive); extern void do_pg_abort_backup(int code, Datum arg); +extern void backup_control_file(uint8_t *controlFile); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 63e834a6ce47..84df7ee6f507 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -164,12 +164,16 @@ typedef struct ControlFileData * If backupEndRequired is true, we know for sure that we're restoring * from a backup, and must see a backup-end record before we can safely * start up. + * + * If backupLabelRequired is true, then a backup_label file must be + * present in order for recovery to proceed. */ XLogRecPtr minRecoveryPoint; TimeLineID minRecoveryPointTLI; XLogRecPtr backupStartPoint; XLogRecPtr backupEndPoint; bool backupEndRequired; + bool backupLabelRequired; /* * Parameter settings that determine if the WAL can be used for archival diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 62beb71da288..7d8f55375bcb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12304,9 +12304,9 @@ { oid => '3443', descr => 'pg_controldata recovery state information as a function', proname => 'pg_control_recovery', provolatile => 'v', prorettype => 'record', - proargtypes => '', proallargtypes => '{pg_lsn,int4,pg_lsn,pg_lsn,bool}', - proargmodes => '{o,o,o,o,o}', - proargnames => '{min_recovery_end_lsn,min_recovery_end_timeline,backup_start_lsn,backup_end_lsn,end_of_backup_record_required}', + proargtypes => '', proallargtypes => '{pg_lsn,int4,pg_lsn,pg_lsn,bool,bool}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{min_recovery_end_lsn,min_recovery_end_timeline,backup_start_lsn,backup_end_lsn,end_of_backup_record_required,backup_label_required}', prosrc => 'pg_control_recovery' }, { oid => '3444', diff --git a/src/test/recovery/t/002_archiving.pl b/src/test/recovery/t/002_archiving.pl index 3acdb9ff1ebe..153c658c1235 100644 --- a/src/test/recovery/t/002_archiving.pl +++ b/src/test/recovery/t/002_archiving.pl @@ -41,6 +41,26 @@ archive_cleanup_command = 'echo archive_cleanup_done > $archive_cleanup_command_file' recovery_end_command = 'echo recovery_ended_done > $recovery_end_command_file' )); + +# Rename backup_label to verify that recovery will not start without it +rename("$data_dir/backup_label", "$data_dir/backup_label.tmp") + or BAIL_OUT "could not move $data_dir/backup_label"; + +my $res = run_log( + [ + 'pg_ctl', '-D', $node_standby->data_dir, '-l', + $node_standby->logfile, 'start' + ]); +ok(!$res, 'invalid recovery startup fails'); + +my $logfile = slurp_file($node_standby->logfile()); +ok($logfile =~ qr/could not find backup_label required for recovery/, + 'could not find backup_label required for recovery'); + +# Restore backup_label so recovery proceeds normally +rename("$data_dir/backup_label.tmp", "$data_dir/backup_label") + or BAIL_OUT "could not move $data_dir/backup_label"; + $node_standby->start; # Create some content on primary From 15c5409cde338e0f7a4231a3796ad6e8f3f23808 Mon Sep 17 00:00:00 2001 From: David Steele Date: Fri, 24 Jan 2025 18:38:09 +0000 Subject: [PATCH 2/2] Return pg_control from pg_backup_stop(). Harden recovery by returning a copy of pg_control from pg_backup_stop() that has a flag set to prevent recovery if the backup_label file is missing. Instead of backup software copying pg_control from PGDATA, it stores an updated version that is returned from pg_backup_stop(). This is better for the following reasons: * The user can no longer remove backup_label and get what looks like a successful recovery (while almost certainly causing corruption). If backup_label is removed the cluster will not start. The user may try pg_resetwal, but that tool makes it pretty clear that corruption will result from its use. * We don't need to worry about backup software seeing a torn copy of pg_control, since Postgres can safely read it out of memory and provide a valid copy via pg_backup_stop(). This solves torn reads without needing to write pg_control via a temp file, which may affect performance on a standby. * For backup from standby, we no longer need to instruct the backup software to copy pg_control last. In fact the backup software should not copy pg_control from PGDATA at all. These changes have no impact on current backup software and they are free to use the pg_control available from pg_stop_backup() or continue to use pg_control from PGDATA. Of course they will miss the benefits of getting a consistent copy of pg_control and the backup_label checking, but will be no worse off than before. Catalog version bump is required. --- doc/src/sgml/backup.sgml | 18 +++++- doc/src/sgml/func.sgml | 5 +- src/backend/access/transam/xlogfuncs.c | 20 ++++-- src/backend/catalog/system_functions.sql | 2 +- src/include/catalog/pg_proc.dat | 4 +- src/test/recovery/t/042_low_level_backup.pl | 67 ++++++++++++++++++++- 6 files changed, 101 insertions(+), 15 deletions(-) diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 25b8904baf7c..7c45e2b151e6 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -1027,9 +1027,12 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); values. The second of these fields should be written to a file named backup_label in the root directory of the backup. The third field should be written to a file named - tablespace_map unless the field is empty. These files are + tablespace_map unless the field is empty. The fourth + field should be written into a file named + global/pg_control (overwriting the existing file when + present). These files are vital to the backup working and must be written byte for byte without - modification, which may require opening the file in binary mode. + modification, which will require opening the file in binary mode. @@ -1101,7 +1104,16 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); - You should, however, omit from the backup the files within the + You should exclude global/pg_control from your backup + and put the contents of the controlfile column + returned from pg_backup_stop in your backup at + global/pg_control. This version of pg_control contains + safeguards against recovery without backup_label present and is guaranteed + not to be torn. + + + + You should also omit from the backup the files within the cluster's pg_wal/ subdirectory. This slight adjustment is worthwhile because it reduces the risk of mistakes when restoring. This is easy to arrange if diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 02cda525a584..f51d8a41cfa5 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -29156,8 +29156,9 @@ stats_timestamp | 2025-03-24 13:55:47.796698+01 The result of the function is a single record. The lsn column holds the backup's ending write-ahead log location (which again can be ignored). The second - column returns the contents of the backup label file, and the third - column returns the contents of the tablespace map file. These must be + column returns the contents of the backup label file, the third column + returns the contents of the tablespace map file, and the fourth column + returns the contents of pg_control. These must be stored as part of the backup and are required as part of the restore process. diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 8c3090165f00..e805f231c693 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -113,9 +113,11 @@ pg_backup_start(PG_FUNCTION_ARGS) * * The backup_label contains the user-supplied label string (typically this * would be used to tell where the backup dump will be stored), the starting - * time, starting WAL location for the dump and so on. It is the caller's - * responsibility to write the backup_label and tablespace_map files in the - * data folder that will be restored from this backup. + * time, starting WAL location for the dump and so on. The pg_control file + * contains a consistent copy of pg_control that also has a safeguard against + * being used without backup_label. It is the caller's responsibility to write + * the backup_label, pg_control, and tablespace_map files in the data folder + * that will be restored from this backup. * * Permission checking for this function is managed through the normal * GRANT system. @@ -123,12 +125,14 @@ pg_backup_start(PG_FUNCTION_ARGS) Datum pg_backup_stop(PG_FUNCTION_ARGS) { -#define PG_BACKUP_STOP_V2_COLS 3 +#define PG_BACKUP_STOP_V2_COLS 4 TupleDesc tupdesc; Datum values[PG_BACKUP_STOP_V2_COLS] = {0}; bool nulls[PG_BACKUP_STOP_V2_COLS] = {0}; bool waitforarchive = PG_GETARG_BOOL(0); char *backup_label; + bytea *pg_control_bytea; + uint8_t pg_control[PG_CONTROL_FILE_SIZE]; SessionBackupState status = get_backup_status(); /* Initialize attributes information in the tuple descriptor */ @@ -150,9 +154,17 @@ pg_backup_stop(PG_FUNCTION_ARGS) /* Build the contents of backup_label */ backup_label = build_backup_content(backup_state, false); + /* Build the contents of pg_control */ + backup_control_file(pg_control); + + pg_control_bytea = (bytea *) palloc(PG_CONTROL_FILE_SIZE + VARHDRSZ); + SET_VARSIZE(pg_control_bytea, PG_CONTROL_FILE_SIZE + VARHDRSZ); + memcpy(VARDATA(pg_control_bytea), pg_control, PG_CONTROL_FILE_SIZE); + values[0] = LSNGetDatum(backup_state->stoppoint); values[1] = CStringGetTextDatum(backup_label); values[2] = CStringGetTextDatum(tablespace_map->data); + values[3] = PointerGetDatum(pg_control_bytea); /* Deallocate backup-related variables */ pfree(backup_label); diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index 566f308e4439..f88333938331 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -390,7 +390,7 @@ CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION pg_backup_stop ( wait_for_archive boolean DEFAULT true, OUT lsn pg_lsn, - OUT labelfile text, OUT spcmapfile text) + OUT labelfile text, OUT spcmapfile text, OUT controlfile bytea) RETURNS record STRICT VOLATILE LANGUAGE internal as 'pg_backup_stop' PARALLEL RESTRICTED; diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 7d8f55375bcb..dcbdba5c1d87 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -6699,8 +6699,8 @@ { oid => '2739', descr => 'finish taking an online backup', proname => 'pg_backup_stop', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => 'bool', - proallargtypes => '{bool,pg_lsn,text,text}', proargmodes => '{i,o,o,o}', - proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile}', + proallargtypes => '{bool,pg_lsn,text,text,bytea}', proargmodes => '{i,o,o,o,o}', + proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile,controlfile}', prosrc => 'pg_backup_stop' }, { oid => '3436', descr => 'promote standby server', proname => 'pg_promote', provolatile => 'v', prorettype => 'bool', diff --git a/src/test/recovery/t/042_low_level_backup.pl b/src/test/recovery/t/042_low_level_backup.pl index 5749a1df5330..a08a9a9d9dfe 100644 --- a/src/test/recovery/t/042_low_level_backup.pl +++ b/src/test/recovery/t/042_low_level_backup.pl @@ -13,6 +13,42 @@ use PostgreSQL::Test::Utils; use Test::More; +# Decode hex to binary +sub decode_hex +{ + my ($encoded) = @_; + my $decoded; + + $encoded =~ s/^\s+|\s+$//g; + + for (my $idx = 0; $idx < length($encoded); $idx += 2) + { + $decoded .= pack('C', hex(substr($encoded, $idx, 2))); + } + + return $decoded; +} + +# Get backup_label/pg_control from pg_stop_backup() +sub stop_backup_result +{ + my ($psql) = @_; + + my $encoded = $psql->query_safe( + "select encode(labelfile::bytea, 'hex') || ',' || " . + " encode(controlfile, 'hex')" . + " from pg_backup_stop()"); + + my @result; + + foreach my $column (split(',', $encoded)) + { + push(@result, decode_hex($column)); + } + + return @result; +} + # Start primary node with archiving. my $node_primary = PostgreSQL::Test::Cluster->new('primary'); $node_primary->init(has_archiving => 1, allows_streaming => 1); @@ -80,8 +116,7 @@ 'SELECT pg_walfile_name(pg_current_wal_lsn())'); # Stop backup and get backup_label, the last segment is archived. -my $backup_label = - $psql->query_safe("select labelfile from pg_backup_stop()"); +(my $backup_label, my $pg_control) = stop_backup_result($psql); $psql->quit; @@ -118,10 +153,36 @@ $node_replica->teardown_node; $node_replica->clean_node; +# Save only pg_control into the backup to demonstrate that pg_control returned +# from pg_stop_backup() will only perform recovery when backup_label is present. +open(my $fh, ">", "$backup_dir/global/pg_control") + or die "could not open pg_control"; +binmode($fh); +syswrite($fh, $pg_control); +close($fh); + +$node_replica = PostgreSQL::Test::Cluster->new('replica_fail2'); +$node_replica->init_from_backup($node_primary, $backup_name, + has_restoring => 1); + +my $res = run_log( + [ + 'pg_ctl', '-D', $node_replica->data_dir, '-l', + $node_replica->logfile, 'start' + ]); +ok(!$res, 'invalid recovery startup fails'); + +my $logfile = slurp_file($node_replica->logfile()); +ok($logfile =~ qr/could not find backup_label required for recovery/, + 'could not find backup_label required for recovery'); + +$node_replica->teardown_node; +$node_replica->clean_node; + # Save backup_label into the backup directory and recover using the primary's # archive. This time recovery will succeed and the canary table will be # present. -open my $fh, ">>", "$backup_dir/backup_label" +open $fh, ">>", "$backup_dir/backup_label" or die "could not open backup_label"; # Binary mode is required for Windows, as the backup_label parsing is not # able to cope with CRLFs.