summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Haas2023-12-05 16:45:58 +0000
committerRobert Haas2023-12-05 16:45:58 +0000
commit2c3d0ae11652905a9f2143df93c0812418dccbc9 (patch)
tree3e0e5e23f5338ebba8d7b0fb724af20cb7a18efa
parent077729031beeb40231efe2dfc9271113d34db9e4 (diff)
backport changes we want to keepwalsummarizermanifest
-rw-r--r--doc/src/sgml/protocol.sgml24
-rw-r--r--src/backend/backup/basebackup.c6
-rw-r--r--src/backend/backup/basebackup_incremental.c110
-rw-r--r--src/backend/postmaster/walsummarizer.c5
-rw-r--r--src/bin/pg_basebackup/pg_basebackup.c6
-rw-r--r--src/bin/pg_combinebackup/copy_file.c2
-rw-r--r--src/bin/pg_combinebackup/load_manifest.c2
-rw-r--r--src/bin/pg_combinebackup/pg_combinebackup.c25
-rw-r--r--src/include/postmaster/walsummarizer.h3
9 files changed, 159 insertions, 24 deletions
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index af3f016f74..9a66918171 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2599,6 +2599,19 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
</listitem>
</varlistentry>
+ <varlistentry id="protocol-replication-upload-manifest">
+ <term>
+ <literal>UPLOAD_MANIFEST</literal>
+ <indexterm><primary>UPLOAD_MANIFEST</primary></indexterm>
+ </term>
+ <listitem>
+ <para>
+ Uploads a backup manifest in preparation for taking an incremental
+ backup.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="protocol-replication-base-backup" xreflabel="BASE_BACKUP">
<term><literal>BASE_BACKUP</literal> [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ]
<indexterm><primary>BASE_BACKUP</primary></indexterm>
@@ -2838,6 +2851,17 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
</para>
</listitem>
</varlistentry>
+
+ <varlistentry>
+ <term><literal>INCREMENTAL</literal></term>
+ <listitem>
+ <para>
+ Requests an incremental backup. The
+ <literal>UPLOAD_MANIFEST</literal> command must be executed
+ before running a base backup with this option.
+ </para>
+ </listitem>
+ </varlistentry>
</variablelist>
</para>
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 9ecce5f222..5ee9628422 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -35,6 +35,7 @@
#include "pgtar.h"
#include "port.h"
#include "postmaster/syslogger.h"
+#include "postmaster/walsummarizer.h"
#include "replication/walsender.h"
#include "replication/walsender_private.h"
#include "storage/bufpage.h"
@@ -788,6 +789,11 @@ parse_basebackup_options(List *options, basebackup_options *opt)
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("duplicate option \"%s\"", defel->defname)));
opt->incremental = defGetBoolean(defel);
+ if (opt->incremental && !summarize_wal)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("incremental backups cannot be taken unless WAL summarization is enabled")));
+ opt->incremental = defGetBoolean(defel);
o_incremental = true;
}
else if (strcmp(defel->defname, "max_rate") == 0)
diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c
index 303117e19e..8f988360f9 100644
--- a/src/backend/backup/basebackup_incremental.c
+++ b/src/backend/backup/basebackup_incremental.c
@@ -241,6 +241,11 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
TimeLineID latest_wal_range_tli = 0;
XLogRecPtr summarized_lsn;
+ XLogRecPtr pending_lsn;
+ XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
+ int deadcycles = 0;
+ TimestampTz initial_time,
+ current_time;
Assert(ib->buf.data == NULL);
@@ -418,15 +423,82 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
* Wait for WAL summarization to catch up to the backup start LSN (but
* time out if it doesn't do so quickly enough).
*/
- /* XXX make timeout configurable */
- summarized_lsn = WaitForWalSummarization(backup_state->startpoint, 60000);
- if (summarized_lsn < backup_state->startpoint)
- ereport(ERROR,
+ initial_time = current_time = GetCurrentTimestamp();
+ while (1)
+ {
+ long timeout_in_ms = 10000;
+ unsigned elapsed_seconds;
+
+ /*
+ * Align the wait time to prevent drift. This doesn't really matter,
+ * but we'd like the warnings about how long we've been waiting to
+ * say 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without
+ * ever drifting to something that is not a multiple of ten.
+ */
+ timeout_in_ms -=
+ TimestampDifferenceMilliseconds(current_time, initial_time) %
+ timeout_in_ms;
+
+ /* Wait for up to 10 seconds. */
+ summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
+ 10000, &pending_lsn);
+
+ /* If WAL summarization has progressed sufficiently, stop waiting. */
+ if (summarized_lsn >= backup_state->startpoint)
+ break;
+
+ /*
+ * Keep track of the number of cycles during which there has been no
+ * progression of pending_lsn. If pending_lsn is not advancing, that
+ * means that not only are no new files appearing on disk, but we're
+ * not even incorporating new records into the in-memory state.
+ */
+ if (pending_lsn > prior_pending_lsn)
+ {
+ prior_pending_lsn = pending_lsn;
+ deadcycles = 0;
+ }
+ else
+ ++deadcycles;
+
+ /*
+ * If we've managed to wait for an entire minute withot the WAL
+ * summarizer absorbing a single WAL record, error out; probably
+ * something is wrong.
+ *
+ * We could consider also erroring out if the summarizer is taking too
+ * long to catch up, but it's not clear what rate of progress would be
+ * acceptable and what would be too slow. So instead, we just try to
+ * error out in the case where there's no progress at all. That seems
+ * likely to catch a reasonable number of the things that can go wrong
+ * in practice (e.g. the summarizer process is completely hung, say
+ * because somebody hooked up a debugger to it or something) without
+ * giving up too quickly when the sytem is just slow.
+ */
+ if (deadcycles >= 6)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL summarization is not progressing"),
+ errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
+ LSN_FORMAT_ARGS(backup_state->startpoint),
+ LSN_FORMAT_ARGS(summarized_lsn),
+ LSN_FORMAT_ARGS(pending_lsn))));
+
+ /*
+ * Otherwise, just let the user know what's happening.
+ */
+ current_time = GetCurrentTimestamp();
+ elapsed_seconds =
+ TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
+ ereport(WARNING,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("timeout waiting for WAL summarization"),
- errdetail("This backup requires WAL to be summarized up to %X/%X, but summarizer has only reached %X/%X.",
- LSN_FORMAT_ARGS(backup_state->startpoint),
- LSN_FORMAT_ARGS(summarized_lsn))));
+ errmsg("still waiting for WAL summarization through %X/%X after %d seconds",
+ LSN_FORMAT_ARGS(backup_state->startpoint),
+ elapsed_seconds),
+ errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
+ LSN_FORMAT_ARGS(summarized_lsn),
+ LSN_FORMAT_ARGS(pending_lsn))));
+ }
/*
* Retrieve a list of all WAL summaries on any timeline that overlap with
@@ -673,8 +745,26 @@ GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path,
return BACK_UP_FILE_FULLY;
/*
- * Check whether this file is part of the prior backup. If it isn't, back
- * up the whole file.
+ * If this file was not part of the prior backup, back it up fully.
+ *
+ * If this file was created after the prior backup and before the start
+ * of the current backup, then the WAL summary information will tell us
+ * to back up the whole file. However, if this file was created after the
+ * start of the current backup, then the WAL summary won't know anything
+ * about it. Without this logic, we would erroneously conclude that it was
+ * OK to send it incrementally.
+ *
+ * Note that the file could have existed at the time of the prior backup,
+ * gotten deleted, and then a new file with the same name could have been
+ * created. In that case, this logic won't prevent the file from being
+ * backed up incrementally. But, if the deletion happened before the start
+ * of the current backup, the limit block will be 0, inducing a full
+ * backup. If the deletion happened after the start of the current backup,
+ * reconstruction will erroneously combine blocks from the current lifespan
+ * of the file with blocks from the previous lifespan -- but in this type
+ * of case, WAL replay to reach backup consistency should remove and
+ * recreate the file anyway, so the initial bogus contents should not
+ * matter.
*/
if (backup_file_lookup(ib->manifest_files, path) == NULL)
{
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index 74a0116a13..8d903283d8 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -567,9 +567,11 @@ SetWalSummarizerLatch(void)
*
* The return value is the first still-unsummarized LSN. If it's greater than
* or equal to the passed LSN, then that LSN was reached. If not, we timed out.
+ *
+ * Either way, *pending_lsn is set to the value taken from WalSummarizerCtl.
*/
XLogRecPtr
-WaitForWalSummarization(XLogRecPtr lsn, long timeout)
+WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn)
{
TimestampTz start_time = GetCurrentTimestamp();
TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout);
@@ -588,6 +590,7 @@ WaitForWalSummarization(XLogRecPtr lsn, long timeout)
*/
LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
summarized_lsn = WalSummarizerCtl->summarized_lsn;
+ *pending_lsn = WalSummarizerCtl->pending_lsn;
LWLockRelease(WALSummarizerLock);
if (summarized_lsn >= lsn)
break;
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 26fd9ad0bc..5795b91261 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -397,7 +397,7 @@ usage(void)
printf(_(" -D, --pgdata=DIRECTORY receive base backup into directory\n"));
printf(_(" -F, --format=p|t output format (plain (default), tar)\n"));
printf(_(" -i, --incremental=OLDMANIFEST\n"));
- printf(_(" take incremental or differential backup\n"));
+ printf(_(" take incremental backup\n"));
printf(_(" -r, --max-rate=RATE maximum transfer rate to transfer data directory\n"
" (in kB/s, or use suffix \"k\" or \"M\")\n"));
printf(_(" -R, --write-recovery-conf\n"
@@ -1830,7 +1830,9 @@ BaseBackup(char *compression_algorithm, char *compression_detail,
char mbuf[65536];
int nbytes;
- /* XXX add a server version check here */
+ /* Reject if server is too old. */
+ if (serverVersion < MINIMUM_VERSION_FOR_WAL_SUMMARIES)
+ pg_fatal("server does not support incremental backup");
/* Open the file. */
fd = open(incremental_manifest, O_RDONLY | PG_BINARY, 0);
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index f2b45787e9..40a55e3087 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -56,7 +56,7 @@ copy_file(const char *src, const char *dst,
* operating system primitives that we know about to copy the file; this
* may be quicker than a naive block copy.
*/
- if (checksum_ctx->type != CHECKSUM_TYPE_NONE)
+ if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
{
char *strategy_name = NULL;
void (*strategy_implementation) (const char *, const char *) = NULL;
diff --git a/src/bin/pg_combinebackup/load_manifest.c b/src/bin/pg_combinebackup/load_manifest.c
index d06c3ffe0f..ad32323c9c 100644
--- a/src/bin/pg_combinebackup/load_manifest.c
+++ b/src/bin/pg_combinebackup/load_manifest.c
@@ -110,7 +110,7 @@ load_backup_manifest(char *backup_directory)
snprintf(pathname, MAXPGPATH, "%s/backup_manifest", backup_directory);
if ((fd = open(pathname, O_RDONLY | PG_BINARY, 0)) < 0)
{
- if (errno == EEXIST)
+ if (errno == ENOENT)
{
pg_log_warning("\"%s\" does not exist", pathname);
return NULL;
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 6eb705c959..63dcbf329d 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -221,8 +221,15 @@ main(int argc, char *argv[])
/* Sanity-check backup_label files, and get the contents of the last one. */
last_backup_label = check_backup_label_files(n_backups, argv + optind);
+ /*
+ * We'll need the pathnames to the prior backups. By "prior" we mean all
+ * but the last one listed on the command line.
+ */
+ n_prior_backups = argc - optind - 1;
+ prior_backup_dirs = argv + optind;
+
/* Load backup manifests. */
- manifests = load_backup_manifests(n_backups, argv + optind);
+ manifests = load_backup_manifests(n_backups, prior_backup_dirs);
/* Figure out which tablespaces are going to be included in the output. */
last_input_dir = argv[argc - 1];
@@ -248,7 +255,16 @@ main(int argc, char *argv[])
/* If we need to write a backup_manifest, prepare to do so. */
if (!opt.dry_run && !opt.no_manifest)
+ {
mwriter = create_manifest_writer(opt.output);
+
+ /*
+ * Verify that we have a backup manifest for the final backup; else we
+ * won't have the WAL ranges for the resulting manifest.
+ */
+ if (manifests[n_prior_backups] == NULL)
+ pg_fatal("can't generate a manifest because no manifest is available for the final input backup");
+ }
else
mwriter = NULL;
@@ -263,13 +279,6 @@ main(int argc, char *argv[])
opt.manifest_checksums, mwriter);
}
- /*
- * We'll need the pathnames to the prior backups. By "prior" we mean all
- * but the last one listed on the command line.
- */
- n_prior_backups = argc - optind - 1;
- prior_backup_dirs = argv + optind;
-
/* Process everything that's not part of a user-defined tablespace. */
pg_log_debug("processing backup directory \"%s\"", last_input_dir);
process_directory_recursively(InvalidOid, last_input_dir, opt.output,
diff --git a/src/include/postmaster/walsummarizer.h b/src/include/postmaster/walsummarizer.h
index 4a6792e5f9..ebc95bd326 100644
--- a/src/include/postmaster/walsummarizer.h
+++ b/src/include/postmaster/walsummarizer.h
@@ -26,6 +26,7 @@ extern void WalSummarizerMain(void) pg_attribute_noreturn();
extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
bool *lsn_is_exact);
extern void SetWalSummarizerLatch(void);
-extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout);
+extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout,
+ XLogRecPtr *pending_lsn);
#endif