*** pgsql/src/backend/access/transam/xlog.c 2009/02/07 10:49:36 1.330 --- pgsql/src/backend/access/transam/xlog.c 2009/02/18 15:58:40 1.331 *************** *** 7,13 **** * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * ! * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.329 2009/01/23 11:19:34 heikki Exp $ * *------------------------------------------------------------------------- */ --- 7,13 ---- * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * ! * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.330 2009/02/07 10:49:36 heikki Exp $ * *------------------------------------------------------------------------- */ *************** *** 36,41 **** --- 36,42 ---- #include "catalog/pg_control.h" #include "catalog/pg_type.h" #include "funcapi.h" + #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" *************** *** 47,52 **** --- 48,54 ---- #include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" + #include "utils/flatfiles.h" #include "utils/guc.h" #include "utils/ps_status.h" #include "pg_trace.h" *************** CheckpointStatsData CheckpointStats; *** 119,130 **** */ TimeLineID ThisTimeLineID = 0; ! /* Are we doing recovery from XLOG? */ bool InRecovery = false; /* Are we recovering using offline XLOG archives? */ static bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; --- 121,147 ---- */ TimeLineID ThisTimeLineID = 0; ! /* ! * Are we doing recovery from XLOG? ! * ! * This is only ever true in the startup process, even if the system is still ! * in recovery. Prior to 8.4, all activity during recovery were carried out ! * by Startup process. This local variable continues to be used in functions ! * that need to act differently when called from a redo function (e.g skip ! * WAL logging). To check whether the system is in recovery regardless of what ! * process you're running in, use RecoveryInProgress(). ! */ bool InRecovery = false; /* Are we recovering using offline XLOG archives? */ static bool InArchiveRecovery = false; + /* + * Local copy of SharedRecoveryInProgress variable. True actually means "not + * known, need to check the shared state" + */ + static bool LocalRecoveryInProgress = true; + /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; *************** static char *recoveryRestoreCommand = NU *** 133,139 **** static bool recoveryTarget = false; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; - static bool recoveryLogRestartpoints = false; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static TimestampTz recoveryLastXTime = 0; --- 150,155 ---- *************** static XLogRecPtr RedoRecPtr; *** 242,250 **** * ControlFileLock: must be held to read/update control file or create * new log file. * ! * CheckpointLock: must be held to do a checkpoint (ensures only one ! * checkpointer at a time; currently, with all checkpoints done by the ! * bgwriter, this is just pro forma). * *---------- */ --- 258,265 ---- * ControlFileLock: must be held to read/update control file or create * new log file. * ! * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures ! * only one checkpointer at a time) * *---------- */ *************** typedef struct XLogCtlData *** 313,318 **** --- 328,352 ---- int XLogCacheBlck; /* highest allocated xlog buffer index */ TimeLineID ThisTimeLineID; + /* + * SharedRecoveryInProgress indicates if we're still in crash or archive + * recovery. It's checked by RecoveryInProgress(). + */ + bool SharedRecoveryInProgress; + + /* + * During recovery, we keep a copy of the latest checkpoint record + * here. Used by the background writer when it wants to create + * a restartpoint. + * + * Protected by info_lck. + */ + XLogRecPtr lastCheckPointRecPtr; + CheckPoint lastCheckPoint; + + /* end+1 of the last record replayed (or being replayed) */ + XLogRecPtr replayEndRecPtr; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; *************** static XLogRecPtr ReadRecPtr; /* start o *** 387,395 **** --- 421,441 ---- static XLogRecPtr EndRecPtr; /* end+1 of last record read */ static XLogRecord *nextRecord = NULL; static TimeLineID lastPageTLI = 0; + static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */ + static bool updateMinRecoveryPoint = true; static bool InRedo = false; + /* + * Flag set by interrupt handlers for later service in the redo loop. + */ + static volatile sig_atomic_t shutdown_requested = false; + /* + * Flag set when executing a restore command, to tell SIGTERM signal handler + * that it's safe to just proc_exit(0). + */ + static volatile sig_atomic_t in_restore_command = false; + static void XLogArchiveNotify(const char *xlog); static void XLogArchiveNotifySeg(uint32 log, uint32 seg); *************** static void PreallocXlogFiles(XLogRecPtr *** 420,425 **** --- 466,472 ---- static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr); static void ValidateXLOGDirectoryStructure(void); static void CleanupBackupHistory(void); + static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode); static bool ValidXLOGHeader(XLogPageHeader hdr, int emode); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); *************** XLogInsert(RmgrId rmid, uint8 info, XLog *** 484,489 **** --- 531,540 ---- bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + /* cross-check on whether we should be here or not */ + if (RecoveryInProgress()) + elog(FATAL, "cannot make new WAL entries during recovery"); + /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) elog(PANIC, "invalid xlog info mask %02X", info); *************** XLogSetAsyncCommitLSN(XLogRecPtr asyncCo *** 1718,1723 **** --- 1769,1831 ---- } /* + * Advance minRecoveryPoint in control file. + * + * If we crash during recovery, we must reach this point again before the + * database is consistent. + * + * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint + * is is only updated if it's not already greater than or equal to 'lsn'. + */ + static void + UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) + { + /* Quick check using our local copy of the variable */ + if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint))) + return; + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* update local copy */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + + /* + * An invalid minRecoveryPoint means that we need to recover all the WAL, + * ie. crash recovery. Don't update the control file in that case. + */ + if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0) + updateMinRecoveryPoint = false; + else if (force || XLByteLT(minRecoveryPoint, lsn)) + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + XLogRecPtr newMinRecoveryPoint; + + /* + * To avoid having to update the control file too often, we update it + * all the way to the last record being replayed, even though 'lsn' + * would suffice for correctness. + */ + SpinLockAcquire(&xlogctl->info_lck); + newMinRecoveryPoint = xlogctl->replayEndRecPtr; + SpinLockRelease(&xlogctl->info_lck); + + /* update control file */ + if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint)) + { + ControlFile->minRecoveryPoint = newMinRecoveryPoint; + UpdateControlFile(); + minRecoveryPoint = newMinRecoveryPoint; + + ereport(DEBUG2, + (errmsg("updated min recovery point to %X/%X", + minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff))); + } + } + LWLockRelease(ControlFileLock); + } + + /* * Ensure that all XLOG data through the given position is flushed to disk. * * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not *************** XLogFlush(XLogRecPtr record) *** 1729,1737 **** XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; ! /* Disabled during REDO */ ! if (InRedo) return; /* Quick exit if already known flushed */ if (XLByteLE(record, LogwrtResult.Flush)) --- 1837,1851 ---- XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; ! /* ! * During REDO, we don't try to flush the WAL, but update minRecoveryPoint ! * instead. ! */ ! if (RecoveryInProgress()) ! { ! UpdateMinRecoveryPoint(record, false); return; + } /* Quick exit if already known flushed */ if (XLByteLE(record, LogwrtResult.Flush)) *************** XLogFlush(XLogRecPtr record) *** 1818,1826 **** * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we ! * cannot get here while InRedo is true, but if the bad page is brought in ! * and marked dirty during recovery then CreateCheckPoint will try to ! * flush it at the end of recovery.) * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if --- 1932,1940 ---- * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we ! * cannot get here while RecoveryInProgress(), but if the bad page is ! * brought in and marked dirty during recovery then if a checkpoint were ! * performed at the end of recovery it will try to flush it. * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if *************** XLogBackgroundFlush(void) *** 1857,1862 **** --- 1971,1980 ---- XLogRecPtr WriteRqstPtr; bool flexible = true; + /* XLOG doesn't need flushing during recovery */ + if (RecoveryInProgress()) + return; + /* read LogwrtResult and update local state */ { /* use volatile pointer to prevent code rearrangement */ *************** XLogAsyncCommitFlush(void) *** 1928,1933 **** --- 2046,2055 ---- /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; + /* There's no asynchronously committed transactions during recovery */ + if (RecoveryInProgress()) + return; + SpinLockAcquire(&xlogctl->info_lck); WriteRqstPtr = xlogctl->asyncCommitLSN; SpinLockRelease(&xlogctl->info_lck); *************** XLogAsyncCommitFlush(void) *** 1944,1949 **** --- 2066,2075 ---- bool XLogNeedsFlush(XLogRecPtr record) { + /* XLOG doesn't need flushing during recovery */ + if (RecoveryInProgress()) + return false; + /* Quick exit if already known flushed */ if (XLByteLE(record, LogwrtResult.Flush)) return false; *************** RestoreArchivedFile(char *path, const ch *** 2619,2627 **** --- 2745,2766 ---- xlogRestoreCmd))); /* + * Set in_restore_command to tell the signal handler that we should exit + * right away on SIGTERM. We know that we're in a safe point to do that. + * Check if we had already received the signal, so that we don't miss a + * shutdown request received just before this. + */ + in_restore_command = true; + if (shutdown_requested) + proc_exit(0); + + /* * Copy xlog from archival storage to XLOGDIR */ rc = system(xlogRestoreCmd); + + in_restore_command = false; + if (rc == 0) { /* *************** RestoreArchivedFile(char *path, const ch *** 2674,2687 **** * assume that recovery is complete and start up the database!) It's * essential to abort on child SIGINT and SIGQUIT, because per spec * system() ignores SIGINT and SIGQUIT while waiting; if we see one of ! * those it's a good bet we should have gotten it too. Aborting on other ! * signals such as SIGTERM seems a good idea as well. * * Per the Single Unix Spec, shells report exit status > 128 when a called * command died on a signal. Also, 126 and 127 are used to report * problems such as an unfindable command; treat those as fatal errors * too. */ signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport(signaled ? FATAL : DEBUG2, --- 2813,2836 ---- * assume that recovery is complete and start up the database!) It's * essential to abort on child SIGINT and SIGQUIT, because per spec * system() ignores SIGINT and SIGQUIT while waiting; if we see one of ! * those it's a good bet we should have gotten it too. ! * ! * On SIGTERM, assume we have received a fast shutdown request, and exit ! * cleanly. It's pure chance whether we receive the SIGTERM first, or the ! * child process. If we receive it first, the signal handler will call ! * proc_exit(0), otherwise we do it here. If we or the child process ! * received SIGTERM for any other reason than a fast shutdown request, ! * postmaster will perform an immediate shutdown when it sees us exiting ! * unexpectedly. * * Per the Single Unix Spec, shells report exit status > 128 when a called * command died on a signal. Also, 126 and 127 are used to report * problems such as an unfindable command; treat those as fatal errors * too. */ + if (WTERMSIG(rc) == SIGTERM) + proc_exit(0); + signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport(signaled ? FATAL : DEBUG2, *************** readRecoveryCommandFile(void) *** 4584,4601 **** ereport(LOG, (errmsg("recovery_target_inclusive = %s", tok2))); } - else if (strcmp(tok1, "log_restartpoints") == 0) - { - /* - * does nothing if a recovery_target is not also set - */ - if (!parse_bool(tok2, &recoveryLogRestartpoints)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"log_restartpoints\" requires a Boolean value"))); - ereport(LOG, - (errmsg("log_restartpoints = %s", tok2))); - } else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", --- 4733,4738 ---- *************** StartupXLOG(void) *** 4877,4883 **** XLogRecPtr RecPtr, LastRec, checkPointLoc, ! minRecoveryLoc, EndOfLog; uint32 endLogId; uint32 endLogSeg; --- 5014,5020 ---- XLogRecPtr RecPtr, LastRec, checkPointLoc, ! backupStopLoc, EndOfLog; uint32 endLogId; uint32 endLogSeg; *************** StartupXLOG(void) *** 4885,4890 **** --- 5022,5029 ---- uint32 freespace; TransactionId oldestActiveXID; + XLogCtl->SharedRecoveryInProgress = true; + /* * Read control file and check XLOG status looks valid. * *************** StartupXLOG(void) *** 4964,4970 **** recoveryTargetTLI, ControlFile->checkPointCopy.ThisTimeLineID))); ! if (read_backup_label(&checkPointLoc, &minRecoveryLoc)) { /* * When a backup_label file is present, we want to roll forward from --- 5103,5109 ---- recoveryTargetTLI, ControlFile->checkPointCopy.ThisTimeLineID))); ! if (read_backup_label(&checkPointLoc, &backupStopLoc)) { /* * When a backup_label file is present, we want to roll forward from *************** StartupXLOG(void) *** 5102,5112 **** ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = checkPointLoc; ControlFile->checkPointCopy = checkPoint; ! if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0) ! ControlFile->minRecoveryPoint = minRecoveryLoc; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the --- 5241,5263 ---- ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = checkPointLoc; ControlFile->checkPointCopy = checkPoint; ! if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0) ! { ! if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc)) ! ControlFile->minRecoveryPoint = backupStopLoc; ! } ControlFile->time = (pg_time_t) time(NULL); + /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); + /* update our local copy of minRecoveryPoint */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + + /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the *************** StartupXLOG(void) *** 5151,5162 **** { bool recoveryContinue = true; bool recoveryApply = true; ErrorContextCallback errcontext; InRedo = true; ! ereport(LOG, ! (errmsg("redo starts at %X/%X", ! ReadRecPtr.xlogid, ReadRecPtr.xrecoff))); /* * main redo apply loop --- 5302,5342 ---- { bool recoveryContinue = true; bool recoveryApply = true; + bool reachedMinRecoveryPoint = false; ErrorContextCallback errcontext; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + /* Update shared replayEndRecPtr */ + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->replayEndRecPtr = ReadRecPtr; + SpinLockRelease(&xlogctl->info_lck); InRedo = true; ! ! if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0) ! ereport(LOG, ! (errmsg("redo starts at %X/%X", ! ReadRecPtr.xlogid, ReadRecPtr.xrecoff))); ! else ! ereport(LOG, ! (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X", ! ReadRecPtr.xlogid, ReadRecPtr.xrecoff, ! minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff))); ! ! /* ! * Let postmaster know we've started redo now, so that it can ! * launch bgwriter to perform restartpoints. We don't bother ! * during crash recovery as restartpoints can only be performed ! * during archive recovery. And we'd like to keep crash recovery ! * simple, to avoid introducing bugs that could you from ! * recovering after crash. ! * ! * After this point, we can no longer assume that we're the only ! * process in addition to postmaster! ! */ ! if (InArchiveRecovery && IsUnderPostmaster) ! SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); /* * main redo apply loop *************** StartupXLOG(void) *** 5183,5188 **** --- 5363,5392 ---- #endif /* + * Check if we were requested to exit without finishing + * recovery. + */ + if (shutdown_requested) + proc_exit(0); + + /* + * Have we reached our safe starting point? If so, we can + * tell postmaster that the database is consistent now. + */ + if (!reachedMinRecoveryPoint && + XLByteLE(minRecoveryPoint, EndRecPtr)) + { + reachedMinRecoveryPoint = true; + if (InArchiveRecovery) + { + ereport(LOG, + (errmsg("consistent recovery state reached"))); + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); + } + } + + /* * Have we reached our recovery target? */ if (recoveryStopsHere(record, &recoveryApply)) *************** StartupXLOG(void) *** 5207,5212 **** --- 5411,5425 ---- TransactionIdAdvance(ShmemVariableCache->nextXid); } + /* + * Update shared replayEndRecPtr before replaying this + * record, so that XLogFlush will update minRecoveryPoint + * correctly. + */ + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->replayEndRecPtr = EndRecPtr; + SpinLockRelease(&xlogctl->info_lck); + RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record); /* Pop the error context stack */ *************** StartupXLOG(void) *** 5250,5263 **** * Complain if we did not roll forward far enough to render the backup * dump consistent. */ ! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, ! (errmsg("requested recovery stop point is before end time of backup dump"))); else /* ran off end of WAL */ ereport(FATAL, ! (errmsg("WAL ends before end time of backup dump"))); } /* --- 5463,5476 ---- * Complain if we did not roll forward far enough to render the backup * dump consistent. */ ! if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint)) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, ! (errmsg("requested recovery stop point is before consistent recovery point"))); else /* ran off end of WAL */ ereport(FATAL, ! (errmsg("WAL ends before consistent recovery point"))); } /* *************** StartupXLOG(void) *** 5352,5357 **** --- 5565,5576 ---- /* Pre-scan prepared transactions to find out the range of XIDs present */ oldestActiveXID = PrescanPreparedTransactions(); + /* + * Allow writing WAL for us, so that we can create a checkpoint record. + * But not yet for other backends! + */ + LocalRecoveryInProgress = false; + if (InRecovery) { int rmid; *************** StartupXLOG(void) *** 5372,5382 **** XLogCheckInvalidPages(); /* - * Reset pgstat data, because it may be invalid after recovery. - */ - pgstat_reset_all(); - - /* * Perform a checkpoint to update all our recovery activity to disk. * * Note that we write a shutdown checkpoint rather than an on-line --- 5591,5596 ---- *************** StartupXLOG(void) *** 5398,5409 **** */ InRecovery = false; ControlFile->state = DB_IN_PRODUCTION; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); /* start the archive_timeout timer running */ ! XLogCtl->Write.lastSegSwitchTime = ControlFile->time; /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; --- 5612,5625 ---- */ InRecovery = false; + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_IN_PRODUCTION; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); /* start the archive_timeout timer running */ ! XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL); /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; *************** StartupXLOG(void) *** 5438,5443 **** --- 5654,5698 ---- readRecordBuf = NULL; readRecordBufSize = 0; } + + /* + * All done. Allow others to write WAL. + */ + XLogCtl->SharedRecoveryInProgress = false; + } + + /* + * Is the system still in recovery? + * + * As a side-effect, we initialize the local TimeLineID and RedoRecPtr + * variables the first time we see that recovery is finished. + */ + bool + RecoveryInProgress(void) + { + /* + * We check shared state each time only until we leave recovery mode. + * We can't re-enter recovery, so we rely on the local state variable + * after that. + */ + if (!LocalRecoveryInProgress) + return false; + else + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress; + + /* + * Initialize TimeLineID and RedoRecPtr the first time we see that + * recovery is finished. + */ + if (!LocalRecoveryInProgress) + InitXLOGAccess(); + + return LocalRecoveryInProgress; + } } /* *************** InitXLOGAccess(void) *** 5569,5574 **** --- 5824,5831 ---- { /* ThisTimeLineID doesn't change so we need no lock to copy it */ ThisTimeLineID = XLogCtl->ThisTimeLineID; + Assert(ThisTimeLineID != 0); + /* Use GetRedoRecPtr to copy the RedoRecPtr safely */ (void) GetRedoRecPtr(); } *************** ShutdownXLOG(int code, Datum arg) *** 5680,5686 **** ereport(LOG, (errmsg("shutting down"))); ! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); ShutdownCLOG(); ShutdownSUBTRANS(); ShutdownMultiXact(); --- 5937,5946 ---- ereport(LOG, (errmsg("shutting down"))); ! if (RecoveryInProgress()) ! CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); ! else ! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); ShutdownCLOG(); ShutdownSUBTRANS(); ShutdownMultiXact(); *************** ShutdownXLOG(int code, Datum arg) *** 5693,5701 **** * Log start of a checkpoint. */ static void ! LogCheckpointStart(int flags) { ! elog(LOG, "checkpoint starting:%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", --- 5953,5972 ---- * Log start of a checkpoint. */ static void ! LogCheckpointStart(int flags, bool restartpoint) { ! char *msg; ! ! /* ! * XXX: This is hopelessly untranslatable. We could call gettext_noop ! * for the main message, but what about all the flags? ! */ ! if (restartpoint) ! msg = "restartpoint starting:%s%s%s%s%s%s"; ! else ! msg = "checkpoint starting:%s%s%s%s%s%s"; ! ! elog(LOG, msg, (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", *************** LogCheckpointStart(int flags) *** 5708,5714 **** * Log end of a checkpoint. */ static void ! LogCheckpointEnd(void) { long write_secs, sync_secs, --- 5979,5985 ---- * Log end of a checkpoint. */ static void ! LogCheckpointEnd(bool restartpoint) { long write_secs, sync_secs, *************** LogCheckpointEnd(void) *** 5731,5747 **** CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " ! "%d transaction log file(s) added, %d removed, %d recycled; " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! CheckpointStats.ckpt_segs_added, ! CheckpointStats.ckpt_segs_removed, ! CheckpointStats.ckpt_segs_recycled, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); } /* --- 6002,6027 ---- CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! if (restartpoint) ! elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); ! else ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " ! "%d transaction log file(s) added, %d removed, %d recycled; " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! CheckpointStats.ckpt_segs_added, ! CheckpointStats.ckpt_segs_removed, ! CheckpointStats.ckpt_segs_recycled, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); } /* *************** CreateCheckPoint(int flags) *** 5772,5784 **** TransactionId *inCommitXids; int nInCommit; /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. ! * (This is just pro forma, since in the present system structure there is ! * only one process that is allowed to issue checkpoints at any given ! * time.) */ ! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); /* * Prepare to accumulate statistics. --- 6052,6084 ---- TransactionId *inCommitXids; int nInCommit; + /* shouldn't happen */ + if (RecoveryInProgress()) + elog(ERROR, "can't create a checkpoint during recovery"); + /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. ! * During normal operation, bgwriter is the only process that creates ! * checkpoints, but at the end of archive recovery, the bgwriter can be ! * busy creating a restartpoint while the startup process tries to perform ! * the startup checkpoint. */ ! if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE)) ! { ! Assert(InRecovery); ! ! /* ! * A restartpoint is in progress. Wait until it finishes. This can ! * cause an extra restartpoint to be performed, but that's OK because ! * we're just about to perform a checkpoint anyway. Flushing the ! * buffers in this restartpoint can take some time, but that time is ! * saved from the upcoming checkpoint so the net effect is zero. ! */ ! ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint"))); ! RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); ! ! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); ! } /* * Prepare to accumulate statistics. *************** CreateCheckPoint(int flags) *** 5797,5805 **** --- 6097,6107 ---- if (shutdown) { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_SHUTDOWNING; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); } /* *************** CreateCheckPoint(int flags) *** 5903,5909 **** * to log anything if we decided to skip the checkpoint. */ if (log_checkpoints) ! LogCheckpointStart(flags); TRACE_POSTGRESQL_CHECKPOINT_START(flags); --- 6205,6211 ---- * to log anything if we decided to skip the checkpoint. */ if (log_checkpoints) ! LogCheckpointStart(flags, false); TRACE_POSTGRESQL_CHECKPOINT_START(flags); *************** CreateCheckPoint(int flags) *** 6070,6076 **** /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(); TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, NBuffers, CheckpointStats.ckpt_segs_added, --- 6372,6378 ---- /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(false); TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, NBuffers, CheckpointStats.ckpt_segs_added, *************** CheckPointGuts(XLogRecPtr checkPointRedo *** 6098,6129 **** } /* ! * Set a recovery restart point if appropriate ! * ! * This is similar to CreateCheckPoint, but is used during WAL recovery ! * to establish a point from which recovery can roll forward without ! * replaying the entire recovery log. This function is called each time ! * a checkpoint record is read from XLOG; it must determine whether a ! * restartpoint is needed or not. */ static void RecoveryRestartPoint(const CheckPoint *checkPoint) { - int elapsed_secs; int rmid; ! ! /* ! * Do nothing if the elapsed time since the last restartpoint is less than ! * half of checkpoint_timeout. (We use a value less than ! * checkpoint_timeout so that variations in the timing of checkpoints on ! * the master, or speed of transmission of WAL segments to a slave, won't ! * make the slave skip a restartpoint once it's synced with the master.) ! * Checking true elapsed time keeps us from doing restartpoints too often ! * while rapidly scanning large amounts of WAL. ! */ ! elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time; ! if (elapsed_secs < CheckPointTimeout / 2) ! return; /* * Is it safe to checkpoint? We must ask each of the resource managers --- 6400,6416 ---- } /* ! * This is used during WAL recovery to establish a point from which recovery ! * can roll forward without replaying the entire recovery log. This function ! * is called each time a checkpoint record is read from XLOG. It is stored ! * in shared memory, so that it can be used as a restartpoint later on. */ static void RecoveryRestartPoint(const CheckPoint *checkPoint) { int rmid; ! /* use volatile pointer to prevent code rearrangement */ ! volatile XLogCtlData *xlogctl = XLogCtl; /* * Is it safe to checkpoint? We must ask each of the resource managers *************** RecoveryRestartPoint(const CheckPoint *c *** 6145,6172 **** } /* ! * OK, force data out to disk */ ! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE); /* ! * Update pg_control so that any subsequent crash will restart from this ! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint ! * record itself. */ ControlFile->prevCheckPoint = ControlFile->checkPoint; ! ControlFile->checkPoint = ReadRecPtr; ! ControlFile->checkPointCopy = *checkPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", ! checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); if (recoveryLastXTime) ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("last completed transaction was at log time %s", ! timestamptz_to_str(recoveryLastXTime)))); } /* --- 6432,6559 ---- } /* ! * Copy the checkpoint record to shared memory, so that bgwriter can ! * use it the next time it wants to perform a restartpoint. */ ! SpinLockAcquire(&xlogctl->info_lck); ! XLogCtl->lastCheckPointRecPtr = ReadRecPtr; ! memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint)); ! SpinLockRelease(&xlogctl->info_lck); ! } ! ! /* ! * This is similar to CreateCheckPoint, but is used during WAL recovery ! * to establish a point from which recovery can roll forward without ! * replaying the entire recovery log. ! * ! * Returns true if a new restartpoint was established. We can only establish ! * a restartpoint if we have replayed a checkpoint record since last ! * restartpoint. ! */ ! bool ! CreateRestartPoint(int flags) ! { ! XLogRecPtr lastCheckPointRecPtr; ! CheckPoint lastCheckPoint; ! /* use volatile pointer to prevent code rearrangement */ ! volatile XLogCtlData *xlogctl = XLogCtl; /* ! * Acquire CheckpointLock to ensure only one restartpoint or checkpoint ! * happens at a time. */ + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + + /* Get the a local copy of the last checkpoint record. */ + SpinLockAcquire(&xlogctl->info_lck); + lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr; + memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint)); + SpinLockRelease(&xlogctl->info_lck); + + /* + * Check that we're still in recovery mode. It's ok if we exit recovery + * mode after this check, the restart point is valid anyway. + */ + if (!RecoveryInProgress()) + { + ereport(DEBUG2, + (errmsg("skipping restartpoint, recovery has already ended"))); + LWLockRelease(CheckpointLock); + return false; + } + + /* + * If the last checkpoint record we've replayed is already our last + * restartpoint, we can't perform a new restart point. We still update + * minRecoveryPoint in that case, so that if this is a shutdown restart + * point, we won't start up earlier than before. That's not strictly + * necessary, but when we get hot standby capability, it would be rather + * weird if the database opened up for read-only connections at a + * point-in-time before the last shutdown. Such time travel is still + * possible in case of immediate shutdown, though. + * + * We don't explicitly advance minRecoveryPoint when we do create a + * restartpoint. It's assumed that flushing the buffers will do that + * as a side-effect. + */ + if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) || + XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo)) + { + XLogRecPtr InvalidXLogRecPtr = {0, 0}; + ereport(DEBUG2, + (errmsg("skipping restartpoint, already performed at %X/%X", + lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff))); + + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + LWLockRelease(CheckpointLock); + return false; + } + + if (log_checkpoints) + { + /* + * Prepare to accumulate statistics. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + LogCheckpointStart(flags, true); + } + + CheckPointGuts(lastCheckPoint.redo, flags); + + /* + * Update pg_control, using current time + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->prevCheckPoint = ControlFile->checkPoint; ! ControlFile->checkPoint = lastCheckPointRecPtr; ! ControlFile->checkPointCopy = lastCheckPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Currently, there is no need to truncate pg_subtrans during recovery. + * If we did do that, we will need to have called StartupSUBTRANS() + * already and then TruncateSUBTRANS() would go here. + */ + + /* All real work is done, but log before releasing lock. */ + if (log_checkpoints) + LogCheckpointEnd(true); ! ereport((log_checkpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", ! lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff))); ! if (recoveryLastXTime) ! ereport((log_checkpoints ? LOG : DEBUG2), ! (errmsg("last completed transaction was at log time %s", ! timestamptz_to_str(recoveryLastXTime)))); ! ! LWLockRelease(CheckpointLock); ! return true; } /* *************** RequestXLogSwitch(void) *** 6232,6237 **** --- 6619,6627 ---- /* * XLOG resource manager's routines + * + * Definitions of info values are in include/catalog/pg_control.h, though + * not all records types are related to control file processing. */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) *************** xlog_redo(XLogRecPtr lsn, XLogRecord *re *** 6278,6286 **** (int) checkPoint.ThisTimeLineID)) ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", ! checkPoint.ThisTimeLineID, ThisTimeLineID))); ! /* Following WAL records should be run with new TLI */ ! ThisTimeLineID = checkPoint.ThisTimeLineID; } RecoveryRestartPoint(&checkPoint); --- 6668,6676 ---- (int) checkPoint.ThisTimeLineID)) ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", ! checkPoint.ThisTimeLineID, ThisTimeLineID))); ! /* Following WAL records should be run with new TLI */ ! ThisTimeLineID = checkPoint.ThisTimeLineID; } RecoveryRestartPoint(&checkPoint); *************** CancelBackup(void) *** 7221,7223 **** --- 7611,7702 ---- } } + /* ------------------------------------------------------ + * Startup Process main entry point and signal handlers + * ------------------------------------------------------ + */ + + /* + * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster. + * + * Some backend has bought the farm, + * so we need to stop what we're doing and exit. + */ + static void + startupproc_quickdie(SIGNAL_ARGS) + { + PG_SETMASK(&BlockSig); + + /* + * DO NOT proc_exit() -- we're here because shared memory may be + * corrupted, so we don't want to try to clean up our transaction. Just + * nail the windows shut and get out of town. + * + * Note we do exit(2) not exit(0). This is to force the postmaster into a + * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random + * backend. This is necessary precisely because we don't clean up our + * shared memory state. + */ + exit(2); + } + + + /* SIGTERM: set flag to abort redo and exit */ + static void + StartupProcShutdownHandler(SIGNAL_ARGS) + { + if (in_restore_command) + proc_exit(0); + else + shutdown_requested = true; + } + + /* Main entry point for startup process */ + void + StartupProcessMain(void) + { + /* + * If possible, make this process a group leader, so that the postmaster + * can signal any child processes too. + */ + #ifdef HAVE_SETSID + if (setsid() < 0) + elog(FATAL, "setsid() failed: %m"); + #endif + + /* + * Properly accept or ignore signals the postmaster might send us + */ + pqsignal(SIGHUP, SIG_IGN); /* ignore config file updates */ + pqsignal(SIGINT, SIG_IGN); /* ignore query cancel */ + pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */ + pqsignal(SIGQUIT, startupproc_quickdie); /* hard crash time */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, SIG_IGN); + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + pqsignal(SIGTTIN, SIG_DFL); + pqsignal(SIGTTOU, SIG_DFL); + pqsignal(SIGCONT, SIG_DFL); + pqsignal(SIGWINCH, SIG_DFL); + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + StartupXLOG(); + + BuildFlatFiles(false); + + /* Let postmaster know that startup is finished */ + SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED); + + /* exit normally */ + proc_exit(0); + }