Merge from PG master upto d5cb3bab564e0927ffac7c8729eacf181a12dd40

This is the result of the "git merge remotes/PGSQL/master" upto the said commit point. We have done some basic analysis, fixed compilation problems etc, but bulk of the logical problems in conflict resolution etc will be handled by subsequent commits.
author: Pavan Deolasee 2017-06-14 05:42:18 +0000
committer: Pavan Deolasee 2017-06-14 05:42:18 +0000
commit: 15dd5274c323fb93e4e3ea9ad2185aaaec10f79c (patch)
tree: 9dafb4c7f735d9429ea461dc792933af87493c33 /src/backend/access/transam/xlogutils.c
parent: dfbb88e3bbb526dcb204b456b9e5cfd9d10d0d0a (diff)
parent: d5cb3bab564e0927ffac7c8729eacf181a12dd40 (diff)
1 files changed, 218 insertions, 22 deletions
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 1bdbea655b..4f67dc62fb 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -2,13 +2,13 @@
  *
  * xlogutils.c
  *
- * PostgreSQL transaction log manager utility routines
+ * PostgreSQL write-ahead log manager utility routines
  *
  * This file contains support routines that are used by XLOG replay functions.
  * None of this code is used during normal system operation.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * src/backend/access/transam/xlogutils.c
@@ -20,11 +20,13 @@
 #include <unistd.h>
 
 #include "miscadmin.h"
+#include "access/timeline.h"
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "storage/smgr.h"
 #include "utils/guc.h"
 #include "utils/hsearch.h"
@@ -276,9 +278,9 @@ XLogCheckInvalidPages(void)
  * will complain if we don't have the lock.  In hot standby mode it's
  * definitely necessary.)
  *
- * Note: when a backup block is available in XLOG, we restore it
- * unconditionally, even if the page in the database appears newer.  This is
- * to protect ourselves against database pages that were partially or
+ * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
+ * set, we restore it, even if the page in the database appears newer.  This
+ * is to protect ourselves against database pages that were partially or
  * incorrectly written during a crash.  We assume that the XLOG data must be
  * good because it has passed a CRC check, while the database page might not
  * be.  This will force us to replay all subsequent modifications of the page
@@ -353,9 +355,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 	if (!willinit && zeromode)
 		elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
 
-	/* If it's a full-page image, restore it. */
-	if (XLogRecHasBlockImage(record, block_id))
+	/* If it has a full-page image and it should be restored, do it. */
+	if (XLogRecBlockImageApply(record, block_id))
 	{
+		Assert(XLogRecHasBlockImage(record, block_id));
 		*buf = XLogReadBufferExtended(rnode, forknum, blkno,
 		   get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
 		page = BufferGetPage(*buf);
@@ -647,7 +650,7 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
  * always be one descriptor left open until the process ends, but never
  * more than one.
  *
- * XXX This is very similar to pg_xlogdump's XLogDumpXLogRead and to XLogRead
+ * XXX This is very similar to pg_waldump's XLogDumpXLogRead and to XLogRead
  * in walsender.c but for small differences (such as lack of elog() in
  * frontend).  Probably these should be merged at some point.
  */
@@ -661,6 +664,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
 	/* state maintained across calls */
 	static int	sendFile = -1;
 	static XLogSegNo sendSegNo = 0;
+	static TimeLineID sendTLI = 0;
 	static uint32 sendOff = 0;
 
 	p = buf;
@@ -676,7 +680,8 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
 		startoff = recptr % XLogSegSize;
 
 		/* Do we need to switch to a different xlog segment? */
-		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
+		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) ||
+			sendTLI != tli)
 		{
 			char		path[MAXPGPATH];
 
@@ -703,6 +708,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
 									path)));
 			}
 			sendOff = 0;
+			sendTLI = tli;
 		}
 
 		/* Need to seek in the file? */
@@ -728,7 +734,9 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
 		else
 			segbytes = nbytes;
 
+		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
 		readbytes = read(sendFile, p, segbytes);
+		pgstat_report_wait_end();
 		if (readbytes <= 0)
 		{
 			char		path[MAXPGPATH];
@@ -751,6 +759,137 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
 }
 
 /*
+ * Determine which timeline to read an xlog page from and set the
+ * XLogReaderState's currTLI to that timeline ID.
+ *
+ * We care about timelines in xlogreader when we might be reading xlog
+ * generated prior to a promotion, either if we're currently a standby in
+ * recovery or if we're a promoted master reading xlogs generated by the old
+ * master before our promotion.
+ *
+ * wantPage must be set to the start address of the page to read and
+ * wantLength to the amount of the page that will be read, up to
+ * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
+ *
+ * We switch to an xlog segment from the new timeline eagerly when on a
+ * historical timeline, as soon as we reach the start of the xlog segment
+ * containing the timeline switch.  The server copied the segment to the new
+ * timeline so all the data up to the switch point is the same, but there's no
+ * guarantee the old segment will still exist. It may have been deleted or
+ * renamed with a .partial suffix so we can't necessarily keep reading from
+ * the old TLI even though tliSwitchPoint says it's OK.
+ *
+ * We can't just check the timeline when we read a page on a different segment
+ * to the last page. We could've received a timeline switch from a cascading
+ * upstream, so the current segment ends abruptly (possibly getting renamed to
+ * .partial) and we have to switch to a new one.  Even in the middle of reading
+ * a page we could have to dump the cached page and switch to a new TLI.
+ *
+ * Because of this, callers MAY NOT assume that currTLI is the timeline that
+ * will be in a page's xlp_tli; the page may begin on an older timeline or we
+ * might be reading from historical timeline data on a segment that's been
+ * copied to a new timeline.
+ *
+ * The caller must also make sure it doesn't read past the current replay
+ * position (using GetWalRcvWriteRecPtr) if executing in recovery, so it
+ * doesn't fail to notice that the current timeline became historical. The
+ * caller must also update ThisTimeLineID with the result of
+ * GetWalRcvWriteRecPtr and must check RecoveryInProgress().
+ */
+void
+XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength)
+{
+	const XLogRecPtr lastReadPage = state->readSegNo * XLogSegSize + state->readOff;
+
+	Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
+	Assert(wantLength <= XLOG_BLCKSZ);
+	Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
+
+	/*
+	 * If the desired page is currently read in and valid, we have nothing to
+	 * do.
+	 *
+	 * The caller should've ensured that it didn't previously advance readOff
+	 * past the valid limit of this timeline, so it doesn't matter if the
+	 * current TLI has since become historical.
+	 */
+	if (lastReadPage == wantPage &&
+		state->readLen != 0 &&
+		lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1))
+		return;
+
+	/*
+	 * If we're reading from the current timeline, it hasn't become historical
+	 * and the page we're reading is after the last page read, we can again
+	 * just carry on. (Seeking backwards requires a check to make sure the
+	 * older page isn't on a prior timeline).
+	 *
+	 * ThisTimeLineID might've become historical since we last looked, but the
+	 * caller is required not to read past the flush limit it saw at the time
+	 * it looked up the timeline. There's nothing we can do about it if
+	 * StartupXLOG() renames it to .partial concurrently.
+	 */
+	if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage)
+	{
+		Assert(state->currTLIValidUntil == InvalidXLogRecPtr);
+		return;
+	}
+
+	/*
+	 * If we're just reading pages from a previously validated historical
+	 * timeline and the timeline we're reading from is valid until the end of
+	 * the current segment we can just keep reading.
+	 */
+	if (state->currTLIValidUntil != InvalidXLogRecPtr &&
+		state->currTLI != ThisTimeLineID &&
+		state->currTLI != 0 &&
+		(wantPage + wantLength) / XLogSegSize < state->currTLIValidUntil / XLogSegSize)
+		return;
+
+	/*
+	 * If we reach this point we're either looking up a page for random
+	 * access, the current timeline just became historical, or we're reading
+	 * from a new segment containing a timeline switch. In all cases we need
+	 * to determine the newest timeline on the segment.
+	 *
+	 * If it's the current timeline we can just keep reading from here unless
+	 * we detect a timeline switch that makes the current timeline historical.
+	 * If it's a historical timeline we can read all the segment on the newest
+	 * timeline because it contains all the old timelines' data too. So only
+	 * one switch check is required.
+	 */
+	{
+		/*
+		 * We need to re-read the timeline history in case it's been changed
+		 * by a promotion or replay from a cascaded replica.
+		 */
+		List	   *timelineHistory = readTimeLineHistory(ThisTimeLineID);
+
+		XLogRecPtr	endOfSegment = (((wantPage / XLogSegSize) + 1) * XLogSegSize) - 1;
+
+		Assert(wantPage / XLogSegSize == endOfSegment / XLogSegSize);
+
+		/*
+		 * Find the timeline of the last LSN on the segment containing
+		 * wantPage.
+		 */
+		state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
+		state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
+												  &state->nextTLI);
+
+		Assert(state->currTLIValidUntil == InvalidXLogRecPtr ||
+			   wantPage + wantLength < state->currTLIValidUntil);
+
+		list_free_deep(timelineHistory);
+
+		elog(DEBUG3, "switched to timeline %u valid until %X/%X",
+			 state->currTLI,
+			 (uint32) (state->currTLIValidUntil >> 32),
+			 (uint32) (state->currTLIValidUntil));
+	}
+}
+
+/*
  * read_page callback for reading local xlog files
  *
  * Public because it would likely be very helpful for someone writing another
@@ -771,28 +910,85 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
 	int			count;
 
 	loc = targetPagePtr + reqLen;
+
+	/* Loop waiting for xlog to be available if necessary */
 	while (1)
 	{
 		/*
-		 * TODO: we're going to have to do something more intelligent about
-		 * timelines on standbys. Use readTimeLineHistory() and
-		 * tliOfPointInHistory() to get the proper LSN? For now we'll catch
-		 * that case earlier, but the code and TODO is left in here for when
-		 * that changes.
+		 * Determine the limit of xlog we can currently read to, and what the
+		 * most recent timeline is.
+		 *
+		 * RecoveryInProgress() will update ThisTimeLineID when it first
+		 * notices recovery finishes, so we only have to maintain it for the
+		 * local process until recovery ends.
 		 */
 		if (!RecoveryInProgress())
-		{
-			*pageTLI = ThisTimeLineID;
 			read_upto = GetFlushRecPtr();
-		}
 		else
-			read_upto = GetXLogReplayRecPtr(pageTLI);
+			read_upto = GetXLogReplayRecPtr(&ThisTimeLineID);
 
-		if (loc <= read_upto)
-			break;
+		*pageTLI = ThisTimeLineID;
+
+		/*
+		 * Check which timeline to get the record from.
+		 *
+		 * We have to do it each time through the loop because if we're in
+		 * recovery as a cascading standby, the current timeline might've
+		 * become historical. We can't rely on RecoveryInProgress() because in
+		 * a standby configuration like
+		 *
+		 * A => B => C
+		 *
+		 * if we're a logical decoding session on C, and B gets promoted, our
+		 * timeline will change while we remain in recovery.
+		 *
+		 * We can't just keep reading from the old timeline as the last WAL
+		 * archive in the timeline will get renamed to .partial by
+		 * StartupXLOG().
+		 *
+		 * If that happens after our caller updated ThisTimeLineID but before
+		 * we actually read the xlog page, we might still try to read from the
+		 * old (now renamed) segment and fail. There's not much we can do
+		 * about this, but it can only happen when we're a leaf of a cascading
+		 * standby whose master gets promoted while we're decoding, so a
+		 * one-off ERROR isn't too bad.
+		 */
+		XLogReadDetermineTimeline(state, targetPagePtr, reqLen);
+
+		if (state->currTLI == ThisTimeLineID)
+		{
 
-		CHECK_FOR_INTERRUPTS();
-		pg_usleep(1000L);
+			if (loc <= read_upto)
+				break;
+
+			CHECK_FOR_INTERRUPTS();
+			pg_usleep(1000L);
+		}
+		else
+		{
+			/*
+			 * We're on a historical timeline, so limit reading to the switch
+			 * point where we moved to the next timeline.
+			 *
+			 * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
+			 * about the new timeline, so we must've received past the end of
+			 * it.
+			 */
+			read_upto = state->currTLIValidUntil;
+
+			/*
+			 * Setting pageTLI to our wanted record's TLI is slightly wrong;
+			 * the page might begin on an older timeline if it contains a
+			 * timeline switch, since its xlog segment will have been copied
+			 * from the prior timeline. This is pretty harmless though, as
+			 * nothing cares so long as the timeline doesn't go backwards.  We
+			 * should read the page header instead; FIXME someday.
+			 */
+			*pageTLI = state->currTLI;
+
+			/* No need to wait on a historical timeline */
+			break;
+		}
 	}
 
 	if (targetPagePtr + XLOG_BLCKSZ <= read_upto)
author	Pavan Deolasee	2017-06-14 05:42:18 +0000
committer	Pavan Deolasee	2017-06-14 05:42:18 +0000
commit	15dd5274c323fb93e4e3ea9ad2185aaaec10f79c (patch)
tree	9dafb4c7f735d9429ea461dc792933af87493c33 /src/backend/access/transam/xlogutils.c
parent	dfbb88e3bbb526dcb204b456b9e5cfd9d10d0d0a (diff)
parent	d5cb3bab564e0927ffac7c8729eacf181a12dd40 (diff)