Skip to content

Commit 0668719

Browse files
committed
Fix scenario where streaming standby gets stuck at a continuation record.
If a continuation record is split so that its first half has already been removed from the master, and is only present in pg_wal, and there is a recycled WAL segment in the standby server that looks like it would contain the second half, recovery would get stuck. The code in XLogPageRead() incorrectly started streaming at the beginning of the WAL record, even if we had already read the first page. Backpatch to 9.4. In principle, older versions have the same problem, but without replication slots, there was no straightforward mechanism to prevent the master from recycling old WAL that was still needed by standby. Without such a mechanism, I think it's reasonable to assume that there's enough slack in how many old segments are kept around to not run into this, or you have a WAL archive. Reported by Jonathon Nelson. Analysis and patch by Kyotaro HORIGUCHI, with some extra comments by me. Discussion: https://www.postgresql.org/message-id/CACJqAM3xVz0JY1XFDKPP%2BJoJAjoGx%3DGNuOAshEDWCext7BFvCQ%40mail.gmail.com
1 parent d2599ec commit 0668719

File tree

3 files changed

+62
-13
lines changed

3 files changed

+62
-13
lines changed

src/backend/access/transam/xlog.c

+42-2
Original file line numberDiff line numberDiff line change
@@ -11694,6 +11694,40 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
1169411694
Assert(reqLen <= readLen);
1169511695

1169611696
*readTLI = curFileTLI;
11697+
11698+
/*
11699+
* Check the page header immediately, so that we can retry immediately if
11700+
* it's not valid. This may seem unnecessary, because XLogReadRecord()
11701+
* validates the page header anyway, and would propagate the failure up to
11702+
* ReadRecord(), which would retry. However, there's a corner case with
11703+
* continuation records, if a record is split across two pages such that
11704+
* we would need to read the two pages from different sources. For
11705+
* example, imagine a scenario where a streaming replica is started up,
11706+
* and replay reaches a record that's split across two WAL segments. The
11707+
* first page is only available locally, in pg_wal, because it's already
11708+
* been recycled in the master. The second page, however, is not present
11709+
* in pg_wal, and we should stream it from the master. There is a recycled
11710+
* WAL segment present in pg_wal, with garbage contents, however. We would
11711+
* read the first page from the local WAL segment, but when reading the
11712+
* second page, we would read the bogus, recycled, WAL segment. If we
11713+
* didn't catch that case here, we would never recover, because
11714+
* ReadRecord() would retry reading the whole record from the beginning.
11715+
*
11716+
* Of course, this only catches errors in the page header, which is what
11717+
* happens in the case of a recycled WAL segment. Other kinds of errors or
11718+
* corruption still has the same problem. But this at least fixes the
11719+
* common case, which can happen as part of normal operation.
11720+
*
11721+
* Validating the page header is cheap enough that doing it twice
11722+
* shouldn't be a big deal from a performance point of view.
11723+
*/
11724+
if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11725+
{
11726+
/* reset any error XLogReaderValidatePageHeader() might have set */
11727+
xlogreader->errormsg_buf[0] = '\0';
11728+
goto next_record_is_invalid;
11729+
}
11730+
1169711731
return readLen;
1169811732

1169911733
next_record_is_invalid:
@@ -11828,12 +11862,18 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
1182811862
}
1182911863
else
1183011864
{
11831-
ptr = tliRecPtr;
11865+
ptr = RecPtr;
11866+
11867+
/*
11868+
* Use the record begin position to determine the
11869+
* TLI, rather than the position we're reading.
11870+
*/
1183211871
tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
1183311872

1183411873
if (curFileTLI > 0 && tli < curFileTLI)
1183511874
elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11836-
(uint32) (ptr >> 32), (uint32) ptr,
11875+
(uint32) (tliRecPtr >> 32),
11876+
(uint32) tliRecPtr,
1183711877
tli, curFileTLI);
1183811878
}
1183911879
curFileTLI = tli;

src/backend/access/transam/xlogreader.c

+16-11
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727

2828
static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength);
2929

30-
static bool ValidXLogPageHeader(XLogReaderState *state, XLogRecPtr recptr,
31-
XLogPageHeader hdr);
3230
static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
3331
XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess);
3432
static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record,
@@ -533,7 +531,6 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
533531
*/
534532
if (targetSegNo != state->readSegNo && targetPageOff != 0)
535533
{
536-
XLogPageHeader hdr;
537534
XLogRecPtr targetSegmentPtr = pageptr - targetPageOff;
538535

539536
readLen = state->read_page(state, targetSegmentPtr, XLOG_BLCKSZ,
@@ -545,9 +542,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
545542
/* we can be sure to have enough WAL available, we scrolled back */
546543
Assert(readLen == XLOG_BLCKSZ);
547544

548-
hdr = (XLogPageHeader) state->readBuf;
549-
550-
if (!ValidXLogPageHeader(state, targetSegmentPtr, hdr))
545+
if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
546+
state->readBuf))
551547
goto err;
552548
}
553549

@@ -584,7 +580,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
584580
/*
585581
* Now that we know we have the full header, validate it.
586582
*/
587-
if (!ValidXLogPageHeader(state, pageptr, hdr))
583+
if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
588584
goto err;
589585

590586
/* update read state information */
@@ -709,15 +705,19 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr)
709705
}
710706

711707
/*
712-
* Validate a page header
708+
* Validate a page header.
709+
*
710+
* Check if 'phdr' is valid as the header of the XLog page at position
711+
* 'recptr'.
713712
*/
714-
static bool
715-
ValidXLogPageHeader(XLogReaderState *state, XLogRecPtr recptr,
716-
XLogPageHeader hdr)
713+
bool
714+
XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
715+
char *phdr)
717716
{
718717
XLogRecPtr recaddr;
719718
XLogSegNo segno;
720719
int32 offset;
720+
XLogPageHeader hdr = (XLogPageHeader) phdr;
721721

722722
Assert((recptr % XLOG_BLCKSZ) == 0);
723723

@@ -805,6 +805,11 @@ ValidXLogPageHeader(XLogReaderState *state, XLogRecPtr recptr,
805805
return false;
806806
}
807807

808+
/*
809+
* Check that the address on the page agrees with what we expected.
810+
* This check typically fails when an old WAL segment is recycled,
811+
* and hasn't yet been overwritten with new data yet.
812+
*/
808813
if (hdr->xlp_pageaddr != recaddr)
809814
{
810815
char fname[MAXFNAMELEN];

src/include/access/xlogreader.h

+4
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ extern void XLogReaderFree(XLogReaderState *state);
205205
extern struct XLogRecord *XLogReadRecord(XLogReaderState *state,
206206
XLogRecPtr recptr, char **errormsg);
207207

208+
/* Validate a page */
209+
extern bool XLogReaderValidatePageHeader(XLogReaderState *state,
210+
XLogRecPtr recptr, char *phdr);
211+
208212
/* Invalidate read state */
209213
extern void XLogReaderInvalReadState(XLogReaderState *state);
210214

0 commit comments

Comments
 (0)