Skip to content

Commit 6423390

Browse files
Send new protocol keepalive messages to standby servers.
Allows streaming replication users to calculate transfer latency and apply delay via internal functions. No external functions yet.
1 parent 2ae2e9c commit 6423390

File tree

8 files changed

+258
-16
lines changed

8 files changed

+258
-16
lines changed

doc/src/sgml/protocol.sgml

+48
Original file line numberDiff line numberDiff line change
@@ -1463,6 +1463,54 @@ The commands accepted in walsender mode are:
14631463
CopyData message):
14641464
</para>
14651465

1466+
<para>
1467+
<variablelist>
1468+
<varlistentry>
1469+
<term>
1470+
Primary keepalive message (B)
1471+
</term>
1472+
<listitem>
1473+
<para>
1474+
<variablelist>
1475+
<varlistentry>
1476+
<term>
1477+
Byte1('k')
1478+
</term>
1479+
<listitem>
1480+
<para>
1481+
Identifies the message as a sender keepalive.
1482+
</para>
1483+
</listitem>
1484+
</varlistentry>
1485+
<varlistentry>
1486+
<term>
1487+
Byte8
1488+
</term>
1489+
<listitem>
1490+
<para>
1491+
The current end of WAL on the server, given in
1492+
XLogRecPtr format.
1493+
</para>
1494+
</listitem>
1495+
</varlistentry>
1496+
<varlistentry>
1497+
<term>
1498+
Byte8
1499+
</term>
1500+
<listitem>
1501+
<para>
1502+
The server's system clock at the time of transmission,
1503+
given in TimestampTz format.
1504+
</para>
1505+
</listitem>
1506+
</varlistentry>
1507+
</variablelist>
1508+
</para>
1509+
</listitem>
1510+
</varlistentry>
1511+
</variablelist>
1512+
</para>
1513+
14661514
<para>
14671515
<variablelist>
14681516
<varlistentry>

src/backend/access/transam/xlog.c

+43
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,9 @@ typedef struct XLogCtlData
452452
XLogRecPtr recoveryLastRecPtr;
453453
/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
454454
TimestampTz recoveryLastXTime;
455+
/* timestamp of when we started replaying the current chunk of WAL data,
456+
* only relevant for replication or archive recovery */
457+
TimestampTz currentChunkStartTime;
455458
/* end of the last record restored from the archive */
456459
XLogRecPtr restoreLastRecPtr;
457460
/* Are we requested to pause recovery? */
@@ -606,6 +609,7 @@ static void exitArchiveRecovery(TimeLineID endTLI,
606609
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
607610
static void recoveryPausesHere(void);
608611
static void SetLatestXTime(TimestampTz xtime);
612+
static void SetCurrentChunkStartTime(TimestampTz xtime);
609613
static void CheckRequiredParameterValues(void);
610614
static void XLogReportParameters(void);
611615
static void LocalSetXLogInsertAllowed(void);
@@ -5847,6 +5851,41 @@ GetLatestXTime(void)
58475851
return xtime;
58485852
}
58495853

5854+
/*
5855+
* Save timestamp of the next chunk of WAL records to apply.
5856+
*
5857+
* We keep this in XLogCtl, not a simple static variable, so that it can be
5858+
* seen by all backends.
5859+
*/
5860+
static void
5861+
SetCurrentChunkStartTime(TimestampTz xtime)
5862+
{
5863+
/* use volatile pointer to prevent code rearrangement */
5864+
volatile XLogCtlData *xlogctl = XLogCtl;
5865+
5866+
SpinLockAcquire(&xlogctl->info_lck);
5867+
xlogctl->currentChunkStartTime = xtime;
5868+
SpinLockRelease(&xlogctl->info_lck);
5869+
}
5870+
5871+
/*
5872+
* Fetch timestamp of latest processed commit/abort record.
5873+
* Startup process maintains an accurate local copy in XLogReceiptTime
5874+
*/
5875+
TimestampTz
5876+
GetCurrentChunkReplayStartTime(void)
5877+
{
5878+
/* use volatile pointer to prevent code rearrangement */
5879+
volatile XLogCtlData *xlogctl = XLogCtl;
5880+
TimestampTz xtime;
5881+
5882+
SpinLockAcquire(&xlogctl->info_lck);
5883+
xtime = xlogctl->currentChunkStartTime;
5884+
SpinLockRelease(&xlogctl->info_lck);
5885+
5886+
return xtime;
5887+
}
5888+
58505889
/*
58515890
* Returns time of receipt of current chunk of XLOG data, as well as
58525891
* whether it was received from streaming replication or from archives.
@@ -6390,6 +6429,7 @@ StartupXLOG(void)
63906429
xlogctl->replayEndRecPtr = ReadRecPtr;
63916430
xlogctl->recoveryLastRecPtr = ReadRecPtr;
63926431
xlogctl->recoveryLastXTime = 0;
6432+
xlogctl->currentChunkStartTime = 0;
63936433
xlogctl->recoveryPause = false;
63946434
SpinLockRelease(&xlogctl->info_lck);
63956435

@@ -9696,7 +9736,10 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
96969736
{
96979737
havedata = true;
96989738
if (!XLByteLT(*RecPtr, latestChunkStart))
9739+
{
96999740
XLogReceiptTime = GetCurrentTimestamp();
9741+
SetCurrentChunkStartTime(XLogReceiptTime);
9742+
}
97009743
}
97019744
else
97029745
havedata = false;

src/backend/replication/walreceiver.c

+46-1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr);
124124
static void XLogWalRcvFlush(bool dying);
125125
static void XLogWalRcvSendReply(void);
126126
static void XLogWalRcvSendHSFeedback(void);
127+
static void ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime);
127128

128129
/* Signal handlers */
129130
static void WalRcvSigHupHandler(SIGNAL_ARGS);
@@ -218,6 +219,10 @@ WalReceiverMain(void)
218219
/* Fetch information required to start streaming */
219220
strlcpy(conninfo, (char *) walrcv->conninfo, MAXCONNINFO);
220221
startpoint = walrcv->receiveStart;
222+
223+
/* Initialise to a sanish value */
224+
walrcv->lastMsgSendTime = walrcv->lastMsgReceiptTime = GetCurrentTimestamp();
225+
221226
SpinLockRelease(&walrcv->mutex);
222227

223228
/* Arrange to clean up at walreceiver exit */
@@ -433,12 +438,28 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len)
433438
errmsg_internal("invalid WAL message received from primary")));
434439
/* memcpy is required here for alignment reasons */
435440
memcpy(&msghdr, buf, sizeof(WalDataMessageHeader));
441+
442+
ProcessWalSndrMessage(msghdr.walEnd, msghdr.sendTime);
443+
436444
buf += sizeof(WalDataMessageHeader);
437445
len -= sizeof(WalDataMessageHeader);
438-
439446
XLogWalRcvWrite(buf, len, msghdr.dataStart);
440447
break;
441448
}
449+
case 'k': /* Keepalive */
450+
{
451+
PrimaryKeepaliveMessage keepalive;
452+
453+
if (len != sizeof(PrimaryKeepaliveMessage))
454+
ereport(ERROR,
455+
(errcode(ERRCODE_PROTOCOL_VIOLATION),
456+
errmsg_internal("invalid keepalive message received from primary")));
457+
/* memcpy is required here for alignment reasons */
458+
memcpy(&keepalive, buf, sizeof(PrimaryKeepaliveMessage));
459+
460+
ProcessWalSndrMessage(keepalive.walEnd, keepalive.sendTime);
461+
break;
462+
}
442463
default:
443464
ereport(ERROR,
444465
(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -711,3 +732,27 @@ XLogWalRcvSendHSFeedback(void)
711732
memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage));
712733
walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1);
713734
}
735+
736+
/*
737+
* Keep track of important messages from primary.
738+
*/
739+
static void
740+
ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime)
741+
{
742+
/* use volatile pointer to prevent code rearrangement */
743+
volatile WalRcvData *walrcv = WalRcv;
744+
745+
TimestampTz lastMsgReceiptTime = GetCurrentTimestamp();
746+
747+
/* Update shared-memory status */
748+
SpinLockAcquire(&walrcv->mutex);
749+
walrcv->lastMsgSendTime = sendTime;
750+
walrcv->lastMsgReceiptTime = lastMsgReceiptTime;
751+
SpinLockRelease(&walrcv->mutex);
752+
753+
elog(DEBUG2, "sendtime %s receipttime %s replication apply delay %d transfer latency %d",
754+
timestamptz_to_str(sendTime),
755+
timestamptz_to_str(lastMsgReceiptTime),
756+
GetReplicationApplyDelay(),
757+
GetReplicationTransferLatency());
758+
}

src/backend/replication/walreceiverfuncs.c

+63
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "replication/walreceiver.h"
2929
#include "storage/pmsignal.h"
3030
#include "storage/shmem.h"
31+
#include "utils/timestamp.h"
3132

3233
WalRcvData *WalRcv = NULL;
3334

@@ -238,3 +239,65 @@ GetWalRcvWriteRecPtr(XLogRecPtr *latestChunkStart)
238239

239240
return recptr;
240241
}
242+
243+
/*
244+
* Returns the replication apply delay in ms
245+
*/
246+
int
247+
GetReplicationApplyDelay(void)
248+
{
249+
/* use volatile pointer to prevent code rearrangement */
250+
volatile WalRcvData *walrcv = WalRcv;
251+
252+
XLogRecPtr receivePtr;
253+
XLogRecPtr replayPtr;
254+
255+
long secs;
256+
int usecs;
257+
258+
SpinLockAcquire(&walrcv->mutex);
259+
receivePtr = walrcv->receivedUpto;
260+
SpinLockRelease(&walrcv->mutex);
261+
262+
replayPtr = GetXLogReplayRecPtr(NULL);
263+
264+
if (XLByteLE(receivePtr, replayPtr))
265+
return 0;
266+
267+
TimestampDifference(GetCurrentChunkReplayStartTime(),
268+
GetCurrentTimestamp(),
269+
&secs, &usecs);
270+
271+
return (((int) secs * 1000) + (usecs / 1000));
272+
}
273+
274+
/*
275+
* Returns the network latency in ms, note that this includes any
276+
* difference in clock settings between the servers, as well as timezone.
277+
*/
278+
int
279+
GetReplicationTransferLatency(void)
280+
{
281+
/* use volatile pointer to prevent code rearrangement */
282+
volatile WalRcvData *walrcv = WalRcv;
283+
284+
TimestampTz lastMsgSendTime;
285+
TimestampTz lastMsgReceiptTime;
286+
287+
long secs = 0;
288+
int usecs = 0;
289+
int ms;
290+
291+
SpinLockAcquire(&walrcv->mutex);
292+
lastMsgSendTime = walrcv->lastMsgSendTime;
293+
lastMsgReceiptTime = walrcv->lastMsgReceiptTime;
294+
SpinLockRelease(&walrcv->mutex);
295+
296+
TimestampDifference(lastMsgSendTime,
297+
lastMsgReceiptTime,
298+
&secs, &usecs);
299+
300+
ms = ((int) secs * 1000) + (usecs / 1000);
301+
302+
return ms;
303+
}

src/backend/replication/walsender.c

+27-15
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ static void ProcessStandbyMessage(void);
131131
static void ProcessStandbyReplyMessage(void);
132132
static void ProcessStandbyHSFeedbackMessage(void);
133133
static void ProcessRepliesIfAny(void);
134+
static void WalSndKeepalive(char *msgbuf);
134135

135136

136137
/* Main entry point for walsender process */
@@ -823,30 +824,24 @@ WalSndLoop(void)
823824
*/
824825
if (caughtup || pq_is_send_pending())
825826
{
826-
TimestampTz finish_time = 0;
827-
long sleeptime = -1;
827+
TimestampTz timeout = 0;
828+
long sleeptime = 10000; /* 10 s */
828829
int wakeEvents;
829830

830831
wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
831-
WL_SOCKET_READABLE;
832+
WL_SOCKET_READABLE | WL_TIMEOUT;
833+
832834
if (pq_is_send_pending())
833835
wakeEvents |= WL_SOCKET_WRITEABLE;
836+
else
837+
WalSndKeepalive(output_message);
834838

835839
/* Determine time until replication timeout */
836840
if (replication_timeout > 0)
837841
{
838-
long secs;
839-
int usecs;
840-
841-
finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
842+
timeout = TimestampTzPlusMilliseconds(last_reply_timestamp,
842843
replication_timeout);
843-
TimestampDifference(GetCurrentTimestamp(),
844-
finish_time, &secs, &usecs);
845-
sleeptime = secs * 1000 + usecs / 1000;
846-
/* Avoid Assert in WaitLatchOrSocket if timeout is past */
847-
if (sleeptime < 0)
848-
sleeptime = 0;
849-
wakeEvents |= WL_TIMEOUT;
844+
sleeptime = 1 + (replication_timeout / 10);
850845
}
851846

852847
/* Sleep until something happens or replication timeout */
@@ -859,7 +854,7 @@ WalSndLoop(void)
859854
* timeout ... he's supposed to reply *before* that.
860855
*/
861856
if (replication_timeout > 0 &&
862-
GetCurrentTimestamp() >= finish_time)
857+
GetCurrentTimestamp() >= timeout)
863858
{
864859
/*
865860
* Since typically expiration of replication timeout means
@@ -1627,6 +1622,23 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
16271622
return (Datum) 0;
16281623
}
16291624

1625+
static void
1626+
WalSndKeepalive(char *msgbuf)
1627+
{
1628+
PrimaryKeepaliveMessage keepalive_message;
1629+
1630+
/* Construct a new message */
1631+
keepalive_message.walEnd = sentPtr;
1632+
keepalive_message.sendTime = GetCurrentTimestamp();
1633+
1634+
elog(DEBUG2, "sending replication keepalive");
1635+
1636+
/* Prepend with the message type and send it. */
1637+
msgbuf[0] = 'k';
1638+
memcpy(msgbuf + 1, &keepalive_message, sizeof(PrimaryKeepaliveMessage));
1639+
pq_putmessage_noblock('d', msgbuf, sizeof(PrimaryKeepaliveMessage) + 1);
1640+
}
1641+
16301642
/*
16311643
* This isn't currently used for anything. Monitoring tools might be
16321644
* interested in the future, and we'll need something like this in the

src/include/access/xlog.h

+1
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ extern XLogRecPtr GetXLogWriteRecPtr(void);
293293
extern bool RecoveryIsPaused(void);
294294
extern void SetRecoveryPause(bool recoveryPause);
295295
extern TimestampTz GetLatestXTime(void);
296+
extern TimestampTz GetCurrentChunkReplayStartTime(void);
296297

297298
extern void UpdateControlFile(void);
298299
extern uint64 GetSystemIdentifier(void);

0 commit comments

Comments
 (0)