Fix more issues with cascading replication and timeline switches.

When a standby server follows the master using WAL archive, and it chooses
a new timeline (recovery_target_timeline='latest'), it only fetches the
timeline history file for the chosen target timeline, not any other history
files that might be missing from pg_xlog. For example, if the current
timeline is 2, and we choose 4 as the new recovery target timeline, the
history file for timeline 3 is not fetched, even if it's part of this
server's history. That's enough for the standby itself - the history file
for timeline 4 includes timeline 3 as well - but if a cascading standby
server wants to recover to timeline 3, it needs the history file. To fix,
when a new recovery target timeline is chosen, try to copy any missing
history files from the archive to pg_xlog between the old and new target
timeline.

A second similar issue was with the WAL files. When a standby recovers from
archive, and it reaches a segment that contains a switch to a new timeline,
recovery fetches only the WAL file labelled with the new timeline's ID. The
file from the new timeline contains a copy of the WAL from the old timeline
up to the point where the switch happened, and recovery recovers it from the
new file. But in streaming replication, walsender only tries to read it
from the old timeline's file. To fix, change walsender to read it from the
new file, so that it behaves the same as recovery in that sense, and doesn't
try to open the possibly nonexistent file with the old timeline's ID.
This commit is contained in:
Heikki Linnakangas 2013-01-23 10:01:04 +02:00
parent 861ad67bd9
commit 990fe3c4ed
5 changed files with 92 additions and 11 deletions

View file

@ -40,6 +40,28 @@
#include "access/xlogdefs.h"
#include "storage/fd.h"
/*
* Copies all timeline history files with id's between 'begin' and 'end'
* from archive to pg_xlog.
*/
void
restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
{
char path[MAXPGPATH];
char histfname[MAXFNAMELEN];
TimeLineID tli;
for (tli = begin; tli < end; tli++)
{
if (tli == 1)
continue;
TLHistoryFileName(histfname, tli);
if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false))
KeepFileRestoredFromArchive(path, histfname);
}
}
/*
* Try to read a timeline's history file.
*

View file

@ -3276,8 +3276,8 @@ rescanLatestTimeLine(void)
bool found;
ListCell *cell;
TimeLineID newtarget;
TimeLineID oldtarget = recoveryTargetTLI;
TimeLineHistoryEntry *currentTle = NULL;
/* use volatile pointer to prevent code rearrangement */
newtarget = findNewestTimeLine(recoveryTargetTLI);
if (newtarget == recoveryTargetTLI)
@ -3336,6 +3336,12 @@ rescanLatestTimeLine(void)
list_free_deep(expectedTLEs);
expectedTLEs = newExpectedTLEs;
/*
* As in StartupXLOG(), try to ensure we have all the history files
* between the old target and new target in pg_xlog.
*/
restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
ereport(LOG,
(errmsg("new target timeline is %u",
recoveryTargetTLI)));
@ -4993,6 +4999,20 @@ StartupXLOG(void)
*/
ThisTimeLineID = checkPoint.ThisTimeLineID;
/*
* Copy any missing timeline history files between 'now' and the
* recovery target timeline from archive to pg_xlog. While we don't need
* those files ourselves - the history file of the recovery target
* timeline covers all the previous timelines in the history too - a
* cascading standby server might be interested in them. Or, if you
* archive the WAL from this server to a different archive than the
* master, it'd be good for all the history files to get archived there
* after failover, so that you can use one of the old timelines as a
* PITR target. Timeline history files are small, so it's better to copy
* them unnecessarily than not copy them and regret later.
*/
restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
lastFullPageWrites = checkPoint.fullPageWrites;
RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;

View file

@ -110,6 +110,9 @@ static int sendFile = -1;
static XLogSegNo sendSegNo = 0;
static uint32 sendOff = 0;
/* Timeline ID of the currently open file */
static TimeLineID curFileTimeLine = 0;
/*
* These variables keep track of the state of the timeline we're currently
* sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
@ -1201,8 +1204,8 @@ WalSndKill(int code, Datum arg)
* always be one descriptor left open until the process ends, but never
* more than one.
*/
void
XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
static void
XLogRead(char *buf, XLogRecPtr startptr, Size count)
{
char *p;
XLogRecPtr recptr;
@ -1222,7 +1225,7 @@ retry:
startoff = recptr % XLogSegSize;
if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) || sendTimeLine != tli)
if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
{
char path[MAXPGPATH];
@ -1230,9 +1233,45 @@ retry:
if (sendFile >= 0)
close(sendFile);
sendTimeLine = tli;
XLByteToSeg(recptr, sendSegNo);
XLogFilePath(path, sendTimeLine, sendSegNo);
/*-------
* When reading from a historic timeline, and there is a timeline
* switch within this segment, read from the WAL segment belonging
* to the new timeline.
*
* For example, imagine that this server is currently on timeline
* 5, and we're streaming timeline 4. The switch from timeline 4
* to 5 happened at 0/13002088. In pg_xlog, we have these files:
*
* ...
* 000000040000000000000012
* 000000040000000000000013
* 000000050000000000000013
* 000000050000000000000014
* ...
*
* In this situation, when requested to send the WAL from
* segment 0x13, on timeline 4, we read the WAL from file
* 000000050000000000000013. Archive recovery prefers files from
* newer timelines, so if the segment was restored from the
* archive on this server, the file belonging to the old timeline,
* 000000040000000000000013, might not exist. Their contents are
* equal up to the switchpoint, because at a timeline switch, the
* used portion of the old segment is copied to the new file.
*-------
*/
curFileTimeLine = sendTimeLine;
if (sendTimeLineIsHistoric)
{
XLogSegNo endSegNo;
XLByteToSeg(sendTimeLineValidUpto, endSegNo);
if (sendSegNo == endSegNo)
curFileTimeLine = sendTimeLineNextTLI;
}
XLogFilePath(path, curFileTimeLine, sendSegNo);
sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
if (sendFile < 0)
@ -1246,7 +1285,7 @@ retry:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("requested WAL segment %s has already been removed",
XLogFileNameP(sendTimeLine, sendSegNo))));
XLogFileNameP(curFileTimeLine, sendSegNo))));
else
ereport(ERROR,
(errcode_for_file_access(),
@ -1263,7 +1302,7 @@ retry:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek in log segment %s to offset %u: %m",
XLogFileNameP(sendTimeLine, sendSegNo),
XLogFileNameP(curFileTimeLine, sendSegNo),
startoff)));
sendOff = startoff;
}
@ -1280,7 +1319,7 @@ retry:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from log segment %s, offset %u, length %lu: %m",
XLogFileNameP(sendTimeLine, sendSegNo),
XLogFileNameP(curFileTimeLine, sendSegNo),
sendOff, (unsigned long) segbytes)));
}
@ -1524,7 +1563,7 @@ XLogSend(bool *caughtup)
* calls.
*/
enlargeStringInfo(&output_message, nbytes);
XLogRead(&output_message.data[output_message.len], sendTimeLine, startptr, nbytes);
XLogRead(&output_message.data[output_message.len], startptr, nbytes);
output_message.len += nbytes;
output_message.data[output_message.len] = '\0';

View file

@ -35,6 +35,7 @@ extern TimeLineID findNewestTimeLine(TimeLineID startTLI);
extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
XLogRecPtr switchpoint, char *reason);
extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size);
extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end);
extern bool tliInHistory(TimeLineID tli, List *expectedTLIs);
extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history);
extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history,

View file

@ -95,7 +95,6 @@ extern WalSndCtlData *WalSndCtl;
extern void WalSndSetState(WalSndState state);
extern void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count);
/*
* Internal functions for parsing the replication grammar, in repl_gram.y and