From 990fe3c4edfdabf4f56aa9a403a11f53006d0dd7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 23 Jan 2013 10:01:04 +0200 Subject: [PATCH] Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID. --- src/backend/access/transam/timeline.c | 22 ++++++++ src/backend/access/transam/xlog.c | 22 +++++++- src/backend/replication/walsender.c | 57 +++++++++++++++++---- src/include/access/timeline.h | 1 + src/include/replication/walsender_private.h | 1 - 5 files changed, 92 insertions(+), 11 deletions(-) diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index ad4f3162c5..51b37ca8f8 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -40,6 +40,28 @@ #include "access/xlogdefs.h" #include "storage/fd.h" +/* + * Copies all timeline history files with id's between 'begin' and 'end' + * from archive to pg_xlog. + */ +void +restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end) +{ + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + TimeLineID tli; + + for (tli = begin; tli < end; tli++) + { + if (tli == 1) + continue; + + TLHistoryFileName(histfname, tli); + if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false)) + KeepFileRestoredFromArchive(path, histfname); + } +} + /* * Try to read a timeline's history file. * diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 9ad9227179..d316c97926 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3276,8 +3276,8 @@ rescanLatestTimeLine(void) bool found; ListCell *cell; TimeLineID newtarget; + TimeLineID oldtarget = recoveryTargetTLI; TimeLineHistoryEntry *currentTle = NULL; - /* use volatile pointer to prevent code rearrangement */ newtarget = findNewestTimeLine(recoveryTargetTLI); if (newtarget == recoveryTargetTLI) @@ -3336,6 +3336,12 @@ rescanLatestTimeLine(void) list_free_deep(expectedTLEs); expectedTLEs = newExpectedTLEs; + /* + * As in StartupXLOG(), try to ensure we have all the history files + * between the old target and new target in pg_xlog. + */ + restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); + ereport(LOG, (errmsg("new target timeline is %u", recoveryTargetTLI))); @@ -4993,6 +4999,20 @@ StartupXLOG(void) */ ThisTimeLineID = checkPoint.ThisTimeLineID; + /* + * Copy any missing timeline history files between 'now' and the + * recovery target timeline from archive to pg_xlog. While we don't need + * those files ourselves - the history file of the recovery target + * timeline covers all the previous timelines in the history too - a + * cascading standby server might be interested in them. Or, if you + * archive the WAL from this server to a different archive than the + * master, it'd be good for all the history files to get archived there + * after failover, so that you can use one of the old timelines as a + * PITR target. Timeline history files are small, so it's better to copy + * them unnecessarily than not copy them and regret later. + */ + restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); + lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index ba138e73da..10e4050696 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -110,6 +110,9 @@ static int sendFile = -1; static XLogSegNo sendSegNo = 0; static uint32 sendOff = 0; +/* Timeline ID of the currently open file */ +static TimeLineID curFileTimeLine = 0; + /* * These variables keep track of the state of the timeline we're currently * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, @@ -1201,8 +1204,8 @@ WalSndKill(int code, Datum arg) * always be one descriptor left open until the process ends, but never * more than one. */ -void -XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) +static void +XLogRead(char *buf, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; @@ -1222,7 +1225,7 @@ retry: startoff = recptr % XLogSegSize; - if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) || sendTimeLine != tli) + if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) { char path[MAXPGPATH]; @@ -1230,9 +1233,45 @@ retry: if (sendFile >= 0) close(sendFile); - sendTimeLine = tli; XLByteToSeg(recptr, sendSegNo); - XLogFilePath(path, sendTimeLine, sendSegNo); + + /*------- + * When reading from a historic timeline, and there is a timeline + * switch within this segment, read from the WAL segment belonging + * to the new timeline. + * + * For example, imagine that this server is currently on timeline + * 5, and we're streaming timeline 4. The switch from timeline 4 + * to 5 happened at 0/13002088. In pg_xlog, we have these files: + * + * ... + * 000000040000000000000012 + * 000000040000000000000013 + * 000000050000000000000013 + * 000000050000000000000014 + * ... + * + * In this situation, when requested to send the WAL from + * segment 0x13, on timeline 4, we read the WAL from file + * 000000050000000000000013. Archive recovery prefers files from + * newer timelines, so if the segment was restored from the + * archive on this server, the file belonging to the old timeline, + * 000000040000000000000013, might not exist. Their contents are + * equal up to the switchpoint, because at a timeline switch, the + * used portion of the old segment is copied to the new file. + *------- + */ + curFileTimeLine = sendTimeLine; + if (sendTimeLineIsHistoric) + { + XLogSegNo endSegNo; + + XLByteToSeg(sendTimeLineValidUpto, endSegNo); + if (sendSegNo == endSegNo) + curFileTimeLine = sendTimeLineNextTLI; + } + + XLogFilePath(path, curFileTimeLine, sendSegNo); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) @@ -1246,7 +1285,7 @@ retry: ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", - XLogFileNameP(sendTimeLine, sendSegNo)))); + XLogFileNameP(curFileTimeLine, sendSegNo)))); else ereport(ERROR, (errcode_for_file_access(), @@ -1263,7 +1302,7 @@ retry: ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log segment %s to offset %u: %m", - XLogFileNameP(sendTimeLine, sendSegNo), + XLogFileNameP(curFileTimeLine, sendSegNo), startoff))); sendOff = startoff; } @@ -1280,7 +1319,7 @@ retry: ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u, length %lu: %m", - XLogFileNameP(sendTimeLine, sendSegNo), + XLogFileNameP(curFileTimeLine, sendSegNo), sendOff, (unsigned long) segbytes))); } @@ -1524,7 +1563,7 @@ XLogSend(bool *caughtup) * calls. */ enlargeStringInfo(&output_message, nbytes); - XLogRead(&output_message.data[output_message.len], sendTimeLine, startptr, nbytes); + XLogRead(&output_message.data[output_message.len], startptr, nbytes); output_message.len += nbytes; output_message.data[output_message.len] = '\0'; diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h index 7d45fcad8a..2e5e9a42a3 100644 --- a/src/include/access/timeline.h +++ b/src/include/access/timeline.h @@ -35,6 +35,7 @@ extern TimeLineID findNewestTimeLine(TimeLineID startTLI); extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, XLogRecPtr switchpoint, char *reason); extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size); +extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end); extern bool tliInHistory(TimeLineID tli, List *expectedTLIs); extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history); extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h index 8f479fda7e..7eaa21b9f7 100644 --- a/src/include/replication/walsender_private.h +++ b/src/include/replication/walsender_private.h @@ -95,7 +95,6 @@ extern WalSndCtlData *WalSndCtl; extern void WalSndSetState(WalSndState state); -extern void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count); /* * Internal functions for parsing the replication grammar, in repl_gram.y and