From 4039d7d0b227953e45982558757d2ed126b55fab Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 2 Jul 2024 12:13:04 -0400 Subject: [PATCH 1/2] ra_lib:sync_file/1: Add 'read' to 'file:open/2' options From the [file:open/2] docs: > `write` - The file is opened for writing. It is created if it does not > exist. If the file exists and `write` is not combined with `read`, the > file is truncated. Without this change, checkpoints which are promoted to snapshots end up as empty snapshot files. [file:open/2]: https://www.erlang.org/doc/apps/kernel/file.html#open/2 Co-authored-by: Karl Nilsson --- src/ra_lib.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ra_lib.erl b/src/ra_lib.erl index 4895aa72..7133b25e 100644 --- a/src/ra_lib.erl +++ b/src/ra_lib.erl @@ -349,7 +349,7 @@ write_file(Name, IOData, Sync) -> -spec sync_file(file:name_all()) -> ok | file_err(). sync_file(Name) -> - case file:open(Name, [binary, write, raw]) of + case file:open(Name, [binary, read, write, raw]) of {ok, Fd} -> sync_and_close_fd(Fd); Err -> From f6c61e4730d3bf8cf395113023b665fecaaf63f6 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 2 Jul 2024 12:36:40 -0400 Subject: [PATCH 2/2] Add another server restart to coordination_SUITE:recover_from_checkpoint This ensures that we can recover from a snapshot which was promoted from a checkpoint. The restart earlier in the test case ensures that we can recover from a checkpoint but that doesn't provide an insights about checkpoint promotion. Without the fix for `ra_lib:sync_file/1` in the parent commit, this case fails on the new calls to `ra:restart_server/2`. --- test/coordination_SUITE.erl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/coordination_SUITE.erl b/test/coordination_SUITE.erl index db9bdf57..f5182579 100644 --- a/test/coordination_SUITE.erl +++ b/test/coordination_SUITE.erl @@ -814,6 +814,16 @@ recover_from_checkpoint(Config) -> Follower2Idx =:= 8 end, 20), + %% Restart the servers: the servers should be able to recover from the + %% snapshot which was promoted from a checkpoint. + [ok = ra:stop_server(?SYS, ServerId) || ServerId <- ServerIds], + [ok = ra:restart_server(?SYS, ServerId) || ServerId <- ServerIds], + [{ok, {_CurrentIdx, _CheckpointIdx = 8}, _Leader} = + ra:local_query(ServerId, fun(State) -> + maps:get(checkpoint_index, State, + undefined) + end) || ServerId <- ServerIds], + stop_nodes(ServerIds), ok.