From 4039d7d0b227953e45982558757d2ed126b55fab Mon Sep 17 00:00:00 2001
From: Michael Davis <mcarsondavis@gmail.com>
Date: Tue, 2 Jul 2024 12:13:04 -0400
Subject: [PATCH 1/2] ra_lib:sync_file/1: Add 'read' to 'file:open/2' options

From the [file:open/2] docs:

> `write` - The file is opened for writing. It is created if it does not
> exist. If the file exists and `write` is not combined with `read`, the
> file is truncated.

Without this change, checkpoints which are promoted to snapshots end up
as empty snapshot files.

[file:open/2]: https://www.erlang.org/doc/apps/kernel/file.html#open/2

Co-authored-by: Karl Nilsson <kjnilsson@gmail.com>
---
 src/ra_lib.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ra_lib.erl b/src/ra_lib.erl
index 4895aa72..7133b25e 100644
--- a/src/ra_lib.erl
+++ b/src/ra_lib.erl
@@ -349,7 +349,7 @@ write_file(Name, IOData, Sync) ->
 -spec sync_file(file:name_all()) ->
     ok | file_err().
 sync_file(Name) ->
-    case file:open(Name, [binary, write, raw]) of
+    case file:open(Name, [binary, read, write, raw]) of
         {ok, Fd} ->
             sync_and_close_fd(Fd);
         Err ->

From f6c61e4730d3bf8cf395113023b665fecaaf63f6 Mon Sep 17 00:00:00 2001
From: Michael Davis <mcarsondavis@gmail.com>
Date: Tue, 2 Jul 2024 12:36:40 -0400
Subject: [PATCH 2/2] Add another server restart to
 coordination_SUITE:recover_from_checkpoint

This ensures that we can recover from a snapshot which was promoted from
a checkpoint. The restart earlier in the test case ensures that we can
recover from a checkpoint but that doesn't provide an insights about
checkpoint promotion.

Without the fix for `ra_lib:sync_file/1` in the parent commit, this case
fails on the new calls to `ra:restart_server/2`.
---
 test/coordination_SUITE.erl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/coordination_SUITE.erl b/test/coordination_SUITE.erl
index db9bdf57..f5182579 100644
--- a/test/coordination_SUITE.erl
+++ b/test/coordination_SUITE.erl
@@ -814,6 +814,16 @@ recover_from_checkpoint(Config) ->
                 Follower2Idx =:= 8
       end, 20),
 
+    %% Restart the servers: the servers should be able to recover from the
+    %% snapshot which was promoted from a checkpoint.
+    [ok = ra:stop_server(?SYS, ServerId) || ServerId <- ServerIds],
+    [ok = ra:restart_server(?SYS, ServerId) || ServerId <- ServerIds],
+    [{ok, {_CurrentIdx, _CheckpointIdx = 8}, _Leader} =
+       ra:local_query(ServerId, fun(State) ->
+                                        maps:get(checkpoint_index, State,
+                                                 undefined)
+                                end) || ServerId <- ServerIds],
+
     stop_nodes(ServerIds),
     ok.