From 09e1029a306a1486f7b4ac4ca3dd708c6d646bd0 Mon Sep 17 00:00:00 2001
From: franzi448 <177637518+franzi448@users.noreply.github.com>
Date: Tue, 20 Aug 2024 14:13:29 +0200
Subject: [PATCH 1/2] Update real_world.md produce_or_load parameter (#422)

---
 docs/src/real_world.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/real_world.md b/docs/src/real_world.md
index ebd81006..2064f3f8 100644
--- a/docs/src/real_world.md
+++ b/docs/src/real_world.md
@@ -254,9 +254,9 @@ end
 
 N = 2000; T = 2000.0
 data, file = produce_or_load(
-    datadir("mushrooms", "toy"), # path
+    simulation, # function
     @dict(N, T), # container
-    simulation; # function
+    datadir("mushrooms", "toy"), # path
     prefix = "fig5_toyparams" # prefix for savename
 )
 @unpack toypar_h = data

From ac0fe7f3c2c869db0e9400ab862cd2e5438e6c57 Mon Sep 17 00:00:00 2001
From: NuclearPowerNerd <58567518+NuclearPowerNerd@users.noreply.github.com>
Date: Fri, 23 Aug 2024 05:51:17 -0400
Subject: [PATCH 2/2] implement `load_function` kwarg for collect_results!
 (#424)

The changes in this branch are a follow up from a previous pull request
based on commit 6e6ff07 in PR #421. In that PR there were issues
with whitespace changes inadvertantly coming from the autoformatter
in vscode. Reverting the whitespace only changes proved to be more
difficult than anticicpated.

So to resolve this, this branch was created and a new PR will be created
from it. The whitespace issues are gone but all the feedback and changes
from the original PR are retained.

The commit makes the following changes.
 - add the `load_function` kwarg to `collect_results`. This allows
customizing how data is loaded from file before being processed into a
dataframe by `collect_results`.
 - add a test to `update_result_tests.jl`
 - update docstring of `collect_results`
 - increase package version to 2.16.0
 - update `CHANGELOG.md`

All tests passed, 589 of 589.
---
 CHANGELOG.md                 |  4 ++++
 Project.toml                 |  2 +-
 src/result_collection.jl     | 17 +++++++++--------
 test/update_results_tests.jl | 16 ++++++++++++++++
 4 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 065c8dab..5cfbf21c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 2.16.0
+
+ - Add `load_function` keyword argument to `collect_results` to customize how data is loaded from file before being converted to a dataframe by `collect_results`
+
 # 2.15.0
 
  - Add `wload_kwargs` to `produce_or_load` to allow passing kwargs to `wload`
diff --git a/Project.toml b/Project.toml
index d806621b..c50f70f5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "DrWatson"
 uuid = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
 repo = "https://github.com/JuliaDynamics/DrWatson.jl.git"
-version = "2.15.0"
+version = "2.16.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/src/result_collection.jl b/src/result_collection.jl
index 42ce4791..c75fcf62 100644
--- a/src/result_collection.jl
+++ b/src/result_collection.jl
@@ -50,6 +50,7 @@ See also [`collect_results`](@ref).
 * `black_list = [:gitcommit, :gitpatch, :script]`: List of keys not to include from result-file.
 * `special_list = []`: List of additional (derived) key-value pairs
   to put in `df` as explained below.
+*  `load_function = wload`: Load function. Defaults to `wload`. You may want to specify a custom load function for example if you store results as a struct and you want the fields of the struct to form the columns of the dataframe. The struct is saved to file as a one-element dictionary so the dataframe will only have a single column. To work around this you could convert it to a dictionary by specifying `load_function = (filename) -> struct2dict(wload(filename)["mykey"])`. This way `collect_results` will receive a `Dict` whose keys are the fields of the struct.
 
 `special_list` is a `Vector` where each entry
 is a derived quantity to be included in `df`. There are two types of entries.
@@ -90,6 +91,7 @@ function collect_results!(filename, folder;
     newfile = false, # keyword only for defining collect_results without !
     rinclude = [r""],
     rexclude = [r"^\b$"],
+    load_function = wload,
     kwargs...)
 
     @assert all(eltype(r) <: Regex for r in (rinclude, rexclude)) "Elements of `rinclude` and `rexclude` must be Regex expressions."
@@ -100,7 +102,7 @@ function collect_results!(filename, folder;
         mtimes = Dict{String,Float64}()
     else
         verbose && @info "Loading existing result collection..."
-        data = wload(filename)
+        data = load_function(filename)
         df = data["df"]
         # Check if we have pre-recorded mtimes (if not this could be because of an old results database).
         if "mtime" ∈ keys(data)
@@ -170,7 +172,7 @@ function collect_results!(filename, folder;
         mtimes[file] = mtime_file
 
         fpath = rpath === nothing ? file : joinpath(rpath, file)
-        df_new = to_data_row(FileIO.query(fpath); kwargs...)
+        df_new = to_data_row(FileIO.query(fpath); load_function=load_function, kwargs...)
         #add filename
         df_new[!, :path] .= file
         if replace_entry
@@ -231,18 +233,17 @@ is_valid_file(file, valid_filetypes) =
     any(endswith(file, v) for v in valid_filetypes)
 
 # Use wload per default when nothing else is available
-function to_data_row(file::File; kwargs...)
+function to_data_row(file::File; load_function=wload, kwargs...)
     fpath = filename(file)
     @debug "Opening $(filename(file)) with fallback wload."
-    return to_data_row(wload(fpath), fpath; kwargs...)
+    return to_data_row(load_function(fpath), fpath; kwargs...)
 end
 # Specialize for JLD2 files, can do much faster mmapped access
-function to_data_row(file::File{format"JLD2"}; kwargs...)
+function to_data_row(file::File{format"JLD2"}; load_function=(filename) -> JLD2.jldopen(filename, "r"), kwargs...)
     fpath = filename(file)
     @debug "Opening $(filename(file)) with jldopen."
-    JLD2.jldopen(filename(file), "r") do data
-        return to_data_row(data, fpath; kwargs...)
-    end
+    data = load_function(fpath)
+    return to_data_row(data, fpath; kwargs...)
 end
 function to_data_row(data, file;
         white_list = collect(keys(data)),
diff --git a/test/update_results_tests.jl b/test/update_results_tests.jl
index e7849908..aad2bd26 100644
--- a/test/update_results_tests.jl
+++ b/test/update_results_tests.jl
@@ -64,6 +64,22 @@ cres_relpath = collect_results!(relpathname, folder;
     rpath = projectdir())
 @info all(startswith.(cres[!,"path"], "data"))
 
+struct dummy
+    a::Float64
+    b::Int64
+    c::Matrix{Float64}
+end
+_dummy_matrix = rand(3,3)
+_dummy = dummy(1.0, 1, _dummy_matrix)
+wsave(datadir("dummy.jld2"), "dummy", _dummy)
+
+actual_dataframe = collect_results(datadir(), rinclude=[r"dummy.jld2"], load_function=(filename) -> struct2dict(wload(filename)["dummy"]))
+_dataframe_vector = Vector{Union{Missing, Matrix{Float64}}}(undef, 1)
+_dataframe_vector[1] = _dummy_matrix
+expected_dataframe = DataFrame(a = 1.0, b = 1, c = _dataframe_vector, path = datadir("dummy.jld2"))
+
+@test actual_dataframe == expected_dataframe
+
 ###############################################################################
 #                           Trailing slash in foldername                      #
 ###############################################################################