Speed up loading Results (multiprocessing + ignore summary files)

burggraaff · Mar 19, 2024 · 5c31ff0 · 5c31ff0
1 parent f1e497d
commit 5c31ff0
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 8 deletions.
diff --git a/fpcup/io.py b/fpcup/io.py
@@ -1,7 +1,7 @@
 """
 Functions for file input and output.
 """
-from functools import cache
+from functools import cache, partial
 from multiprocessing import Pool
 from os import makedirs
 from pathlib import Path
@@ -19,7 +19,9 @@
 from .constants import CRS_AMERSFOORT
 from .model import Result, Summary
 
+# Constants
 _SAMPLE_LENGTH = 10
+_THRESHOLD_PARALLEL_LOADING = 1000
 
 def save_ensemble_results(results: Iterable[Result], savefolder: PathOrStr, *,
                           progressbar=True, leave_progressbar=True) -> None:
@@ -95,8 +97,10 @@ def load_ensemble_summary_from_folder(folder: PathOrStr, *,
     return summary
 
 
+_load_ensemble_result_simple = partial(Result.from_file, run_id=None, include_summary=False)
 def load_ensemble_results_from_folder(folder: PathOrStr, run_ids: Optional[Iterable[PathOrStr]]=None, *,
-                                      extension=".wout", sample=False, progressbar=True, leave_progressbar=True) -> list[Result]:
+                                      extension=".wout", sample=False,
+                                      progressbar=True, leave_progressbar=True) -> list[Result]:
     """
     Load the result files in a given folder.
     By default, load all files in the folder. If `run_ids` is specified, load only those files.
@@ -125,10 +129,10 @@ def load_ensemble_results_from_folder(folder: PathOrStr, run_ids: Optional[Itera
 
     # Load the files with an optional progressbar
     filenames = tqdm(filenames, total=n_results, desc="Loading outputs", unit="files", disable=not progressbar, leave=leave_progressbar)
-    # if n_results < 1000:
-    results = [Result.from_file(filename) for filename in filenames]
-    # else:
-    #     with Pool() as p:
-    #         results = list(p.imap_unordered(Result.from_file, filenames, chunksize=100))
+    if n_results < _THRESHOLD_PARALLEL_LOADING:
+        results = list(map(_load_ensemble_result_simple, filenames))
+    else:
+        with Pool() as p:
+            results = list(p.imap_unordered(_load_ensemble_result_simple, filenames, chunksize=25))
 
     return results
diff --git a/fpcup/model.py b/fpcup/model.py
@@ -313,7 +313,7 @@ def from_model(cls, model: Engine, run_data: RunData, **kwargs):
 
     @classmethod
     def from_file(cls, filename: PathOrStr, *,
-                  run_id: Optional[str]=None, include_summary=True, **kwargs):
+                  run_id: Optional[str]=None, include_summary=False, **kwargs):
         """
         Load an output file.
         If a run_id is not provided, use the filename stem.