From e559799e0952b4c2e9fdfc974b18a03ec1f21f46 Mon Sep 17 00:00:00 2001 From: Rishabh Sharma Date: Mon, 14 Oct 2024 15:51:43 -0700 Subject: [PATCH] model and dataset logging for raytune using CMF (#212) --- cmflib/cmf_ray_logger.py | 10 +++++++++- docs/api/public/cmf_ray_logger.md | 29 ++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/cmflib/cmf_ray_logger.py b/cmflib/cmf_ray_logger.py index 7146d012..ff50d904 100644 --- a/cmflib/cmf_ray_logger.py +++ b/cmflib/cmf_ray_logger.py @@ -5,7 +5,7 @@ class CmfRayLogger(Callback): #id_count = 1 - def __init__(self, pipeline_name, file_path, pipeline_stage): + def __init__(self, pipeline_name, file_path, pipeline_stage, data_dir = None): """ pipeline_name: The name of the CMF Pipelibe file_path: The path to metadata file @@ -16,6 +16,7 @@ def __init__(self, pipeline_name, file_path, pipeline_stage): self.pipeline_stage = pipeline_stage self.cmf_obj = {} self.cmf_run = {} + self.data_dir = data_dir def on_trial_start(self, iteration, trials, trial, **info): trial_id = trial.trial_id @@ -28,6 +29,8 @@ def on_trial_start(self, iteration, trials, trial, **info): custom_properties = {'Configuration': trial_config}) #self.execution_id[trial_id] = CmfRayLogger.id_count #CmfRayLogger.id_count+=1 + if self.data_dir: + _ = self.cmf_obj[trial_id].log_dataset(url = str(self.data_dir), event = "input") def on_trial_result(self, iteration, trials, trial, result, **info): trial_id = trial.trial_id @@ -58,6 +61,11 @@ def on_trial_complete(self, iteration, trials, trial, **info): _ = self.cmf_obj[trial_id].log_execution_metrics(metrics_name = f"Trial_{trial_id}_Result", custom_properties = {'Result': trial_result}) + if 'model_path' in trial_result: + _ = self.cmf_obj[trial_id].log_model(path = trial_result['model_path'], + event = 'input', + model_name = f"{trial_id}_model") + def on_trial_error(self, iteration, trials, trial, **info): trial_id = trial.trial_id trial_config = trial.config diff --git a/docs/api/public/cmf_ray_logger.md b/docs/api/public/cmf_ray_logger.md index 4447e28a..36f81fe2 100644 --- a/docs/api/public/cmf_ray_logger.md +++ b/docs/api/public/cmf_ray_logger.md @@ -25,11 +25,13 @@ Create an instance of CmfRayLogger by providing the following parameters: * pipeline_name: A string representing the name of the CMF pipeline. * file_path: The file path to the metadata file associated with the CMF pipeline. * pipeline_stage: The name of the current stage of the CMF pipeline. +* data_dir (optional): A directory path where trial data should be logged. If the path is within the CMF directory, it should be relative. If it is outside, it must be an absolute path. Default vale is `None`. Example of instantiation: ```python -logger = cmf_ray_logger.CmfRayLogger(pipeline_name, file_path, pipeline_stage) +logger = cmf_ray_logger.CmfRayLogger(pipeline_name, file_path, pipeline_stage. data_dir) ``` +Here, the `data_dir` argument is used to log the dataset at the start of each trial. Ensure that this path is relative if within the CMF directory and absolute if external to the CMF directory. ## Integration with Ray Tune @@ -50,8 +52,22 @@ tune.run( ) ``` +## Model Logging +`CmfRayLogger` can now log the model during trials. To enable this, the `train.report` method must include a special key: `"model_path"`. The value of `"model_path"` should be a relative path pointing to the saved model within the CMF directory. + +Important: Ensure that the `"model_path"` is relative, as the DVC wrapper expects all paths nested within the CMF directory to be relative. +```Python +train.report({ + "accuracy": 0.95, + "loss": 0.05, + "model_path": "models/example_model.pth" +}) +``` + + + ## Output -During each trial, `CmfRayLogger` will automatically create a CMF object with attributes set as `pipeline_name`, `pipeline_stage`, and the CMF execution as `trial_id`. It captures the trial's output and logs it under the metric key `'Output'`. +During each trial, `CmfRayLogger` will automatically create a CMF object with attributes set as `pipeline_name`, `pipeline_stage`, and the CMF execution as `trial_id`. It captures the trial's output and logs it under the metric key `'Output'`. Additionally, it logs the dataset at the start of each trial (if data_dir is specified) and logs the model based on the `"model_path"` key in `train.report`. ## Example Here is a complete example of how to use `CmfRayLogger` with Ray Tune: @@ -61,7 +77,7 @@ from cmf import cmf_ray_logger from ray import tune # Initialize the logger -logger = cmf_ray_logger.CmfRayLogger("ExamplePipeline", "/path/to/metadata.json", "Stage1") +logger = cmf_ray_logger.CmfRayLogger("ExamplePipeline", "/path/to/metadata.json", "Stage1", "path/to/data_dir") # Configuration for tuning config = { @@ -74,4 +90,11 @@ tune.run( config=config, callbacks=[logger] ) + +# Reporting within your trainable function +train.report({ + "accuracy": 0.95, + "loss": 0.05, + "model_path": "path/to/models/example_model.pth" +}) ``` \ No newline at end of file