From 277163871264e63c59bc21a19451a8fca8e06b0e Mon Sep 17 00:00:00 2001 From: Jeffrey Lim Date: Fri, 8 Mar 2024 09:48:27 +0100 Subject: [PATCH 1/9] Fix pyproject.toml --- pyproject.toml | 12 ++++++------ requirements.txt | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6e54b23..8f59573 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,20 +17,20 @@ classifiers = [ ] dependencies= [ # Machine Learning Libraries - "numpy>=1.24.4", + "numpy~=1.26.4", # Parallel Processing Libraries - "dask>=2023.12.0", + "dask~=2023.12.0", # Data Processing Libraries - "pandas>=1.3.3", + "pandas~=1.3.3", # Parquet - "pyarrow>=6.0.0", + "pyarrow~=6.0.0", # PyTorch - "torch>=2.1.2+cu118", + "torch~=2.2.1", # Agogos - "agogos>=0.2.0", + "agogos~=0.2.0", ] diff --git a/requirements.txt b/requirements.txt index bfea61f..b7cbcd5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -agogos>=0.2.0 +agogos==0.2.0 annotated-types==0.6.0 certifi==2024.2.2 charset-normalizer==3.3.2 @@ -23,7 +23,7 @@ pandas==2.2.1 partd==1.4.1 pillow==10.2.0 pluggy==1.4.0 -pyarrow>=6.0.0 +pyarrow==6.0.0 pytest==8.0.2 pytest-cov==4.1.0 PyYAML==6.0.1 From 50d29f8a6476e1dced56ef7af22381c1f0f99282 Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 10:29:23 +0100 Subject: [PATCH 2/9] Add log_to_terminal for loading cache --- epochalyst/pipeline/model/training/training.py | 3 +++ epochalyst/pipeline/model/training/training_block.py | 3 +++ epochalyst/pipeline/model/transformation/transformation.py | 3 +++ .../pipeline/model/transformation/transformation_block.py | 3 +++ 4 files changed, 12 insertions(+) diff --git a/epochalyst/pipeline/model/training/training.py b/epochalyst/pipeline/model/training/training.py index 8c625fd..bc79946 100644 --- a/epochalyst/pipeline/model/training/training.py +++ b/epochalyst/pipeline/model/training/training.py @@ -25,6 +25,9 @@ def train( and self._cache_exists(name=self.get_hash() + "x", cache_args=cache_args) and self._cache_exists(name=self.get_hash() + "y", cache_args=cache_args) ): + self.log_to_terminal( + f"Cache exists for training pipeline with hash: {self.get_hash()}. Using the cache." + ) x = self._get_cache(name=self.get_hash() + "x", cache_args=cache_args) y = self._get_cache(name=self.get_hash() + "y", cache_args=cache_args) return x, y diff --git a/epochalyst/pipeline/model/training/training_block.py b/epochalyst/pipeline/model/training/training_block.py index 3252c8e..bd4872c 100644 --- a/epochalyst/pipeline/model/training/training_block.py +++ b/epochalyst/pipeline/model/training/training_block.py @@ -70,6 +70,9 @@ def train( and self._cache_exists(name=self.get_hash() + "x", cache_args=cache_args) and self._cache_exists(name=self.get_hash() + "y", cache_args=cache_args) ): + self.log_to_terminal( + f"Cache exists for {self.__class__} with hash: {self.get_hash()}. Using the cache." + ) x = self._get_cache(name=self.get_hash() + "x", cache_args=cache_args) y = self._get_cache(name=self.get_hash() + "y", cache_args=cache_args) return x, y diff --git a/epochalyst/pipeline/model/transformation/transformation.py b/epochalyst/pipeline/model/transformation/transformation.py index 77dcddb..689eed8 100644 --- a/epochalyst/pipeline/model/transformation/transformation.py +++ b/epochalyst/pipeline/model/transformation/transformation.py @@ -26,6 +26,9 @@ def transform( """ if cache_args and self._cache_exists(self.get_hash(), cache_args): + self.log_to_terminal( + f"Cache exists for {self.title} with hash: {self.get_hash()}. Using the cache." + ) return self._get_cache(self.get_hash(), cache_args) if self.steps: diff --git a/epochalyst/pipeline/model/transformation/transformation_block.py b/epochalyst/pipeline/model/transformation/transformation_block.py index bb11dfd..0254fe2 100644 --- a/epochalyst/pipeline/model/transformation/transformation_block.py +++ b/epochalyst/pipeline/model/transformation/transformation_block.py @@ -79,6 +79,9 @@ def transform( if cache_args and self._cache_exists( name=self.get_hash(), cache_args=cache_args ): + self.log_to_terminal( + f"Cache exists for {self.__class__} with hash: {self.get_hash()}. Using the cache." + ) return self._get_cache(name=self.get_hash(), cache_args=cache_args) data = self.custom_transform(data, **transform_args) From 4d55f7f1e9c039486e817764c1de5d35f6329820 Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 10:41:46 +0100 Subject: [PATCH 3/9] Add log_to_terminal method to test classes --- .gitignore | 2 +- tests/pipeline/model/training/test_training.py | 3 +++ tests/pipeline/model/training/test_training_block.py | 3 +++ tests/pipeline/model/transformation/test_transformation.py | 3 +++ .../pipeline/model/transformation/test_transformation_block.py | 3 +++ 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1aa502a..4510c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -216,4 +216,4 @@ plots/images/train coverage_re/ # Ignore vscode settings -.vscode/ \ No newline at end of file +.vscode/ diff --git a/tests/pipeline/model/training/test_training.py b/tests/pipeline/model/training/test_training.py index feee1e7..42d264a 100644 --- a/tests/pipeline/model/training/test_training.py +++ b/tests/pipeline/model/training/test_training.py @@ -43,6 +43,9 @@ class CustomTrainingPipeline(TrainingPipeline): def log_to_debug(self, message: str) -> None: return None + def log_to_terminal(self, message: str) -> None: + return None + t1 = TestTrainingBlock() t2 = TestTrainingBlock() diff --git a/tests/pipeline/model/training/test_training_block.py b/tests/pipeline/model/training/test_training_block.py index e4ab074..4d42be2 100644 --- a/tests/pipeline/model/training/test_training_block.py +++ b/tests/pipeline/model/training/test_training_block.py @@ -43,6 +43,9 @@ def custom_predict(self, x: int) -> int: def log_to_debug(self, message: str) -> None: return None + def log_to_terminal(self, message: str) -> None: + return None + tb = TestTrainingBlockImpl() cache_args = { "output_data_type": "numpy_array", diff --git a/tests/pipeline/model/transformation/test_transformation.py b/tests/pipeline/model/transformation/test_transformation.py index b46fc19..7b9a9bf 100644 --- a/tests/pipeline/model/transformation/test_transformation.py +++ b/tests/pipeline/model/transformation/test_transformation.py @@ -36,6 +36,9 @@ class CustomTransformationPipeline(TransformationPipeline): def log_to_debug(self, message: str) -> None: return None + def log_to_terminal(self, message: str) -> None: + return None + t1 = TestTransformationBlock() t2 = TestTransformationBlock() diff --git a/tests/pipeline/model/transformation/test_transformation_block.py b/tests/pipeline/model/transformation/test_transformation_block.py index fa580b3..7a3cb2e 100644 --- a/tests/pipeline/model/transformation/test_transformation_block.py +++ b/tests/pipeline/model/transformation/test_transformation_block.py @@ -63,6 +63,9 @@ def custom_transform(self, data: np.ndarray[int], **transform_args) -> int: def log_to_debug(self, message: str) -> None: return None + def log_to_terminal(self, message: str) -> None: + return None + tb = TestTransformationBlockImpl() cache_args = { "output_data_type": "numpy_array", From f65d0adc10117f5760a6a1f3e742792e0cda60a6 Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 10:43:57 +0100 Subject: [PATCH 4/9] Upgrade protocol of pickle dump --- epochalyst/_core/_caching/_cacher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/epochalyst/_core/_caching/_cacher.py b/epochalyst/_core/_caching/_cacher.py index a772ea1..2d38313 100644 --- a/epochalyst/_core/_caching/_cacher.py +++ b/epochalyst/_core/_caching/_cacher.py @@ -287,7 +287,11 @@ def _store_cache( elif storage_type == ".pkl": # Store the pickle file self.log_to_debug(f"Storing pickle file to {storage_path + name + '.pkl'}") - pickle.dump(data, open(storage_path + name + ".pkl", "wb")) + pickle.dump( + data, + open(storage_path + name + ".pkl", "wb"), + protocol=pickle.HIGHEST_PROTOCOL, + ) else: self.log_to_debug(f"Invalid storage type: {storage_type}") raise ValueError( From e08f1c3f573c4546457d4b5eeb0ec57f4b8e2b5c Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 11:54:46 +0100 Subject: [PATCH 5/9] Update documentation for TorchTrainer and TransformationPipeline --- .../pipeline/model/training/torch_trainer.py | 90 ++++++++++++++----- .../model/transformation/transformation.py | 47 +++++++++- 2 files changed, 114 insertions(+), 23 deletions(-) diff --git a/epochalyst/pipeline/model/training/torch_trainer.py b/epochalyst/pipeline/model/training/torch_trainer.py index 34d510a..356e51b 100644 --- a/epochalyst/pipeline/model/training/torch_trainer.py +++ b/epochalyst/pipeline/model/training/torch_trainer.py @@ -24,30 +24,49 @@ class TorchTrainer(TrainingBlock): """Abstract class for torch trainers, override necessary functions for custom implementation. - To use this block, you must inherit from it and implement the following methods: - - `log_to_terminal` - - `log_to_debug` - - `log_to_warning` - - `log_to_external` - - `external_define_metric` - - :param model: The model to train. - :param optimizer: Optimizer to use for training. - :param criterion: Criterion to use for training. - :param scheduler: Scheduler to use for training. - :param epochs: Number of epochs - :param batch_size: Batch size - :param patience: Patience for early stopping - :param test_size: Relative size of the test set - + ### Parameters: + - `model` (nn.Module): The model to train. + - `optimizer` (functools.partial[Optimizer]): Optimizer to use for training. + - `criterion` (nn.Module): Criterion to use for training. + - `scheduler` (Callable[[Optimizer], LRScheduler] | None): Scheduler to use for training. + - `epochs` (int): Number of epochs + - `batch_size` (int): Batch size + - `patience` (int): Patience for early stopping + - `test_size` (float): Relative size of the test set + + ### Methods: ```python - ------------------------------ + @abstractmethod + def log_to_terminal(self, message: str) -> None: + # Logs to terminal if implemented + + @abstractmethod + def log_to_debug(self, message: str) -> None: + # Logs to debugger if implemented + + @abstractmethod + def log_to_warning(self, message: str) -> None: + # Logs to warning if implemented + + @abstractmethod + def log_to_external(self, message: dict[str, Any], **kwargs: Any) -> None: + # Logs to external site + + @abstractmethod + def external_define_metric(self, metric: str, metric_type: str) -> None: + # Defines an external metric - def train(x: npt.NDArray[np.float32], y: npt.NDArray[np.float32], train_indices: list[int], test_indices: list[int], cache_size: int = -1, save_model: bool = True) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.float32]]: - # Train the model. + def train(self, x: Any, y: Any, cache_args: dict[str, Any] = {}, **train_args: Any) -> tuple[Any, Any]: + # Applies caching and calls custom_train, overridding removes caching - def predict(x: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]: - # Predict on the test data. + def predict(self, x: Any, cache_args: dict[str, Any] = {}, **pred_args: Any) -> Any: + # Applies caching and calls custom_predict, overridding removes caching + + def custom_train(self, x: Any, y: Any, **train_args: Any) -> tuple[Any, Any]: + # Implements torch training. If you are going to override this method and not use any other functionality, inherit from TrainingBlock. + + def custom_predict(self, x: Any, **pred_args: Any) -> Any: + # Implements torch prediction. If you are going to override this method and not use any other functionality, inherit from TrainingBlock. def predict_on_loader(loader: DataLoader[tuple[Tensor, ...]]) -> npt.NDArray[np.float32]: # Predict using a dataloader. @@ -64,6 +83,35 @@ def create_dataloaders(train_dataset: Dataset[tuple[Tensor, ...]], test_dataset: def update_model_directory(model_directory: str) -> None: # Update the model directory for caching (default: tm). ``` + + ### Usage: + ```python + from epochalyst.pipeline.model.training.torch_trainer import TorchTrainer + from torch import nn + from torch.optim import Adam + from torch.optim.lr_scheduler import StepLR + from torch.nn import MSELoss + + class MyTorchTrainer(TorchTrainer): + + def log_to_terminal(self, message: str) -> None: + + .... + + model = nn.Sequential(nn.Linear(1, 1)) + optimizer = functools.partial(Adam, lr=0.01) + criterion = MSELoss() + scheduler = functools.partial(StepLR, step_size=1, gamma=0.1) + epochs = 10 + batch_size = 32 + patience = 5 + test_size = 0.2 + + trainer = MyTorchTrainer(model=model, optimizer=optimizer, criterion=criterion, scheduler=scheduler, epochs=epochs, batch_size=batch_size, patience=patience, test_size=test_size) + + x, y = trainer.train(x, y) + x = trainer.predict(x) + ``` """ model: nn.Module diff --git a/epochalyst/pipeline/model/transformation/transformation.py b/epochalyst/pipeline/model/transformation/transformation.py index 77dcddb..c03e608 100644 --- a/epochalyst/pipeline/model/transformation/transformation.py +++ b/epochalyst/pipeline/model/transformation/transformation.py @@ -10,8 +10,51 @@ class TransformationPipeline(TransformingSystem, _Cacher, _Logger): """TransformationPipeline is the class used to create the pipeline for the transformation of the data. - :param steps: The steps to transform the data. Can be a list of Transformers, TransformationPipelines, or a combination of both. - :param title: The title of the pipeline. (Default: "Transformation Pipeline") + ### Parameters: + - `steps` (List[Union[Transformer, TransformationPipeline]]): The steps to transform the data. Can be a list of Transformers, TransformationPipelines, or a combination of both. + - `title` (str): The title of the pipeline. (Default: "Transformation Pipeline") + + ### Methods: + ```python + @abstractmethod + def log_to_terminal(self, message: str) -> None: # Log the message to the terminal. + + @abstractmethod + def log_to_debug(self, message: str) -> None: # Log the message to the debug file. + + @abstractmethod + def log_to_warning(self, message: str) -> None: # Log the message to the warning file. + + @abstractmethod + def log_to_external(self, message: str) -> None: # Log the message to an external file. + + @abstractmethod + def log_section_separator(self, title: str) -> None: # Log a section separator to the terminal. + + @abstractmethod + def external_define_metric(self, metric: str, metric_type: str) -> None: # Define a metric to be logged to an external file. + + def transform(self, data: Any, cache_args: dict[str, Any] = {}, **transform_args: Any) -> Any: # Transform the input data. + + def get_hash(self) -> str: # Get the hash of the pipeline. + ``` + + ### Usage: + ```python + from epochalyst.pipeline.model.transformation import TransformationPipeline + + class MyTransformationPipeline(TransformationPipeline): + def log_to_terminal(self, message: str) -> None: + print(message) + + .... + + step1 = MyTransformer1() + step2 = MyTransformer2() + pipeline = MyTransformationPipeline(steps=[step1, step2]) + + data = pipeline.transform(data) + ``` """ title: str = "Transformation Pipeline" # The title of the pipeline since transformation pipeline can be used for multiple purposes. (Feature, Label, etc.) From aa806a5258bf1a653e2d8b24d17ea8880e57b6c7 Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 11:55:07 +0100 Subject: [PATCH 6/9] Pre-commit fixes --- .gitignore | 2 +- .../pipeline/model/training/torch_trainer.py | 26 +++++++++---------- .../model/transformation/transformation.py | 18 ++++++------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 1aa502a..4510c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -216,4 +216,4 @@ plots/images/train coverage_re/ # Ignore vscode settings -.vscode/ \ No newline at end of file +.vscode/ diff --git a/epochalyst/pipeline/model/training/torch_trainer.py b/epochalyst/pipeline/model/training/torch_trainer.py index 356e51b..1268048 100644 --- a/epochalyst/pipeline/model/training/torch_trainer.py +++ b/epochalyst/pipeline/model/training/torch_trainer.py @@ -37,35 +37,35 @@ class TorchTrainer(TrainingBlock): ### Methods: ```python @abstractmethod - def log_to_terminal(self, message: str) -> None: + def log_to_terminal(self, message: str) -> None: # Logs to terminal if implemented @abstractmethod - def log_to_debug(self, message: str) -> None: + def log_to_debug(self, message: str) -> None: # Logs to debugger if implemented @abstractmethod - def log_to_warning(self, message: str) -> None: + def log_to_warning(self, message: str) -> None: # Logs to warning if implemented @abstractmethod - def log_to_external(self, message: dict[str, Any], **kwargs: Any) -> None: + def log_to_external(self, message: dict[str, Any], **kwargs: Any) -> None: # Logs to external site @abstractmethod - def external_define_metric(self, metric: str, metric_type: str) -> None: + def external_define_metric(self, metric: str, metric_type: str) -> None: # Defines an external metric - def train(self, x: Any, y: Any, cache_args: dict[str, Any] = {}, **train_args: Any) -> tuple[Any, Any]: + def train(self, x: Any, y: Any, cache_args: dict[str, Any] = {}, **train_args: Any) -> tuple[Any, Any]: # Applies caching and calls custom_train, overridding removes caching - def predict(self, x: Any, cache_args: dict[str, Any] = {}, **pred_args: Any) -> Any: + def predict(self, x: Any, cache_args: dict[str, Any] = {}, **pred_args: Any) -> Any: # Applies caching and calls custom_predict, overridding removes caching - def custom_train(self, x: Any, y: Any, **train_args: Any) -> tuple[Any, Any]: - # Implements torch training. If you are going to override this method and not use any other functionality, inherit from TrainingBlock. - - def custom_predict(self, x: Any, **pred_args: Any) -> Any: + def custom_train(self, x: Any, y: Any, **train_args: Any) -> tuple[Any, Any]: + # Implements torch training. If you are going to override this method and not use any other functionality, inherit from TrainingBlock. + + def custom_predict(self, x: Any, **pred_args: Any) -> Any: # Implements torch prediction. If you are going to override this method and not use any other functionality, inherit from TrainingBlock. def predict_on_loader(loader: DataLoader[tuple[Tensor, ...]]) -> npt.NDArray[np.float32]: @@ -93,9 +93,9 @@ def update_model_directory(model_directory: str) -> None: from torch.nn import MSELoss class MyTorchTrainer(TorchTrainer): - + def log_to_terminal(self, message: str) -> None: - + .... model = nn.Sequential(nn.Linear(1, 1)) diff --git a/epochalyst/pipeline/model/transformation/transformation.py b/epochalyst/pipeline/model/transformation/transformation.py index c03e608..c20c540 100644 --- a/epochalyst/pipeline/model/transformation/transformation.py +++ b/epochalyst/pipeline/model/transformation/transformation.py @@ -18,35 +18,35 @@ class TransformationPipeline(TransformingSystem, _Cacher, _Logger): ```python @abstractmethod def log_to_terminal(self, message: str) -> None: # Log the message to the terminal. - + @abstractmethod def log_to_debug(self, message: str) -> None: # Log the message to the debug file. - + @abstractmethod def log_to_warning(self, message: str) -> None: # Log the message to the warning file. - + @abstractmethod def log_to_external(self, message: str) -> None: # Log the message to an external file. - + @abstractmethod def log_section_separator(self, title: str) -> None: # Log a section separator to the terminal. - + @abstractmethod def external_define_metric(self, metric: str, metric_type: str) -> None: # Define a metric to be logged to an external file. - + def transform(self, data: Any, cache_args: dict[str, Any] = {}, **transform_args: Any) -> Any: # Transform the input data. - + def get_hash(self) -> str: # Get the hash of the pipeline. ``` - ### Usage: + ### Usage: ```python from epochalyst.pipeline.model.transformation import TransformationPipeline class MyTransformationPipeline(TransformationPipeline): def log_to_terminal(self, message: str) -> None: print(message) - + .... step1 = MyTransformer1() From bde7400df6ecc9e637a9ad39c3539c12f7c04002 Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 12:22:22 +0100 Subject: [PATCH 7/9] Update version in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f6ad3d5..8f59573 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "epochalyst" -version = "0.1.2" +version = "0.1.3" authors = [ { name = "Jasper van Selm", email = "jmvanselm@gmail.com" }, { name = "Ariel Ebersberger", email = "arielebersberger@gmail.com" }, From 75fb9788f0c3740cd69feeb8c5e54a25151f5fb7 Mon Sep 17 00:00:00 2001 From: Jeffrey Lim Date: Fri, 8 Mar 2024 13:38:03 +0100 Subject: [PATCH 8/9] Fix pyproject --- pyproject.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8f59573..c0dfead 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,20 +17,20 @@ classifiers = [ ] dependencies= [ # Machine Learning Libraries - "numpy~=1.26.4", + "numpy>=1.22.4", # Parallel Processing Libraries - "dask~=2023.12.0", + "dask>=2023.12.0", # Data Processing Libraries - "pandas~=1.3.3", + "pandas>=1.3.0", # Parquet - "pyarrow~=6.0.0", + "pyarrow>=6.0.0", # PyTorch - "torch~=2.2.1", + "torch>=2.1.0", # Agogos - "agogos~=0.2.0", + "agogos>=0.2.1", ] From 7e6011cb95209c1f7381252fd297d0ddf0781208 Mon Sep 17 00:00:00 2001 From: schobbejak Date: Fri, 8 Mar 2024 13:56:28 +0100 Subject: [PATCH 9/9] Update pyarrow version in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b7cbcd5..923b87d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ pandas==2.2.1 partd==1.4.1 pillow==10.2.0 pluggy==1.4.0 -pyarrow==6.0.0 +pyarrow>=6.0.0 pytest==8.0.2 pytest-cov==4.1.0 PyYAML==6.0.1