diff --git a/README.md b/README.md index 1820dfe..d966c52 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ conda activate penguins-env 3. Run the project ``` -kedro run +kedro run ``` ## PyCharm Setup diff --git a/src/penguins/__main__.py b/src/penguins/__main__.py index 36bdd3d..4e243a0 100644 --- a/src/penguins/__main__.py +++ b/src/penguins/__main__.py @@ -6,7 +6,7 @@ from kedro.framework.cli.utils import KedroCliError, load_entry_points from kedro.framework.project import configure_project - +from kedro.framework.cli.project import run as kedro_framework_cli_project_run def _find_run_command(package_name): try: @@ -21,9 +21,7 @@ def _find_run_command(package_name): # use run command from installed plugin if it exists return run # use run command from `kedro.framework.cli.project` - from kedro.framework.cli.project import run - - return run + return kedro_framework_cli_project_run # fail badly if cli.py exists, but has no `cli` in it if not hasattr(project_cli, "cli"): raise KedroCliError(f"Cannot load commands from {package_name}.cli") @@ -34,9 +32,10 @@ def _find_run_command_in_plugins(plugins): for group in plugins: if "run" in group.commands: return group.commands["run"] - + return None def main(*args, **kwargs): + """Program entry point""" package_name = Path(__file__).parent.name configure_project(package_name) run = _find_run_command(package_name) diff --git a/src/penguins/pipelines/modeling/nodes.py b/src/penguins/pipelines/modeling/nodes.py index 8323cf9..63a1c7a 100644 --- a/src/penguins/pipelines/modeling/nodes.py +++ b/src/penguins/pipelines/modeling/nodes.py @@ -1,4 +1,5 @@ -from typing import Dict, Tuple +"""Nodes for the modeling pipeline.""" +from typing import Tuple import mlflow import pandas as pd @@ -6,17 +7,19 @@ from sklearn.model_selection import train_test_split -def split_data(data: pd.DataFrame, parameters: Dict) -> Tuple: +def split_data(data: pd.DataFrame) -> Tuple: + """Split data into train and test sets.""" train, test = train_test_split(data, test_size=0.2) - return test, test + return train, test def train_model(train: pd.DataFrame, test: pd.DataFrame) -> TabularPredictor: + """Train a model on the given data.""" mlflow.set_experiment("penguins") classificator = TabularPredictor(label="species", log_to_file=False, problem_type="multiclass", eval_metric="accuracy") classificator.fit(train, time_limit=120) - y_pred = classificator.evaluate(test) + classificator.evaluate(test) for key, value in classificator.fit_summary()["model_performance"].items(): mlflow.log_metric(f"{key}_accuracy", value) return classificator diff --git a/src/penguins/pipelines/modeling/pipeline.py b/src/penguins/pipelines/modeling/pipeline.py index 04c885d..98897be 100644 --- a/src/penguins/pipelines/modeling/pipeline.py +++ b/src/penguins/pipelines/modeling/pipeline.py @@ -5,14 +5,15 @@ from kedro.pipeline import Pipeline, node, pipeline -from .nodes import split_data, train_model +from penguins.pipelines.modeling.nodes import split_data, train_model -def create_pipeline(**kwargs) -> Pipeline: +def create_pipeline() -> Pipeline: + """Create the kedro modeling pipeline.""" return pipeline([ node( func=split_data, - inputs=["model_input_table", "params:model_options"], + inputs=["model_input_table"], outputs=["train", "test"], name="split_data_node", ), diff --git a/src/penguins/pipelines/preprocessing/nodes.py b/src/penguins/pipelines/preprocessing/nodes.py index 8d53d9f..5d13b83 100644 --- a/src/penguins/pipelines/preprocessing/nodes.py +++ b/src/penguins/pipelines/preprocessing/nodes.py @@ -1,3 +1,4 @@ +"""Nodes for the preprocessing pipeline.""" import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -5,6 +6,7 @@ def preprocess_penguins( penguins: pd.DataFrame ) -> pd.DataFrame: + """Preprocess the penguins data by encoding categorical columns.""" # to encode island_encoder = LabelEncoder() penguins["island"] = island_encoder.fit_transform(penguins["island"]) @@ -24,7 +26,8 @@ def preprocess_penguins( def create_model_input_table( - preprocess_penguins: pd.DataFrame + preprocessed_penguins: pd.DataFrame ) -> pd.DataFrame: - model_input_table = preprocess_penguins.dropna() + """Create a model input table by dropping rows with missing values.""" + model_input_table = preprocessed_penguins.dropna() return model_input_table diff --git a/src/penguins/pipelines/preprocessing/pipeline.py b/src/penguins/pipelines/preprocessing/pipeline.py index 0ccf07f..b6f2d90 100644 --- a/src/penguins/pipelines/preprocessing/pipeline.py +++ b/src/penguins/pipelines/preprocessing/pipeline.py @@ -5,10 +5,11 @@ from kedro.pipeline import Pipeline, node, pipeline -from .nodes import preprocess_penguins, create_model_input_table +from penguins.pipelines.preprocessing.nodes import preprocess_penguins, create_model_input_table -def create_pipeline(**kwargs) -> Pipeline: +def create_pipeline() -> Pipeline: + """Create the kedro preprocessing pipeline.""" return pipeline([ node( func=preprocess_penguins, diff --git a/src/penguins/pipelines/serving/nodes.py b/src/penguins/pipelines/serving/nodes.py index 7e52a00..d2ef9e7 100644 --- a/src/penguins/pipelines/serving/nodes.py +++ b/src/penguins/pipelines/serving/nodes.py @@ -1,3 +1,4 @@ +"""Serving pipeline nodes""" import io import pickle @@ -5,7 +6,12 @@ from autogluon.tabular import TabularPredictor -def save_data(data: pd.DataFrame, classificator: TabularPredictor, encoders: pickle.OBJ) -> pd.DataFrame: +def save_data( + data: pd.DataFrame, + classificator: TabularPredictor, + encoders: pickle.OBJ + ) -> pd.DataFrame: + """Saves model data to a file""" df = pd.read_csv(io.StringIO(data), sep=",") df["island"] = encoders["island"].transform(df["island"]) diff --git a/src/penguins/pipelines/serving/pipeline.py b/src/penguins/pipelines/serving/pipeline.py index b0e58ca..f3e56ae 100644 --- a/src/penguins/pipelines/serving/pipeline.py +++ b/src/penguins/pipelines/serving/pipeline.py @@ -5,10 +5,11 @@ from kedro.pipeline import Pipeline, node, pipeline -from .nodes import save_data +from penguins.pipelines.serving.nodes import save_data -def create_pipeline(**kwargs) -> Pipeline: +def create_pipeline() -> Pipeline: + """Create the kedro serving pipeline.""" return pipeline([ node( func=save_data, diff --git a/src/penguins/settings.py b/src/penguins/settings.py index 6309d6f..63f103a 100644 --- a/src/penguins/settings.py +++ b/src/penguins/settings.py @@ -24,7 +24,8 @@ # Class that manages how configuration is loaded. from kedro.config import OmegaConfigLoader # noqa: import-outside-toplevel -CONFIG_LOADER_CLASS = OmegaConfigLoader +# CONFIG_LOADER_CLASS = OmegaConfigLoader +ConfigLoaderClass = OmegaConfigLoader # Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. # CONFIG_LOADER_ARGS = { # "config_patterns": { diff --git a/src/tests/test_run.py b/src/tests/test_run.py index dbd7c36..2a9f6d2 100644 --- a/src/tests/test_run.py +++ b/src/tests/test_run.py @@ -11,7 +11,7 @@ from pathlib import Path import pytest -from kedro.config import ConfigLoader +from kedro.config import OmegaConfigLoader from kedro.framework.context import KedroContext from kedro.framework.hooks import _create_hook_manager from kedro.framework.project import settings @@ -19,15 +19,17 @@ @pytest.fixture def config_loader(): - return ConfigLoader(conf_source=str(Path.cwd() / settings.CONF_SOURCE)) + """Load the config file for tests""" + return OmegaConfigLoader(conf_source=str(Path.cwd() / settings.CONF_SOURCE)) @pytest.fixture -def project_context(config_loader): +def project_context(cfg_loader): + """Introduce project context so tests behave like in a real project.""" return KedroContext( package_name="penguins", project_path=Path.cwd(), - config_loader=config_loader, + config_loader=cfg_loader, hook_manager=_create_hook_manager(), ) @@ -36,5 +38,10 @@ def project_context(config_loader): # and should be replaced with the ones testing the project # functionality class TestProjectContext: - def test_project_path(self, project_context): - assert project_context.project_path == Path.cwd() + """Example test for project context""" + def test_project_path(self, proj_context): + """Example test for project path""" + assert proj_context.project_path == Path.cwd() + def test_project_name(self, proj_context): + """Example test for project name""" + assert proj_context.project_name == "penguins"