From 84137138525fce2fca0dba9b4eea5cc2c9a75658 Mon Sep 17 00:00:00 2001 From: lis-r-barreto Date: Sun, 23 Jun 2024 18:43:14 -0300 Subject: [PATCH] feat: Add dagshub token --- .github/workflows/pipeline.yml | 21 ----------- app/train.py | 64 ++++++++++++++++++---------------- tests/test_train.py | 24 ++++++------- 3 files changed, 46 insertions(+), 63 deletions(-) diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index 1b26fca..958cbc0 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -31,27 +31,6 @@ jobs: run: | pytest tests/test_train.py - check_python_code_style: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - with: - fetch-depth: 0 - - - name: Set up Python 3.11 - uses: actions/setup-python@v1 - with: - python-version: 3.11 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - - name: Validate Python Code Style - run: | - pycodestyle . - train_pipeline: runs-on: ubuntu-latest needs: [test_train] diff --git a/app/train.py b/app/train.py index aabb282..fa719b8 100644 --- a/app/train.py +++ b/app/train.py @@ -24,7 +24,7 @@ def reset_seeds(): Returns: None """ - os.environ['PYTHONHASHSEED'] = str(42) + os.environ["PYTHONHASHSEED"] = str(42) tf.random.set_seed(42) np.random.seed(42) random.seed(42) @@ -39,7 +39,8 @@ def read_data(): y (pandas.Series): The target vector of shape (n_samples,). """ data = pd.read_csv( - 'https://raw.githubusercontent.com/lis-r-barreto/mlops-mlflow-classification-experiment/main/data/fetal_health_reduced.csv') + "https://raw.githubusercontent.com/lis-r-barreto/mlops-mlflow-classification-experiment/main/data/fetal_health_reduced.csv" + ) X = data.drop(["fetal_health"], axis=1) y = data["fetal_health"] return X, y @@ -65,10 +66,9 @@ def process_data(X, y): X_df = scaler.fit_transform(X) X_df = pd.DataFrame(X_df, columns=columns_names) - X_train, X_test, y_train, y_test = train_test_split(X_df, - y, - test_size=0.3, - random_state=42) + X_train, X_test, y_train, y_test = train_test_split( + X_df, y, test_size=0.3, random_state=42 + ) y_train = y_train - 1 y_test = y_test - 1 @@ -89,13 +89,15 @@ def create_model(X): reset_seeds() model = Sequential() model.add(InputLayer(input_shape=(X.shape[1],))) - model.add(Dense(10, activation='relu')) - model.add(Dense(10, activation='relu')) - model.add(Dense(3, activation='softmax')) - - model.compile(loss='sparse_categorical_crossentropy', - optimizer='adam', - metrics=['accuracy']) + model.add(Dense(10, activation="relu")) + model.add(Dense(10, activation="relu")) + model.add(Dense(3, activation="softmax")) + + model.compile( + loss="sparse_categorical_crossentropy", + optimizer="adam", + metrics=["accuracy"], + ) return model @@ -119,17 +121,23 @@ def config_mlflow(): Returns: None """ - mlflow_username = os.getenv('MLFLOW_TRACKING_USERNAME') - mlflow_password = os.getenv('MLFLOW_TRACKING_PASSWORD') + # mlflow_username = os.getenv("MLFLOW_TRACKING_USERNAME") + mlflow_username = os.getenv("lis-r-barreto") + # mlflow_password = os.getenv("MLFLOW_TRACKING_PASSWORD") + mlflow_password = os.getenv("4e756087b8703385f28b2787fca9832416cc0c83") if not mlflow_username or not mlflow_password: - raise ValueError("MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD environment variables must be set") - os.environ['MLFLOW_TRACKING_USERNAME'] = mlflow_username - os.environ['MLFLOW_TRACKING_PASSWORD'] = mlflow_password - mlflow.set_tracking_uri('https://dagshub.com/lis-r-barreto/mlops-mlflow-experiments.mlflow') + raise ValueError( + "MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD environment variables must be set" + ) + os.environ["MLFLOW_TRACKING_USERNAME"] = mlflow_username + os.environ["MLFLOW_TRACKING_PASSWORD"] = mlflow_password + mlflow.set_tracking_uri( + "https://dagshub.com/lis-r-barreto/mlops-mlflow-experiments.mlflow" + ) - mlflow.tensorflow.autolog(log_models=True, - log_input_examples=True, - log_model_signatures=True) + mlflow.tensorflow.autolog( + log_models=True, log_input_examples=True, log_model_signatures=True + ) def train_model(model, X_train, y_train, is_train=True): @@ -147,15 +155,11 @@ def train_model(model, X_train, y_train, is_train=True): Returns: None """ - with mlflow.start_run(run_name='experiment_fetal_health') as run: - model.fit(X_train, - y_train, - epochs=50, - validation_split=0.2, - verbose=3) + with mlflow.start_run(run_name="experiment_fetal_health") as run: + model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=3) if is_train: - run_uri = f'runs:/{run.info.run_id}' - mlflow.register_model(run_uri, 'fetal_health') + run_uri = f"runs:/{run.info.run_id}" + mlflow.register_model(run_uri, "fetal_health") if __name__ == "__main__": diff --git a/tests/test_train.py b/tests/test_train.py index c345acc..5484040 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -2,9 +2,7 @@ import pytest from tensorflow.keras.models import Sequential -from app.train import (read_data, - create_model, - train_model) +from app.train import read_data, create_model, train_model @pytest.fixture @@ -16,11 +14,13 @@ def sample_data(): pandas.DataFrame: A DataFrame containing sample data with three columns: 'feature1', 'feature2', and 'fetal_health'. """ - data = pd.DataFrame({ - 'feature1': [1, 2, 3, 4, 5], - 'feature2': [6, 7, 8, 9, 10], - 'fetal_health': [1, 1, 2, 3, 2] - }) + data = pd.DataFrame( + { + "feature1": [1, 2, 3, 4, 5], + "feature2": [6, 7, 8, 9, 10], + "fetal_health": [1, 1, 2, 3, 2], + } + ) return data @@ -66,9 +66,9 @@ def test_train_model(sample_data): Returns: None """ - X = sample_data.drop(['fetal_health'], axis=1) - y = sample_data['fetal_health'] - 1 + X = sample_data.drop(["fetal_health"], axis=1) + y = sample_data["fetal_health"] - 1 model = create_model(X) train_model(model, X, y, is_train=False) - assert model.history.history['loss'][-1] > 0 - assert model.history.history['val_loss'][-1] > 0 + assert model.history.history["loss"][-1] > 0 + assert model.history.history["val_loss"][-1] > 0