clear

pirocheto · Nov 22, 2023 · 3b90ea5 · 3b90ea5
1 parent 33ecf95
commit 3b90ea5
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 24 deletions.
diff --git a/dvc.yaml b/dvc.yaml
@@ -1,5 +1,11 @@
+params:
+- dvclive/params.yaml
 metrics:
 - dvclive/metrics.json
 plots:
 - dvclive/plots/metrics:
     x: step
+artifacts:
+  model:
+    path: dvclive/model/model.pkl
+    type: model
diff --git a/dvclive/model/params.yaml b/dvclive/model/params.yaml
@@ -1,13 +1 @@
-cls__estimator__C: 8.809631367836817e-05
-cls__estimator__loss: squared_hinge
-cls__estimator__tol: 1.5261822420077893e-05
-tfidf__char__lowercase: false
-tfidf__char__ngram_range: !!python/tuple
-- 1
-- 2
-tfidf__char__use_idf: false
-tfidf__word__lowercase: false
-tfidf__word__ngram_range: !!python/tuple
-- 1
-- 2
-tfidf__word__use_idf: false
+{}
diff --git a/dvclive/params.yaml b/dvclive/params.yaml
@@ -0,0 +1 @@
+cls: MultinomialNB
diff --git a/scripts/compare_classifiers.py b/scripts/compare_classifiers.py
@@ -1,6 +1,10 @@
+import pickle
+from pathlib import Path
+
 import dvc.api
 import numpy as np
 import pandas as pd
+import yaml
 from rich.pretty import pprint
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
@@ -16,6 +20,13 @@
 
 DATA_PATH = "data/data.csv"
 
+classifiers = [
+    ("svm", LinearSVC(dual="auto")),
+    ("lr", LogisticRegression()),
+    ("knn", KNeighborsClassifier()),
+    ("nb", MultinomialNB()),
+]
+
 
 # Function to load data
 def load_data(path):
@@ -27,14 +38,6 @@ def load_data(path):
     return X_train, y_train
 
 
-classifiers = [
-    ("svm", LinearSVC(dual="auto")),
-    ("lr", LogisticRegression()),
-    ("knn", KNeighborsClassifier()),
-    ("nb", MultinomialNB()),
-]
-
-
 # Function to print the best trial results
 def print_best_exps(n=10):
     pd.set_option("display.max_columns", None)
@@ -60,6 +63,8 @@ def main():
     for exp_name, classifier in classifiers:
         print(f"Experiment '{exp_name}' in progress...")
         with Live(exp_name=exp_name) as live:
+            live.log_param("cls", classifier.__class__.__name__)
+
             tfidf = FeatureUnion(
                 [
                     ("word", TfidfVectorizer()),
@@ -86,6 +91,24 @@ def main():
                 ],
             )
 
+            # Create a directory to save the model
+            model_dir = Path(live.dir) / "model"
+            model_dir.mkdir(exist_ok=True)
+
+            # Save the model to a pickle file
+            model_path = model_dir / "model.pkl"
+            model_path.write_bytes(pickle.dumps(model))
+
+            # Log the model as an artifact using dvclive
+            live.log_artifact(model_path, type="model", cache=False)
+
+            # Save parameters to a YAML file
+            params_path = model_dir / "params.yaml"
+            with open(params_path, "w") as fp:
+                yaml.dump({}, fp)
+
+            live.log_artifact(params_path, cache=False)
+
             for name, values in scores.items():
                 if name.startswith("test_"):
                     name = name.replace("test_", "")

diff --git a/scripts/optimize_model.py b/scripts/optimize_model.py
@@ -108,16 +108,16 @@ def __call__(self, trial) -> Any:
             model_path = model_dir / "model.pkl"
             model_path.write_bytes(pickle.dumps(model))
 
+            # Log the model as an artifact using dvclive
+            live.log_artifact(model_path, type="model", cache=False)
+
             # Save parameters to a YAML file
             params_path = model_dir / "params.yaml"
             with open(params_path, "w") as fp:
                 yaml.dump(params, fp)
 
             live.log_artifact(params_path, cache=False)
 
-            # Log the model as an artifact using dvclive
-            live.log_artifact(model_path, type="model", cache=False)
-
             scores = cross_validate(
                 model,
                 self.X_train,