From 96251f61c4f521e7a6766309051567773fde2e5a Mon Sep 17 00:00:00 2001
From: JunhaoQiu <56094690+qchiujunhao@users.noreply.github.com>
Date: Thu, 13 Jun 2024 19:13:57 -0400
Subject: [PATCH] Regression (#3)
* add regression
restructure classes
* better html report
---
.gitignore | 3 +-
tools/base_model_trainer.py | 164 ++++++++++++++++++++++++++++++++
tools/dashboard.py | 66 ++++++++++++-
tools/pycaret_classification.py | 129 +++----------------------
tools/pycaret_regression.py | 23 +++++
tools/pycaret_train.py | 8 +-
tools/pycaret_train.xml | 1 +
7 files changed, 272 insertions(+), 122 deletions(-)
create mode 100644 tools/base_model_trainer.py
create mode 100644 tools/pycaret_regression.py
diff --git a/.gitignore b/.gitignore
index 806aca1..8315d31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ env
*.txt
*.pkl
*.html
-*.ipynb
\ No newline at end of file
+*.ipynb
+__pycache__
\ No newline at end of file
diff --git a/tools/base_model_trainer.py b/tools/base_model_trainer.py
new file mode 100644
index 0000000..1d6a2cd
--- /dev/null
+++ b/tools/base_model_trainer.py
@@ -0,0 +1,164 @@
+import sys
+import pandas as pd
+import os
+import logging
+import base64
+
+logging.basicConfig(level=logging.DEBUG)
+LOG = logging.getLogger(__name__)
+
+class BaseModelTrainer:
+ def __init__(self, input_file, target_col, output_dir):
+ self.exp = None # This will be set in the subclass
+ self.input_file = input_file
+ self.target_col = target_col
+ self.output_dir = output_dir
+ self.data = None
+ self.target = None
+ self.best_model = None
+ self.results = None
+ self.plots = {}
+
+ def load_data(self):
+ LOG.info(f"Loading data from {self.input_file}")
+ self.data = pd.read_csv(self.input_file, sep=None, engine='python')
+ names = self.data.columns.to_list()
+ self.target = names[int(self.target_col)-1]
+ self.data = self.data.fillna(self.data.median(numeric_only=True))
+ self.data.columns = self.data.columns.str.replace('.', '_')
+
+ def setup_pycaret(self):
+ LOG.info("Initializing PyCaret")
+ self.exp.setup(self.data, target=self.target,
+ session_id=123, html=True,
+ log_experiment=False, system_log=False)
+
+ def train_model(self):
+ LOG.info("Training and selecting the best model")
+ self.best_model = self.exp.compare_models()
+ self.results = self.exp.pull()
+
+ def save_model(self):
+ LOG.info("Saving the model")
+ self.exp.save_model(self.best_model, "model.pkl")
+
+ def generate_plots(self):
+ raise NotImplementedError("Subclasses should implement this method")
+
+ def encode_image_to_base64(self, img_path):
+ with open(img_path, 'rb') as img_file:
+ return base64.b64encode(img_file.read()).decode('utf-8')
+
+ def save_html_report(self):
+ LOG.info("Saving HTML report")
+
+ model_name = type(self.best_model).__name__
+
+ # Save model summary
+ best_model_params = pd.DataFrame(self.best_model.get_params().items(), columns=['Parameter', 'Value'])
+ best_model_params.to_csv(os.path.join(self.output_dir, 'best_model.csv'), index=False)
+
+ # Save comparison results
+ self.results.to_csv(os.path.join(self.output_dir, "comparison_results.csv"))
+
+ # Read and encode plot images
+ plots_html = ""
+ for plot_name, plot_path in self.plots.items():
+ encoded_image = self.encode_image_to_base64(plot_path)
+ plots_html += f"""
+
+
{plot_name.capitalize()}
+
+
+ """
+
+ # Generate HTML content
+ html_content = f"""
+
+
+
+
+
+ PyCaret Model Training Report
+
+
+
+
+
PyCaret Model Training Report
+
Best Model: {model_name}
+
+ Parameter | Value |
+ {best_model_params.to_html(index=False, header=False, classes='table')}
+
+
Comparison Results
+
+ {self.results.to_html(index=False, classes='table')}
+
+
Plots
+ {plots_html}
+
+
+
+ """
+
+ with open(os.path.join(self.output_dir, "comparison_result.html"), "w") as file:
+ file.write(html_content)
+
+ def save_dashboard(self):
+ raise NotImplementedError("Subclasses should implement this method")
+
+ def run(self):
+ self.load_data()
+ self.setup_pycaret()
+ self.train_model()
+ self.save_model()
+ self.generate_plots()
+ self.save_html_report()
+ self.save_dashboard()
diff --git a/tools/dashboard.py b/tools/dashboard.py
index fb9e6eb..557e93f 100644
--- a/tools/dashboard.py
+++ b/tools/dashboard.py
@@ -5,7 +5,7 @@
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
-def generate_dashboard(
+def generate_classifier_explainer_dashboard(
exp,
estimator,
display_format: str = "dash",
@@ -76,4 +76,68 @@ def generate_dashboard(
)
return ExplainerDashboard(
explainer, mode=display_format, contributions=False, whatif=False, **dashboard_kwargs
+ )
+
+def generate_regression_explainer_dashboard(
+ exp,
+ estimator,
+ display_format: str = "dash",
+ dashboard_kwargs: Optional[Dict[str, Any]] = None,
+ run_kwargs: Optional[Dict[str, Any]] = None,
+ **kwargs,
+ ):
+ """
+ This function is changed from pycaret.regression.oop.dashboard()
+
+ This function generates the interactive dashboard for a trained model. The
+ dashboard is implemented using ExplainerDashboard (explainerdashboard.readthedocs.io)
+
+
+ estimator: scikit-learn compatible object
+ Trained model object
+
+
+ display_format: str, default = 'dash'
+ Render mode for the dashboard. The default is set to ``dash`` which will
+ render a dashboard in browser. There are four possible options:
+
+ - 'dash' - displays the dashboard in browser
+ - 'inline' - displays the dashboard in the jupyter notebook cell.
+ - 'jupyterlab' - displays the dashboard in jupyterlab pane.
+ - 'external' - displays the dashboard in a separate tab. (use in Colab)
+
+
+ dashboard_kwargs: dict, default = {} (empty dict)
+ Dictionary of arguments passed to the ``ExplainerDashboard`` class.
+
+
+ run_kwargs: dict, default = {} (empty dict)
+ Dictionary of arguments passed to the ``run`` method of ``ExplainerDashboard``.
+
+
+ **kwargs:
+ Additional keyword arguments to pass to the ``ClassifierExplainer`` or
+ ``RegressionExplainer`` class.
+
+
+ Returns:
+ ExplainerDashboard
+ """
+
+ dashboard_kwargs = dashboard_kwargs or {}
+ run_kwargs = run_kwargs or {}
+
+ from explainerdashboard import ExplainerDashboard, RegressionExplainer
+
+ # Replaceing chars which dash doesnt accept for column name `.` , `{`, `}`
+ X_test_df = exp.X_test_transformed.copy()
+ X_test_df.columns = [
+ col.replace(".", "__").replace("{", "__").replace("}", "__")
+ for col in X_test_df.columns
+ ]
+ explainer = RegressionExplainer(
+ estimator, X_test_df, exp.y_test_transformed, **kwargs
+ )
+ return ExplainerDashboard(
+ explainer, mode=display_format, contributions=False, whatif=False, shap_interaction=False, decision_trees=False, **dashboard_kwargs
)
\ No newline at end of file
diff --git a/tools/pycaret_classification.py b/tools/pycaret_classification.py
index e65fb0f..d382cd1 100644
--- a/tools/pycaret_classification.py
+++ b/tools/pycaret_classification.py
@@ -1,130 +1,23 @@
-import sys
-import pandas as pd
+from base_model_trainer import BaseModelTrainer
from pycaret.classification import ClassificationExperiment
-import os
import logging
-from dashboard import generate_dashboard
-from jinja_report.generate_report import main as generate_report
-import base64
+from dashboard import generate_classifier_explainer_dashboard
-logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
-class ModelTrainer:
+class ClassificationModelTrainer(BaseModelTrainer):
def __init__(self, input_file, target_col, output_dir):
+ super().__init__(input_file, target_col, output_dir)
self.exp = ClassificationExperiment()
- self.input_file = input_file
- self.target_col = target_col
- self.output_dir = output_dir
- self.data = None
- self.target = None
- self.best_model = None
- self.results = None
- self.plots = {}
-
- def load_data(self):
- LOG.info(f"Loading data from {self.input_file}")
- self.data = pd.read_csv(self.input_file, sep=None, engine='python')
- names = self.data.columns.to_list()
- self.target = names[int(self.target_col)-1]
- self.data = self.data.fillna(self.data.median(numeric_only=True))
- self.data.columns = self.data.columns.str.replace('.', '_')
-
- def setup_pycaret(self):
- LOG.info("Initializing PyCaret")
- self.exp.setup(self.data, target=self.target,
- session_id=123, html=True,
- log_experiment=False, system_log=False)
-
- def train_model(self):
- LOG.info("Training and selecting the best model")
- self.best_model = self.exp.compare_models()
- self.results = self.exp.pull()
-
- def save_model(self):
- LOG.info("Saving the model")
- self.exp.save_model(self.best_model, "model")
+
+ def save_dashboard(self):
+ LOG.info("Saving explainer dashboard")
+ dashboard = generate_classifier_explainer_dashboard(self.exp, self.best_model)
+ dashboard.save_html("dashboard.html")
def generate_plots(self):
LOG.info("Generating and saving plots")
- # Generate PyCaret plots
- plots = ['auc', 'confusion_matrix',
- 'threshold',
- 'pr', 'error',
- 'class_report', 'learning',
- 'calibration', 'vc',
- 'dimension',
- 'manifold', 'rfe',
- 'feature', 'feature_all']
+ plots = ['auc', 'confusion_matrix', 'threshold', 'pr', 'error', 'class_report', 'learning', 'calibration', 'vc', 'dimension', 'manifold', 'rfe', 'feature', 'feature_all']
for plot_name in plots:
- plot_path = self.exp.plot_model(self.best_model, plot=plot_name,
- save=True)
+ plot_path = self.exp.plot_model(self.best_model, plot=plot_name, save=True)
self.plots[plot_name] = plot_path
-
- def encode_image_to_base64(self, img_path):
- with open(img_path, 'rb') as img_file:
- return base64.b64encode(img_file.read()).decode('utf-8')
-
- def save_html_report(self):
- LOG.info("Saving HTML report")
-
- model_name = type(self.best_model).__name__
-
- report_data = {
- "title": "PyCaret Model Training Report",
- 'Best Model': [
- {
- 'type': 'table',
- 'src': os.path.join(self.output_dir, 'best_model.csv'),
- 'label': f'Best Model: {model_name}'
- }
- ],
- 'Comparison Results': [
- {
- 'type': 'table',
- 'src': os.path.join(self.output_dir, 'comparison_results.csv'),
- 'label': 'Comparison Result
The scoring grid with average cross-validation scores'
- }
- ],
- "Plots": []
- }
-
- # Save model summary
- best_model_params = pd.DataFrame(self.best_model.get_params().items(), columns=['Parameter', 'Value'])
- best_model_params.to_csv(os.path.join(self.output_dir, 'best_model.csv'), index=False)
-
- # Save comparison results
- self.results.to_csv(os.path.join(self.output_dir, "comparison_results.csv"))
-
- # Add plots to the report data
- for plot_name, plot_path in self.plots.items():
- encoded_image = self.encode_image_to_base64(plot_path)
- report_data['Plots'].append({
- 'type': 'html',
- 'src': f'data:image/png;base64,{encoded_image}',
- 'label': plot_name.capitalize()
- })
-
- generate_report(inputs=report_data, outfile=os.path.join(self.output_dir, "comparison_result.html"))
-
- def save_dashboard(self):
- LOG.info("Saving explainer dashboard")
- dashboard = generate_dashboard(self.exp, self.best_model)
- dashboard.save_html("dashboard.html")
-
- def run(self):
- self.load_data()
- self.setup_pycaret()
- self.train_model()
- self.save_model()
- self.generate_plots()
- self.save_html_report()
- self.save_dashboard()
-
-if __name__ == "__main__":
- input_file = sys.argv[1]
- target_col = sys.argv[2]
- output_dir = sys.argv[3]
-
- trainer = ModelTrainer(input_file, target_col, output_dir)
- trainer.run()
diff --git a/tools/pycaret_regression.py b/tools/pycaret_regression.py
new file mode 100644
index 0000000..f04700b
--- /dev/null
+++ b/tools/pycaret_regression.py
@@ -0,0 +1,23 @@
+from base_model_trainer import BaseModelTrainer
+from pycaret.regression import RegressionExperiment
+from dashboard import generate_regression_explainer_dashboard
+import logging
+
+LOG = logging.getLogger(__name__)
+
+class RegressionModelTrainer(BaseModelTrainer):
+ def __init__(self, input_file, target_col, output_dir):
+ super().__init__(input_file, target_col, output_dir)
+ self.exp = RegressionExperiment()
+
+ def save_dashboard(self):
+ LOG.info("Saving explainer dashboard")
+ dashboard = generate_regression_explainer_dashboard(self.exp, self.best_model)
+ dashboard.save_html("dashboard.html")
+
+ def generate_plots(self):
+ LOG.info("Generating and saving plots")
+ plots = ['residuals', 'error', 'cooks', 'learning', 'vc', 'manifold', 'rfe', 'feature']
+ for plot_name in plots:
+ plot_path = self.exp.plot_model(self.best_model, plot=plot_name, save=True)
+ self.plots[plot_name] = plot_path
diff --git a/tools/pycaret_train.py b/tools/pycaret_train.py
index cc31bbe..acfcb46 100644
--- a/tools/pycaret_train.py
+++ b/tools/pycaret_train.py
@@ -1,7 +1,8 @@
import sys
import logging
-from pycaret_classification import ModelTrainer
+from pycaret_classification import ClassificationModelTrainer
+from pycaret_regression import RegressionModelTrainer
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
@@ -12,5 +13,8 @@
model_type = sys.argv[4]
if model_type == "classification":
- trainer = ModelTrainer(input_file, target_col, output_dir)
+ trainer = ClassificationModelTrainer(input_file, target_col, output_dir)
+ trainer.run()
+elif model_type == "regression":
+ trainer = RegressionModelTrainer(input_file, target_col, output_dir)
trainer.run()
\ No newline at end of file
diff --git a/tools/pycaret_train.xml b/tools/pycaret_train.xml
index dfca2f7..cf5b555 100644
--- a/tools/pycaret_train.xml
+++ b/tools/pycaret_train.xml
@@ -13,6 +13,7 @@
+