From 96251f61c4f521e7a6766309051567773fde2e5a Mon Sep 17 00:00:00 2001 From: JunhaoQiu <56094690+qchiujunhao@users.noreply.github.com> Date: Thu, 13 Jun 2024 19:13:57 -0400 Subject: [PATCH] Regression (#3) * add regression restructure classes * better html report --- .gitignore | 3 +- tools/base_model_trainer.py | 164 ++++++++++++++++++++++++++++++++ tools/dashboard.py | 66 ++++++++++++- tools/pycaret_classification.py | 129 +++---------------------- tools/pycaret_regression.py | 23 +++++ tools/pycaret_train.py | 8 +- tools/pycaret_train.xml | 1 + 7 files changed, 272 insertions(+), 122 deletions(-) create mode 100644 tools/base_model_trainer.py create mode 100644 tools/pycaret_regression.py diff --git a/.gitignore b/.gitignore index 806aca1..8315d31 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ env *.txt *.pkl *.html -*.ipynb \ No newline at end of file +*.ipynb +__pycache__ \ No newline at end of file diff --git a/tools/base_model_trainer.py b/tools/base_model_trainer.py new file mode 100644 index 0000000..1d6a2cd --- /dev/null +++ b/tools/base_model_trainer.py @@ -0,0 +1,164 @@ +import sys +import pandas as pd +import os +import logging +import base64 + +logging.basicConfig(level=logging.DEBUG) +LOG = logging.getLogger(__name__) + +class BaseModelTrainer: + def __init__(self, input_file, target_col, output_dir): + self.exp = None # This will be set in the subclass + self.input_file = input_file + self.target_col = target_col + self.output_dir = output_dir + self.data = None + self.target = None + self.best_model = None + self.results = None + self.plots = {} + + def load_data(self): + LOG.info(f"Loading data from {self.input_file}") + self.data = pd.read_csv(self.input_file, sep=None, engine='python') + names = self.data.columns.to_list() + self.target = names[int(self.target_col)-1] + self.data = self.data.fillna(self.data.median(numeric_only=True)) + self.data.columns = self.data.columns.str.replace('.', '_') + + def setup_pycaret(self): + LOG.info("Initializing PyCaret") + self.exp.setup(self.data, target=self.target, + session_id=123, html=True, + log_experiment=False, system_log=False) + + def train_model(self): + LOG.info("Training and selecting the best model") + self.best_model = self.exp.compare_models() + self.results = self.exp.pull() + + def save_model(self): + LOG.info("Saving the model") + self.exp.save_model(self.best_model, "model.pkl") + + def generate_plots(self): + raise NotImplementedError("Subclasses should implement this method") + + def encode_image_to_base64(self, img_path): + with open(img_path, 'rb') as img_file: + return base64.b64encode(img_file.read()).decode('utf-8') + + def save_html_report(self): + LOG.info("Saving HTML report") + + model_name = type(self.best_model).__name__ + + # Save model summary + best_model_params = pd.DataFrame(self.best_model.get_params().items(), columns=['Parameter', 'Value']) + best_model_params.to_csv(os.path.join(self.output_dir, 'best_model.csv'), index=False) + + # Save comparison results + self.results.to_csv(os.path.join(self.output_dir, "comparison_results.csv")) + + # Read and encode plot images + plots_html = "" + for plot_name, plot_path in self.plots.items(): + encoded_image = self.encode_image_to_base64(plot_path) + plots_html += f""" +
+

{plot_name.capitalize()}

+ {plot_name} +
+ """ + + # Generate HTML content + html_content = f""" + + + + + + PyCaret Model Training Report + + + +
+

PyCaret Model Training Report

+

Best Model: {model_name}

+ + + {best_model_params.to_html(index=False, header=False, classes='table')} +
ParameterValue
+

Comparison Results

+ + {self.results.to_html(index=False, classes='table')} +
+

Plots

+ {plots_html} +
+ + + """ + + with open(os.path.join(self.output_dir, "comparison_result.html"), "w") as file: + file.write(html_content) + + def save_dashboard(self): + raise NotImplementedError("Subclasses should implement this method") + + def run(self): + self.load_data() + self.setup_pycaret() + self.train_model() + self.save_model() + self.generate_plots() + self.save_html_report() + self.save_dashboard() diff --git a/tools/dashboard.py b/tools/dashboard.py index fb9e6eb..557e93f 100644 --- a/tools/dashboard.py +++ b/tools/dashboard.py @@ -5,7 +5,7 @@ logging.basicConfig(level=logging.DEBUG) LOG = logging.getLogger(__name__) -def generate_dashboard( +def generate_classifier_explainer_dashboard( exp, estimator, display_format: str = "dash", @@ -76,4 +76,68 @@ def generate_dashboard( ) return ExplainerDashboard( explainer, mode=display_format, contributions=False, whatif=False, **dashboard_kwargs + ) + +def generate_regression_explainer_dashboard( + exp, + estimator, + display_format: str = "dash", + dashboard_kwargs: Optional[Dict[str, Any]] = None, + run_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ): + """ + This function is changed from pycaret.regression.oop.dashboard() + + This function generates the interactive dashboard for a trained model. The + dashboard is implemented using ExplainerDashboard (explainerdashboard.readthedocs.io) + + + estimator: scikit-learn compatible object + Trained model object + + + display_format: str, default = 'dash' + Render mode for the dashboard. The default is set to ``dash`` which will + render a dashboard in browser. There are four possible options: + + - 'dash' - displays the dashboard in browser + - 'inline' - displays the dashboard in the jupyter notebook cell. + - 'jupyterlab' - displays the dashboard in jupyterlab pane. + - 'external' - displays the dashboard in a separate tab. (use in Colab) + + + dashboard_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the ``ExplainerDashboard`` class. + + + run_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the ``run`` method of ``ExplainerDashboard``. + + + **kwargs: + Additional keyword arguments to pass to the ``ClassifierExplainer`` or + ``RegressionExplainer`` class. + + + Returns: + ExplainerDashboard + """ + + dashboard_kwargs = dashboard_kwargs or {} + run_kwargs = run_kwargs or {} + + from explainerdashboard import ExplainerDashboard, RegressionExplainer + + # Replaceing chars which dash doesnt accept for column name `.` , `{`, `}` + X_test_df = exp.X_test_transformed.copy() + X_test_df.columns = [ + col.replace(".", "__").replace("{", "__").replace("}", "__") + for col in X_test_df.columns + ] + explainer = RegressionExplainer( + estimator, X_test_df, exp.y_test_transformed, **kwargs + ) + return ExplainerDashboard( + explainer, mode=display_format, contributions=False, whatif=False, shap_interaction=False, decision_trees=False, **dashboard_kwargs ) \ No newline at end of file diff --git a/tools/pycaret_classification.py b/tools/pycaret_classification.py index e65fb0f..d382cd1 100644 --- a/tools/pycaret_classification.py +++ b/tools/pycaret_classification.py @@ -1,130 +1,23 @@ -import sys -import pandas as pd +from base_model_trainer import BaseModelTrainer from pycaret.classification import ClassificationExperiment -import os import logging -from dashboard import generate_dashboard -from jinja_report.generate_report import main as generate_report -import base64 +from dashboard import generate_classifier_explainer_dashboard -logging.basicConfig(level=logging.DEBUG) LOG = logging.getLogger(__name__) -class ModelTrainer: +class ClassificationModelTrainer(BaseModelTrainer): def __init__(self, input_file, target_col, output_dir): + super().__init__(input_file, target_col, output_dir) self.exp = ClassificationExperiment() - self.input_file = input_file - self.target_col = target_col - self.output_dir = output_dir - self.data = None - self.target = None - self.best_model = None - self.results = None - self.plots = {} - - def load_data(self): - LOG.info(f"Loading data from {self.input_file}") - self.data = pd.read_csv(self.input_file, sep=None, engine='python') - names = self.data.columns.to_list() - self.target = names[int(self.target_col)-1] - self.data = self.data.fillna(self.data.median(numeric_only=True)) - self.data.columns = self.data.columns.str.replace('.', '_') - - def setup_pycaret(self): - LOG.info("Initializing PyCaret") - self.exp.setup(self.data, target=self.target, - session_id=123, html=True, - log_experiment=False, system_log=False) - - def train_model(self): - LOG.info("Training and selecting the best model") - self.best_model = self.exp.compare_models() - self.results = self.exp.pull() - - def save_model(self): - LOG.info("Saving the model") - self.exp.save_model(self.best_model, "model") + + def save_dashboard(self): + LOG.info("Saving explainer dashboard") + dashboard = generate_classifier_explainer_dashboard(self.exp, self.best_model) + dashboard.save_html("dashboard.html") def generate_plots(self): LOG.info("Generating and saving plots") - # Generate PyCaret plots - plots = ['auc', 'confusion_matrix', - 'threshold', - 'pr', 'error', - 'class_report', 'learning', - 'calibration', 'vc', - 'dimension', - 'manifold', 'rfe', - 'feature', 'feature_all'] + plots = ['auc', 'confusion_matrix', 'threshold', 'pr', 'error', 'class_report', 'learning', 'calibration', 'vc', 'dimension', 'manifold', 'rfe', 'feature', 'feature_all'] for plot_name in plots: - plot_path = self.exp.plot_model(self.best_model, plot=plot_name, - save=True) + plot_path = self.exp.plot_model(self.best_model, plot=plot_name, save=True) self.plots[plot_name] = plot_path - - def encode_image_to_base64(self, img_path): - with open(img_path, 'rb') as img_file: - return base64.b64encode(img_file.read()).decode('utf-8') - - def save_html_report(self): - LOG.info("Saving HTML report") - - model_name = type(self.best_model).__name__ - - report_data = { - "title": "PyCaret Model Training Report", - 'Best Model': [ - { - 'type': 'table', - 'src': os.path.join(self.output_dir, 'best_model.csv'), - 'label': f'Best Model: {model_name}' - } - ], - 'Comparison Results': [ - { - 'type': 'table', - 'src': os.path.join(self.output_dir, 'comparison_results.csv'), - 'label': 'Comparison Result
The scoring grid with average cross-validation scores' - } - ], - "Plots": [] - } - - # Save model summary - best_model_params = pd.DataFrame(self.best_model.get_params().items(), columns=['Parameter', 'Value']) - best_model_params.to_csv(os.path.join(self.output_dir, 'best_model.csv'), index=False) - - # Save comparison results - self.results.to_csv(os.path.join(self.output_dir, "comparison_results.csv")) - - # Add plots to the report data - for plot_name, plot_path in self.plots.items(): - encoded_image = self.encode_image_to_base64(plot_path) - report_data['Plots'].append({ - 'type': 'html', - 'src': f'data:image/png;base64,{encoded_image}', - 'label': plot_name.capitalize() - }) - - generate_report(inputs=report_data, outfile=os.path.join(self.output_dir, "comparison_result.html")) - - def save_dashboard(self): - LOG.info("Saving explainer dashboard") - dashboard = generate_dashboard(self.exp, self.best_model) - dashboard.save_html("dashboard.html") - - def run(self): - self.load_data() - self.setup_pycaret() - self.train_model() - self.save_model() - self.generate_plots() - self.save_html_report() - self.save_dashboard() - -if __name__ == "__main__": - input_file = sys.argv[1] - target_col = sys.argv[2] - output_dir = sys.argv[3] - - trainer = ModelTrainer(input_file, target_col, output_dir) - trainer.run() diff --git a/tools/pycaret_regression.py b/tools/pycaret_regression.py new file mode 100644 index 0000000..f04700b --- /dev/null +++ b/tools/pycaret_regression.py @@ -0,0 +1,23 @@ +from base_model_trainer import BaseModelTrainer +from pycaret.regression import RegressionExperiment +from dashboard import generate_regression_explainer_dashboard +import logging + +LOG = logging.getLogger(__name__) + +class RegressionModelTrainer(BaseModelTrainer): + def __init__(self, input_file, target_col, output_dir): + super().__init__(input_file, target_col, output_dir) + self.exp = RegressionExperiment() + + def save_dashboard(self): + LOG.info("Saving explainer dashboard") + dashboard = generate_regression_explainer_dashboard(self.exp, self.best_model) + dashboard.save_html("dashboard.html") + + def generate_plots(self): + LOG.info("Generating and saving plots") + plots = ['residuals', 'error', 'cooks', 'learning', 'vc', 'manifold', 'rfe', 'feature'] + for plot_name in plots: + plot_path = self.exp.plot_model(self.best_model, plot=plot_name, save=True) + self.plots[plot_name] = plot_path diff --git a/tools/pycaret_train.py b/tools/pycaret_train.py index cc31bbe..acfcb46 100644 --- a/tools/pycaret_train.py +++ b/tools/pycaret_train.py @@ -1,7 +1,8 @@ import sys import logging -from pycaret_classification import ModelTrainer +from pycaret_classification import ClassificationModelTrainer +from pycaret_regression import RegressionModelTrainer logging.basicConfig(level=logging.DEBUG) LOG = logging.getLogger(__name__) @@ -12,5 +13,8 @@ model_type = sys.argv[4] if model_type == "classification": - trainer = ModelTrainer(input_file, target_col, output_dir) + trainer = ClassificationModelTrainer(input_file, target_col, output_dir) + trainer.run() +elif model_type == "regression": + trainer = RegressionModelTrainer(input_file, target_col, output_dir) trainer.run() \ No newline at end of file diff --git a/tools/pycaret_train.xml b/tools/pycaret_train.xml index dfca2f7..cf5b555 100644 --- a/tools/pycaret_train.xml +++ b/tools/pycaret_train.xml @@ -13,6 +13,7 @@ +