diff --git a/LR confusion matrix.png b/LR confusion matrix.png new file mode 100644 index 0000000..7b371f4 Binary files /dev/null and b/LR confusion matrix.png differ diff --git a/SVM confusion matrix.png b/SVM confusion matrix.png new file mode 100644 index 0000000..a54c68e Binary files /dev/null and b/SVM confusion matrix.png differ diff --git a/output.png b/output.png new file mode 100644 index 0000000..09a1f7f Binary files /dev/null and b/output.png differ diff --git a/src/classifier.py b/src/classifier.py index ff0f0ff..733cf5c 100644 --- a/src/classifier.py +++ b/src/classifier.py @@ -1,26 +1,25 @@ import numpy as np -import pandas as pd import matplotlib.pyplot as plt -from sklearn.svm import SVC, LinearSVC -from sklearn.tree import DecisionTreeClassifier +from sklearn.svm import LinearSVC from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler, StandardScaler from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV -from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score +from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay from xgboost import XGBClassifier def fit_svm_classifier(X, y): - pipeline = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5, class_weight="balanced")) + pipeline = make_pipeline(RobustScaler(), LinearSVC(random_state=0, tol=1e-5, class_weight="balanced")) pipeline.fit(X, y) return pipeline def fit_logistic_regression(X, y): # It is called Logistic Regression but it is really a classifier (source: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) - regression = LogisticRegression(class_weight= "balanced", solver="newton-cholesky") - regression.fit(X, y) + + regression = LogisticRegression(class_weight= "balanced", solver="newton-cholesky", max_iter=5000, n_jobs=-1) + pipeline = make_pipeline(StandardScaler(), regression) + pipeline.fit(X, y) return regression def fit_random_forest(X, y): @@ -29,34 +28,34 @@ def fit_random_forest(X, y): return forest def run_and_compare(train_X, train_y, test_x, test_y): - print(f"baseline: {f1_score(test_y, np.zeros(test_y.shape))}") + print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}") print("Running Support Vector Classifier ....") svm = fit_svm_classifier(train_X, train_y) - svm_f1 = f1_score(test_y, svm.predict(test_x), average = "weighted") - print(f"SVM f1-score: {svm_f1}") - plot_confusion_matrix(test_y, svm.predict(test_x)) + svm_balanced_accuracy = balanced_accuracy_score(test_y, svm.predict(test_x)) + print(f"SVM balanced accuracy: {svm_balanced_accuracy}") + plot_confusion_matrix(test_y, svm.predict(test_x), title="Support Vector Machine Confusion Matrix") print("Running Logistic Classifier ....") logit = fit_logistic_regression(train_X, train_y) - logit_f1 = f1_score(test_y, logit.predict(test_x)) - print(f"Logistic Classifier f1-score: {logit_f1}") - plot_confusion_matrix(test_y, logit.predict(test_x)) + logit_balanced_accuracy = balanced_accuracy_score(test_y, logit.predict(test_x)) + print(f"Logistic Classifier balanced accuracy: {logit_balanced_accuracy}") + plot_confusion_matrix(test_y, logit.predict(test_x), title="Logistic Regression Confusion Matrix") - print("Running Random Forest Classifier ....") - forest = fit_random_forest(train_X, train_y) - forest_predictions = forest.predict(test_x) - forest_f1 = f1_score(test_y, forest_predictions) - print(f"DT f1-score: {forest_f1}") - plot_confusion_matrix(test_y, forest_predictions) + # print("Running Random Forest Classifier ....") + # forest = fit_random_forest(train_X, train_y) + # forest_predictions = forest.predict(test_x) + # forest_f1 = f1_score(test_y, forest_predictions) + # print(f"DT f1-score: {forest_f1}") + # plot_confusion_matrix(test_y, forest_predictions) def tune_hyperparameters(X, y, parameters, model): searcher = RandomizedSearchCV(model, parameters, scoring = "balanced_accuracy") searcher.fit(X, y) return searcher.best_params_, searcher.best_estimator_ -def plot_confusion_matrix(ground_truth, predictions): - confusion_array = confusion_matrix(ground_truth, predictions) +def plot_confusion_matrix(ground_truth, predictions, title = "Confusion Matrix"): + confusion_array = confusion_matrix(ground_truth, predictions, normalize="true") disp = ConfusionMatrixDisplay(confusion_matrix=confusion_array) disp.plot() plt.show() diff --git a/src/data-processing.ipynb b/src/data-processing.ipynb index 158f0d7..012b96a 100644 --- a/src/data-processing.ipynb +++ b/src/data-processing.ipynb @@ -184,6 +184,16 @@ "df[:] = imputer.fit_transform(df)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from get_feature_importance import get_feature_imp\n", + "get_feature_imp(df, target)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/get_feature_importance.py b/src/get_feature_importance.py new file mode 100644 index 0000000..ded7ddc --- /dev/null +++ b/src/get_feature_importance.py @@ -0,0 +1,13 @@ +from sklearn.tree import DecisionTreeClassifier +import matplotlib.pyplot as plt +import pandas as pd + + + +def get_feature_imp(X_train, y_train): + tree = DecisionTreeClassifier() + tree.fit(X_train, y_train) + importances = tree.feature_importances_ + feat_importances = pd.DataFrame(importances, index=X_train.columns, columns=["Importance"]) + feat_importances.sort_values(by='Importance', ascending=False, inplace=True) + feat_importances.plot(kind='barh', figsize=(80,60)) diff --git a/src/util.py b/src/util.py index 34c4933..d874c55 100644 --- a/src/util.py +++ b/src/util.py @@ -1,5 +1,5 @@ import pandas as pd -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler from sklearn.decomposition import PCA # ignore_columns = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', @@ -21,7 +21,8 @@ ignore_columns = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', - 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'] + 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', "FLAG_MOBIL", "FLAG_CONT_MOBILE", + "FLAG_EMP_PHONE"] def defaultClean(df: pd.DataFrame) -> None: df.drop(ignore_columns, axis=1, inplace=True) @@ -32,7 +33,7 @@ def dropColumnsBetween(df: pd.DataFrame, start: str, end: str) -> None: def run_pca(df: pd.DataFrame, explained_var: float) -> pd.DataFrame: # Standardize the data - X_std = StandardScaler().fit_transform(df) + X_std = RobustScaler().fit_transform(df) # running PCA pca = PCA(n_components=explained_var)