Skip to content

Commit

Permalink
added some new changes
Browse files Browse the repository at this point in the history
  • Loading branch information
hashgupta committed Nov 13, 2023
1 parent d4dc844 commit 33e6c03
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 27 deletions.
Binary file added LR confusion matrix.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added SVM confusion matrix.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
47 changes: 23 additions & 24 deletions src/classifier.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier


def fit_svm_classifier(X, y):
pipeline = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5, class_weight="balanced"))
pipeline = make_pipeline(RobustScaler(), LinearSVC(random_state=0, tol=1e-5, class_weight="balanced"))
pipeline.fit(X, y)
return pipeline

def fit_logistic_regression(X, y):
# It is called Logistic Regression but it is really a classifier (source: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
regression = LogisticRegression(class_weight= "balanced", solver="newton-cholesky")
regression.fit(X, y)

regression = LogisticRegression(class_weight= "balanced", solver="newton-cholesky", max_iter=5000, n_jobs=-1)
pipeline = make_pipeline(StandardScaler(), regression)
pipeline.fit(X, y)
return regression

def fit_random_forest(X, y):
Expand All @@ -29,34 +28,34 @@ def fit_random_forest(X, y):
return forest

def run_and_compare(train_X, train_y, test_x, test_y):
print(f"baseline: {f1_score(test_y, np.zeros(test_y.shape))}")
print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}")

print("Running Support Vector Classifier ....")
svm = fit_svm_classifier(train_X, train_y)
svm_f1 = f1_score(test_y, svm.predict(test_x), average = "weighted")
print(f"SVM f1-score: {svm_f1}")
plot_confusion_matrix(test_y, svm.predict(test_x))
svm_balanced_accuracy = balanced_accuracy_score(test_y, svm.predict(test_x))
print(f"SVM balanced accuracy: {svm_balanced_accuracy}")
plot_confusion_matrix(test_y, svm.predict(test_x), title="Support Vector Machine Confusion Matrix")

print("Running Logistic Classifier ....")
logit = fit_logistic_regression(train_X, train_y)
logit_f1 = f1_score(test_y, logit.predict(test_x))
print(f"Logistic Classifier f1-score: {logit_f1}")
plot_confusion_matrix(test_y, logit.predict(test_x))
logit_balanced_accuracy = balanced_accuracy_score(test_y, logit.predict(test_x))
print(f"Logistic Classifier balanced accuracy: {logit_balanced_accuracy}")
plot_confusion_matrix(test_y, logit.predict(test_x), title="Logistic Regression Confusion Matrix")

print("Running Random Forest Classifier ....")
forest = fit_random_forest(train_X, train_y)
forest_predictions = forest.predict(test_x)
forest_f1 = f1_score(test_y, forest_predictions)
print(f"DT f1-score: {forest_f1}")
plot_confusion_matrix(test_y, forest_predictions)
# print("Running Random Forest Classifier ....")
# forest = fit_random_forest(train_X, train_y)
# forest_predictions = forest.predict(test_x)
# forest_f1 = f1_score(test_y, forest_predictions)
# print(f"DT f1-score: {forest_f1}")
# plot_confusion_matrix(test_y, forest_predictions)

def tune_hyperparameters(X, y, parameters, model):
searcher = RandomizedSearchCV(model, parameters, scoring = "balanced_accuracy")
searcher.fit(X, y)
return searcher.best_params_, searcher.best_estimator_

def plot_confusion_matrix(ground_truth, predictions):
confusion_array = confusion_matrix(ground_truth, predictions)
def plot_confusion_matrix(ground_truth, predictions, title = "Confusion Matrix"):
confusion_array = confusion_matrix(ground_truth, predictions, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_array)
disp.plot()
plt.show()
Expand Down
10 changes: 10 additions & 0 deletions src/data-processing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,16 @@
"df[:] = imputer.fit_transform(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from get_feature_importance import get_feature_imp\n",
"get_feature_imp(df, target)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
13 changes: 13 additions & 0 deletions src/get_feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import pandas as pd



def get_feature_imp(X_train, y_train):
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
importances = tree.feature_importances_
feat_importances = pd.DataFrame(importances, index=X_train.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='barh', figsize=(80,60))
7 changes: 4 additions & 3 deletions src/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

# ignore_columns = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG',
Expand All @@ -21,7 +21,8 @@
ignore_columns = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', "FLAG_MOBIL", "FLAG_CONT_MOBILE",
"FLAG_EMP_PHONE"]

def defaultClean(df: pd.DataFrame) -> None:
df.drop(ignore_columns, axis=1, inplace=True)
Expand All @@ -32,7 +33,7 @@ def dropColumnsBetween(df: pd.DataFrame, start: str, end: str) -> None:

def run_pca(df: pd.DataFrame, explained_var: float) -> pd.DataFrame:
# Standardize the data
X_std = StandardScaler().fit_transform(df)
X_std = RobustScaler().fit_transform(df)

# running PCA
pca = PCA(n_components=explained_var)
Expand Down

0 comments on commit 33e6c03

Please sign in to comment.