Skip to content

Commit

Permalink
make changes that hopefully work
Browse files Browse the repository at this point in the history
  • Loading branch information
hashgupta committed Dec 5, 2023
1 parent 38811c7 commit 5f3b4fb
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 111 deletions.
22 changes: 4 additions & 18 deletions src/classifier.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,25 @@
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay


def fit_svm_classifier(X, y):
pipeline = make_pipeline(RobustScaler(), SGDClassifier(loss = "hinge", random_state=0, tol=1e-5, class_weight="balanced", max_iter=10000, alpha = 0.05, early_stopping = True))
pipeline = make_pipeline(RobustScaler(), SGDClassifier(loss = "hinge", random_state=0, tol=1e-6, class_weight="balanced", max_iter=10000))
pipeline.fit(X, y)
return pipeline

def fit_logistic_regression(X, y):
# It is called Logistic Regression but it is really a classifier (source: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

regression = LogisticRegression(l1_ratio = 0.5, class_weight= "balanced", max_iter=10000, penalty = "elasticnet", solver="saga")
pipeline = make_pipeline(StandardScaler(), regression)
pipeline.fit(X, y)
return regression

def run_and_compare(train_X, train_y, test_x, test_y):
# print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}")
print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}")

print("Running Support Vector Classifier ....")
svm = fit_svm_classifier(train_X, train_y)
svm_balanced_accuracy = balanced_accuracy_score(test_y, svm.predict(test_x))
print(f"SVM balanced accuracy: {svm_balanced_accuracy}")
plot_confusion_matrix(test_y, svm.predict(test_x), title="Support Vector Machine Confusion Matrix")

print("Running Logistic Classifier ....")
logit = fit_logistic_regression(train_X, train_y)
logit_balanced_accuracy = balanced_accuracy_score(test_y, logit.predict(test_x))
print(f"Logistic Classifier balanced accuracy: {logit_balanced_accuracy}")
plot_confusion_matrix(test_y, logit.predict(test_x), title="Logistic Regression Confusion Matrix")

def tune_hyperparameters(X, y, parameters, model):
searcher = RandomizedSearchCV(model, parameters, scoring = "balanced_accuracy")
Expand Down
109 changes: 16 additions & 93 deletions src/data-processing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"import numpy as np\n",
"import plotly.express as px\n",
"from sklearn import model_selection, linear_model\n",
"from sklearn.impute import KNNImputer, SimpleImputer\n",
"from sklearn.impute import SimpleImputer\n",
"import util"
]
},
Expand Down Expand Up @@ -46,7 +46,14 @@
"metadata": {},
"outputs": [],
"source": [
"ccb = pd.read_csv('data/credit_card_balance.csv')"
"bureau = pd.read_csv('data/bureau.csv')\n",
"bureau.head(15)\n",
"bureau = bureau[bureau[\"CREDIT_CURRENCY\"] == \"currency 1\"]\n",
"bureau.drop(columns = [\"SK_ID_BUREAU\", \"CREDIT_ACTIVE\", \"CREDIT_CURRENCY\", \"DAYS_CREDIT\", \"CREDIT_DAY_OVERDUE\", \"DAYS_CREDIT_ENDDATE\", \"DAYS_ENDDATE_FACT\", \"CREDIT_TYPE\", \"DAYS_CREDIT_UPDATE\", \"AMT_ANNUITY\"], inplace=True)\n",
"bureau.fillna(0, inplace=True)\n",
"bureau[\"CREDIT_BUREAU_APPLICATION_COUNT\"] = 1\n",
"bureau = bureau.groupby(\"SK_ID_CURR\").agg(\"sum\")\n",
"df = df.merge(bureau, how=\"left\", on=\"SK_ID_CURR\")"
]
},
{
Expand All @@ -55,35 +62,7 @@
"metadata": {},
"outputs": [],
"source": [
"ccb = util.onehot_categorical_columns(ccb, ['NAME_CONTRACT_STATUS'])\n",
"ccb = ccb.groupby('SK_ID_CURR', as_index=False).agg('mean')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ccb.loc[ccb['SK_ID_CURR'] == 378907]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = df.merge(ccb, how='left', on='SK_ID_CURR')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.drop(columns = [\"SK_ID_CURR\", \"SK_ID_PREV\"], inplace=True)"
"df.drop(columns = [\"SK_ID_CURR\"], inplace=True)"
]
},
{
Expand Down Expand Up @@ -134,34 +113,6 @@
"col_descriptions.head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"| Column Name | DType | What it Means | How to Handle |\n",
"| --- | --- | --- | --- |\n",
"| AMT_ANNUITY | float | Monthly payment for the loan | I would find the mean of AMT_ANNUITY/AMT_CREDIT and then multiply that mean ratio by the AMT_CREDIT for the rows that don't have a NaN AMT_ANNUITY |\n",
"| AMT_GOODS_PRICE | float | Price of the Good that people get a loan for | Most times, AMT_CREDIT is either slightly greater than or equal to AMT_GOODS_PRICE, so I would just set NaNs in AMT_GOODS_PRICE to whatever the AMT_CREDIT is for that row|\n",
"| NAME_TYPE_SUITE | String | Who was accompanying the person when they were applying for the loan | Either just assume they were unnacompanied, use the mode, or use K-Means |\n",
"| OWN_CAR_AGE | float | Age of the car they owned | Fill NaNs w/ 0 or -1, since a person without a car will have a NaN here. Filling with 0 as it could conflate with those who have a brand new car, but -1 could be mistinterpreted as a very very new car or something |\n",
"| OCCUPATION_TYPE | String | What the person's occupation is | Not sure, probably should leave to K-Means |\n",
"| CNT_FAM_MEMBERS | float | How many family members the person has | K-Means or mode |\n",
"| EXT_SOURCE_1 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n",
"| EXT_SOURCE_2 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n",
"| EXT_SOURCE_3 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n",
"| OBS_30_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who were observed with a possible 30 DPD (Days past due) default | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
"| DEF_30_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who defaulted with a 30 DPD (Days past due) | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
"| OBS_60_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who were observed with a possible 60 DPD (Days past due) default | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
"| DEF_60_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who defaulted with a 60 DPD (Days past due) | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
"| DAYS_LAST_PHONE_CHANGE | float | How many days before applying did the client change their phone | I think there's only one NaN so we can set it to 0 |\n",
"| AMT_REQ_CREDIT_BUREAU_HOUR | float | Amount of enquiries the client had with the Credit Bureau one hour before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
"| AMT_REQ_CREDIT_BUREAU_DAY | float | Amount of enquiries the client had with the Credit Bureau one day before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
"| AMT_REQ_CREDIT_BUREAU_WEEK | float | Amount of enquiries the client had with the Credit Bureau one week before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
"| AMT_REQ_CREDIT_BUREAU_MON | float | Amount of enquiries the client had with the Credit Bureau one month before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
"| AMT_REQ_CREDIT_BUREAU_QRT | float | Amount of enquiries the client had with the Credit Bureau three months before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
"| AMT_REQ_CREDIT_BUREAU_YEAR | float | Amount of enquiries the client had with the Credit Bureau one year before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -191,9 +142,7 @@
"\n",
"# Columns where we fill null with 0\n",
"zero_cols = [\"OWN_CAR_AGE\", \"DAYS_LAST_PHONE_CHANGE\", \"AMT_REQ_CREDIT_BUREAU_HOUR\", \"AMT_REQ_CREDIT_BUREAU_DAY\", \"AMT_REQ_CREDIT_BUREAU_WEEK\",\n",
" \"AMT_REQ_CREDIT_BUREAU_MON\", \"AMT_REQ_CREDIT_BUREAU_QRT\", \"AMT_REQ_CREDIT_BUREAU_YEAR\", 'NAME_CONTRACT_STATUS_Active',\n",
" 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Demand',\n",
" 'NAME_CONTRACT_STATUS_Refused', 'NAME_CONTRACT_STATUS_Sent proposal', 'NAME_CONTRACT_STATUS_Signed']\n",
" \"AMT_REQ_CREDIT_BUREAU_MON\", \"AMT_REQ_CREDIT_BUREAU_QRT\", \"AMT_REQ_CREDIT_BUREAU_YEAR\"]\n",
"# Set Nulls in zero cols to 0\n",
"for col in zero_cols:\n",
" df[col] = df[col].fillna(0)"
Expand Down Expand Up @@ -234,7 +183,7 @@
"metadata": {},
"outputs": [],
"source": [
"imputer = SimpleImputer(missing_values=np.NaN, strategy = \"constant\", fill_value=-1)\n",
"imputer = SimpleImputer(missing_values=np.NaN, strategy = \"constant\", fill_value=0)\n",
"df[:] = imputer.fit_transform(df)"
]
},
Expand All @@ -254,7 +203,7 @@
"outputs": [],
"source": [
"from get_feature_importance import get_feature_imp\n",
"get_feature_imp(df, target)"
"# get_feature_imp(df, target)"
]
},
{
Expand All @@ -263,16 +212,8 @@
"metadata": {},
"outputs": [],
"source": [
"# df.to_csv(\"cleaned_data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output = util.run_pca(df, 40)"
"# output = util.run_pca(df, 60)\n",
"output = df"
]
},
{
Expand All @@ -288,32 +229,14 @@
"print(\"testing size:\", len(test_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# non_null_ext_df = df.dropna(subset=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])\n",
"# # non_null_ext_df = df.fillna(value={'EXT_SOURCE_1': 0.5, 'EXT_SOURCE_2': 0.5, 'EXT_SOURCE_3': 0.5})\n",
"# print(non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])\n",
"# print(\"# points that have all 3 non-null ext sources:\", len(non_null_ext_df))\n",
"# non_null_ext_sample = non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].sample(frac=0.001)\n",
"# # extsrc_regr = linear_model.LinearRegression().fit()\n",
"# ext_src_fig = px.scatter_3d(non_null_ext_sample, x='EXT_SOURCE_1', y='EXT_SOURCE_2', z='EXT_SOURCE_3')\n",
"# ext_src_fig.show()\n",
"# print(\"correlation between sources 1 and 2 compared to source2\\n\",\n",
"# non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].corr()['EXT_SOURCE_2'][:], sep='')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from classifier import run_and_compare\n",
"# run_and_compare(train_X, train_y, test_X, test_y)"
"run_and_compare(train_X, train_y, test_X, test_y)"
]
}
],
Expand Down

0 comments on commit 5f3b4fb

Please sign in to comment.