diff --git a/src/classifier.py b/src/classifier.py index 843e3a0..0976da6 100644 --- a/src/classifier.py +++ b/src/classifier.py @@ -1,39 +1,25 @@ import numpy as np import matplotlib.pyplot as plt from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import RobustScaler, StandardScaler -from sklearn.linear_model import LogisticRegression, SGDClassifier +from sklearn.preprocessing import RobustScaler +from sklearn.linear_model import SGDClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay def fit_svm_classifier(X, y): - pipeline = make_pipeline(RobustScaler(), SGDClassifier(loss = "hinge", random_state=0, tol=1e-5, class_weight="balanced", max_iter=10000, alpha = 0.05, early_stopping = True)) + pipeline = make_pipeline(RobustScaler(), SGDClassifier(loss = "hinge", random_state=0, tol=1e-6, class_weight="balanced", max_iter=10000)) pipeline.fit(X, y) return pipeline -def fit_logistic_regression(X, y): - # It is called Logistic Regression but it is really a classifier (source: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) - - regression = LogisticRegression(l1_ratio = 0.5, class_weight= "balanced", max_iter=10000, penalty = "elasticnet", solver="saga") - pipeline = make_pipeline(StandardScaler(), regression) - pipeline.fit(X, y) - return regression - def run_and_compare(train_X, train_y, test_x, test_y): - # print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}") + print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}") print("Running Support Vector Classifier ....") svm = fit_svm_classifier(train_X, train_y) svm_balanced_accuracy = balanced_accuracy_score(test_y, svm.predict(test_x)) print(f"SVM balanced accuracy: {svm_balanced_accuracy}") plot_confusion_matrix(test_y, svm.predict(test_x), title="Support Vector Machine Confusion Matrix") - - print("Running Logistic Classifier ....") - logit = fit_logistic_regression(train_X, train_y) - logit_balanced_accuracy = balanced_accuracy_score(test_y, logit.predict(test_x)) - print(f"Logistic Classifier balanced accuracy: {logit_balanced_accuracy}") - plot_confusion_matrix(test_y, logit.predict(test_x), title="Logistic Regression Confusion Matrix") def tune_hyperparameters(X, y, parameters, model): searcher = RandomizedSearchCV(model, parameters, scoring = "balanced_accuracy") diff --git a/src/data-processing.ipynb b/src/data-processing.ipynb index f5ebe3c..16da311 100644 --- a/src/data-processing.ipynb +++ b/src/data-processing.ipynb @@ -12,7 +12,7 @@ "import numpy as np\n", "import plotly.express as px\n", "from sklearn import model_selection, linear_model\n", - "from sklearn.impute import KNNImputer, SimpleImputer\n", + "from sklearn.impute import SimpleImputer\n", "import util" ] }, @@ -46,7 +46,14 @@ "metadata": {}, "outputs": [], "source": [ - "ccb = pd.read_csv('data/credit_card_balance.csv')" + "bureau = pd.read_csv('data/bureau.csv')\n", + "bureau.head(15)\n", + "bureau = bureau[bureau[\"CREDIT_CURRENCY\"] == \"currency 1\"]\n", + "bureau.drop(columns = [\"SK_ID_BUREAU\", \"CREDIT_ACTIVE\", \"CREDIT_CURRENCY\", \"DAYS_CREDIT\", \"CREDIT_DAY_OVERDUE\", \"DAYS_CREDIT_ENDDATE\", \"DAYS_ENDDATE_FACT\", \"CREDIT_TYPE\", \"DAYS_CREDIT_UPDATE\", \"AMT_ANNUITY\"], inplace=True)\n", + "bureau.fillna(0, inplace=True)\n", + "bureau[\"CREDIT_BUREAU_APPLICATION_COUNT\"] = 1\n", + "bureau = bureau.groupby(\"SK_ID_CURR\").agg(\"sum\")\n", + "df = df.merge(bureau, how=\"left\", on=\"SK_ID_CURR\")" ] }, { @@ -55,35 +62,7 @@ "metadata": {}, "outputs": [], "source": [ - "ccb = util.onehot_categorical_columns(ccb, ['NAME_CONTRACT_STATUS'])\n", - "ccb = ccb.groupby('SK_ID_CURR', as_index=False).agg('mean')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ccb.loc[ccb['SK_ID_CURR'] == 378907]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.merge(ccb, how='left', on='SK_ID_CURR')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop(columns = [\"SK_ID_CURR\", \"SK_ID_PREV\"], inplace=True)" + "df.drop(columns = [\"SK_ID_CURR\"], inplace=True)" ] }, { @@ -134,34 +113,6 @@ "col_descriptions.head(20)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| Column Name | DType | What it Means | How to Handle |\n", - "| --- | --- | --- | --- |\n", - "| AMT_ANNUITY | float | Monthly payment for the loan | I would find the mean of AMT_ANNUITY/AMT_CREDIT and then multiply that mean ratio by the AMT_CREDIT for the rows that don't have a NaN AMT_ANNUITY |\n", - "| AMT_GOODS_PRICE | float | Price of the Good that people get a loan for | Most times, AMT_CREDIT is either slightly greater than or equal to AMT_GOODS_PRICE, so I would just set NaNs in AMT_GOODS_PRICE to whatever the AMT_CREDIT is for that row|\n", - "| NAME_TYPE_SUITE | String | Who was accompanying the person when they were applying for the loan | Either just assume they were unnacompanied, use the mode, or use K-Means |\n", - "| OWN_CAR_AGE | float | Age of the car they owned | Fill NaNs w/ 0 or -1, since a person without a car will have a NaN here. Filling with 0 as it could conflate with those who have a brand new car, but -1 could be mistinterpreted as a very very new car or something |\n", - "| OCCUPATION_TYPE | String | What the person's occupation is | Not sure, probably should leave to K-Means |\n", - "| CNT_FAM_MEMBERS | float | How many family members the person has | K-Means or mode |\n", - "| EXT_SOURCE_1 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n", - "| EXT_SOURCE_2 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n", - "| EXT_SOURCE_3 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n", - "| OBS_30_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who were observed with a possible 30 DPD (Days past due) default | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n", - "| DEF_30_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who defaulted with a 30 DPD (Days past due) | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n", - "| OBS_60_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who were observed with a possible 60 DPD (Days past due) default | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n", - "| DEF_60_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who defaulted with a 60 DPD (Days past due) | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n", - "| DAYS_LAST_PHONE_CHANGE | float | How many days before applying did the client change their phone | I think there's only one NaN so we can set it to 0 |\n", - "| AMT_REQ_CREDIT_BUREAU_HOUR | float | Amount of enquiries the client had with the Credit Bureau one hour before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n", - "| AMT_REQ_CREDIT_BUREAU_DAY | float | Amount of enquiries the client had with the Credit Bureau one day before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n", - "| AMT_REQ_CREDIT_BUREAU_WEEK | float | Amount of enquiries the client had with the Credit Bureau one week before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n", - "| AMT_REQ_CREDIT_BUREAU_MON | float | Amount of enquiries the client had with the Credit Bureau one month before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n", - "| AMT_REQ_CREDIT_BUREAU_QRT | float | Amount of enquiries the client had with the Credit Bureau three months before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n", - "| AMT_REQ_CREDIT_BUREAU_YEAR | float | Amount of enquiries the client had with the Credit Bureau one year before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -191,9 +142,7 @@ "\n", "# Columns where we fill null with 0\n", "zero_cols = [\"OWN_CAR_AGE\", \"DAYS_LAST_PHONE_CHANGE\", \"AMT_REQ_CREDIT_BUREAU_HOUR\", \"AMT_REQ_CREDIT_BUREAU_DAY\", \"AMT_REQ_CREDIT_BUREAU_WEEK\",\n", - " \"AMT_REQ_CREDIT_BUREAU_MON\", \"AMT_REQ_CREDIT_BUREAU_QRT\", \"AMT_REQ_CREDIT_BUREAU_YEAR\", 'NAME_CONTRACT_STATUS_Active',\n", - " 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Demand',\n", - " 'NAME_CONTRACT_STATUS_Refused', 'NAME_CONTRACT_STATUS_Sent proposal', 'NAME_CONTRACT_STATUS_Signed']\n", + " \"AMT_REQ_CREDIT_BUREAU_MON\", \"AMT_REQ_CREDIT_BUREAU_QRT\", \"AMT_REQ_CREDIT_BUREAU_YEAR\"]\n", "# Set Nulls in zero cols to 0\n", "for col in zero_cols:\n", " df[col] = df[col].fillna(0)" @@ -234,7 +183,7 @@ "metadata": {}, "outputs": [], "source": [ - "imputer = SimpleImputer(missing_values=np.NaN, strategy = \"constant\", fill_value=-1)\n", + "imputer = SimpleImputer(missing_values=np.NaN, strategy = \"constant\", fill_value=0)\n", "df[:] = imputer.fit_transform(df)" ] }, @@ -254,7 +203,7 @@ "outputs": [], "source": [ "from get_feature_importance import get_feature_imp\n", - "get_feature_imp(df, target)" + "# get_feature_imp(df, target)" ] }, { @@ -263,16 +212,8 @@ "metadata": {}, "outputs": [], "source": [ - "# df.to_csv(\"cleaned_data.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output = util.run_pca(df, 40)" + "# output = util.run_pca(df, 60)\n", + "output = df" ] }, { @@ -288,24 +229,6 @@ "print(\"testing size:\", len(test_X))" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# non_null_ext_df = df.dropna(subset=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])\n", - "# # non_null_ext_df = df.fillna(value={'EXT_SOURCE_1': 0.5, 'EXT_SOURCE_2': 0.5, 'EXT_SOURCE_3': 0.5})\n", - "# print(non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])\n", - "# print(\"# points that have all 3 non-null ext sources:\", len(non_null_ext_df))\n", - "# non_null_ext_sample = non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].sample(frac=0.001)\n", - "# # extsrc_regr = linear_model.LinearRegression().fit()\n", - "# ext_src_fig = px.scatter_3d(non_null_ext_sample, x='EXT_SOURCE_1', y='EXT_SOURCE_2', z='EXT_SOURCE_3')\n", - "# ext_src_fig.show()\n", - "# print(\"correlation between sources 1 and 2 compared to source2\\n\",\n", - "# non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].corr()['EXT_SOURCE_2'][:], sep='')" - ] - }, { "cell_type": "code", "execution_count": null, @@ -313,7 +236,7 @@ "outputs": [], "source": [ "from classifier import run_and_compare\n", - "# run_and_compare(train_X, train_y, test_X, test_y)" + "run_and_compare(train_X, train_y, test_X, test_y)" ] } ],