From 5f3b4fbb1d61bee41c2622aaeb0e681dccb36fd5 Mon Sep 17 00:00:00 2001
From: "Gupta, Yash" <ygupta46@gatech.edu>
Date: Mon, 4 Dec 2023 21:45:46 -0500
Subject: [PATCH] make changes that hopefully work

---
 src/classifier.py         |  22 ++------
 src/data-processing.ipynb | 109 ++++++--------------------------------
 2 files changed, 20 insertions(+), 111 deletions(-)

diff --git a/src/classifier.py b/src/classifier.py
index 843e3a0..0976da6 100644
--- a/src/classifier.py
+++ b/src/classifier.py
@@ -1,39 +1,25 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import RobustScaler, StandardScaler
-from sklearn.linear_model import LogisticRegression, SGDClassifier
+from sklearn.preprocessing import RobustScaler
+from sklearn.linear_model import SGDClassifier
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
 
 
 def fit_svm_classifier(X, y):
-    pipeline = make_pipeline(RobustScaler(), SGDClassifier(loss = "hinge", random_state=0, tol=1e-5, class_weight="balanced", max_iter=10000, alpha = 0.05, early_stopping = True))
+    pipeline = make_pipeline(RobustScaler(), SGDClassifier(loss = "hinge", random_state=0, tol=1e-6, class_weight="balanced", max_iter=10000))
     pipeline.fit(X, y)
     return pipeline
 
-def fit_logistic_regression(X, y):
-    # It is called Logistic Regression but it is really a classifier (source: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
-    
-    regression = LogisticRegression(l1_ratio = 0.5, class_weight= "balanced", max_iter=10000, penalty = "elasticnet", solver="saga")
-    pipeline = make_pipeline(StandardScaler(), regression)
-    pipeline.fit(X, y)
-    return regression
-
 def run_and_compare(train_X, train_y, test_x, test_y):
-    # print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}")
+    print(f"baseline balanced-accuracy-score: {balanced_accuracy_score(test_y, np.zeros(test_y.shape))}")
 
     print("Running Support Vector Classifier ....")
     svm = fit_svm_classifier(train_X, train_y)
     svm_balanced_accuracy = balanced_accuracy_score(test_y, svm.predict(test_x))
     print(f"SVM balanced accuracy: {svm_balanced_accuracy}")
     plot_confusion_matrix(test_y, svm.predict(test_x), title="Support Vector Machine Confusion Matrix")
-    
-    print("Running Logistic Classifier ....")
-    logit = fit_logistic_regression(train_X, train_y)
-    logit_balanced_accuracy = balanced_accuracy_score(test_y, logit.predict(test_x))
-    print(f"Logistic Classifier balanced accuracy: {logit_balanced_accuracy}")
-    plot_confusion_matrix(test_y, logit.predict(test_x), title="Logistic Regression Confusion Matrix")
 
 def tune_hyperparameters(X, y, parameters, model):
     searcher = RandomizedSearchCV(model, parameters, scoring = "balanced_accuracy")
diff --git a/src/data-processing.ipynb b/src/data-processing.ipynb
index f5ebe3c..16da311 100644
--- a/src/data-processing.ipynb
+++ b/src/data-processing.ipynb
@@ -12,7 +12,7 @@
     "import numpy as np\n",
     "import plotly.express as px\n",
     "from sklearn import model_selection, linear_model\n",
-    "from sklearn.impute import KNNImputer, SimpleImputer\n",
+    "from sklearn.impute import SimpleImputer\n",
     "import util"
    ]
   },
@@ -46,7 +46,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ccb = pd.read_csv('data/credit_card_balance.csv')"
+    "bureau = pd.read_csv('data/bureau.csv')\n",
+    "bureau.head(15)\n",
+    "bureau = bureau[bureau[\"CREDIT_CURRENCY\"] == \"currency 1\"]\n",
+    "bureau.drop(columns = [\"SK_ID_BUREAU\", \"CREDIT_ACTIVE\", \"CREDIT_CURRENCY\", \"DAYS_CREDIT\", \"CREDIT_DAY_OVERDUE\", \"DAYS_CREDIT_ENDDATE\", \"DAYS_ENDDATE_FACT\", \"CREDIT_TYPE\", \"DAYS_CREDIT_UPDATE\", \"AMT_ANNUITY\"], inplace=True)\n",
+    "bureau.fillna(0, inplace=True)\n",
+    "bureau[\"CREDIT_BUREAU_APPLICATION_COUNT\"] = 1\n",
+    "bureau = bureau.groupby(\"SK_ID_CURR\").agg(\"sum\")\n",
+    "df = df.merge(bureau, how=\"left\", on=\"SK_ID_CURR\")"
    ]
   },
   {
@@ -55,35 +62,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ccb = util.onehot_categorical_columns(ccb, ['NAME_CONTRACT_STATUS'])\n",
-    "ccb = ccb.groupby('SK_ID_CURR', as_index=False).agg('mean')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ccb.loc[ccb['SK_ID_CURR'] == 378907]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = df.merge(ccb, how='left', on='SK_ID_CURR')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop(columns = [\"SK_ID_CURR\", \"SK_ID_PREV\"], inplace=True)"
+    "df.drop(columns = [\"SK_ID_CURR\"], inplace=True)"
    ]
   },
   {
@@ -134,34 +113,6 @@
     "col_descriptions.head(20)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "| Column Name | DType | What it Means | How to Handle |\n",
-    "| --- | --- | --- | --- |\n",
-    "| AMT_ANNUITY | float | Monthly payment for the loan | I would find the mean of AMT_ANNUITY/AMT_CREDIT and then multiply that mean ratio by the AMT_CREDIT for the rows that don't have a NaN AMT_ANNUITY |\n",
-    "| AMT_GOODS_PRICE | float | Price of the Good that people get a loan for | Most times, AMT_CREDIT is either slightly greater than or equal to AMT_GOODS_PRICE, so I would just set NaNs in AMT_GOODS_PRICE to whatever the AMT_CREDIT is for that row|\n",
-    "| NAME_TYPE_SUITE | String | Who was accompanying the person when they were applying for the loan | Either just assume they were unnacompanied, use the mode, or use K-Means |\n",
-    "| OWN_CAR_AGE | float | Age of the car they owned | Fill NaNs w/ 0 or -1, since a person without a car will have a NaN here. Filling with 0 as it could conflate with those who have a brand new car, but -1 could be mistinterpreted as a very very new car or something |\n",
-    "| OCCUPATION_TYPE | String | What the person's occupation is | Not sure, probably should leave to K-Means |\n",
-    "| CNT_FAM_MEMBERS | float | How many family members the person has | K-Means or mode |\n",
-    "| EXT_SOURCE_1 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n",
-    "| EXT_SOURCE_2 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n",
-    "| EXT_SOURCE_3 | float | Normalized score from external database | Linear Regression/Use other two scores. There are situations in which only one score is present, and very few where no external scores are present. |\n",
-    "| OBS_30_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who were observed with a possible 30 DPD (Days past due) default | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
-    "| DEF_30_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who defaulted with a  30 DPD (Days past due) | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
-    "| OBS_60_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who were observed with a possible 60 DPD (Days past due) default | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
-    "| DEF_60_CNT_SOCIAL_CIRCLE | float | Amount of people in person's social circle who defaulted with a 60 DPD (Days past due) | NaN means there was no observation, so I would make an \"assumption\" of 0, but you could also use K-Means I think |\n",
-    "| DAYS_LAST_PHONE_CHANGE | float | How many days before applying did the client change their phone | I think there's only one NaN so we can set it to 0 |\n",
-    "| AMT_REQ_CREDIT_BUREAU_HOUR | float | Amount of enquiries the client had with the Credit Bureau one hour before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
-    "| AMT_REQ_CREDIT_BUREAU_DAY | float | Amount of enquiries the client had with the Credit Bureau one day before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
-    "| AMT_REQ_CREDIT_BUREAU_WEEK | float | Amount of enquiries the client had with the Credit Bureau one week before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
-    "| AMT_REQ_CREDIT_BUREAU_MON | float | Amount of enquiries the client had with the Credit Bureau one month before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
-    "| AMT_REQ_CREDIT_BUREAU_QRT | float | Amount of enquiries the client had with the Credit Bureau three months before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n",
-    "| AMT_REQ_CREDIT_BUREAU_YEAR | float | Amount of enquiries the client had with the Credit Bureau one year before application | If there's no data, we probably should assume 0. We can't really guess at how many enquiries a client had and when they had them |\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -191,9 +142,7 @@
     "\n",
     "# Columns where we fill null with 0\n",
     "zero_cols = [\"OWN_CAR_AGE\", \"DAYS_LAST_PHONE_CHANGE\", \"AMT_REQ_CREDIT_BUREAU_HOUR\", \"AMT_REQ_CREDIT_BUREAU_DAY\", \"AMT_REQ_CREDIT_BUREAU_WEEK\",\n",
-    "        \"AMT_REQ_CREDIT_BUREAU_MON\", \"AMT_REQ_CREDIT_BUREAU_QRT\", \"AMT_REQ_CREDIT_BUREAU_YEAR\", 'NAME_CONTRACT_STATUS_Active',\n",
-    "        'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Demand',\n",
-    "        'NAME_CONTRACT_STATUS_Refused', 'NAME_CONTRACT_STATUS_Sent proposal', 'NAME_CONTRACT_STATUS_Signed']\n",
+    "        \"AMT_REQ_CREDIT_BUREAU_MON\", \"AMT_REQ_CREDIT_BUREAU_QRT\", \"AMT_REQ_CREDIT_BUREAU_YEAR\"]\n",
     "# Set Nulls in zero cols to 0\n",
     "for col in zero_cols:\n",
     "        df[col] = df[col].fillna(0)"
@@ -234,7 +183,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "imputer = SimpleImputer(missing_values=np.NaN, strategy = \"constant\", fill_value=-1)\n",
+    "imputer = SimpleImputer(missing_values=np.NaN, strategy = \"constant\", fill_value=0)\n",
     "df[:] = imputer.fit_transform(df)"
    ]
   },
@@ -254,7 +203,7 @@
    "outputs": [],
    "source": [
     "from get_feature_importance import get_feature_imp\n",
-    "get_feature_imp(df, target)"
+    "# get_feature_imp(df, target)"
    ]
   },
   {
@@ -263,16 +212,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# df.to_csv(\"cleaned_data.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output = util.run_pca(df, 40)"
+    "# output = util.run_pca(df, 60)\n",
+    "output = df"
    ]
   },
   {
@@ -288,24 +229,6 @@
     "print(\"testing size:\", len(test_X))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# non_null_ext_df =  df.dropna(subset=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])\n",
-    "# # non_null_ext_df = df.fillna(value={'EXT_SOURCE_1': 0.5, 'EXT_SOURCE_2': 0.5, 'EXT_SOURCE_3': 0.5})\n",
-    "# print(non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])\n",
-    "# print(\"# points that have all 3 non-null ext sources:\", len(non_null_ext_df))\n",
-    "# non_null_ext_sample = non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].sample(frac=0.001)\n",
-    "# # extsrc_regr = linear_model.LinearRegression().fit()\n",
-    "# ext_src_fig = px.scatter_3d(non_null_ext_sample, x='EXT_SOURCE_1', y='EXT_SOURCE_2', z='EXT_SOURCE_3')\n",
-    "# ext_src_fig.show()\n",
-    "# print(\"correlation between sources 1 and 2 compared to source2\\n\",\n",
-    "#       non_null_ext_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].corr()['EXT_SOURCE_2'][:], sep='')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -313,7 +236,7 @@
    "outputs": [],
    "source": [
     "from classifier import run_and_compare\n",
-    "# run_and_compare(train_X, train_y, test_X, test_y)"
+    "run_and_compare(train_X, train_y, test_X, test_y)"
    ]
   }
  ],