-
Notifications
You must be signed in to change notification settings - Fork 2
/
LoanRiskAnalysis_InterestRate.Rmd
233 lines (183 loc) · 9.23 KB
/
LoanRiskAnalysis_InterestRate.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
---
title: "LoanRiskAnalysis_InterestRate"
author: "Liang Tan"
output:
pdf_document: default
html_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Read data
```{r tidy=TRUE}
loan <- read.csv('loan.csv', stringsAsFactors = FALSE)
loanT <- loan
num.NA <- sort(sapply(loan, function(x) sum(is.na(x))), decreasing = TRUE)
remain.col = names(num.NA)[(num.NA < 0.8 * dim(loan)[1])]
delete.col = names(num.NA)[(num.NA >= 0.8 * dim(loan)[1])]
delete.col
```
# Feature engineering and selection
User feature selection
addr_state, emp_title, member_id, zip_code is removed
emp_length, home_ownership is reserved
```{r tidy=TRUE}
#encode home_ownership
loan$home_ownership <- ifelse(loan$home_ownership %in% c('ANY', 'NONE', 'OTHER'), 'OTHER',
loan$home_ownership)
#encode state information with the help of int_rate
int_state <- by(loan, loan$addr_state, function(x) {
return(mean(x$int_rate))
})
loan$state_mean_int <-
ifelse(loan$addr_state %in% names(int_state)[which(int_state <=
quantile(int_state, 0.25))], 'low',
ifelse(loan$addr_state %in% names(int_state)[which(int_state <=
quantile(int_state, 0.5))],'lowmedium',
ifelse(loan$addr_state %in% names(int_state)[which(int_state <= quantile(int_state, 0.75))],
'mediumhigh', 'high')))
select.features_1 <- c('home_ownership', 'state_mean_int')
```
Financial feature selection
combine annual_inc and annual_inc_joint, dti and dti_joint, verification_status and verification_status_joint based on joint condition
```{r tidy=TRUE}
loan$dti <- ifelse(!is.na(loan$dti_joint), loan$dti_joint, loan$dti)
loan$annual_inc <- ifelse(!is.na(loan$annual_inc_joint), loan$annual_inc_joint, loan$annual_inc)
loan$annual_inc[which(is.na(loan$annual_inc))] <- median(loan$annual_inc, na.rm = T)
loan$verification_status <- ifelse(loan$application_type == 'JOINT', loan$verification_status_joint, loan$verification_status)
select.features_2 <- c('dti', 'annual_inc', 'verification_status')
```
Credit scores feature selection
inq_fi, inq_last_12m is removed for over 80% NA values.
The earliest_cr_line and last_credit_pull_d are reserved
```{r tidy=TRUE}
select.features_3 <- c('earliest_cr_line', 'last_credit_pull_d')
```
credit lines feature selection
all_util, open_acc_6m, total_cu_tl, open_il_6m, open_il_12m, open_il_24m, open_rv_12m, open_rv_24m, max_bal_bc, mths_since_last_record, il_util, mths_since_rcnt_il, total_bal_il, max_bal_bc are removed for over 80% NA values
policy_code and url are removed for irrelavance
total_acc, tot_cur_bal, open_acc, acc_now_delinq, delinq_2yrs, mths_since_last_delinq, collections_12_mths_ex_med, tot_coll_amt, pub_rec, mths_since_last_major_derog, revol_util, total_rev_hi_lim are reserved
```{r tidy=TRUE}
#mean and median are similar so I use mean for na
loan$total_acc[which(is.na(loan$total_acc))] <- mean(loan$total_acc, na.rm = T)
#mean of tot_cur_bal is more influenced by large value so I use median
loan$tot_cur_bal[which(is.na(loan$tot_cur_bal))] <- median(loan$tot_cur_bal, na.rm = T)
#mean and median are similar so I use mean for na
loan$open_acc[which(is.na(loan$open_acc))] <- mean(loan$open_acc, na.rm = T)
#acc_now_delinq is int number, so I use median for na
loan$acc_now_delinq[which(is.na(loan$acc_now_delinq))] <- median(loan$acc_now_delinq, na.rm = T)
#delinq_2yrs is int number, so I use median for na
loan$delinq_2yrs[which(is.na(loan$delinq_2yrs))] <- median(loan$delinq_2yrs, na.rm = T)
#mths_since_last_delinq is int number, so I use median for na
loan$mths_since_last_delinq[which(is.na(loan$mths_since_last_delinq))] <- median(loan$mths_since_last_delinq, na.rm = T)
#collections_12_mths_ex_med is int number, so I use median for na
loan$collections_12_mths_ex_med[which(is.na(loan$collections_12_mths_ex_med))] <- median(loan$collections_12_mths_ex_med, na.rm = T)
#tot_coll_amt is int number, so I use median for na
loan$tot_coll_amt[which(is.na(loan$tot_coll_amt))] <- median(loan$tot_coll_amt, na.rm = T)
#pub_rec is int number, so I use median for na
loan$pub_rec[which(is.na(loan$pub_rec))] <- median(loan$pub_rec, na.rm = T)
#mths_since_last_major_derog is int number, so I use median for na
loan$mths_since_last_major_derog[which(is.na(loan$mths_since_last_major_derog))] <- median(loan$mths_since_last_major_derog, na.rm = T)
#mean and median is similar so I use mean for revol_util na values
loan$revol_util[which(is.na(loan$revol_util))] <- mean(loan$revol_util, na.rm = T)
#total_rev_hi_lim is int number, so I use median for na
loan$total_rev_hi_lim[which(is.na(loan$total_rev_hi_lim))] <- median(loan$total_rev_hi_lim, na.rm = T)
select.features_4 <- c('total_acc', 'tot_cur_bal', 'open_acc', 'acc_now_delinq', 'delinq_2yrs',
'mths_since_last_delinq', 'collections_12_mths_ex_med', 'tot_coll_amt',
'pub_rec', 'mths_since_last_major_derog', 'revol_util',
'total_rev_hi_lim')
```
loan feature selection
desc, id, title, issue_d, are removed
loan_amnt, application_type, purpose, term and initial_list_status are reserved
```{r tidy=TRUE}
select.features_5 <- c('loan_amnt', 'application_type', 'purpose',
'term', 'initial_list_status')
```
loan payment feature selection
last_pymnt_amnt, last_pymnt_d, next_pymnt_d, total_pymnt, total_pymnt_inv, total_rec_int, total_rec_late_fee, total_rec_prncp are inrrelative here
installment, funded_amnt, funded_amnt_inv, pymnt_plan, recoveries
collection_recovery_fee, out_prncp, out_prncp_inv are reserved
```{r tidy=TRUE}
select.features_6 <- c('installment', 'funded_amnt', 'funded_amnt_inv', 'pymnt_plan',
'recoveries', 'collection_recovery_fee',
'out_prncp', 'out_prncp_inv')
```
grade and int_rate are used as well
```{r tidy=TRUE}
select.features <- c(select.features_1, select.features_2, select.features_3, select.features_4,
select.features_5, select.features_6, 'int_rate')
loan <- loan[select.features]
```
scale all numeric variables
```{r tidy=TRUE}
select.features.num <- names(loan[, sapply(loan[, 1:32], is.numeric)])
loan.scale <- loan
loan.scale[, select.features.num] <- scale(loan.scale[, select.features.num])
```
check the level of all category variables
```{r tidy=TRUE}
select.features.cate <- names(loan.scale[, sapply(loan.scale, is.character)])
n_levels <- sort(sapply(loan.scale[select.features.cate], function(x) {nlevels(as.factor(x))}), decreasing = TRUE)
print(n_levels)
```
The level number of 'earliest_cr_line' and 'last_credit_pull_d' is too large. Further treatment needs applying.
```{r tidy=TRUE}
anova_test <- aov(int_rate ~ earliest_cr_line, data = loan.scale)
summary(anova_test)
```
The ANOVA test shows this feature is important so I can't delete it. Therefore, I will transfer it into years only.
```{r tidy=TRUE}
library("zoo")
loan.scale$earliest_cr_line <- format(as.Date(as.yearmon(loan.scale$earliest_cr_line, "%B-%Y")), "%Y")
length(unique(loan.scale$earliest_cr_line))
```
Now the levels of earliest_cr_line are reduced to 68.
```{r tidy=TRUE}
anova_test <- aov(int_rate ~ last_credit_pull_d, data = loan.scale)
summary(anova_test)
```
The ANOVA test shows this feature is important so I can't delete it. Therefore, I will transfer it into years only.
```{r tidy=TRUE}
loan.scale$last_credit_pull_d <- format(as.Date(as.yearmon(loan.scale$last_credit_pull_d, "%B-%Y")), "%Y")
length(unique(loan.scale$last_credit_pull_d))
```
Now the levels of last_credit_pull_d are reduced to 11.
# Build model to predict the loan interest_rate
train, test data set selection
```{r tidy=TRUE}
set.seed(1)
train.ind <- sample(1:dim(loan.scale)[1], 0.8 * dim(loan)[1])
train <- loan.scale[train.ind, ]
test <- loan.scale[-train.ind, ]
```
build regression model
```{r tidy=TRUE}
mod <- lm(int_rate ~ ., data = train)
print(summary(mod))
```
Based on the summary information, I notice some features are not significant in building linear regression. So I decided to add Lasso regularization to penalize them.
```{r tidy=TRUE}
library(glmnet)
drops <- c("last_credit_pull_d","earliest_cr_line","funded_amnt_inv","pymnt_plan", "int_rate")
ind <- train[, !(names(train) %in% drops)]
ind <- model.matrix( ~., ind)
dep <- train[, 'int_rate']
#Use cross validation to tune parameters
linear.cvfit <- cv.glmnet(ind, dep, family = 'gaussian', alpha = 1.0)
plot(linear.cvfit)
```
Choose optimus parameters for this linear regression model.
```{r tidy=TRUE}
print(paste('The optimus lambda for model is', round(linear.cvfit$lambda.1se, 5)))
print(coef(linear.cvfit, s = "lambda.1se"))
```
make predictions for test data set
```{r tidy=TRUE}
library(hydroGOF)
ind <- test[, !(names(test) %in% drops)]
ind <- model.matrix( ~., ind)
cv.pred <- predict(linear.cvfit, s=linear.cvfit$lambda.1se, newx=ind)
print(paste0("The mean square error is: ", round(mse(cv.pred[,1], test$int_rate),4), "%"))
print(paste0("The mean absolute error is: ", round(mae(cv.pred[,1], test$int_rate),4), "%"))
```