-
Notifications
You must be signed in to change notification settings - Fork 0
/
DefaultPrediction_LoanForecast.R
188 lines (147 loc) · 7.26 KB
/
DefaultPrediction_LoanForecast.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Default Predictor Project
# Forecasting Next Year's Personal Loan Behaviour
# All theory was extracted from Google, I'm not a PhD, bro
# First and foremost, we tell RStudio where is our Working Directory
# We made sure all CSV files were included there before starting
# Be aware your Directory may not be the same in your PC
setwd('C://Users//andre//Downloads')
library(ggplot2) #-- nice charts
library(tseries) #-- Test Series
library(forecast) #--ARIMA
library(tidyverse) #--data manipulation, data visualization, and data science
library(gridExtra) #-- for arranging plots
library(dplyr) #-- ggplot
library(highcharter) #---more charts
library(tidyr) #--ggplot
library(magrittr) #-- mutate in place
library(reshape2) #-- reshape
library(lubridate) #--change dates
library(quantmod) #-- The quantmod package for R is designed to assist the quantitative trader
library(rugarch) #--generalized autoregressive conditional heteroskedasticity (GARCH)
library(rmgarch) #--multivariate GARCH
library(xts) #--merge, cbind, rbind, c, Ops, lag, diff, coredata
library(PerformanceAnalytics) #--performance and risk analysis of financial instruments or portfolios
library(quantmod) #-- quantitative financial modeling framework
# Forecasting next year here
# Data is already in tabular form, almost pristine
# Download loan behaviour from CNBV Comision Nacional Bancaria y de Valores
# Personal Loans balance from Scotiabank Mexico
mx <- read.csv("DefaultPrediction_personales.csv", header = TRUE, sep = ",")
# To change this, we call magrittr library and apply lubridate to MES
mx %<>%
mutate(MES = lubridate::dmy(MES))
# Look at the variables, tell me what they are
sapply(mx, class)
# Replace commas by nothing to have numbers AND then transform char to num
mx %<>%
mutate(VIGENTE = str_replace(VIGENTE,"[:punct:]", ""))
mx %<>%
mutate(VENCIDA = str_replace(VENCIDA,"[:punct:]", ""))
# Transform. From string to numeric.
mx$VIGENTE = as.numeric(mx$VIGENTE)
mx$VENCIDA = as.numeric(mx$VENCIDA)
# check types again
sapply(mx, class)
# Create a dataframe with variable & value
df <- mx %>%
select(MES,VIGENTE,VENCIDA) %>%
gather(key = "variable", value = "value", -MES)
# look at your new dataframe
head(df,10)
# Make a chart to see how personal loans behaved in our balance sheet
p <- ggplot(df,
aes(x = MES,
y = value,
col = variable)) +
geom_line()
p + ggtitle("Evolución del Balance de Préstamos Personales Scotiabank") +
xlab("Periodo") + ylab("MXN Millones")
# Profitable Loans
# transform loan balances into time series AND get month to month change
# then, decompose the signal
cartera_vigente <- diff(ts(mx$VIGENTE, start=c(2016,8,1), frequency=12))
plot(decompose(cartera_vigente, "multiplicative"))
# Non Profit Loans
# transform loan balances into time series AND get month to month change
# then, decompose the signal
cartera_vencida <- diff(ts(mx$VENCIDA, start=c(2016,8,1), frequency=12))
plot(decompose(cartera_vencida, "multiplicative"))
# bind both time series and put them in a chart
xh <- cbind(cartera_vigente,cartera_vencida)
hchart(xh) %>%
hc_title(text = "Cambios de cartera vigente y vencida Scotiabank m/m (MXN millones)")
# Sationarity Testing
# Atime series is said to be “stationary” if it has no trend, exhibits constant variance o
# ver time, and has a constant autocorrelation structure over time.
# One way to test whether a time series is stationary is to perform an augmented
# Dickey-Fuller test, which uses the following null and alternative hypotheses:
# H0: The time series is non-stationary. In other words, it has some time-dependent structure and does not have constant variance over time.
# HA: The time series is stationary.
# A stationary process, another way to say something that generates a stationary time series,
# has specific statistical properties enabling predicting a likely outcome.
# Seasonality refers to a series whose distribution changes predictably through time.
# Non Profitable Loans
pp.test(cartera_vencida) #Voila! Stationarity thanks to many more observations! pv = 0.01
adf.test(cartera_vencida) #Voila! again stationarity pv=0.02 note 0.05 and up is non-stationary
#Nevertheless we still see outliers, but at least a normal distibution centered in 0
boxplot(cartera_vencida) #quite normal around zero
# Repeat for profitable loans
pp.test(cartera_vigente) #Voila! Stationarity thanks to many more observations! pv = 0.01
adf.test(cartera_vigente) # This detects non-stationarity pv = 0.27
#Nevertheless we still see outliers, but at least a normal distibution centered in 0
boxplot(cartera_vigente) #quite normal
#---- STATIONARITY CHECK, NOW LETS GET AMBITIOUS!!!
acf(cartera_vencida, lag.max=20) # correlations & geometric decay
pacf(cartera_vencida, lag.max=20)
acf(cartera_vigente, lag.max=20) # I fear: noise in here, what do you say?
pacf(cartera_vigente, lag.max=20)
# Lest do Auto ARIMA, this works most of the time
#1. The Data series as input should be stationary.
#2. As ARIMA takes past values to predict the future output, the input data must be invariant.
modelo_c_vencida <-auto.arima(cartera_vencida)
modelo_c_vigente <-auto.arima(cartera_vigente)
#look at the summary
summary(modelo_c_vencida)
summary(modelo_c_vigente)
# p = 1 number of auto regressive terms
# d = 1 is times the lagged indicators have been subtracted to make the data stationary.
# q = 1 number of forecast errors in the model
# ARIMA(1, 1, 1) - AR, MA and I values
# LJung Box
# The Ljung-Box test uses the following hypotheses:
# H0: The residuals are independently distributed.
# HA: The residuals are not independently distributed; they exhibit serial correlation.
# Ideally, we would like to fail to reject the null hypothesis.
# That is, we would like to see the p-value of the test be greater than 0.05 because
# this means the residuals for our time series model are independent,
# which is often an assumption we make when creating a model
Box.test(residuals(modelo_c_vencida), type = "Ljung-Box")
#Fortunately P-value of 47% > 5% residuals are independently distributed!!!!
Box.test(residuals(modelo_c_vigente), type = "Ljung-Box")
#Fortunately P-value of 67% > 5% residuals are independently distributed!!!!
# The Q-Q plot, or quantile-quantile plot, is a graphical tool to help us assess
# if a set of data plausibly came from some theoretical distribution such as a Normal
# or exponential. We basically like those points to fall into a straight line.
qqnorm(modelo_c_vencida$residuals)
qqline(modelo_c_vencida$residuals)
qqnorm(modelo_c_vigente$residuals)
qqline(modelo_c_vigente$residuals)
#Forecast
pronostico_vencida <- forecast(modelo_c_vencida,h=11)
pronostico_vencida
plot(pronostico_vencida, ylab="consumer confidence", xlim=c(2016,2024))
#Looks neat
multi_forecast_vencida <- modelo_c_vencida %>%
forecast(h = 11)
multi_forecast_vencida %>%
autoplot()
#Forecast
pronostico_vigente <- forecast(modelo_c_vigente,h=11)
pronostico_vigente
plot(pronostico_vigente, ylab="consumer confidence", xlim=c(2016,2024))
#Looks neat
multi_forecast_vigente <- modelo_c_vigente %>%
forecast(h = 11)
multi_forecast_vigente %>%
autoplot()
print("the_end")