-
Notifications
You must be signed in to change notification settings - Fork 0
/
rethinking.R
452 lines (316 loc) · 14.9 KB
/
rethinking.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
# --------------------------------------------------------------------------------------------------------------------------
#
# I recently finished reading Statistical Rethinking by McElreath. The reviews are true, it's a great book.
# As a bonus, it comes with 20 hours of related lectures on youtube taking you through the content.
#
# Statistical Rethinking is an introduction to statistical modelling using Bayesian methods. To distill it
# down to a one liner, Bayesian methods give you a distribution for model parameters, in contrast to frequentist
# methods which produce a point estimate for parameter values. This can come in handy for expressing uncertainty in
# inference or prediction.
#
# As stated in the preface, the book takes the philosophy that you will understand something much better
# if you can see it working and implemented in an algorithm, i.e., in code. The first computational
# method introduced is grid approximation. In this post I'm going to look at replicating both the
# calculation of log likelihood and grid approximation.
#
# GRID APPROXIMATION
#
# We can achieve an excellent approximation of the continuous posterior distribution by considering
# only a finite grid of parameter values. At any particular value of a parameter, it is simple a
# matter to compute the posterior probability: just multiply the prior probability OF p by the
# likelihood AT p, repeating this procedure for each value in the grid - McElreath p40.
#
# --------------------------------------------------------------------------------------------------------------------------
# RE-PERFORMING EXAMPLE FROM MCELREATH
# Fitting normal distribution to data using grid approximation. This is equivalent to fitting
# a regression model that contains only an intercept.
# Required packages
library(rethinking)
# Data
data(Howell1)
d <- Howell1
d2 <- d[d$age >= 18, ]
# Plot using rethinking function
dens(d2)
# Base r plot
plot(density(d2$age))
# Re-performing code block 4.16, estimating the posterior distribution using grid approximation.
# I'm not the first to look at this, see also:
# https://poissonisfish.com/2019/05/01/bayesian-models-in-r/
# and here
# https://rileyking.netlify.app/post/bayesian-modeling-of-censored-and-uncensored-fatigue-data-in-r/
# and here https://allendowney.github.io/ThinkBayes2/hospital.html.
# Create grid for combinations of mu and sigma for which the log-likelihood is calculated over
mu_list <- seq(from = 150, to = 160, length.out = 100)
sigma_list <- seq(from = 7, to = 9, length.out = 100)
post <- expand.grid(mu = mu_list, sigma = sigma_list)
# Compute the log likelihood
# For each of the 10,000 combinations of mu and sigma, calculate the the sum of the
# probability density function for each of 352 values of height using 'dnorm'.
# What exactly is 'dnorm' doing - background https://seankross.com/notes/dpqr/
# The function dnorm returns the value of the probability density function for the normal distribution given
# parameters for x, mu, and sigma. 'dnorm' will give us the "height" of the pdf of the normal distribution
# at whatever value of x we provide. The height of the pdf represents the relative probability of getting
# the value of x assuming the data is normally distributed, and also assuming the mu and sigma parameters
# supplied.
# Vectorised using sapply
post$ll <- sapply(
X = 1:nrow(post),
FUN = function(i) sum(dnorm(x = d2$height, mean = post$mu[i], sd = post$sigma[i], log = TRUE))
)
# Repeating the above sapply construction of log-likelihood, using a for loop
ll_via_loop <- vector(mode = "double", length = nrow(post))
for (mu_sigma in 1:nrow(post)) {
ll_height <- vector(mode = "double", length = nrow(d2))
for (height in 1:nrow(d2)) {
ll_height[height] <- dnorm(x = d2$height[height], mean = post$mu[mu_sigma], sd = post$sigma[mu_sigma], log = TRUE)
}
ll_via_loop[mu_sigma] = sum(ll_height)
}
# Assign to df
post$ll2 <- ll_via_loop
# Check we are returning the same results
sum(abs(post$ll - post$ll2))
# Calculating the product of ll and priors (adding since using logs), ie., the unstandardised posterior
post$prod <-
post$ll + # log-likelihood per above
dnorm(x = post$mu, mean = 178, sd = 20, log = TRUE) + # prior for mean
dunif(x = post$sigma, min = 0, max = 50, log = TRUE) # prior for standard deviation
# We now have an unstandardised posterior, this is scaled with the code below to produce a
# "relative posterior probability" see McElreath p.562
post$prob <- exp(post$prod - max(post$prod))
# Note that this does not sum to 1
sum(post$prob)
# Plot
contour_xyz(post$mu, post$sigma, post$prob)
image_xyz(post$mu, post$sigma, post$prob)
# The parameters associated with the maximum log-likelihood
post[which.max(post$prob), ]
# The same calculations using 'quap'
m6 <- quap(
alist(
height ~ dnorm( mu , sigma ) ,
mu ~ dnorm( 178 , 20 ),
sigma ~ dunif(min = 0, max = 50)
) , data = d2 )
precis(m6)
#The mean and standard deviation of the plots above correspond to that in
# the output of quap.
# SIDETRACK: AN INTUITIVE EXAMPLE OF LOG-LIKELIHOOD -----------------------------------------------
# Some data
data <- c(148, 149, 150, 151, 152)
# The mean and standard deviation
mean(data) # 150
sd(data) # 1.58
# LL1 - calculate log-likelihood, firstly using the exact mean and sd, then with differing means and
# standard deviations
sum(dnorm(x = data, mean = 150, sd = 1.58, log = TRUE))
sum(dnorm(x = data, mean = 150, sd = 10, log = TRUE))
sum(dnorm(x = data, mean = 160, sd = 1.58, log = TRUE))
# The maximum log-likelihood is returned by the first example using the exact mean and standard deviation.
# This represents the relative plausibility of seeing the data assuming the parameters - McElreath p 37.
# Put another way, the likelihood is the relative number of ways that a parameter (mu & sigma) can produce
# the data - McElreath p 27. We measure the ways parameters can produce the data by assuming the data comes
# from a distribution (normal in this case), and comparing the expected value for different sets of parameters
# (this is what dnorm does).
# Another intuitive example of likelihood can be found here - #https://rpsychologist.com/likelihood/.
# The same example immediately above, presented in a different manner. A dataframe is created with the same
# data. Columns for the mean and sd are included in order to calculate the ll. The mu and sigma associated
# with param_set = 'a' is the true mu and sigma.
df <- data.frame(
param_set = rep(c('a','b','c'), each = 5),
mu = c(rep(150,10), rep(160,5)),
sigma = c(rep(1.6,5), rep(7,5), rep(1.6,5)),
dat = rep(148:152,3)
)
# Calculate the ll for each data point
df$log_lik <- sapply(df[,2:4], dnorm, x = df$dat, mean = df$mu, sd = df$sigma)[,1]#, log = TRUE)
#The maximum LL is that associated with the true mean and sd.
aggregate(. ~ param_set, data = df[,c('param_set', 'log_lik')], sum)
# This can be replicated in excel using the NORM.DIST and LN functions.
# --------------------------------------------------------------------------------------------------------------------------
#
# AN ALTERNATE DERIVATION OF LOG-LIKELIHOOD
#
# MLE from https://stats.stackexchange.com/questions/112451/maximum-likelihood-estimation-mle-in-layman-terms
# This example calculates the likelihood using a function, and then optimises the parameters of that function to
# maximse the log-likelihood. This method CANNOT be used in a bayesian context to calculate posterior
# probabilities since that requires "multiplying the prior probability OF p by the likelihood AT p for any
# particular value of p"
# --------------------------------------------------------------------------------------------------------------------------
# Create mock data using parameters to retrieve
n <- 200
alpha <- 9
beta <- 2
sigma <- 0.8
data <- data.frame(x = runif(n, 1, 10))
data$y <- alpha + beta*data$x + rnorm(n, 0, sigma)
plot(data$x, data$y)
# The guts of the function (for inspection)
theta = c(alpha, beta, sigma)
y = data$y
X = cbind(1, data$x) # column of 1's for matrix multiplication
n <- nrow(X) # 200
k <- ncol(X) # 2
beta <- theta[1:k] # 9 2
sigma2 <- theta[k+1]^2 # Convert sd to variance
e <- y - X %*% beta # Matrix multiplication for error
logl <- -0.5 * n * log(2*pi) - 0.5 * n * log(sigma2) - ((t(e) %*% e) / (2*sigma2)) # IS THIS THE NORMAL DISTRIBUTION
# The function itself
linear.lik <- function(theta, y, X){
n <- nrow(X)
k <- ncol(X)
beta <- theta[1:k]
sigma2 <- theta[k+1]^2
e <- y - X%*%beta
logl <- -.5*n*log(2*pi)-.5*n*log(sigma2) - ( (t(e) %*% e)/ (2*sigma2) )
return(-logl)
}
# This is effectively the grid approximation of the likelihood
surface <- list()
k <- 0
for(alpha in seq(5, 10, 0.2)){
for(beta in seq(0, 5, 0.2)){
for(sigma in seq(0.1, 2, 0.1)){
k <- k + 1
logL <- linear.lik(theta = c(alpha, beta, sigma), y = data$y, X = cbind(1, data$x))
surface[[k]] <- data.frame(alpha = alpha, beta = beta, sigma = sigma, logL = -logL)
}
}
}
# To dataframe
surface <- do.call(rbind, surface)
# Plot log-likelihood
library(lattice)
wireframe(logL ~ beta*sigma, surface, shade = TRUE)
wireframe(logL ~ alpha*sigma, surface, shade = TRUE)
# Find MLE using optimisation
linear.MLE <- optim(fn=linear.lik, par=c(1,1,1), lower = c(-Inf, -Inf, 1e-8),
upper = c(Inf, Inf, Inf), hessian=TRUE,
y=data$y, X=cbind(1, data$x), method = "L-BFGS-B")
linear.MLE$par
# Compare to true parameters
theta
# PARAMETER ESTIMATION USING BASE R LINEAR MODEL
lm(y ~ x, data = data)
# GRID APPROXIMATION FOR LOG-LIKELIHOOD - SAME DATA AS ABOVE---------------------------------------
# The initial grid approximation performed above did not include a linear model, the example below
# does.
# Grid - we will use the 'surface' data frame created above
post2 <- surface
# Log-likelihood
post2$ll <- sapply(
X = 1:nrow(post2),
FUN = function(i) sum(dnorm(
x = data$y, # Not that it is the dependent variable here
mean = post2$alpha[i] + (post2$beta[i] * data$x),
sd = post2$sigma[i],
log = TRUE
))
)
# Posterior
post2$prod1 <-
post2$ll + # log-likelihood per above
dnorm(x = post2$alpha , mean = 0, sd = 5, log = TRUE) + # prior for alpha
dnorm(x = post2$beta , mean = 0, sd = 5, log = TRUE) + # prior for beta
dexp(x = post2$sigma , rate = 1, log = TRUE) # prior for standard deviation
# Relative posterior probability
post2$prob1 <- exp(post2$prod1 - max(post2$prod1))
#post2$prob2 <- exp(post2$prod2 - max(post2$prod2))
# MLE
post2[which.max(post2$ll), ]
# Check the equivalence of the two MLE estimation methods (grid approximation and function optimisation)
sum(abs(post2$ll - post$logL))
# Summarise likelihood and posterior over beta
ll_pr_by_beta <- aggregate(. ~ beta, data = post2, mean)
# SIMULATING DATA ---------------------------------------------------------------------------------
# https://www.jamesuanhoro.com/post/2018/05/07/simulating-data-from-regression-models/
library(MASS)
n <- 50
set.seed(18050518)
dat <- data.frame(xc = rnorm(n), xb = rbinom(n, 1, 0.5))
dat <- within(dat, y <- rpois(n, exp(0.5 * xc + xb)))
# Note that these two are identical
dat <- within(dat, y1 <- exp(0.5 * xc + xb)) # these two methods are identical
dat$y2 <- rpois(n, dat$y1) # these two methods are identical
summary(fit.p <- glm(y ~ xc + xb, poisson, dat))
# Simulate data from the model assuming coefficients arise from multivariate normal
coefs <- mvrnorm(n = 1e4, mu = coefficients(fit.p), Sigma = vcov(fit.p)) # same as rethinking:extract.samples
# Check model co-efficients against those simulated, see also McElreath p.90
coefficients(fit.p)
colMeans(coefs)
# Check model standard errors against those simulated
sqrt(diag(vcov(fit.p)))
apply(coefs, 2, sd)
# Simulate data from the model
sim.dat <- matrix(nrow = n, ncol = nrow(coefs))
sim.dat[, 1] <- 1:50
fit.p.mat <- model.matrix(fit.p) # same as adding a column of 1's (labelled "Intercept") to the predictors
sim.dat[, 2] <- rpois(n, exp(fit.p.mat %*% coefs[1, ]))
for (i in nrow(coefs)) {
sim.dat[, i] <- 1:50#rpois(n, exp(fit.p.mat %*% coefs[i, ]))
}
c(mean(dat$y), var(dat$y)) # Mean and variance of original outcome
c(mean(colMeans(sim.dat)), mean(apply(sim.dat, 2, var))) # average of mean and var of 10,000 simulated outcomes
# SPARE CODE --------------------------------------------------------------------------------------
# LINEAR REGRESSION USING GRID APPROXIMATION
# Re-performing using custom grid approximation...
# This is a useful reference (p.70) - https://cran.r-project.org/doc/contrib/Robinson-icebreaker.pdf
# Useful resource for grid approximation with more than two parameters (p.18+) - http://patricklam.org/teaching/grid_print.pdf
# Make grid for parameters
post1 <- expand.grid(
alpha = seq(from = 0, to = 2, length.out = 40),
beta = seq(from = 0, to = 3 , length.out = 40),
sigma = seq(from = 0.001, to = 1 , length.out = 40)
)
post1$ll <- sapply(
X = 1:nrow(post1),
FUN = function(i) sum(dnorm(
x = d3$height,
mean = post1$alpha[i] + (post1$beta[i] * d3$height),
sd = post1$sigma[i],
log = TRUE
))
)
# Calculating the product of ll and priors (adding since using logs), ie., the unstandardised posterior
post1$prod <-
post1$ll + # log-likelihood per above
dnorm(x = post1$alpha , mean = 1, sd = 3, log = TRUE) + # prior for alpha
dnorm(x = post1$beta , mean = 2, sd = 4, log = TRUE) + # prior for beta
dunif(x = post1$sigma , min = 0, max = 5, log = TRUE) # prior for standard deviation
# Standardise
post1$prob <- exp(post1$prod - max(post1$prod))
# MLE
post1[which.max(post1$prob), ]
# Linear regression example from the book
set.seed(909)
N <- 100
height <- rnorm(N, 10, 2) # total height of each
leg_prop <- runif(N, 0.4, 0.5) # leg length as proportion of height, roughly 50%
leg_left <- leg_prop * height + rnorm(N, 0, 0.02) # leg as proportion + error
leg_right <- leg_prop * height + rnorm(N, 0, 0.02) # leg as proportion + error
d3 <- data.frame(height, leg_left)
## R code 6.7, estimated using grid approximation with 'quap'
m6.2 <- quap(
alist(
height ~ dnorm( mu , sigma ) ,
mu <- a + bl * leg_left,
a ~ dnorm( 10 , 100 ) ,
bl ~ dnorm( 2 , 10 ) ,
sigma ~ dexp( 1 )
) , data = d3 )
precis(m6.2)
# Re-perform with base R lm OLS
lm(height ~ leg_left, data = d3)
library(rethinking)
options(mc.cores = parallel::detectCores())
N <- 500
income <-c(1,2,5)
score <- 0.5 * income
p <- softmax(score[1], score[2], score[3])
career <- rep(NA, N)
set.seed(34302)
for (i in 1:N) career[i] <- sample(1:3, size = 1, prob = p)
pbar <- 0.5
theta <- 1
curve(dbeta2(x, pbar, theta), from = 0, to = 1, xlab = 'probability', ylab = 'Density')