-
Notifications
You must be signed in to change notification settings - Fork 62
/
15-prop-scores-r.Rmd
291 lines (242 loc) · 9.66 KB
/
15-prop-scores-r.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# 15. Outcome regression and propensity scores{-}
```{r setup, include=FALSE}
knitr::opts_chunk$set(collapse = TRUE, comment = '#>')
```
## Program 15.1
- Estimating the average causal effect within levels of confounders under the assumption of effect-measure modification by smoking intensity ONLY
- Data from NHEFS
```{r, results='hide', message=FALSE, warning=FALSE}
library(here)
```
```{r}
#install.packages("readxl") # install package if required
library("readxl")
nhefs <- read_excel(here("data", "NHEFS.xls"))
nhefs$cens <- ifelse(is.na(nhefs$wt82), 1, 0)
# regression on covariates, allowing for some effect modification
fit <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age) + as.factor(education)
+ smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs
+ I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active)
+ wt71 + I(wt71*wt71) + I(qsmk*smokeintensity), data=nhefs)
summary(fit)
# (step 1) build the contrast matrix with all zeros
# this function builds the blank matrix
# install.packages("multcomp") # install packages if necessary
library("multcomp")
makeContrastMatrix <- function(model, nrow, names) {
m <- matrix(0, nrow = nrow, ncol = length(coef(model)))
colnames(m) <- names(coef(model))
rownames(m) <- names
return(m)
}
K1 <-
makeContrastMatrix(
fit,
2,
c(
'Effect of Quitting Smoking at Smokeintensity of 5',
'Effect of Quitting Smoking at Smokeintensity of 40'
)
)
# (step 2) fill in the relevant non-zero elements
K1[1:2, 'qsmk'] <- 1
K1[1:2, 'I(qsmk * smokeintensity)'] <- c(5, 40)
# (step 3) check the contrast matrix
K1
# (step 4) estimate the contrasts, get tests and confidence intervals for them
estimates1 <- glht(fit, K1)
summary(estimates1)
confint(estimates1)
# regression on covariates, not allowing for effect modification
fit2 <- glm(wt82_71 ~ qsmk + sex + race + age + I(age*age) + as.factor(education)
+ smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs
+ I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active)
+ wt71 + I(wt71*wt71), data=nhefs)
summary(fit2)
```
## Program 15.2
- Estimating and plotting the propensity score
- Data from NHEFS
```{r, out.width="85%", fig.align='center'}
fit3 <- glm(qsmk ~ sex + race + age + I(age*age) + as.factor(education)
+ smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs
+ I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active)
+ wt71 + I(wt71*wt71), data=nhefs, family=binomial())
summary(fit3)
nhefs$ps <- predict(fit3, nhefs, type="response")
summary(nhefs$ps[nhefs$qsmk==0])
summary(nhefs$ps[nhefs$qsmk==1])
# # plotting the estimated propensity score
# install.packages("ggplot2") # install packages if necessary
# install.packages("dplyr")
library("ggplot2")
library("dplyr")
nhefs <- nhefs %>% mutate(qsmklabel = ifelse(qsmk == 1,
yes = 'Quit Smoking 1971-1982',
no = 'Did Not Quit Smoking 1971-1982'))
ggplot(nhefs, aes(x = ps, fill = qsmklabel, color = qsmklabel)) +
geom_density(alpha = 0.2) +
xlab('Probability of Quitting Smoking During Follow-up') +
ggtitle('Propensity Score Distribution by Treatment Group') +
theme(legend.position = 'bottom', legend.direction = 'vertical',
legend.title = element_blank())
# alternative plot with histograms
ggplot(nhefs, aes(x = ps, fill = as.factor(qsmk), color = as.factor(qsmk))) +
geom_histogram(alpha = 0.3, position = 'identity', bins=15) +
facet_grid(as.factor(qsmk) ~ .) +
xlab('Probability of Quitting Smoking During Follow-up') +
ggtitle('Propensity Score Distribution by Treatment Group') +
scale_fill_discrete('') +
scale_color_discrete('') +
theme(legend.position = 'bottom', legend.direction = 'vertical')
```
```{r, eval=FALSE, out.width="85%", fig.align='center'}
# attempt to reproduce plot from the book
nhefs %>%
mutate(ps.grp = round(ps/0.05) * 0.05) %>%
group_by(qsmk, ps.grp) %>%
summarize(n = n()) %>%
ungroup() %>%
mutate(n2 = ifelse(qsmk == 0, yes = n, no = -1*n)) %>%
ggplot(aes(x = ps.grp, y = n2, fill = as.factor(qsmk))) +
geom_bar(stat = 'identity', position = 'identity') +
geom_text(aes(label = n, x = ps.grp, y = n2 + ifelse(qsmk == 0, 8, -8))) +
xlab('Probability of Quitting Smoking During Follow-up') +
ylab('N') +
ggtitle('Propensity Score Distribution by Treatment Group') +
scale_fill_discrete('') +
scale_x_continuous(breaks = seq(0, 1, 0.05)) +
theme(legend.position = 'bottom', legend.direction = 'vertical',
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
```
## Program 15.3
- Stratification on the propensity score
- Data from NHEFS
```{r}
# calculation of deciles
nhefs$ps.dec <- cut(nhefs$ps,
breaks=c(quantile(nhefs$ps, probs=seq(0,1,0.1))),
labels=seq(1:10),
include.lowest=TRUE)
#install.packages("psych") # install package if required
library("psych")
describeBy(nhefs$ps, list(nhefs$ps.dec, nhefs$qsmk))
# function to create deciles easily
decile <- function(x) {
return(factor(quantcut(x, seq(0, 1, 0.1), labels = FALSE)))
}
# regression on PS deciles, allowing for effect modification
for (deciles in c(1:10)) {
print(t.test(wt82_71~qsmk, data=nhefs[which(nhefs$ps.dec==deciles),]))
}
# regression on PS deciles, not allowing for effect modification
fit.psdec <- glm(wt82_71 ~ qsmk + as.factor(ps.dec), data = nhefs)
summary(fit.psdec)
confint.lm(fit.psdec)
```
## Program 15.4
- Standardization using the propensity score
- Data from NHEFS
```{r}
#install.packages("boot") # install package if required
library("boot")
# standardization by propensity score, agnostic regarding effect modification
std.ps <- function(data, indices) {
d <- data[indices,] # 1st copy: equal to original one`
# calculating propensity scores
ps.fit <- glm(qsmk ~ sex + race + age + I(age*age)
+ as.factor(education) + smokeintensity
+ I(smokeintensity*smokeintensity) + smokeyrs
+ I(smokeyrs*smokeyrs) + as.factor(exercise)
+ as.factor(active) + wt71 + I(wt71*wt71),
data=d, family=binomial())
d$pscore <- predict(ps.fit, d, type="response")
# create a dataset with 3 copies of each subject
d$interv <- -1 # 1st copy: equal to original one`
d0 <- d # 2nd copy: treatment set to 0, outcome to missing
d0$interv <- 0
d0$qsmk <- 0
d0$wt82_71 <- NA
d1 <- d # 3rd copy: treatment set to 1, outcome to missing
d1$interv <- 1
d1$qsmk <- 1
d1$wt82_71 <- NA
d.onesample <- rbind(d, d0, d1) # combining datasets
std.fit <- glm(wt82_71 ~ qsmk + pscore + I(qsmk*pscore), data=d.onesample)
d.onesample$predicted_meanY <- predict(std.fit, d.onesample)
# estimate mean outcome in each of the groups interv=-1, interv=0, and interv=1
return(c(mean(d.onesample$predicted_meanY[d.onesample$interv==-1]),
mean(d.onesample$predicted_meanY[d.onesample$interv==0]),
mean(d.onesample$predicted_meanY[d.onesample$interv==1]),
mean(d.onesample$predicted_meanY[d.onesample$interv==1])-
mean(d.onesample$predicted_meanY[d.onesample$interv==0])))
}
# bootstrap
results <- boot(data=nhefs, statistic=std.ps, R=5)
# generating confidence intervals
se <- c(sd(results$t[,1]), sd(results$t[,2]),
sd(results$t[,3]), sd(results$t[,4]))
mean <- results$t0
ll <- mean - qnorm(0.975)*se
ul <- mean + qnorm(0.975)*se
bootstrap <- data.frame(cbind(c("Observed", "No Treatment", "Treatment",
"Treatment - No Treatment"), mean, se, ll, ul))
bootstrap
```
```{r}
# regression on the propensity score (linear term)
model6 <- glm(wt82_71 ~ qsmk + ps, data = nhefs) # p.qsmk
summary(model6)
# standarization on the propensity score
# (step 1) create two new datasets, one with all treated and one with all untreated
treated <- nhefs
treated$qsmk <- 1
untreated <- nhefs
untreated$qsmk <- 0
# (step 2) predict values for everyone in each new dataset based on above model
treated$pred.y <- predict(model6, treated)
untreated$pred.y <- predict(model6, untreated)
# (step 3) compare mean weight loss had all been treated vs. that had all been untreated
mean1 <- mean(treated$pred.y, na.rm = TRUE)
mean0 <- mean(untreated$pred.y, na.rm = TRUE)
mean1
mean0
mean1 - mean0
# (step 4) bootstrap a confidence interval
# number of bootstraps
nboot <- 100
# set up a matrix to store results
boots <- data.frame(i = 1:nboot,
mean1 = NA,
mean0 = NA,
difference = NA)
# loop to perform the bootstrapping
nhefs <- subset(nhefs, !is.na(ps) & !is.na(wt82_71)) # p.qsmk
for(i in 1:nboot) {
# sample with replacement
sampl <- nhefs[sample(1:nrow(nhefs), nrow(nhefs), replace = TRUE), ]
# fit the model in the bootstrap sample
bootmod <- glm(wt82_71 ~ qsmk + ps, data = sampl) # ps
# create new datasets
sampl.treated <- sampl %>%
mutate(qsmk = 1)
sampl.untreated <- sampl %>%
mutate(qsmk = 0)
# predict values
sampl.treated$pred.y <- predict(bootmod, sampl.treated)
sampl.untreated$pred.y <- predict(bootmod, sampl.untreated)
# output results
boots[i, 'mean1'] <- mean(sampl.treated$pred.y, na.rm = TRUE)
boots[i, 'mean0'] <- mean(sampl.untreated$pred.y, na.rm = TRUE)
boots[i, 'difference'] <- boots[i, 'mean1'] - boots[i, 'mean0']
# once loop is done, print the results
if(i == nboot) {
cat('95% CI for the causal mean difference\n')
cat(mean(boots$difference) - 1.96*sd(boots$difference),
',',
mean(boots$difference) + 1.96*sd(boots$difference))
}
}
```
A more flexible and elegant way to do this is to write a function to perform the model fitting, prediction, bootstrapping, and reporting all at once.