-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcov_balancing_propensity_score_further_try.R
119 lines (106 loc) · 7.06 KB
/
cov_balancing_propensity_score_further_try.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library(tidyverse)
library(CBPS)
library(rstanarm)
# This script trains some Bayesian logistic regression models used in the final submission (see the same file name
# with "_cv" included for some cross-validation: compared to the other similar script, some extra covariates are added
# (further explanations below remain the same)
#################################################################
# Read data, some model specific feature engineering
#################################################################
adat1 <- read_csv(file="/files/feat_bjoern_trial.csv") %>%
dplyr::select(-row_id, -starts_with("intsponsor"), -starts_with("intpers"), -intidentifiedsites) %>%
group_by(DrugKey, indicationkey) %>%
summarize_if(is.numeric, median) %>% # categorical: names(sort(summary(as_factor(c(0,1,1,2,3))), decreasing = T))[1]
ungroup() %>%
dplyr::select(-newta)
adat2 <- read_csv(file="/files/feat_bjoern_trial.csv") %>%
dplyr::select(-row_id, -starts_with("intsponsor"), -starts_with("intpers"), -intidentifiedsites) %>%
#mutate(newta = ifelse(!is.na(newta), newta, ifelse( strDiseaseType == "[Type 2 Diabetes|NAFLD]", )))
group_by(DrugKey, indicationkey) %>%
summarize(newta = as.numeric(names(sort(summary(as_factor(newta)), decreasing = T, na.last = T)[1]))) %>%
ungroup()
adat <- left_join(adat1, adat2, by=c("DrugKey", "indicationkey")) %>%
ungroup() %>%
mutate(prediction_task = case_when(
insulin + fluvacc>0 ~ 1,
is_a_generic>0 ~ 2,
time_since_first_outcome==0 & prior_approval==0 ~ 3,
prior_approval == 1 ~ 4,
time_since_first_outcome>0 ~ 5,
TRUE ~ 6),
efficacy_assessed = ifelse(is.na(efficacy_assessed),0 , efficacy_assessed),
pk_aspects_na = 1L*is.na(pk_aspects),
pk_aspects = ifelse(is.na(pk_aspects),mean(pk_aspects, na.rm=T) , pk_aspects),
ta7byorph = ta7*orphan_indication,
myclass = as_factor(1L*is.na(outcome)),
dtmeanclu50_t345 = scale( dtmeanclu50 * (prediction_task)>=3),
dtmeancll50_t345 = scale( dtmeancll50 * (prediction_task)>=3),
tameanclu50_t345 = scale( tameanclu50* (prediction_task)>=3),
tameancll50_t345 = scale( tameancll50* (prediction_task)>=3),
dtmeanclu50_t345n = scale( dtmeanclu50 * (prediction_task>=3)),
dtmeancll50_t345n = scale( dtmeancll50 * (prediction_task>=3)),
tameanclu50_t345n = scale( tameanclu50* (prediction_task>=3)),
tameancll50_t345n = scale( tameancll50* (prediction_task>=3)),
orphan_indication_t345 = orphan_indication * (prediction_task>=3),
rel_ph2_size_dis_t345 = rel_ph2_size_dis * (prediction_task>=3),
ta7_t345 = ta7 * (prediction_task>=3),
ta7_orph_t345 = ta7*any_oncology_orphan_designation * (prediction_task>=3),
newta_t345 = newta * (prediction_task>=3),
monoclonal_no_inn_t35 = monoclonal_no_inn * (prediction_task %in% c(3,5)),
unwilling_to_pay_12k_t35 = unwilling_to_pay_12k * (prediction_task %in% c(3,5)),
prediction_task = as_factor(prediction_task),
newta = as_factor(newta),
mean_trialendscore = scale(mean_trialendscore),
rel_ph2_size_dis_t345 = scale(rel_ph2_size_dis_t345)
) %>%
mutate_if(is.numeric , replace_na, replace = 0) %>%
left_join( distinct(dplyr::select(read_csv(file="/files/feat_bjoern_trial.csv"),
DrugKey, indicationkey, GenericName, strDiseaseType)), by=c("DrugKey", "indicationkey")) %>%
ungroup()
####################################################
# Obtain Covariate-balancing propensity score
####################################################
cbps1 <- CBPS( myclass ~ newta1 + newta2 + newta3 + newta4 + newta5 + newta6 + newta7 + newta8 + newta9 + newta10 +
newta11 + newta12 + newta13 + newta14 + newta15 + newta16 + newta17 + newta18 + newta19 + newta20 +
+ newta21 + newta22 + newta23 + pk_aspects + time_since_first_outcome +
prediction_task*(mean_trialendscore + spons4) + tameancll50 + tameanclu50 + orphan_indication + any_oncology_orphan_designation +
ta7 + ta7*any_oncology_orphan_designation + any_nononc_orphan_designation +
+ rel_ph2_size_dis + safety + mean_trialendscore + monoclonal_no_inn + pk_aspects + after2007 +
drugtype1 + drugtype2 + drugtype4 + drugtype5 + drugtype6 + drugtype7 + drugtype8 + drugtype9 + unwilling_to_pay_12k_t35 +
+ fluvacc + insulin + rel_ph2_size_dis_t345 + monoclonal_no_inn_t35 + tameanclu50_t345n + tameancll50_t345n,
data=filter(adat, phaseendyear<2018))
adat <- adat %>%
mutate( propensity = 0)
adat$propensity[adat$phaseendyear<2018] = cbps1$fitted.values
adat %>%
ggplot(aes(x=phaseendyear, y=propensity, col=as_factor(myclass))) + geom_jitter(alpha=0.6)
######################################################
# Bayesian logistic regression fitting using rstanarm
######################################################
## Main model used in final sumbission, used more MCMC samples than other fits below (seems to help a little)
# Fit Bayesian random effects logistic regression model with somewhat informative priors
# even limit to 2010 or later
sglmerfit1b <- stan_glmer(outcome ~ (1 | newta_t345) + prediction_task*mean_trialendscore + rel_ph2_size_dis_t345 + monoclonal_no_inn_t35 + ta7 +
tameanclu50_t345n + unwilling_to_pay_12k_t35,
# + ta7_orph_t345 + orphan_indication_t345#spons4) + # fluvacc + insulin +
family=binomial(link="logit"),
data=filter(adat, myclass==0 & phaseendyear>=2010),
weights = propensity,
chains=4, cores=4,
iter=10000,
seed=2020,
prior_intercept=normal(-2.5, 0.5, autoscale=FALSE), # The mean of the prior corresponds to about 7.5% approvals,
# (last year of training data has 6.45%, but previous year higher)
# SD of 0.5 leads to + SD being ~ 18% (reasonable somewhat informed prior)
prior_aux = exponential(1, autoscale=FALSE),
prior = student_t( # Follow prior recommendations of https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations
# for student_t priors, but make them somehwat informative (location=0.5 or -0.5 if I have a good guess,
# -2.5 for stuff where I more or less assume deterministic relationship)
# Things I have opionions on:
# bad: drug with previous failed indications, poor Phase 2 outcome (e.g. stopped for safety)
# very bad: no INN requsted for a monoclonal antibody
# good: large Phase 2 by standards of the disease area
df=4,
location=c(0,0,0,-0.5, -0.5,0.5,-2.5,0, 0.5,-0.5, -0.5,-0.5,-0.5,-0.5), #-0.5
scale=1))
write_rds(sglmerfit1b, "/home/desktop1/Documents/holzhbj1/novartis-dsai-challenge-starter-kit/sglmerfit1f.rds", compress = "bz2")