-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_read_data_redcap.R
344 lines (325 loc) · 16.5 KB
/
1_read_data_redcap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# 1_read_data_redcap.R
# read the LOCATE data from REDCap using the API; calculate additional variables; add information from study scans
# May 2023
library(dplyr)
library(tidyr)
library(janitor)
library(redcapAPI) # to directly get data from REDCap
library(RCurl) # for making the API request
library(jsonlite) # for converting JSON data
library(stringr)
library(eq5d)
library(readxl)
library(openxlsx) # for exporting data for checking
source('99_functions.R')
source('API/config.R') # for key
## Section 1: REDCap data dictionary ##
dictionary = read.csv('data/LOCATENAFLD_DataDictionary_2023-05-04.csv', stringsAsFactors = FALSE) %>%
clean_names() %>%
filter(!str_detect(variable_field_name, pattern='_instr_'), # remove instructions that are not variables
!variable_field_name == 'audit_score_result') # remove this derived variable
# get variables of particular types:
dates = filter(dictionary, text_validation_type_or_show_slider_number=='date_dmy') %>% pull(variable_field_name)
numbers1 = filter(dictionary, text_validation_type_or_show_slider_number=='number') %>% pull(variable_field_name)
numbers2 = filter(dictionary, str_detect(string=variable_field_name, pattern='eq5d_')) %>% pull(variable_field_name)
numbers = union(numbers1, numbers2)
yes_no1 = filter(dictionary, choices_calculations_or_slider_labels=='1, Yes | 0, No') %>% pull(variable_field_name)
yes_no2 = filter(dictionary, field_type=='yesno') %>% pull(variable_field_name)
yes_no = c(yes_no1, yes_no2)
yes_no_unclear = filter(dictionary, choices_calculations_or_slider_labels=='1, Yes | 0, No | 2, Unclear') %>% pull(variable_field_name)
drinks = filter(dictionary, choices_calculations_or_slider_labels=='0, Never | 1, Less than monthly | 2, Monthly | 3, Weekly | 4, Daily or almost daily') %>% pull(variable_field_name)
audit = c('how_often','standard_drinks_day','six_or_more','not_stop_once_started','failed_normal_expectations','morning_drinking','guilt_remorse','unable_to_remember','you_someone_else_injured','relative_hp_concerned')
## Section 2: study scans data
load('data/0_scans.RData')
scans = select(scans, record_id, exam_date, e_median_k_pa) # just keep the exam date and KPA
##
## Section 3: get data from REDCap via API ##
rcon = redcapConnection(url = api_url, token = api_token_locate)
result = exportRecords(rcon, forms = NULL)
## process and tidy-up the data
data = clean_names(result) %>%
select(-name, -address, -phone_number, -participant_contacted, -interest_to_participate, -patient_gp, -ends_with('complete'), -pregnant, -leave_area) %>% # variable not needed; no patients were pregnant; none leave area
separate(record_id, into=c('centre','num'), sep = '-', remove=FALSE) %>% # extract centre number; keep original record id variable
mutate(centrename = centre_name(centre),
year_referral = as.numeric(format(date_of_referral, '%Y')),
year_today = as.numeric(format(date_today, '%Y'))) %>%
mutate(referral_time = as.numeric(date_returned - date_of_referral), # time between randomised and GP referral
bmi_calc = weight / ((height/100)^2), # calculated BMI
bmi_calc = as.numeric(bmi_calc), # remove label
body_mass_index = as.numeric(body_mass_index), # remove label
bmi = coalesce(body_mass_index, bmi_calc), # is sometimes missing for both, so combine two sources
year_now = coalesce(year_referral, year_today),
year_born = ifelse(is.na(year_born_baseline) == TRUE, year_born_referral, year_born_baseline), # get year born from two source
age = year_now - year_born,
age = as.numeric(age)) %>% # remove label
select(-starts_with('year_'), -body_mass_index, -bmi_calc, -audit_score_result) # drop some variables not needed
## combine two tick box results into one variable from referral letter
data = mutate(data,
bmi_6m = case_when(
bmi_6m_0 == 'Checked' ~ 1,
bmi_6m_1 == 'Checked' ~ 0,
bmi_6m_0 == 'Unchecked' & bmi_6m_1 == 'Unchecked' ~ NA_real_
),
fbc_6m = case_when(
fbc_6m_0 == 'Checked' ~ 1,
fbc_6m_1 == 'Checked' ~ 0,
fbc_6m_0 == 'Unchecked' & fbc_6m_1 == 'Unchecked' ~ NA_real_
),
liver_6m = case_when(
liver_6m_0 == 'Checked' ~ 1,
liver_6m_1 == 'Checked' ~ 0,
liver_6m_0 == 'Unchecked' & liver_6m_1 == 'Unchecked' ~ NA_real_
),
hep_6m = case_when(
hep_6m_0 == 'Checked' ~ 1,
hep_6m_1 == 'Checked' ~ 0,
hep_6m_0 == 'Unchecked' & hep_6m_1 == 'Unchecked' ~ NA_real_
),
iron_6m = case_when(
iron_6m_0 == 'Checked' ~ 1,
iron_6m_1 == 'Checked' ~ 0,
iron_6m_0 == 'Unchecked' & iron_6m_1 == 'Unchecked' ~ NA_real_
),
us_6m = case_when(
us_6m_0 == 'Checked' ~ 1,
us_6m_1 == 'Checked' ~ 0,
us_6m_0 == 'Unchecked' & us_6m_1 == 'Unchecked' ~ NA_real_
)) %>%
select(-ends_with('_0'), -'bmi_6m_1', -'fbc_6m_1', -'liver_6m_1', -'hep_6m_1', -'iron_6m_1', -'us_6m_1') # can't use general '_1' because of data in notes form
# convert variables
data = mutate(data,
across(any_of(dates), as.Date), # convert dates
across(any_of(numbers), as.numeric), # convert to numbers
across(any_of(yes_no), yesno), # convert yes/no
how_often = audit_score(how_often), # audit not working in `across`
standard_drinks_day = audit_score(standard_drinks_day),
six_or_more = audit_score(six_or_more),
not_stop_once_started = audit_score(not_stop_once_started),
failed_normal_expectations = audit_score(failed_normal_expectations),
morning_drinking = audit_score(morning_drinking),
guilt_remorse = audit_score(guilt_remorse),
unable_to_remember = audit_score(unable_to_remember),
you_someone_else_injured = audit_score(you_someone_else_injured),
relative_hp_concerned = audit_score(relative_hp_concerned)
)
## export liver size for hand cleaning, just do once ...
export_liver = function(){
export = select(data, record_id, liver_size) %>%
filter(!is.na(liver_size),
liver_size != 'not stated') %>%
mutate(size = NA)
wb <- createWorkbook()
addWorksheet(wb, 1)
writeData(wb, sheet = 1, export)
saveWorkbook(wb, "data/liver_size.xlsx", overwrite = TRUE)
}
# ... now read in cleaned liver data and merge with original data
data = select(data, -liver_size) # remove liver size from original data
cleaned = read_excel('data/liver_size_24042023.xlsx', sheet='for_r') %>%
clean_names() %>%
select(-liver_size) %>% # remove old version
rename('liver_size' = 'size_cm') # rename new version
data = mutate(data, record_id = as.character(record_id)) # for merge
data = left_join(data, cleaned, by='record_id')
## AUDIT alcohol score
audit_score = select(data, record_id, all_of(audit)) %>%
mutate_if(is.factor, as.numeric) %>%
pivot_longer(cols = all_of(audit)) %>%
filter(!is.na(value)) %>% # ignore missing, create score from what is available
group_by(record_id) %>%
summarise(n = n(), audit = sum(value)) %>%
ungroup() %>%
filter(n >= 4) %>% # must have answered 4 or more questions (most were fully complete)
select(-n)
data = left_join(data, audit_score, by='record_id')
## calculate EQ-5D-3L
# a) at baseline
scores.df = select(data, record_id, contains('eq5d')) %>%
rename('MO' = 'eq5d_mb_b', # mobility
'SC' = 'eq5d_sc_b', # self-care
'UA' = 'eq5d_ua_b', # usual activities
'PD' = 'eq5d_pd_b', # pain/discomfort
'AD' = 'eq5d_ad_b') %>% # anxiety/depression
select(record_id, MO, SC, UA, PD, AD) %>%
drop_na() # remove all missing
scores.df.baseline = mutate(scores.df,
eq5d_b = eq5d(scores.df, country="Australia", version="3L", type="TTO"),
eq5d_b = eq5d_b*100 # make eq-5d on 0 to 100 scale
) %>%
select(record_id, eq5d_b)
# b) at follow-up
scores.df = select(data, record_id, contains('eq5d')) %>%
rename('MO' = 'eq5d_mb_12m', # mobility
'SC' = 'eq5d_sc_12m', # self-care
'UA' = 'eq5d_ua_12m', # usual activities
'PD' = 'eq5d_pd_12m', # pain/discomfort
'AD' = 'eq5d_ad_12m') %>% # anxiety/depression
select(record_id, MO, SC, UA, PD, AD) %>%
drop_na() # remove all missing
#
scores.df.fu = mutate(scores.df,
eq5d_12m = eq5d(scores.df, country="Australia", version="3L", type="TTO"),
eq5d_12m = eq5d_12m*100 # make eq-5d on 0 to 100 scale
) %>%
select(record_id, eq5d_12m)
# add scores back to data
data = left_join(data, scores.df.baseline, by='record_id')
data = left_join(data, scores.df.fu, by='record_id') #
## secondary outcome, date of successful fibroscan,
# take earliest date from patient notes or study data; keep kpa data associated with scan
notes = select(data, record_id, fibroscan_date_2, kpa) %>%
mutate(kpa = as.numeric(kpa)) %>%
rename('exam_date' = 'fibroscan_date_2')
both_scans = rename(scans, 'kpa' = 'e_median_k_pa') %>%
bind_rows(notes, .id = 'source') %>%
filter(!is.na(exam_date), !is.na(kpa)) %>% # must have date and KPA result
arrange(record_id, exam_date) %>%
group_by(record_id) %>%
slice(1) %>% # take earliest reading
ungroup() %>%
select(-source) %>% # no longer needed
rename('KPA' = 'kpa',
'scan_date' = 'exam_date')
# now add this information to the main data
data = left_join(data, both_scans, by = "record_id") %>%
mutate(kpa = as.numeric(kpa),
kpa2 = coalesce(kpa, KPA)) %>% # take whichever KPA reading is available
select(-kpa, -KPA) %>%
rename('kpa' = 'kpa2')
## primary outcome using fibroscan date and KPA data
# first flag missing scan dates
miss = filter(data,
fibroscan == 'Yes (successful)' &
!is.na(randomised), # must have been randomised
is.na(fibroscan_date_2)) %>%
select(record_id)
miss # should be empty
# clinically significant fibrosis is over 8
data = mutate(data,
primary_date = case_when(
is.na(scan_date) ~ NA_real_,
is.na(kpa) ~ NA_real_,
!is.na(scan_date) & !is.na(kpa) & kpa < 8 ~ NA_real_,
!is.na(scan_date) & !is.na(kpa) & kpa >= 8 ~ as.numeric(scan_date) # 8 and above is high risk
),
primary_date = as.Date(primary_date, origin='1970-01-01'))
#
## fixes to exclusions and a few other small edits
data = mutate(data,
record_id = as.character(record_id), # remove label for merging
# randomised but excluded - set randomised to blank
randomised = as.character(randomised),
randomised = ifelse(!is.na(randomised) & !is.na(exclude_reason), NA, randomised),
# a few more who never returned questionnaire but not marked in REDCap
exclude_reason = as.character(exclude_reason),
exclude_reason = ifelse(is.na(randomised) == TRUE & withdrew_or_excluded ==TRUE, 'Could not be contacted' , exclude_reason),
withdrew_reason = as.character(withdrew_reason),
reason = coalesce(exclude_reason, withdrew_reason),
reason = ifelse(withdrew_or_excluded == 'Neither', 'Could not be contacted', reason), # add people who could not be contacted
phone_time = as.numeric(phone_date - date_returned)/(365.25/12) ) # time between randomisation and telephone follow-up (months)
## fixes to form (patient notes) data
# ED presentation
index = data$ed_pres_1 < data$date_returned # presentations before randomised
index[is.na(index)] = FALSE
data$ed_pres_1[index] = data$ed_pres_2[index] # shift date from next ED presentation ...
data$ed_pres_2[index] = NA # ... and blank second
data$ed_reason_1[index] = data$ed_reason_2[index] # shift reason from next ED presentation ...
data$ed_reason_2[index] = NA #... and blank second
if(data$ed_presentations[index] == 'Yes - once'){
data$ed_presentations[index] = 'No'
}
### fixes to employment as we did not have retired as a category
## baseline employment
data = mutate(data, employment = as.character(employment)) # change to character
index = str_detect(tolower(data$employment_other), 'retired|pension|super')
index[is.na(index)] = FALSE
data$employment_other[index] = ''
data$employment[index] = 'Retired'
# more consistent and shift to category
index = str_detect(tolower(data$employment_other), '\\bcarer\\b')
data$employment[index] = 'Carer'
data$employment_other[index] = ''
# more consistent and shift to category
index = str_detect(tolower(data$employment_other), '\\bfull.time\\b')
data$employment[index] = 'Full-time'
data$employment_other[index] = ''
# more consistent and shift to category
index = str_detect(tolower(data$employment_other), 'casual|supply')
data$employment[index] = 'Part-time'
data$employment_other[index] = ''
# more consistent and shift to category
index = str_detect(data$employment_other, 'Stay at home mother')
data$employment[index] = 'Home duties'
data$employment_other[index] = ''
## 12 months employment
data = mutate(data, employment_12 = as.character(employment_12)) # change to character
index = str_detect(tolower(data$employment_other_12), 'retired|pension|super')
index[is.na(index)] = FALSE
data$employment_other_12[index] = ''
data$employment_12[index] = 'Retired'
# fix typo
index = str_detect(data$employment_other_12, 'Volunteer worjk')
data$employment_other_12[index] = 'Volunteer work'
# more consistent
index = str_detect(data$employment_other_12, 'Contract work')
data$employment_other_12[index] = 'Contract work'
# more consistent and shift to category
index = str_detect(data$employment_other_12, 'Carer')
data$employment_12[index] = 'Carer'
data$employment_other_12[index] = ''
# more consistent and shift to category
index = str_detect(data$employment_other_12, 'Stay at home mother')
data$employment_12[index] = 'Home duties'
data$employment_other_12[index] = ''
# more consistent and shift to category
index = str_detect(tolower(data$employment_other_12), 'casual|supply')
data$employment_12[index] = 'Part-time'
data$employment_other_12[index] = ''
## add per protocol for new model of care; they must have had a scan
data = left_join(data, scans, by='record_id') %>% # scans data loaded above
mutate(pp = case_when(
randomised == 'Usual care' ~ TRUE, # include all as PP does not apply because "treatment" is usual care, so it is whatever they did
randomised == 'New model of care' & !is.na(exam_date) ~ TRUE, # completed the treatment protocol originally allocated
randomised == 'New model of care' & is.na(exam_date) ~ FALSE
))
#with(data, table(randomised, is.na(exam_date))) # check
## if age/sex is missing, use referral data
data = mutate(data,
sex = as.character(sex),
sex_referral = as.character(sex_referral),
sex = ifelse(is.na(sex)==TRUE, sex_referral, sex)) %>%
select(-sex_referral)
## Edit from Ingrid: combine 'Yes, dietician' and 'Yes, both', 'Yes, nutritionist'
data = mutate(data,
dietician_baseline = case_when(
dietician_baseline == 'No' ~ 'No',
dietician_baseline == 'Yes, both' ~ 'Yes',
dietician_baseline == 'Yes, dietician' ~ 'Yes',
dietician_baseline == 'Yes, nutritionist' ~ 'Yes'
),
dietician_12m = case_when(
dietician_12m == 'No' ~ 'No',
dietician_12m == 'Yes, both' ~ 'Yes',
dietician_12m == 'Yes, dietician' ~ 'Yes',
dietician_12m == 'Yes, nutritionist' ~ 'Yes'
))
## there were no admissions to hospital, so can delete date variables
data = select(data,
-'admission_date_1',
-'discharge_1',
-'hospitalisation_usage',
-'admission_date_2',
-'discharge_2',
-'hospitalisation_usage_2',
-'admission_date_3',
-'discharge_3',
-'hospitalisation_usage_3')
dictionary = filter(dictionary,
!variable_field_name %in% c('admission_date_1', 'discharge_1',
'hospitalisation_usage', 'admission_date_2', 'discharge_2',
'hospitalisation_usage_2', 'admission_date_3', 'discharge_3',
'hospitalisation_usage_3','sex_referral'))
## save ##
save(data, dictionary, file = 'data/1_redcap_data.RData')
#
# filter(data, centre=='122', randomised==1) %>% pull(record_id)