-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.R
102 lines (92 loc) · 3.67 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#' ---
#' title: "RAI Main Analysis"
#' author: "Wilson, Bokov, Shireman"
#' date: "08/15/2017"
#' ---
#'
#' Please read this file through before trying to run it. The comments tell
#' you what you need to edit in order to proceed.
#'
source('global.R');
.currentscript <- parent.frame(2)$ofile;
if(is.null(.currentscript)) .currentscript <- 'RUN_FROM_INTERACTIVE_SESSION';
tself(scriptname=.currentscript);
project_seed <- 20180803;
#' ## Load data if it exists
#'
#' (useful later, right now don't bother saving sessions)
#'if(session %in% list.files()) load(session);
dat0 <- tread(inputdata,read_tsv,na=c('(null)',''));
#' Read in the data dictionary
dct0 <- tread(dctfile,read_csv,na = '');
colnames(dat0) <- tolower(colnames(dat0));
#' If you want to use some other set of columns as indices that is
#' specific to the version of the data you are using, declare
#' a different `alist` in your config.R, but it's better to not
#' mess with this.
if(!exists('l_indices')) l_indices <- alist(
pnum=patient_num,vnum=encounter_num
);
#' Create copy of original dataset
dat1 <- do.call(mutate,c(list(dat0),l_indices));
#' Create custom synonyms for 'TRUE' if needed
l_truthy_default <- eval(formals(truthy.default)$truewords);
#l_truthy_default <- c("Yes", l_truthy_default);
l_missing <- c(NA,'Unknown','unknown','UNKNOWN');
#' Creating training/testing/validation samples
#'
#' As long as the seed is the same, all random values will be generated the same
#' reproducibly.
tseed(project_seed);
#' Randomly to training, testing, or validation sets
pat_samples <- split(dat1$pnum,sample(c('train','test','val')
,size=nrow(dat1),rep=T));
#' ## Transform certain columns
#
#' Make sex/gender a factor
#
#' TODO: do this dynamically via new c_ group for all columns that are safe to
#' directly convert to factors
#dat1$gender <- factor(dat1$gender);
#' ## Create binned versions of certain numeric vars.
#
#' (commented out until we can put a c_num2bin or something into dct0)
# dat1[,paste0('bin_',cnum2bin)] <- sapply(dat1[,cnum2bin],function(ii){
# qii <- c(0,quantile(ii,c(.25,.5,.75),na.rm=T),Inf);
# cut(ii,breaks = qii);
# })
#' ## Create response variables
#
#' ## Transform Rows
#
#' ## Sort the rows
#
#' Creating an object to use as the lookup argument for `mapnames()``
#' TODO: change the NSQIP_NAMES column name to a more generic one
#dat1namelookup <- with(dct0,setNames(dataset_column_names
# ,ifelse(is.na(NSQIP_NAMES)
# ,dataset_column_names
# ,NSQIP_NAMES)));
#' ### Make several subsets of dat1 all at once
#
#' for later use to make multiple versions of the same table and
#' multiple versions of the same graph,
#' ### Create a version of the dataset that only has each patient's 1st encounter
#'
dat2 <- group_by(dat1,pnum) %>% summarise_all(first);
#' Each name is a legal variable name for that subset, the value
#' assigned to it is an R expression that can be evaluated in the
#' scope of `dat1` and will return a `TRUE`/`FALSE` vector
subs_criteria <- alist(
SUBSETNAME00=0==1
,SUBSETNAME01=0==1
);
#' Creates a hierarchy of lists containing various subsets of interest
sbs0 <- sapply(list(all=dat1,index=dat2),function(xx) do.call(ssply,c(list(dat=xx),subs_criteria[-1])),simplify=F);
#' Subsetting by the earlier randomly assigned train and test groups
sbs0$train <- lapply(sbs0$all,subset,pnum%in%pat_samples$train);
sbs0$test <- lapply(sbs0$all,subset,pnum%in%pat_samples$test);
#' ## Save all the processed data to an rdata file
#'
#' ...which includes the audit trail
tsave(file=paste0(.currentscript,'.rdata'),list=ls());