-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathentrypoint.R
324 lines (298 loc) · 14.1 KB
/
entrypoint.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/local/bin/Rscript
# Greeting users
dht::greeting()
## load libraries without messages or warnings
withr::with_message_sink("/dev/null", library(daymetr))
withr::with_message_sink("/dev/null", library(tidyverse))
withr::with_message_sink("/dev/null", library(terra))
withr::with_message_sink("/dev/null", library(gtools))
withr::with_message_sink("/dev/null", library(data.table))
withr::with_message_sink("/dev/null", library(dht))
doc <- '
Usage:
entrypoint.R <filename>
entrypoint.R (-h | --help)
Options:
-h --help Show this screen
filename name of csv file
'
opt <- docopt::docopt(doc)
# Writing functions
# Creating function to import and process the input data
import_data <- function(.csv_filename = opt$filename, .min_lon = min_lon, .max_lon = max_lon, .min_lat = min_lat, .max_lat = max_lat, .year_start = year_start, .year_end = year_end) {
# Checking that the input data is a CSV file
if (!str_detect(.csv_filename, ".csv$")) {
stop(call. = FALSE, 'Input file must be a CSV.')
}
# Reading in the input data
input_data <- fread(.csv_filename, header = TRUE, sep = ",", colClasses = c(id = "character", lat = "numeric", lon = "numeric",
enc_admit_date = "IDate", start_date = "character",
end_date = "character", address_number = "integer",
census_tract = "character", census_tract_date = "IDate"))
input_data <- as_tibble(input_data)
# Creating a row_index variable in the input data that is just the row number
input_data <- input_data %>%
mutate(row_index = 1:nrow(input_data)) %>%
relocate(row_index)
# Ensuring that an id column is in the input data, and quitting if not
tryCatch({
check_for_column(input_data, column_name = "id")
}, error = function(e) {
print(e)
stop(call. = FALSE)
}, warning = function(w) {
print(w)
stop(call. = FALSE)
})
# Ensuring that numeric lat and lon are in the input data, and quitting if not
tryCatch({
check_for_column(input_data, column_name = "lat")
check_for_column(input_data, column_name = "lon")
check_for_column(input_data, column_name = "lat", column = input_data$lat, column_type = "numeric")
check_for_column(input_data, column_name = "lon", column = input_data$lon, column_type = "numeric")
}, error = function(e) {
print(e)
stop(call. = FALSE)
}, warning = function(w) {
print(w)
stop(call. = FALSE)
})
# Filtering out rows in the input data where lat or lon are missing
input_data <- input_data %>%
filter(!is.na(lat) & !is.na(lon))
# Throwing an error if no observations are remaining
if (nrow(input_data) == 0) {
stop(call. = FALSE, 'Zero observations where lat and lon are not missing.')
}
# Removing observations where the address is outside of the bounding box of Daymet data
input_data <- input_data %>%
filter(lat >= .min_lat & lat <= .max_lat & lon >= .min_lon & lon <= .max_lon)
# Throwing an error if no observations are remaining
if (nrow(input_data) == 0) {
stop(call. = FALSE, 'Zero observations where the lat and lon coordinates are within the bounding box of Daymet data.')
}
# Ensuring that start_date and end_date are in the input data as dates, and quitting if not
tryCatch({
check_for_column(input_data, column_name = "start_date")
check_for_column(input_data, column_name = "end_date")
input_data$start_date <- check_dates(input_data$start_date, allow_missing = TRUE)
input_data$end_date <- check_dates(input_data$end_date, allow_missing = TRUE)
}, error = function(e) {
print(e)
stop(call. = FALSE)
}, warning = function(w) {
print(w)
stop(call. = FALSE)
})
# Filtering out rows in the input data where start_date or end_date are missing
input_data <- input_data %>%
filter(!is.na(start_date) & !is.na(end_date))
# Throwing an error if no observations are remaining
if (nrow(input_data) == 0) {
stop(call. = FALSE, 'Zero observations where the user-supplied event dates are not entirely missing.')
}
# Checking that end_date is after start_date
tryCatch({
check_end_after_start_date(input_data$start_date, input_data$end_date)
}, error = function(e) {
print(e)
stop(call. = FALSE)
}, warning = function(w) {
print(w)
stop(call. = FALSE)
})
# Expanding the dates between start_date and end_date into a daily series
input_data <- expand_dates(input_data, by = "day") %>%
select(-start_date, -end_date)
# Filtering out any rows in the input data where the date is before the first year of downloaded Daymet data
input_data <- input_data %>%
filter(!(year(date) < .year_start))
# Throwing an error if no observations are remaining
if (nrow(input_data) == 0) {
stop(call. = FALSE, 'Zero observations where the start_date is within or after the first year of downloaded Daymet data.')
}
# Filtering out any rows in the input data where the date is after the last year of downloaded Daymet data
input_data <- input_data %>%
filter(!(year(date) > .year_end))
# Throwing an error if no observations are remaining
if (nrow(input_data) == 0) {
stop(call. = FALSE, 'Zero observations where the end_date is within or before the last year of downloaded Daymet data.')
}
# Removing any columns in the input data where everything is NA
input_data <- input_data %>%
select_if(~ !all(is.na(.)))
# Separating the row_index, address coordinates, and dates out into their own dataset
addresses <- input_data %>%
select(row_index, lat, lon, date)
# Separating the row_index and any other columns out into their own dataset
extra_columns <- input_data %>%
select(-lat, -lon, -date) %>%
distinct()
extra_columns <- as.data.table(extra_columns)
# Converting the input addresses to a SpatVector
coords <- vect(addresses, geom = c("lon", "lat"), crs = "+proj=longlat +ellips=WGS84")
# Returning a list of objects needed later
out <- list("addresses" = addresses, "extra_columns" = extra_columns, "coords" = coords)
return(out)
}
# Create function to load the Daymet NetCDF data
daymet_load <- function() {
# Loading the NetCDF files downloaded from Daymet as a SpatRaster raster stack
netcdf_list <- list.files(path = "/app", pattern = "_ncss.nc$")
# Initializing a time dictionary
time_dict <- tibble(number = 1:365)
for (i in 1:length(netcdf_list)) {
# Extracting the year and Daymet variable from the file to be loaded in
yr <- str_extract(netcdf_list[i], "[0-9]{4}")
dm_var <- unlist(str_split(netcdf_list[i], "_"))[1]
# Creating a vector of layer names
layer_names <- as.character(1:365)
layer_names <- paste0(dm_var, "_", layer_names, "_", yr)
# Loading the Daymet data
daymet_load <- rast(paste0("/app/", netcdf_list[i]))
names(daymet_load) <- layer_names
# Creating a dictionary to link numbers 1–365 to a date in a year (time dictionary)
origin <- as_date(paste0(yr, "-01-01")) - 1 # Numbers count days since origin
time_dict <- time_dict %>%
mutate(year = yr,
date := as_date(number, origin = origin))
# Stacking the Daymet data rasters and time dictionary, and tracking the year and Daymet variable
if (i == 1) {
daymet_data <- daymet_load
time_dictionary <- time_dict
yr_list <- list(yr)
dm_var_list <- list(dm_var)
} else {
daymet_data <- c(daymet_data, daymet_load)
time_dictionary <- rbind(time_dictionary, time_dict)
yr_list[[length(yr_list) + 1]] <- yr
dm_var_list[[length(dm_var_list) + 1]] <- dm_var
}
}
time_dictionary <- time_dictionary %>%
arrange(number, year) %>%
distinct()
time_dictionary <- as.data.table(time_dictionary)
# Extracting the year start and year end of the Daymet data that was loaded in
yr_list <- unique(yr_list)
year_start <- as.numeric(min(unlist(yr_list)))
year_end <- as.numeric(max(unlist(yr_list)))
# Extracting the Daymet variables of the Daymet data that was loaded in
dm_var_list <- unique(dm_var_list)
daymet_variables <- paste(unlist(dm_var_list), collapse = ",")
# Extracting the minimum and maximum longitude and latitude of the Daymet data that was loaded in
proj_daymet_data <- project(daymet_data, "+proj=longlat +ellips=WGS84")
min_lon <- unname(ext(proj_daymet_data)[1])
max_lon <- unname(ext(proj_daymet_data)[2])
min_lat <- unname(ext(proj_daymet_data)[3])
max_lat <- unname(ext(proj_daymet_data)[4])
# Returning a list of objects needed later
out <- list("time_dictionary" = time_dictionary, "daymet_data" = daymet_data, "year_start" = year_start, "year_end" = year_end, "daymet_variables" = daymet_variables, "min_lon" = min_lon, "max_lon" = max_lon, "min_lat" = min_lat, "max_lat" = max_lat)
return(out)
}
# Loading the Daymet NetCDF data
daymet_load_out <- daymet_load()
time_dictionary <- daymet_load_out$time_dictionary
daymet_data <- daymet_load_out$daymet_data
year_start <- daymet_load_out$year_start
year_end <- daymet_load_out$year_end
daymet_variables <- daymet_load_out$daymet_variables
min_lon <- daymet_load_out$min_lon
max_lon <- daymet_load_out$max_lon
min_lat <- daymet_load_out$min_lat
max_lat <- daymet_load_out$max_lat
rm(daymet_load_out)
# Importing and processing the input data
import_data_out <- import_data()
addresses <- import_data_out$addresses
extra_columns <- import_data_out$extra_columns
coords <- import_data_out$coords
rm(import_data_out)
# Changing the coordinate reference system of the input addresses so they match that of Daymet
new_crs <- crs(daymet_data, proj = TRUE)
proj_coords <- project(coords, new_crs)
rm(coords)
# Finding the Daymet raster cell numbers that match the input address coordinates
addresses <- addresses %>%
mutate(cell = unname(cells(daymet_data, proj_coords)[, "cell"]))
rm(proj_coords)
# Removing any input address observations where the Daymet cell raster number is missing
addresses <- addresses %>%
filter(!is.na(cell)) %>%
select(-c(lat, lon))
addresses <- as.data.table(addresses)
# Throwing an error if no observations are remaining
if (nrow(addresses) == 0) {
stop(call. = FALSE, 'Zero observations where the input address coordinates fell within Daymet raster cells.')
}
# Taking care of leap years, per Daymet conventions (12/31 is switched to 12/30)
addresses$date <- if_else(leap_year(addresses$date) & month(addresses$date) == 12 & day(addresses$date) == 31,
addresses$date - 1,
addresses$date)
# Converting the Daymet SpatRaster raster stack to a data table, with cell numbers
daymet_data_dt <- as.data.frame(daymet_data, cells = TRUE)
daymet_data_dt <- as.data.table(daymet_data_dt)
rm(daymet_data)
# Transposing the Daymet data table, one Daymet variable at a time
transpose_daymet <- function(.daymet_data_dt = daymet_data_dt, dm_var) {
daymet_data_dt_var <- .daymet_data_dt %>%
select(c("cell", starts_with(dm_var)))
daymet_data_dt_var <- melt(daymet_data_dt_var, id.vars = "cell", variable.name = "number_year", value.name = dm_var)
daymet_data_dt_var <- daymet_data_dt_var %>%
mutate(number_year = str_remove_all(number_year, paste0(dm_var, "_")))
return(daymet_data_dt_var)
}
daymet_variables <- str_remove_all(daymet_variables, " ")
daymet_variables <- str_split(daymet_variables, ",", simplify = TRUE)
for (i in 1:length(daymet_variables)) {
if (i == 1) {
daymet_data_long <- transpose_daymet(dm_var = daymet_variables[i])
}
else {
daymet_data_long <- daymet_data_long[transpose_daymet(dm_var = daymet_variables[i]), on = c(cell = "cell", number_year = "number_year")]
}
}
rm(daymet_data_dt)
# Rounding the Daymet variables to two decimal places
daymet_data_long <- daymet_data_long %>%
mutate(across(where(is.numeric) & matches(daymet_variables), ~ round(., 2)))
rm(daymet_variables)
# Splitting out number_year in daymet_data_long
daymet_data_long <- daymet_data_long %>%
separate_wider_delim(number_year, delim = "_", names = c("number", "year"), cols_remove = TRUE)
# Matching the Daymet day numbers to the dates they correspond to
daymet_data_long <- daymet_data_long %>%
mutate(number = as.integer(number),
year = as.integer(year))
daymet_data_long <- as.data.table(daymet_data_long)
time_dictionary <- time_dictionary[, number := as.integer(number)]
time_dictionary <- time_dictionary[, year := as.integer(year)]
daymet_data_long <- daymet_data_long[time_dictionary, on = c(number = "number", year = "year")]
daymet_data_long <- daymet_data_long[, c("number", "year") := NULL]
rm(time_dictionary)
# Linking the Daymet data cells to the input address coordinate cells across all dates
main_dataset <- daymet_data_long[addresses, on = c(cell = "cell", date = "date")]
main_dataset <- main_dataset[, "cell" := NULL]
setcolorder(main_dataset, c("row_index", "date"))
rm(daymet_data_long, addresses)
# Merging in the extra columns
main_dataset <- extra_columns[main_dataset, on = c(row_index = "row_index")]
main_dataset <- main_dataset[, "row_index" := NULL]
rm(extra_columns)
# Removing any rows with NA
main_dataset <- main_dataset %>%
na.omit()
# Sorting and de-duplicating the final results (duplicates could have resulted from leap years or repeat dates)
main_dataset <- main_dataset %>%
mutate(sort1 = factor(id, ordered = TRUE, levels = unique(mixedsort(id))),
sort2 = factor(date, ordered = TRUE, levels = unique(mixedsort(date)))) %>%
arrange(sort1, sort2) %>%
select(-c(sort1, sort2)) %>%
distinct()
# Renaming the id variable to patid
main_dataset <- main_dataset %>%
rename("patid" = "id")
# Writing the results out as a CSV file
csv_out <- paste0(unlist(str_split(opt$filename, ".csv"))[1], "_daymet", ".csv")
fwrite(main_dataset, csv_out, na = "", row.names = FALSE)
rm(list = ls(all.names = TRUE))