diff --git a/DESCRIPTION b/DESCRIPTION index 1f413c7..71093bf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dynamite Title: Bayesian Modeling and Causal Inference for Multivariate Longitudinal Data -Version: 1.4.7 +Version: 1.4.8 Authors@R: c( person("Santtu", "Tikka", email = "santtuth@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4039-4342")), diff --git a/NEWS.md b/NEWS.md index bf4cd82..8292000 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# dynamite 1.4.8 + + * Made several performance improvements to data parsing. + * `dynamite()` will now retain the original column order of `data` in all circumstances. + # dynamite 1.4.7 * Added a note on priors vignette regarding default priors for $\tau$ parameters. diff --git a/R/dynamite.R b/R/dynamite.R index 43c812d..dc72a33 100644 --- a/R/dynamite.R +++ b/R/dynamite.R @@ -45,7 +45,8 @@ #' needs the actual `CmdStan` software. See https://mc-stan.org/cmdstanr/ for #' details. #' @param verbose \[`logical(1)`]\cr All warnings and messages are suppressed -#' if set to `FALSE`. Defaults to `TRUE`. +#' if set to `FALSE`. Defaults to `TRUE`. Setting this to `FALSE` will also +#' disable checks for perfect collinearity in the model matrix. #' @param verbose_stan \[`logical(1)`]\cr This is the `verbose` argument for #' [rstan::sampling()]. Defaults to `FALSE`. #' @param stanc_options \[`list()`]\cr This is the `stanc_options` argument @@ -846,9 +847,9 @@ parse_data <- function(dformula, data, group_var, time_var, verbose) { "Non-finite values were found in variable{?s} {.var {data_names[!finite_cols]}} of {.arg data}." ) + data.table::setkeyv(data, c(group_var, time_var)) data <- fill_time(data, group_var, time_var) drop_unused(dformula, data, group_var, time_var) - data.table::setkeyv(data, c(group_var, time_var)) data } @@ -1129,8 +1130,10 @@ fill_time <- function(data, group_var, time_var) { n_group <- length(group) time_duplicated <- logical(n_group) time_missing <- logical(n_group) + group_bounds <- c(0, data[, max(.I), by = group_var]$V1) for (i in seq_len(n_group)) { - idx_group <- which(data_groups == group[i]) + idx_group <- seq(group_bounds[i] + 1, group_bounds[i + 1]) + #idx_group <- which(data_groups == group[i]) sub <- data[idx_group, ] time_duplicated[i] <- any(duplicated(sub[[time_var]])) time_missing[i] <- !identical(sub[[time_var]], full_time) @@ -1148,26 +1151,27 @@ fill_time <- function(data, group_var, time_var) { all(time_ivals[!is.na(time_ivals)] %% time_scale == 0), "Observations must occur at regular time intervals." ) - # time_missing <- data[, # !identical(time_var, full_time), # by = group_var, # env = list(time_var = time_var, full_time = full_time) # ]$V1 if (any(time_missing)) { + data_names <- names(data) full_data_template <- data.table::as.data.table( expand.grid( - time = full_time, - group = unique(data[[group_var]]) + group = unique(data[[group_var]]), + time = full_time ) ) - names(full_data_template) <- c(time_var, group_var) + names(full_data_template) <- c(group_var, time_var) data <- data.table::merge.data.table( full_data_template, data, - by = c(time_var, group_var), + by = c(group_var, time_var), all.x = TRUE ) + data.table::setcolorder(data, data_names) } data } diff --git a/R/predict_helpers.R b/R/predict_helpers.R index 578f4b2..757d4d0 100644 --- a/R/predict_helpers.R +++ b/R/predict_helpers.R @@ -123,13 +123,13 @@ parse_newdata <- function(dformulas, newdata, data, type, eval_type, newdata[[i]] <- factor(newdata[[i]], levels = l_orig) } } + data.table::setDT(newdata, key = c(group_var, time_var)) newdata <- fill_time_predict( newdata, group_var, time_var, time_scale = original_times[2L] - original_times[1L] ) - data.table::setDT(newdata, key = c(group_var, time_var)) clear_names <- intersect(names(newdata), clear_names) if (length(clear_names) > 0L) { # TODO no need check length when data.table package is updated @@ -270,8 +270,9 @@ fill_time_predict <- function(data, group_var, time_var, time_scale) { has_gaps = logical(n_group) ) time_missing <- logical(n_group) + group_bounds <- c(0, data[, max(.I), by = group_var]$V1) for (i in seq_len(n_group)) { - idx_group <- which(data_groups == group[i]) + idx_group <- seq(group_bounds[i] + 1, group_bounds[i + 1]) sub <- data[idx_group, ] time_duplicated[i] <- any(duplicated(sub[[time_var]])) time_groups$has_missing[i] <- !identical(sub[[time_var]], full_time) @@ -288,7 +289,6 @@ fill_time_predict <- function(data, group_var, time_var, time_scale) { ) ) if (length(time) > 1L) { - original_order <- colnames(data) # time_groups <- data[, # { # has_missing = !identical(time_var, full_time) @@ -303,6 +303,7 @@ fill_time_predict <- function(data, group_var, time_var, time_scale) { # ) # ] if (any(time_groups$has_missing)) { + data_names <- colnames(data) if (any(time_groups$has_gaps)) { warning_(c( "Time index variable {.var {time_var}} of {.arg newdata} has gaps:", @@ -312,16 +313,17 @@ fill_time_predict <- function(data, group_var, time_var, time_scale) { )) } full_data_template <- data.table::as.data.table(expand.grid( - time = full_time, - group = unique(data[[group_var]]) + group = unique(data[[group_var]]), + time = full_time )) - names(full_data_template) <- c(time_var, group_var) + names(full_data_template) <- c(group_var, time_var) data <- data.table::merge.data.table( full_data_template, data, - by = c(time_var, group_var), + by = c(group_var, time_var), all.x = TRUE ) + data.table::setcolorder(data, data_names) } } data diff --git a/R/prepare_stan_input.R b/R/prepare_stan_input.R index dc8385d..fe4a470 100644 --- a/R/prepare_stan_input.R +++ b/R/prepare_stan_input.R @@ -81,13 +81,16 @@ prepare_stan_input <- function(dformula, data, group_var, time_var, ) N <- n_unique(group) K <- ncol(model_matrix) - X <- aperm( - array( - as.numeric(unlist(split(model_matrix, gl(T_full, 1L, N * T_full)))), - dim = c(N, K, T_full) - ), - c(3L, 1L, 2L) - )[T_idx, , , drop = FALSE] + #X <- aperm( + # array( + # as.numeric(unlist(split(model_matrix, gl(T_full, 1L, N * T_full)))), + # dim = c(N, K, T_full) + # ), + # c(3L, 1L, 2L) + #)[T_idx, , , drop = FALSE] + X <- model_matrix[,] + dim(X) <- c(T_full, N, K) + X <- X[T_idx, , , drop = FALSE] x_tmp <- X[1L, , , drop = FALSE] sd_x <- pmax( setNames(apply(X, 3L, sd, na.rm = TRUE), colnames(model_matrix)), @@ -107,12 +110,14 @@ prepare_stan_input <- function(dformula, data, group_var, time_var, for (i in seq_len(n_channels)) { y <- resp[i] y_name <- resp_names[i] - y_split <- split( - data[, .SD, .SDcols = c(y, group_var)], - by = group_var, - keep.by = FALSE - ) - Y <- array(as.numeric(unlist(y_split)), dim = c(T_full, N)) + #y_split <- split( + # data[, .SD, .SDcols = c(y, group_var)], + # by = group_var, + # keep.by = FALSE + #) + #Y <- array(as.numeric(unlist(y_split)), dim = c(T_full, N)) + Y <- as.numeric(data[[y]]) + dim(Y) <- c(T_full, N) Y <- Y[T_idx, , drop = FALSE] tmp <- initialize_univariate_channel( dformula = dformula[[i]], @@ -323,8 +328,10 @@ initialize_univariate_channel <- function(dformula, specials, fixed_pars, for (spec in formula_special_funs) { if (!is.null(specials[[spec]])) { spec_idx <- seq.int(fixed + 1L, T_full) - spec_split <- split(specials[[spec]], group) - spec_array <- array(as.numeric(unlist(spec_split)), dim = c(T_full, N)) + #spec_split <- split(specials[[spec]], group) + spec_array <- as.numeric(specials[[spec]]) + dim(spec_array) <- c(T_full, N) + #spec_array <- array(as.numeric(unlist(spec_split)), dim = c(T_full, N)) spec_na <- spec_na | is.na(spec_array[spec_idx, , drop = FALSE]) spec_name <- paste0(spec, "_", y_name) sampling[[spec_name]] <- ifelse_( diff --git a/R/sysdata.rda b/R/sysdata.rda index a74638f..3e43fbd 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/data/categorical_example_fit.rda b/data/categorical_example_fit.rda index ed5514b..1032173 100644 Binary files a/data/categorical_example_fit.rda and b/data/categorical_example_fit.rda differ diff --git a/data/gaussian_example_fit.rda b/data/gaussian_example_fit.rda index ff2cc29..8bc3901 100644 Binary files a/data/gaussian_example_fit.rda and b/data/gaussian_example_fit.rda differ diff --git a/data/gaussian_simulation_fit.rda b/data/gaussian_simulation_fit.rda index fd71ae7..1c99127 100644 Binary files a/data/gaussian_simulation_fit.rda and b/data/gaussian_simulation_fit.rda differ diff --git a/data/multichannel_example_fit.rda b/data/multichannel_example_fit.rda index 8bee57a..8fce23a 100644 Binary files a/data/multichannel_example_fit.rda and b/data/multichannel_example_fit.rda differ diff --git a/tests/testthat/test-edgecases.R b/tests/testthat/test-edgecases.R index 5ee2813..fb0977a 100644 --- a/tests/testthat/test-edgecases.R +++ b/tests/testthat/test-edgecases.R @@ -10,8 +10,8 @@ individuals <- 5 total_obs <- timepoints * individuals test_data <- data.frame( - time = 1:timepoints, group = gl(individuals, timepoints), + time = 1:timepoints, offset = sample(50:100, size = total_obs, replace = TRUE), trials = sample(50:100, size = total_obs, replace = TRUE) ) |> @@ -487,6 +487,8 @@ test_that("data expansion to full time scale works", { expected_data_single <- droplevels(expected_data_single) expected_data_single$trials <- NULL expected_data_single$offset <- NULL + expected_data_single$group <- NULL + expected_data_single$.group <- 1L data.table::setDT(expected_data_single, key = c("time")) expect_equal(fit_single$data, expected_data_single, ignore_attr = TRUE) })