Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve speed of function fit_parameters_loop #9

Open
wants to merge 1 commit into
base: devel
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 56 additions & 37 deletions R/proDA.R
Original file line number Diff line number Diff line change
Expand Up @@ -313,36 +313,46 @@ fit_parameters_loop <- function(Y, model_matrix, location_prior_df,
}
# Initialization
n_samples <- ncol(Y)


mM_t <- t(model_matrix)
seq_Y <- seq_len(nrow(Y))
seq_mM <- seq_len(nrow(model_matrix))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you check how big the speed-up due to those two changes is. And if it is not signifcant revert the changes?

The additional variables seq_Y and seq_mM makes the code more difficult to understand, because they are two additional things that you need to keep in your head when you try to understand what is going at the end of the function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

microbenchmark::microbenchmark(
        Pred_init_var = {
            seq_Y <- seq_len(nrow(Y))
            seq_mM <- seq_len(nrow(model_matrix))
            Pred_init_var <- mply_dbl(seq_Y, function(i){
               coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
               vapply(seq_mM, function(j)
                   mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,],
                  FUN.VALUE = 0.0)
             }, ncol=n_samples)
          },
         Pred_init_var2 = mply_dbl(seq_len(nrow(Y)), function(i){
              coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
               vapply(seq_len(nrow(model_matrix)), function(j)
                  mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,],
                  FUN.VALUE = 0.0)
           }, ncol=n_samples)
   )

Unit: milliseconds
         expr     min       lq     mean   median       uq     max neval cld
 Pred_init_var 42.2649 46.95005 51.42409 50.29355 54.08615 76.6705   100   a
Pred_init_var2 44.1800 48.31820 52.47180 50.84440 54.41230 78.1734   100   a

A slight increase here!

The following two expressions are in the convergence loop:

microbenchmark::microbenchmark(
        
        Pred_var_unreg = {
            mply_dbl(seq_Y, function(i){
                coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
                vapply(seq_mM, function(j)
                    mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],
                    FUN.VALUE = 0.0)
            }, ncol=n_samples)     
        },  
        Pred_var_unreg2 = {
            mply_dbl(seq_len(nrow(Y)), function(i){
                coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
                vapply(seq_len(nrow(model_matrix)), function(j)
                    mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],
                    FUN.VALUE = 0.0)
            }, ncol=n_samples)     
        }, times = 200)
        
        Unit: milliseconds
       expr      min       lq     mean   median       uq      max neval cld
    Pred_var_unreg 250.7857 272.2618 294.7421 291.3625 305.8123 903.5610   200   a
   Pred_var_unreg2 252.7301 276.2538 300.5611 289.7970 305.4967 796.0543   200   a

There are again some higher max values (not sure why it happens)

 microbenchmark::microbenchmark(
        Pred_var_reg = {mply_dbl(seq_Y, function(i){
            coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
            vapply(seq_mM, function(j)
                mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],
                FUN.VALUE = 0.0)
        }, ncol=ncol(Y))},
        Pred_var_reg2 = {mply_dbl(seq_len(nrow(Y)), function(i){
            coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
            vapply(seq_len(nrow(model_matrix)), function(j)
                mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],
                FUN.VALUE = 0.0)
        }, ncol=ncol(Y))}, 
        times = 200)

 Unit: milliseconds
      expr      min       lq     mean   median       uq      max neval cld
 Pred_var_reg 241.0048 260.8333 279.9133 266.5539 293.6949 383.3302   200  a 
 Pred_var_reg2 245.4748 261.0977 290.8885 275.2618 308.8479 771.8275   200   b

Again the version with pre-calculated values is slightly faster (and now the old version shows these distortions).

I guess these are only slight improvements anyway coming from pre-calculating these values. It's on you if you think the old version with respect to code maintenance is superior.


Y_compl <- Y
Y_compl[is.na(Y)] <- rnorm(sum(is.na(Y)), mean=quantile(Y, probs=0.1, na.rm=TRUE), sd=sd(Y,na.rm=TRUE)/5)
res_init <- lapply(seq_len(nrow(Y)), function(i){
pd_lm.fit(Y_compl[i, ], model_matrix,
y_na <- is.na(Y)
Y_compl[y_na] <- rnorm(sum(y_na), mean=quantile(Y, probs=0.1, na.rm=TRUE), sd=sd(Y,na.rm=TRUE)/5)

split_Y_compl <- split(Y_compl, factor(rownames(Y_compl), rownames(Y_compl)))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you pass rownames(Y_compl) to factor twice?

What happens if the rownames are not unique? (I know one shouldn't make objects with duplicated rownames, but you can, which means that people probably do.)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mat <- matrix(1:20, nrow=5, ncol = 4)
rownames(mat) <- paste0("row_", 1:5)
rownames(mat)[3] <- "row_2"
split(mat, factor(rownames(mat)))
#> $row_1
#> [1]  1  6 11 16
#> 
#> $row_2
#> [1]  2  3  7  8 12 13 17 18
#> 
#> $row_4
#> [1]  4  9 14 19
#> 
#> $row_5
#> [1]  5 10 15 20

Created on 2021-01-27 by the reprex package (v0.3.0)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please validate that split is actually faster than lapply(), because it makes the code more difficult to understand and maintain.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, maybe it's not a good idea to use split here given the points you addressed. I will revert this.

names(split_Y_compl) <- NULL
res_init <- lapply(split_Y_compl, function(i){
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you choose a more descriptive variable name. For me i always is an index. I think for example y_i would be better :)

pd_lm.fit(i, model_matrix,
dropout_curve_position = rep(NA, n_samples),
dropout_curve_scale =rep(NA, n_samples),
verbose=verbose)
})
Pred_init <- msply_dbl(res_init, function(x) x$coefficients) %*% t(model_matrix)
Pred_init_var <- mply_dbl(seq_len(nrow(Y)), function(i){
vapply(seq_len(nrow(model_matrix)), function(j)
t(model_matrix[j,]) %*% res_init[[i]]$coef_variance_matrix %*% model_matrix[j,],

Pred_init <- msply_dbl(res_init, function(x) x$coefficients) %*% mM_t
Pred_init_var <- mply_dbl(seq_Y, function(i){
coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
vapply(seq_mM, function(j)
mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,],
FUN.VALUE = 0.0)
}, ncol=ncol(Y))
s2_init <- vapply(res_init, function(x) x[["s2"]], 0.0)
df_init <- vapply(res_init, function(x) x[["df"]], 0.0)

if(moderate_location){
lp <- location_prior(model_matrix,
Pred_reg = Pred_init,
Pred_unreg = Pred_init,
Pred_var_reg = Pred_init_var,
Pred_var_unreg = Pred_init_var)
Pred_reg = Pred_init,
Pred_unreg = Pred_init,
Pred_var_reg = Pred_init_var,
Pred_var_unreg = Pred_init_var)
mu0 <- lp$mu0
sigma20 <- lp$sigma20
}else{
mu0 <- NA_real_
sigma20 <- NA_real_
}

dc <- dropout_curves(Y, model_matrix, Pred_init, Pred_init_var)
rho <- dc$rho
zetainv <- dc$zetainv
Expand All @@ -354,26 +364,31 @@ fit_parameters_loop <- function(Y, model_matrix, location_prior_df,
tau20 <- NA_real_
df0_inv <- NA_real_
}

last_round_params <- list(mu0, sigma20, rho, zetainv, tau20, df0_inv)
converged <- FALSE
iter <- 1
error <- NA
res_reg <- res_init
res_unreg <- res_init

## create split_Y
split_Y <- split(Y, factor(rownames(Y), rownames(Y)))
names(split_Y) <- NULL

while(! converged && iter <= max_iter){
if(verbose){
message(paste0("Starting iter: ", iter))
}

res_unreg <- lapply(seq_len(nrow(Y)), function(i){
pd_lm.fit(Y[i, ], model_matrix,
res_unreg <- lapply(split_Y, function(i){
pd_lm.fit(i, model_matrix,
dropout_curve_position = rho, dropout_curve_scale = 1/zetainv,
verbose=verbose)
})
if(moderate_location || moderate_variance){
res_reg <- lapply(seq_len(nrow(Y)), function(i){
pd_lm.fit(Y[i, ], model_matrix,
res_reg <- lapply(split_Y, function(i){
pd_lm.fit(i, model_matrix,
dropout_curve_position = rho, dropout_curve_scale = 1/zetainv,
location_prior_mean = mu0, location_prior_scale = sigma20,
variance_prior_scale = tau20, variance_prior_df = 1/df0_inv,
Expand All @@ -383,23 +398,28 @@ fit_parameters_loop <- function(Y, model_matrix, location_prior_df,
}else{
res_reg <- res_unreg
}

Pred_unreg <- msply_dbl(res_unreg, function(x) x$coefficients) %zero_dom_mat_mult% t(model_matrix)
Pred_reg <- msply_dbl(res_reg, function(x) x$coefficients) %zero_dom_mat_mult% t(model_matrix)
Pred_var_unreg <- mply_dbl(seq_len(nrow(Y)), function(i){
vapply(seq_len(nrow(model_matrix)), function(j)
t(model_matrix[j,]) %zero_dom_mat_mult% res_unreg[[i]]$coef_variance_matrix %zero_dom_mat_mult% model_matrix[j,],

Pred_unreg <- msply_dbl(res_unreg, function(x) x$coefficients) %zero_dom_mat_mult% mM_t
Pred_reg <- msply_dbl(res_reg, function(x) x$coefficients) %zero_dom_mat_mult% mM_t

Pred_var_unreg <- mply_dbl(seq_Y, function(i){
coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is probably a good idea, getting stuff from a list can be quite slow :)

vapply(seq_mM, function(j)
mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,],
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

t(model_matrix[j,]) and mM_t[, j] are not the same!

Small example to illustrate:

mat <- matrix(1:32, nrow=8, ncol = 4)
tmat <- t(mat)

# treated as a row vector
t(mat[3,])
#>      [,1] [,2] [,3] [,4]
#> [1,]    3   11   19   27

# treated as a column Vector!
tmat[,3]
#> [1]  3 11 19 27

Created on 2021-01-27 by the reprex package (v0.3.0)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's true, but anyway they yield the same result:

Pred_init_var <- mply_dbl(seq_Y, function(i){
    coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
    vapply(seq_mM, function(j)
        mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,],
        FUN.VALUE = 0.0)
}, ncol=ncol(Y))

Pred_init_var2 <- mply_dbl(seq_Y, function(i){
    coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
    vapply(seq_mM, function(j)
        t(model_matrix[j,]) %*% coef_var_matrix_i %*% model_matrix[j,],
        FUN.VALUE = 0.0)
}, ncol=ncol(Y))

identical(Pred_init_var2, Pred_init_var)
[1] TRUE

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

microbenchmark::microbenchmark(
    Pred_init_var = mply_dbl(seq_Y, function(i){
        coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
       vapply(seq_mM, function(j)
         mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,],
         FUN.VALUE = 0.0)
    }, ncol=ncol(Y)),


    Pred_init_var2 = mply_dbl(seq_Y, function(i){
         coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix
        vapply(seq_mM, function(j)
            t(model_matrix[j,]) %*% coef_var_matrix_i %*% model_matrix[j,],
            FUN.VALUE = 0.0)
     }, ncol=ncol(Y))
    )

Unit: milliseconds
       expr     min       lq     mean   median       uq      max neval cld
  Pred_init_var 43.9988 47.41380 51.77982 50.67965 55.03035  75.4580   100  a 
 Pred_init_var2 72.4167 78.46865 84.72939 83.47805 87.49775 129.3417   100   b

FUN.VALUE = 0.0)
}, ncol=ncol(Y))
Pred_var_reg <- mply_dbl(seq_len(nrow(Y)), function(i){
vapply(seq_len(nrow(model_matrix)), function(j)
t(model_matrix[j,]) %zero_dom_mat_mult% res_reg[[i]]$coef_variance_matrix %zero_dom_mat_mult% model_matrix[j,],

Pred_var_reg <- mply_dbl(seq_Y, function(i){
coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
vapply(seq_mM, function(j)
mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,],
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same problem as above. Could this be the reason that bench::mark() detects different results?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the bench::mark() gives different results above even when using the same function proDA::fit_parameters_loop (see above).

  microbenchmark::microbenchmark(
        Pred_var_unreg = mply_dbl(seq_Y, function(i){
            coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
            vapply(seq_mM, function(j)
                mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],
                FUN.VALUE = 0.0)
        }, ncol=ncol(Y)),
        Pred_var_unreg2 = mply_dbl(seq_Y, function(i){
            coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
            vapply(seq_mM, function(j)
                t(model_matrix[j,]) %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],
                FUN.VALUE = 0.0)
        }, ncol=ncol(Y))
    )


Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval cld
 Pred_var_unreg 250.6602 263.1104 281.4725 267.6236 279.7790 704.7041   100  a 
 Pred_var_unreg2 296.5720 310.2032 333.9179 317.6835 342.1173 965.6829   100   b

FUN.VALUE = 0.0)
}, ncol=ncol(Y))

s2_unreg <- vapply(res_unreg, function(x) x[["s2"]], 0.0)
df_unreg <-vapply(res_unreg, function(x) x[["df"]], 0.0)


if(moderate_location){
lp <- location_prior(model_matrix,
Pred_reg = Pred_reg,
Expand All @@ -417,37 +437,36 @@ fit_parameters_loop <- function(Y, model_matrix, location_prior_df,
tau20 <- vp$tau20
df0_inv <- vp$df0_inv
}

error <- sum(mapply(function(new, old) {
sum(new - old, na.rm=TRUE)/length(new)
sum(new - old, na.rm = TRUE)/length(new)
}, list(mu0, sigma20, rho, zetainv, tau20, df0_inv), last_round_params)^2)
if (error < epsilon) {
if(verbose){
message("Converged!")
}
converged <- TRUE
}

last_round_params <- list(mu0, sigma20, rho, zetainv, tau20, df0_inv)
if(verbose){
log_parameters(last_round_params)
message(paste0("Error: ", sprintf("%.2g", error)), "\n")
}
iter <- iter + 1
}



convergence <- list(successful = converged, iterations = iter-1, error = error)
names(last_round_params) <- c("location_prior_mean", "location_prior_scale",
"dropout_curve_position", "dropout_curve_scale",
"variance_prior_scale", "variance_prior_df")
last_round_params[["dropout_curve_scale"]] <- 1/last_round_params[["dropout_curve_scale"]]
last_round_params[["variance_prior_df"]] <- 1/last_round_params[["variance_prior_df"]]

list(hyper_parameters = last_round_params,
convergence = convergence,
feature_parameters = lapply(res_reg, function(x) x[c("coefficients", "coef_variance_matrix", "n_approx", "df", "s2", "n_obs")]))

}


Expand Down