improve speed of function fit_parameters_loop #9

tnaake · 2021-01-26T21:01:34Z

following the same logic of pull request #8 I had another look at the function fit_parameters_loop and improved slightly the performance of the function. It is now slightly faster than the original version (see below, I hope it is worth checking).

However, there is a problem, which doesn't allow to compare directly if the two versions yield the same result, see:

 bench::mark(
    x1 = proDA:::fit_parameters_loop(Y = sub_sample_mat, model_matrix = model_matrix, 
                         location_prior_df = 3, moderate_location = TRUE,
                         moderate_variance = TRUE, max_iter = 20, epsilon = 1e-03),
    x2 = proDA:::fit_parameters_loop(Y = sub_sample_mat, model_matrix = model_matrix, 
                          location_prior_df = 3, moderate_location = TRUE,
                          moderate_variance = TRUE, max_iter = 20, epsilon = 1e-03)

Error: Each result must equal the first result:
`x1` does not equal `x2`

I guess this is doe to some random number generation!

For now, see bench::mark with check=FALSE :

 bench::mark(
    x1 = proDA:::fit_parameters_loop(Y = sub_sample_mat, model_matrix = model_matrix, 
                                 location_prior_df = 3, moderate_location = TRUE,
                                 moderate_variance = TRUE, max_iter = 20, epsilon = 1e-03), 
    x2 = fit_parameters_loop(Y = sub_sample_mat, model_matrix = model_matrix, 
                                location_prior_df = 3, moderate_location = TRUE,
                                moderate_variance = TRUE, max_iter = 20, epsilon = 1e-03), 
    max_iterations = 10, check = FALSE)

  # A tibble: 2 x 13
  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory          time    gc       
   <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>          <list>  <list>   
 1 x1             48s     48s    0.0209     954MB     3.00     1   144        48s <NULL> <Rprofmem[,3] ~ <bch:t~ <tibble ~
 2 x2           46.5s   46.5s    0.0215     950MB     2.21     1   103      46.5s <NULL> <Rprofmem[,3] ~ <bch:t~ <tibble ~

Warning message:
Some expressions had a GC in every iteration; so filtering is disabled.

Short script to generate the input variables:

library(proDA)
data <- generate_synthetic_data(n_proteins = 3000, n_conditions = 2)
design <- data$groups
data <- data$Y
col_data <- NULL
reference_level <- NULL
data_is_log_transformed <- TRUE
n_subsample = nrow(data)

if (!is.matrix(data) && !is(data, "SummarizedExperiment")) {
    if (canCoerce(data, "SummarizedExperiment")) {
        data <- as(data, "SummarizedExperiment")
    } else {
        stop("Cannot handle data of class", class(data), 
             ". It must be of type", "matrix or it should be possible to cast it to SummarizedExperiment")
    }
}
n_samples <- ncol(data)
n_rows <- nrow(data)
if (is.matrix(design)) {
    model_matrix <- design
    design_formula <- NULL
} else if ((is.vector(design) || is.factor(design))) {
    if (length(design) != n_samples) {
        stop(paste0("The specified design vector length (", 
                    length(design), ") does not match ", "the number of samples: ", 
                    n_samples))
    }
    model_matrix <- proDA:::convert_chr_vec_to_model_matrix(design, 
                                                    reference_level)
    design_formula <- NULL
} else if (inherits(design, "formula")) {
    if (design == formula(~1) && is.null(col_data)) {
        col_data <- as.data.frame(matrix(numeric(0), nrow = n_samples))
    }
    compl_col_data <- if (is(data, "SummarizedExperiment")) {
        if (is.null(col_data)) {
            colData(data)
        } else cbind(col_data, colData(data))
    } else {
        col_data
    }
    model_matrix <- convert_formula_to_model_matrix(design, 
                                                    compl_col_data, reference_level)
    design_formula <- design
} else {
    stop(paste0("design argment of class ", class(design), 
                " is not supported. Please ", "specify a `model_matrix`, a `character vector`, or a `formula`."))
}
rownames(model_matrix) <- colnames(data)

if (is.matrix(data)) {
    if (any(!is.na(data) & data == 0)) {
        if (any(is.na(data))) {
            warning(paste0("The data contains a mix of ", 
                           sum(!is.na(data) & data == 0), " exact zeros ", 
                           "and ", sum(is.na(data)), " NA's. Will treat the zeros as valid input and not replace them with NA's."))
        } else {
            warning(paste0("The data contains ", sum(!is.na(data) & 
                                                         data == 0), " exact zeros and no NA's.", 
                           "Replacing all exact zeros with NA's."))
            data[!is.na(data) & data == 0] <- NA
        }
    }
    if (!data_is_log_transformed) {
        data <- log2(data)
    }
    data_mat <- data
} else if (is(data, "SummarizedExperiment")) {
    if (any(!is.na(assay(data)) & assay(data) == 0)) {
        if (any(is.na(assay(data)))) {
            warning(paste0("The data contains a mix of ", 
                           sum(!is.na(assay(data)) & assay(data) == 0), 
                           " exact zeros ", "and ", sum(is.na(assay(data))), 
                           " NA's. Will treat the zeros as valid input and not replace them with NA's."))
        } else {
            warning(paste0("The data contains ", sum(!is.na(assay(data)) & 
                                                         assay(data) == 0), " exact zeros and no NA's.", 
                           "Replacing all exact zeros with NA's."))
            assay(data)[!is.na(assay(data)) & assay(data) == 
                            0] <- NA
        }
    }
    if (!data_is_log_transformed) {
        assay(data) <- log2(assay(data))
    }
    data_mat <- assay(data)
    assays(data)[seq_len(length(assays(data)) - 1) + 1] <- NULL
} else {
    stop("data of tye ", class(data), " is not supported.")
}

n_subsample <- min(nrow(data_mat), n_subsample)
sub_sample_mat <- data_mat[seq_len(n_subsample), , drop = FALSE]

Surprisingly, these updates (I guess most comes from the lapply changes result in the considerable speed improvements.

const-ae · 2021-01-27T10:51:21Z

Hi Thomas,

thanks. I'll take a look right away.

I am a bit confused, because first you say

I had another look at the function fit_parameters_loop and improved slightly the performance of the function

but in the end

these updates [...] result in the considerable speed improvements.

Both times, you talk about the benchmark (2 seconds / 5% speed-up), right?

However, there is a problem, which doesn't allow to compare directly if the two versions yield the same result, see [...] I guess this is doe to some random number generation!

I am a bit concerned about this, can you investigate this a bit more and quantify how much the results change?

I'll also make a few inline comments about some specific things.

Best,
Constantin

const-ae · 2021-01-27T10:54:39Z

R/proDA.R

+    Pred_var_unreg <- mply_dbl(seq_Y, function(i){
+      coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
+      vapply(seq_mM, function(j)
+        mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],


t(model_matrix[j,]) and mM_t[, j] are not the same!

Small example to illustrate:

mat <- matrix(1:32, nrow=8, ncol = 4) tmat <- t(mat) # treated as a row vector t(mat[3,]) #> [,1] [,2] [,3] [,4] #> [1,] 3 11 19 27 # treated as a column Vector! tmat[,3] #> [1] 3 11 19 27

^{Created on 2021-01-27 by the reprex package (v0.3.0)}

That's true, but anyway they yield the same result:

Pred_init_var <- mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix vapply(seq_mM, function(j) mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y)) Pred_init_var2 <- mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix vapply(seq_mM, function(j) t(model_matrix[j,]) %*% coef_var_matrix_i %*% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y))

identical(Pred_init_var2, Pred_init_var)
[1] TRUE

microbenchmark::microbenchmark( Pred_init_var = mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix vapply(seq_mM, function(j) mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y)), Pred_init_var2 = mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix vapply(seq_mM, function(j) t(model_matrix[j,]) %*% coef_var_matrix_i %*% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y)) ) Unit: milliseconds expr min lq mean median uq max neval cld Pred_init_var 43.9988 47.41380 51.77982 50.67965 55.03035 75.4580 100 a Pred_init_var2 72.4167 78.46865 84.72939 83.47805 87.49775 129.3417 100 b

const-ae · 2021-01-27T10:55:09Z

R/proDA.R

+    Pred_var_reg <- mply_dbl(seq_Y, function(i){
+      coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix
+      vapply(seq_mM, function(j)
+        mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i  %zero_dom_mat_mult% model_matrix[j,],


same problem as above. Could this be the reason that bench::mark() detects different results?

Well, the bench::mark() gives different results above even when using the same function proDA::fit_parameters_loop (see above).

microbenchmark::microbenchmark( Pred_var_unreg = mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix vapply(seq_mM, function(j) mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y)), Pred_var_unreg2 = mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix vapply(seq_mM, function(j) t(model_matrix[j,]) %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y)) ) Unit: milliseconds expr min lq mean median uq max neval cld Pred_var_unreg 250.6602 263.1104 281.4725 267.6236 279.7790 704.7041 100 a Pred_var_unreg2 296.5720 310.2032 333.9179 317.6835 342.1173 965.6829 100 b

const-ae · 2021-01-27T10:56:18Z

R/proDA.R

+    Pred_reg <- msply_dbl(res_reg, function(x) x$coefficients) %zero_dom_mat_mult% mM_t
+
+    Pred_var_unreg <- mply_dbl(seq_Y, function(i){
+      coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix


I think this is probably a good idea, getting stuff from a list can be quite slow :)

const-ae · 2021-01-27T10:58:46Z

R/proDA.R

-
+  mM_t <- t(model_matrix)
+  seq_Y <- seq_len(nrow(Y))
+  seq_mM <- seq_len(nrow(model_matrix))


Can you check how big the speed-up due to those two changes is. And if it is not signifcant revert the changes?

The additional variables seq_Y and seq_mM makes the code more difficult to understand, because they are two additional things that you need to keep in your head when you try to understand what is going at the end of the function.

microbenchmark::microbenchmark( Pred_init_var = { seq_Y <- seq_len(nrow(Y)) seq_mM <- seq_len(nrow(model_matrix)) Pred_init_var <- mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix vapply(seq_mM, function(j) mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=n_samples) }, Pred_init_var2 = mply_dbl(seq_len(nrow(Y)), function(i){ coef_var_matrix_i <- res_init[[i]]$coef_variance_matrix vapply(seq_len(nrow(model_matrix)), function(j) mM_t[, j] %*% coef_var_matrix_i %*% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=n_samples) ) Unit: milliseconds expr min lq mean median uq max neval cld Pred_init_var 42.2649 46.95005 51.42409 50.29355 54.08615 76.6705 100 a Pred_init_var2 44.1800 48.31820 52.47180 50.84440 54.41230 78.1734 100 a

A slight increase here!

The following two expressions are in the convergence loop:

microbenchmark::microbenchmark( Pred_var_unreg = { mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix vapply(seq_mM, function(j) mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=n_samples) }, Pred_var_unreg2 = { mply_dbl(seq_len(nrow(Y)), function(i){ coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix vapply(seq_len(nrow(model_matrix)), function(j) mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=n_samples) }, times = 200) Unit: milliseconds expr min lq mean median uq max neval cld Pred_var_unreg 250.7857 272.2618 294.7421 291.3625 305.8123 903.5610 200 a Pred_var_unreg2 252.7301 276.2538 300.5611 289.7970 305.4967 796.0543 200 a

There are again some higher max values (not sure why it happens)

microbenchmark::microbenchmark( Pred_var_reg = {mply_dbl(seq_Y, function(i){ coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix vapply(seq_mM, function(j) mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y))}, Pred_var_reg2 = {mply_dbl(seq_len(nrow(Y)), function(i){ coef_var_matrix_i <- res_reg[[i]]$coef_variance_matrix vapply(seq_len(nrow(model_matrix)), function(j) mM_t[, j] %zero_dom_mat_mult% coef_var_matrix_i %zero_dom_mat_mult% model_matrix[j,], FUN.VALUE = 0.0) }, ncol=ncol(Y))}, times = 200) Unit: milliseconds expr min lq mean median uq max neval cld Pred_var_reg 241.0048 260.8333 279.9133 266.5539 293.6949 383.3302 200 a Pred_var_reg2 245.4748 261.0977 290.8885 275.2618 308.8479 771.8275 200 b

Again the version with pre-calculated values is slightly faster (and now the old version shows these distortions).

I guess these are only slight improvements anyway coming from pre-calculating these values. It's on you if you think the old version with respect to code maintenance is superior.

const-ae · 2021-01-27T11:02:43Z

R/proDA.R

+  y_na <- is.na(Y)
+  Y_compl[y_na] <- rnorm(sum(y_na), mean=quantile(Y, probs=0.1, na.rm=TRUE), sd=sd(Y,na.rm=TRUE)/5)
+
+  split_Y_compl <- split(Y_compl, factor(rownames(Y_compl), rownames(Y_compl)))


Why do you pass rownames(Y_compl) to factor twice?

What happens if the rownames are not unique? (I know one shouldn't make objects with duplicated rownames, but you can, which means that people probably do.)

mat <- matrix(1:20, nrow=5, ncol = 4) rownames(mat) <- paste0("row_", 1:5) rownames(mat)[3] <- "row_2" split(mat, factor(rownames(mat))) #> $row_1 #> [1] 1 6 11 16 #> #> $row_2 #> [1] 2 3 7 8 12 13 17 18 #> #> $row_4 #> [1] 4 9 14 19 #> #> $row_5 #> [1] 5 10 15 20

^{Created on 2021-01-27 by the reprex package (v0.3.0)}

Can you please validate that split is actually faster than lapply(), because it makes the code more difficult to understand and maintain.

Agreed, maybe it's not a good idea to use split here given the points you addressed. I will revert this.

const-ae · 2021-01-27T11:04:03Z

R/proDA.R

+
+  split_Y_compl <- split(Y_compl, factor(rownames(Y_compl), rownames(Y_compl)))
+  names(split_Y_compl) <- NULL
+  res_init <- lapply(split_Y_compl, function(i){


Can you choose a more descriptive variable name. For me i always is an index. I think for example y_i would be better :)

tnaake · 2021-03-17T13:53:37Z

I have found another bunch of code that could be slightly optimised:

res_init <- lapply(seq_len(nrow(Y)), function(i){
     pd_lm.fit(Y_compl[i, ], model_matrix,
               dropout_curve_position = rep(NA, n_samples),
               dropout_curve_scale =rep(NA, n_samples),
               verbose=verbose)
})

could become:

rep_NA <- rep(NA, n_samples)
res_init <- lapply(seq_len(nrow(Y)), function(i){
      pd_lm.fit(Y_compl[i, ], model_matrix,
                dropout_curve_position = rep_NA,
                dropout_curve_scale =rep_NA,
                verbose=verbose)
 })

microbenchmark the change results in slight decrease:

microbenchmark::microbenchmark(

   res_init = {
      rep_NA = rep(NA, n_samples);
      lapply(split_Y_compl, function(i){
      pd_lm.fit(i, model_matrix,
              dropout_curve_position = rep_NA,
              dropout_curve_scale = rep_NA,
              verbose=verbose)
     })},

   res_init2 = lapply(split_Y_compl, function(i){
       pd_lm.fit(i, model_matrix,
              dropout_curve_position = rep(NA, n_samples),
              dropout_curve_scale = rep(NA, n_samples),
              verbose=verbose)
 })
)


Unit: seconds
  expr      min       lq     mean   median       uq      max neval cld
 res_init 2.490822 2.743049 2.980568 2.896272 3.209700 4.115103   100  a 
 res_init2 2.519873 2.766047 3.080926 2.958154 3.381294 4.317780   100   b

change function fit_parameters_loop

a83ab5c

tnaake changed the title ~~change function fit_parameters_loop~~ improve speed of function fit_parameters_loop Jan 26, 2021

const-ae reviewed Jan 27, 2021

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

improve speed of function fit_parameters_loop #9

improve speed of function fit_parameters_loop #9

tnaake commented Jan 26, 2021

const-ae commented Jan 27, 2021

const-ae Jan 27, 2021

tnaake Mar 17, 2021

tnaake Mar 17, 2021

const-ae Jan 27, 2021

tnaake Mar 17, 2021

const-ae Jan 27, 2021

const-ae Jan 27, 2021

tnaake Mar 17, 2021

const-ae Jan 27, 2021

const-ae Jan 27, 2021

const-ae Jan 27, 2021

tnaake Mar 17, 2021

const-ae Jan 27, 2021

tnaake commented Mar 17, 2021

improve speed of function fit_parameters_loop #9

Are you sure you want to change the base?

improve speed of function fit_parameters_loop #9

Conversation

tnaake commented Jan 26, 2021

const-ae commented Jan 27, 2021

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tnaake commented Mar 17, 2021