Merge pull request #216 from jpquast/developer

Release 0.7.0
jpquast · Feb 17, 2024 · c831779 · c831779
2 parents 32d501e + aa88248
commit c831779
Show file tree

Hide file tree

Showing 104 changed files with 2,733 additions and 1,803 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -14,3 +14,4 @@
 ^cran-comments\.md$
 ^CRAN-RELEASE$
 ^CRAN-SUBMISSION$
+^revdep$
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -4,7 +4,8 @@ on:
   push:
     branches: [main, master]
   pull_request:
-    branches: [main, master]
+    branches:
+      - '*'
 
 name: R-CMD-check
 
@@ -43,9 +44,15 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: any::rcmdcheck
+          extra-packages:
+              any::rcmdcheck
           needs: check
 
+      #  run: |
+     # - name: Install remotes and lme4 package
+       #   Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIB_FOR_PAK"))'
+        #  Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")'
+
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -18,9 +18,9 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-pandoc@v2
 
       - name: Query dependencies
         run: |
@@ -44,7 +44,7 @@ jobs:
         shell: Rscript {0}
 
       - name: Test coverage
-        env: 
+        env:
            TEST_PROTTI: true
            BUILD_VIGNETTE: true
         run: covr::codecov()

diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,5 @@ inst/doc
 doc
 Meta
 docs
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: protti
 Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
-Version: 0.6.0.9000
+Version: 0.7.0
 Authors@R: 
     c(person(given = "Jan-Philipp",
            family = "Quast",

diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(calculate_kegg_enrichment)
 export(calculate_protein_abundance)
 export(calculate_sequence_coverage)
 export(calculate_treatment_enrichment)
+export(correct_lip_for_abundance)
 export(create_queue)
 export(create_structure_contact_map)
 export(create_synthetic_data)
@@ -145,6 +146,7 @@ importFrom(rlang,enquo)
 importFrom(rlang,ensym)
 importFrom(rlang,expr)
 importFrom(rlang,new_formula)
+importFrom(rlang,sym)
 importFrom(stats,median)
 importFrom(stats,na.omit)
 importFrom(stats,p.adjust)

diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,27 @@
-# protti 0.6.0.9000
+# protti 0.7.0
+
+## New features
+
+* `correct_lip_for_abundance()` was added. It corrects LiP-peptides for changes in protein abundance and calculates their significance using a t-test. The function is based on the [MSstatsLiP](https://www.bioconductor.org/packages/release/bioc/html/MSstatsLiP.html) package developed by the Vitek Lab. Big thanks to @FehrAaron for implementing it! 
+* `qc_cvs()` received a new argument called `max_cv` that specifies the maximum CV that should be included in the plot.
+* `peptide_profile_plot()` received a new argument called `complete_sample`. If set to `TRUE`, each protein gets assigned all sample names that are found in the input data. This ensures that the plot always contains all samples on the x-axis even if there are no measured intensities for a specific sample. The default is `FALSE`, which is the original behaviour of the function.
+* `volcano_plot()` received the `colour` argument that allows the user to provide custom colours for points.
+* Increased the speed of `find_peptide()` and `assign_peptide_type()` by only computing on the smallest possible subset of data before joining back to the original data frame.
+* `calculate_treatment_enrichment()` can now be applied on data frames with multiple different groups. The enrichment will be calculated for each group separately. If the data is plotted, each group is displayed in a separate facet. The group is provided to the new `group` argument.
+* `qc_pca()`: If the condition argument is numeric a colour gradient is used instead.
 
 ## Bug fixes
 
-* `plot_volcano()` now also works interactively if there are no significant hits.
-* `fetch_chebi()` fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update.
+* `volcano_plot()` now also works interactively if there are no significant hits.
+* `fetch_chebi()`: fixed an issue caused by `na_if()` that changed its behaviour after the recent `dplyr` update.
+* `qc_proteome_coverage()`: fixed the label order of fractions of proteins detected and not detected in the proteome. Fixes issue #194.
+* `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. Fixes issue #197.
+* `fetch_kegg()` now returns the pathway name correctly again.
+* `qc_intensity_distribution()`, `qc_median_intensities()`, `qc_charge_states()`, `qc_contaminants()`, `qc_missed_cleavages()`, `qc_peptide_type()`, `qc_ids()`: If the provided sample column is of type factor, the level order won't be overwritten anymore.
+*`fit_drc_4p()`: If there are no correlations an empty data frame is returned to prevent errors in `parallel_fit_drc_4p()`.
+* `calculate_sequence_coverage()` does not fail anymore if a protein only contains `NA` peptide sequences.
+* `qc_sequence_coverage()` does not return a plot anymore if `plot = FALSE`. This fixes issue #207.
+* `qc_data_completeness()` if sample was of type `factor` the function did not properly facet the data when the `digestion` argument was provided. Now we filter out all 0% completeness values that come from factor levels that are not present in subsetted data. 
 
 # protti 0.6.0
 

diff --git a/R/assign_missingness.R b/R/assign_missingness.R
@@ -30,18 +30,17 @@
 #' @return A data frame that contains the reference condition paired with each treatment condition.
 #' The \code{comparison} column contains the comparison name for the specific treatment/reference
 #' pair. The \code{missingness} column reports the type of missingness.
-#' \itemize{
-#' \item{"complete": }{No missing values for every replicate of this reference/treatment pair for
-#' the specific grouping variable.}
-#' \item{"MNAR": }{Missing not at random. All replicates of either the reference or treatment
-#' condition have missing values for the specific grouping variable.}
-#' \item{"MAR": }{Missing at random. At least n-1 replicates have missing values for the
-#' reference/treatment pair for the specific grouping varible.}
-#' \item{NA: }{The comparison is not complete enough to fall into any other category. It will not
+#' * "complete": No missing values for every replicate of this reference/treatment pair for
+#' the specific grouping variable.
+#' * "MNAR": Missing not at random. All replicates of either the reference or treatment
+#' condition have missing values for the specific grouping variable.
+#' * "MAR": Missing at random. At least n-1 replicates have missing values for the
+#' reference/treatment pair for the specific grouping varible.
+#' * NA: The comparison is not complete enough to fall into any other category. It will not
 #' be imputed if imputation is performed. For statistical significance testing these comparisons
 #' are filtered out after the test and prior to p-value adjustment. This can be prevented by setting
-#' `filter_NA_missingness = FALSE` in the `calculate_diff_abundance()` function.}
-#' }
+#' `filter_NA_missingness = FALSE` in the `calculate_diff_abundance()` function.
+#'
 #' The type of missingness has an influence on the way values are imputeted if imputation is
 #' performed subsequently using the `impute()` function. How each type of missingness is
 #' specifically imputed can be found in the function description. The type of missingness
@@ -128,7 +127,7 @@ from the conditions and assigned their missingness. The created comparisons are:
 
   # create dataframe that contains all combinations to be tested
   all_combinations <- all_combinations %>%
-    tidyr::pivot_longer(cols = c(.data$V1, .data$V2), names_to = "name", values_to = rlang::as_name(rlang::enquo(condition))) %>%
+    tidyr::pivot_longer(cols = c("V1", "V2"), names_to = "name", values_to = rlang::as_name(rlang::enquo(condition))) %>%
     dplyr::select(-.data$name) %>%
     dplyr::group_by({{ condition }}) %>%
     dplyr::mutate(comparison = list(.data$combinations)) %>%
@@ -143,7 +142,7 @@ from the conditions and assigned their missingness. The created comparisons are:
     dplyr::mutate(n_replicates = dplyr::n()) %>%
     dplyr::ungroup() %>%
     dplyr::left_join(all_combinations, by = rlang::as_name(rlang::enquo(condition))) %>%
-    tidyr::unnest(.data$comparison)
+    tidyr::unnest("comparison")
 
   # check if there are any unequal replicate comparisons
   unequal_replicates <- data_prep %>%
@@ -152,25 +151,26 @@ from the conditions and assigned their missingness. The created comparisons are:
     dplyr::group_by(.data$comparison) %>%
     dplyr::mutate(n = dplyr::n()) %>%
     dplyr::filter(.data$n > 1) %>%
-    dplyr::mutate(n_replicates = paste0(.data$n_replicates, collapse = "/")) 
-  
-  if(any(unequal_replicates$n > 2)){
+    dplyr::mutate(n_replicates = paste0(.data$n_replicates, collapse = "/"))
+
+  if (any(unequal_replicates$n > 2)) {
     stop(
       "\n",
       strwrap('Some created comparisons seem to have more than two unequal number of replicates.
               This usually only happens if the wrong grouping variable was selected. Please check this!
               The grouping variable should split the dataset so that each sample of each condition only
-              appears once for each element of the grouping. E.g. grouping peptide: Each peptide should 
+              appears once for each element of the grouping. E.g. grouping peptide: Each peptide should
               only have sample_1 associated once with condition_1 and not twice or more often. If in this
               case grouping "protein" was inadvertently selected a protein might have multiple peptides, each
               containing sample_1 of condition_1, which means it appears more than once (appears as many times
               as there are peptides per protein). This means each condition can have an unequal number of replicates
-              that is as high as the max number of proteins, which is not the correct calculation for replicates.', 
-              prefix = "\n", initial = ""), "\n"
+              that is as high as the max number of proteins, which is not the correct calculation for replicates.',
+        prefix = "\n", initial = ""
+      ), "\n"
     )
   }
-  
-    unequal_replicates <- unequal_replicates %>% 
+
+  unequal_replicates <- unequal_replicates %>%
     dplyr::distinct(.data$n_replicates, .data$comparison)
 
   if (nrow(unequal_replicates) != 0) {
@@ -190,21 +190,21 @@ from the conditions and assigned their missingness. The created comparisons are:
     )) %>%
     split(.$comparison) %>%
     purrr::map_df(.f = ~ .x %>%
-      tidyr::pivot_wider(names_from = .data$type, values_from = c(.data$n_detect, .data$n_replicates)) %>%
+      tidyr::pivot_wider(names_from = "type", values_from = c("n_detect", "n_replicates")) %>%
       dplyr::group_by({{ grouping }}) %>%
-      tidyr::fill(.data$n_detect_treated, .data$n_detect_control, .data$n_replicates_treated, .data$n_replicates_control, .direction = "updown") %>%
+      tidyr::fill("n_detect_treated", "n_detect_control", "n_replicates_treated", "n_replicates_control", .direction = "updown") %>%
       dplyr::ungroup() %>%
       dplyr::mutate(missingness = dplyr::case_when(
         .data$n_detect_control == .data$n_replicates_control &
           .data$n_detect_treated == .data$n_replicates_treated ~ "complete",
-        .data$n_detect_control <= floor(n_replicates_control * 0.2) &
+        .data$n_detect_control <= floor(n_replicates_control * completeness_MNAR) &
           .data$n_detect_treated == .data$n_replicates_treated ~ "MNAR",
         .data$n_detect_control == .data$n_replicates_control &
-          .data$n_detect_treated <= floor(n_replicates_treated * 0.2) ~ "MNAR",
-        .data$n_detect_control >= max(floor(.data$n_replicates_control * 0.7), 1) &
-          .data$n_detect_treated >= max(floor(.data$n_replicates_control * 0.7), 1) ~ "MAR"
+          .data$n_detect_treated <= floor(n_replicates_treated * completeness_MNAR) ~ "MNAR",
+        .data$n_detect_control >= max(floor(.data$n_replicates_control * completeness_MAR), 1) &
+          .data$n_detect_treated >= max(floor(.data$n_replicates_control * completeness_MAR), 1) ~ "MAR"
       ))) %>%
-    dplyr::select(-c(.data$n_detect_control, .data$n_detect_treated, .data$n_replicates_control, .data$n_replicates_treated)) %>%
+    dplyr::select(-c("n_detect_control", "n_detect_treated", "n_replicates_control", "n_replicates_treated")) %>%
     # Arrange by grouping but in a numeric order of the character vector.
     dplyr::arrange(factor({{ grouping }}, levels = unique(stringr::str_sort({{ grouping }}, numeric = TRUE))))
 

diff --git a/R/assign_peptide_type.R b/R/assign_peptide_type.R
@@ -55,6 +55,7 @@ assign_peptide_type <- function(data,
                                 last_aa = last_aa,
                                 aa_after = aa_after) {
   data %>%
+    dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>%
     dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" |
       {{ aa_before }} == "K" |
       {{ aa_before }} == "R",
@@ -72,5 +73,10 @@ assign_peptide_type <- function(data,
       .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic",
       .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic"
     )) %>%
-    dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp)
+    dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>%
+    dplyr::right_join(data, by = c(
+      rlang::as_name(rlang::enquo(aa_before)),
+      rlang::as_name(rlang::enquo(last_aa)),
+      rlang::as_name(rlang::enquo(aa_after))
+    ))
 }
diff --git a/R/barcode_plot.R b/R/barcode_plot.R
@@ -102,14 +102,15 @@ barcode_plot <- function(data,
   # Create plot
   data %>%
     ggplot2::ggplot() +
-    ggplot2::geom_rect(ggplot2::aes(
-      ymin = -2.5,
-      ymax = 2.5,
-      xmax = {{ end_position }} / {{ protein_length }} * 100,
-      xmin = ({{ start_position }} - 1) / {{ protein_length }} * 100,
-      fill = {{ colouring }}
-    ),
-    size = 0.7
+    ggplot2::geom_rect(
+      ggplot2::aes(
+        ymin = -2.5,
+        ymax = 2.5,
+        xmax = {{ end_position }} / {{ protein_length }} * 100,
+        xmin = ({{ start_position }} - 1) / {{ protein_length }} * 100,
+        fill = {{ colouring }}
+      ),
+      size = 0.7
     ) +
     ggplot2::scale_fill_manual(values = c(
       "#999999", "#5680C1", "#B96DAD", "#64CACA", "#81ABE9", "#F6B8D1", "#99F1E4", "#9AD1FF", "#548BDF", "#A55098", "#3EB6B6",

diff --git a/R/calculate_aa_scores.R b/R/calculate_aa_scores.R
@@ -59,7 +59,7 @@ calculate_aa_scores <- function(data,
     dplyr::mutate(score = -log10({{ adj_pval }}) * abs({{ diff }})) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(residue = list(seq({{ start_position }}, {{ end_position }}))) %>%
-    tidyr::unnest(.data$residue) %>%
+    tidyr::unnest("residue") %>%
     dplyr::group_by({{ protein }}, .data$residue) %>%
     dplyr::mutate(amino_acid_score = mean(.data$score)) %>%
     dplyr::distinct({{ protein }}, .data$residue, .data$amino_acid_score)