diff --git a/DESCRIPTION b/DESCRIPTION index 9b857d43..53f20b81 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: protti Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools -Version: 0.9.1 +Version: 0.9.1.9000 Authors@R: c(person(given = "Jan-Philipp", family = "Quast", diff --git a/NEWS.md b/NEWS.md index f4e23d7e..fa529c28 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,13 @@ +# protti 0.9.1.9000 + +## Additional Changes + +* `assign_peptide_type` now takes the `start` argument, containing the start position of a peptide. If a protein does not have any peptide starting at position `1` and there is a peptide starting at position `2`, this peptide will be considered "tryptic" at the N-terminus. This is because the initial Methionine is likely missing due to processing for every copy of the protein and therefore position `2` is the true N-terminus. + # protti 0.9.1 ## Bug fixes + * `try_query()` now correctly handles errors that don't return a response object. We also handle gzip decompression problems better since some databases compressed responses were not handled correctly. # protti 0.9.0 diff --git a/R/assign_peptide_type.R b/R/assign_peptide_type.R index a219d610..defe7360 100644 --- a/R/assign_peptide_type.R +++ b/R/assign_peptide_type.R @@ -24,7 +24,9 @@ peptide_type <- function(...) { #' peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be #' fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the #' criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling -#' the criteria for both termini are non-tryptic peptides. +#' the criteria for both termini are non-tryptic peptides. In addition, peptides that miss the initial +#' Methionine of a protein are considered "tryptic" at that site if there is no other peptide +#' starting at position 1 for that protein. #' #' @param data a data frame containing at least information about the preceding and C-terminal #' amino acids of peptides. @@ -34,49 +36,90 @@ peptide_type <- function(...) { #' acid as one letter code. #' @param aa_after a character column in the \code{data} data frame that contains the following amino #' acid as one letter code. +#' @param protein_id a character column in the \code{data} data frame that contains the protein +#' accession numbers. +#' @param start a numeric column in the \code{data} data frame that contains the start position of +#' each peptide within the corresponding protein. This is used to check if the protein is consistently +#' missing the initial Methionine, making peptides starting at position 2 "tryptic" on that site. #' #' @return A data frame that contains the input data and an additional column with the peptide #' type information. #' @import dplyr #' @importFrom magrittr %>% #' @importFrom rlang .data +#' @importFrom stringr str_detect #' @export #' #' @examples #' data <- data.frame( -#' aa_before = c("K", "S", "T"), -#' last_aa = c("R", "K", "Y"), -#' aa_after = c("T", "R", "T") +#' aa_before = c("K", "M", "", "M", "S", "M", "-"), +#' last_aa = c("R", "K", "R", "R", "Y", "K", "K"), +#' aa_after = c("T", "R", "T", "R", "T", "R", "T"), +#' protein_id = c("P1", "P1", "P3", "P3", "P2", "P2", "P2"), +#' start = c(38, 2, 1, 2, 10, 2, 1) #' ) #' -#' assign_peptide_type(data, aa_before, last_aa, aa_after) +#' assign_peptide_type(data, aa_before, last_aa, aa_after, protein_id, start) assign_peptide_type <- function(data, aa_before = aa_before, last_aa = last_aa, - aa_after = aa_after) { - data %>% - dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>% - dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" | - {{ aa_before }} == "K" | - {{ aa_before }} == "R", - TRUE, - FALSE + aa_after = aa_after, + protein_id = protein_id, + start = start) { + # Check if there's any peptide starting at position 1 for each protein + start_summary <- data %>% + dplyr::group_by({{ protein_id }}) %>% + dplyr::summarize(has_start_1 = any({{ start }} == 1), .groups = "drop") + + peptide_data <- data %>% + dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}, {{ protein_id }}, {{ start }}, .keep_all = TRUE) %>% + dplyr::left_join(start_summary, by = rlang::as_name(rlang::enquo(protein_id))) %>% + # Determine N-terminal trypticity + dplyr::mutate(N_term_tryp = dplyr::if_else( + !stringr::str_detect({{ aa_before }}, "[A-Y]") | {{ aa_before }} == "K" | {{ aa_before }} == "R", + TRUE, + FALSE )) %>% - dplyr::mutate(C_term_tryp = dplyr::if_else({{ last_aa }} == "K" | - {{ last_aa }} == "R" | - {{ aa_after }} == "", - TRUE, - FALSE + # Determine C-terminal trypticity + dplyr::mutate(C_term_tryp = dplyr::if_else( + {{ last_aa }} == "K" | {{ last_aa }} == "R" | !stringr::str_detect({{ aa_after }}, "[A-Y]"), + TRUE, + FALSE )) %>% + # Assign peptide type based on N-term and C-term trypticity dplyr::mutate(pep_type = dplyr::case_when( - .data$N_term_tryp + .data$C_term_tryp == 2 ~ "fully-tryptic", - .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic", - .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic" + .data$N_term_tryp & .data$C_term_tryp ~ "fully-tryptic", + .data$N_term_tryp | .data$C_term_tryp ~ "semi-tryptic", + TRUE ~ "non-tryptic" + )) %>% + # Reassign semi-tryptic peptides at position 2 to fully-tryptic if no start == 1 + dplyr::mutate(pep_type = dplyr::if_else( + .data$pep_type == "semi-tryptic" & {{ start }} == 2 & !.data$has_start_1 & .data$C_term_tryp, + "fully-tryptic", + .data$pep_type + )) %>% + # Reassign non-tryptic peptides at position 2 to semi-tryptic if no start == 1 + dplyr::mutate(pep_type = dplyr::if_else( + .data$pep_type == "non-tryptic" & {{ start }} == 2 & !.data$has_start_1 & !.data$C_term_tryp, + "fully-tryptic", + .data$pep_type )) %>% - dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>% - dplyr::right_join(data, by = c( - rlang::as_name(rlang::enquo(aa_before)), - rlang::as_name(rlang::enquo(last_aa)), - rlang::as_name(rlang::enquo(aa_after)) - )) + # Drop unnecessary columns + dplyr::select(-c("N_term_tryp", "C_term_tryp", "has_start_1")) + + # Join back to original data to return the full result + result <- data %>% + dplyr::left_join( + peptide_data %>% + dplyr::select({{ aa_before }}, {{ last_aa }}, {{ aa_after }}, {{ protein_id }}, {{ start }}, "pep_type"), + by = c( + rlang::as_name(rlang::enquo(aa_before)), + rlang::as_name(rlang::enquo(last_aa)), + rlang::as_name(rlang::enquo(aa_after)), + rlang::as_name(rlang::enquo(protein_id)), + rlang::as_name(rlang::enquo(start)) + ) + ) + + return(result) } diff --git a/R/qc_cvs.R b/R/qc_cvs.R index 713303d3..3ffa1ff2 100644 --- a/R/qc_cvs.R +++ b/R/qc_cvs.R @@ -122,7 +122,7 @@ The function does not handle log2 transformed data.", dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>% dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>% dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>% - dplyr::select(-.data$type) %>% + dplyr::select(-"type") %>% dplyr::group_by({{ condition }}) %>% dplyr::mutate(median = stats::median(.data$values)) %>% dplyr::distinct() diff --git a/man/assign_peptide_type.Rd b/man/assign_peptide_type.Rd index 5f0da6ee..baff1587 100644 --- a/man/assign_peptide_type.Rd +++ b/man/assign_peptide_type.Rd @@ -8,7 +8,9 @@ assign_peptide_type( data, aa_before = aa_before, last_aa = last_aa, - aa_after = aa_after + aa_after = aa_after, + protein_id = protein_id, + start = start ) } \arguments{ @@ -23,6 +25,13 @@ acid as one letter code.} \item{aa_after}{a character column in the \code{data} data frame that contains the following amino acid as one letter code.} + +\item{protein_id}{a character column in the \code{data} data frame that contains the protein +accession numbers.} + +\item{start}{a numeric column in the \code{data} data frame that contains the start position of +each peptide within the corresponding protein. This is used to check if the protein is consistently +missing the initial Methionine, making peptides starting at position 2 "tryptic" on that site.} } \value{ A data frame that contains the input data and an additional column with the peptide @@ -34,14 +43,18 @@ Peptides with preceeding and C-terminal lysine or arginine are considered fully- peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling -the criteria for both termini are non-tryptic peptides. +the criteria for both termini are non-tryptic peptides. In addition, peptides that miss the initial +Methionine of a protein are considered "tryptic" at that site if there is no other peptide +starting at position 1 for that protein. } \examples{ data <- data.frame( - aa_before = c("K", "S", "T"), - last_aa = c("R", "K", "Y"), - aa_after = c("T", "R", "T") + aa_before = c("K", "M", "", "M", "S", "M", "-"), + last_aa = c("R", "K", "R", "R", "Y", "K", "K"), + aa_after = c("T", "R", "T", "R", "T", "R", "T"), + protein_id = c("P1", "P1", "P3", "P3", "P2", "P2", "P2"), + start = c(38, 2, 1, 2, 10, 2, 1) ) -assign_peptide_type(data, aa_before, last_aa, aa_after) +assign_peptide_type(data, aa_before, last_aa, aa_after, protein_id, start) } diff --git a/tests/testthat/test-auxiliary_functions.R b/tests/testthat/test-auxiliary_functions.R index a3012779..c08c0aa9 100644 --- a/tests/testthat/test-auxiliary_functions.R +++ b/tests/testthat/test-auxiliary_functions.R @@ -11,14 +11,18 @@ if (Sys.getenv("TEST_PROTTI") == "true") { }) protein <- fetch_uniprot(uniprot_ids = "P36578") + protein2 <- fetch_uniprot(uniprot_ids = "P00925") if (!is.null(protein)) { data <- tibble::tibble( - protein_id = rep("P36578", 3), - protein_sequence = rep(protein$sequence, 3), + protein_id = c(rep("P36578", 3), rep("P00925", 3)), + protein_sequence = c(rep(protein$sequence, 3), rep(protein2$sequence, 3)), peptide = c( stringr::str_sub(protein$sequence, start = 87, end = 97), stringr::str_sub(protein$sequence, start = 59, end = 71), - stringr::str_sub(protein$sequence, start = 10, end = 18) + stringr::str_sub(protein$sequence, start = 10, end = 18), + stringr::str_sub(protein2$sequence, start = 5, end = 10), + stringr::str_sub(protein2$sequence, start = 2, end = 15), + stringr::str_sub(protein2$sequence, start = 10, end = 15) ) ) @@ -29,12 +33,18 @@ if (Sys.getenv("TEST_PROTTI") == "true") { protein_sequence = protein_sequence, peptide_sequence = peptide ) %>% - peptide_type(aa_before = aa_before, last_aa = last_aa)) + peptide_type( + aa_before = aa_before, + last_aa = last_aa, + aa_after = aa_after, + protein_id = protein_id, + start = start + )) }) expect_is(assigned_types, "data.frame") - expect_equal(nrow(assigned_types), 3) + expect_equal(nrow(assigned_types), 6) expect_equal(ncol(assigned_types), 9) - expect_equal(assigned_types$pep_type, c("fully-tryptic", "semi-tryptic", "non-tryptic")) + expect_equal(assigned_types$pep_type, c("fully-tryptic", "semi-tryptic", "non-tryptic", "non-tryptic", "fully-tryptic", "fully-tryptic")) }) assigned_types <- data %>% @@ -42,13 +52,18 @@ if (Sys.getenv("TEST_PROTTI") == "true") { protein_sequence = protein_sequence, peptide_sequence = peptide ) %>% - assign_peptide_type(aa_before = aa_before, last_aa = last_aa) + assign_peptide_type( + aa_before = aa_before, + last_aa = last_aa, + aa_after = aa_after, + protein_id = protein_id + ) test_that("find_peptide and assign_peptide_type work", { expect_is(assigned_types, "data.frame") - expect_equal(nrow(assigned_types), 3) + expect_equal(nrow(assigned_types), 6) expect_equal(ncol(assigned_types), 9) - expect_equal(assigned_types$pep_type, c("fully-tryptic", "semi-tryptic", "non-tryptic")) + expect_equal(assigned_types$pep_type, c("fully-tryptic", "semi-tryptic", "non-tryptic", "non-tryptic", "fully-tryptic", "fully-tryptic")) }) test_that("deprecated sequence_coverage works", { @@ -60,9 +75,9 @@ if (Sys.getenv("TEST_PROTTI") == "true") { )) }) expect_is(coverage, "data.frame") - expect_equal(nrow(coverage), 3) + expect_equal(nrow(coverage), 6) expect_equal(ncol(coverage), 10) - expect_equal(unique(round(coverage$coverage, digits = 1)), 7.7) + expect_equal(unique(round(coverage$coverage, digits = 1)), c(7.7, 3.2)) }) coverage <- calculate_sequence_coverage( @@ -73,14 +88,14 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("calculate_sequence_coverage works", { expect_is(coverage, "data.frame") - expect_equal(nrow(coverage), 3) + expect_equal(nrow(coverage), 6) expect_equal(ncol(coverage), 10) - expect_equal(unique(round(coverage$coverage, digits = 1)), 7.7) + expect_equal(unique(round(coverage$coverage, digits = 1)), c(7.7, 3.2)) }) plot_data <- coverage %>% dplyr::mutate( - fold_change = c(3, -0.4, 2.1), + fold_change = c(3, -0.4, 2.1, 0.1, -0.1, 0.2), protein_length = nchar(protein_sequence) ) @@ -106,7 +121,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { end_position = end, protein_length = protein_length, coverage = coverage, - protein_id = protein_id, + facet = protein_id, colouring = pep_type ) expect_is(p, "ggplot") diff --git a/vignettes/data_analysis_single_dose_treatment_workflow.Rmd b/vignettes/data_analysis_single_dose_treatment_workflow.Rmd index 8cb8a6d4..95c0d397 100644 --- a/vignettes/data_analysis_single_dose_treatment_workflow.Rmd +++ b/vignettes/data_analysis_single_dose_treatment_workflow.Rmd @@ -202,7 +202,8 @@ data_filtered_uniprot <- data_filtered_proteotypic %>% assign_peptide_type( aa_before = aa_before, last_aa = last_aa, - aa_after = aa_after + aa_after = aa_after, + protein_id = pg_protein_accessions ) %>% calculate_sequence_coverage( protein_sequence = sequence,