jpquast · elena-krismer · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024 · Aug 26, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: protti
 Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
-Version: 0.9.0
+Version: 0.9.0.9000
 Authors@R: 
     c(person(given = "Jan-Philipp",
            family = "Quast",
@@ -43,7 +43,7 @@ Imports:
     methods,
     R.utils,
     stats
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Suggests: 
     testthat,
     covr,
@@ -67,7 +67,9 @@ Suggests:
     iq,
     scales,
     farver,
-    ggforce
+    ggforce,
+    xml2,
+    jsonlite
 Depends: 
     R (>= 4.0)
 URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/

diff --git a/NAMESPACE b/NAMESPACE
@@ -134,6 +134,7 @@ importFrom(purrr,pluck)
 importFrom(purrr,pmap)
 importFrom(purrr,reduce)
 importFrom(purrr,set_names)
+importFrom(readr,read_csv)
 importFrom(readr,read_tsv)
 importFrom(readr,write_csv)
 importFrom(readr,write_tsv)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# protti 0.9.0.9000
+
+## Additional Changes
+
+* `assign_peptide_type` now takes the `start` argument, containing the start position of a peptide. If a protein does not have any peptide starting at position `1` and there is a peptide starting at position `2`, this peptide will be considered "tryptic" at the N-terminus. This is because the initial Methionine is likely missing due to processing for every copy of the protein and therefore position `2` is the true N-terminus.
+
 # protti 0.9.0
 
 ## New features 

diff --git a/R/assign_peptide_type.R b/R/assign_peptide_type.R
@@ -24,7 +24,9 @@ peptide_type <- function(...) {
 #' peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be
 #' fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the
 #' criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling
-#' the criteria for both termini are non-tryptic peptides.
+#' the criteria for both termini are non-tryptic peptides. In addition, peptides that miss the initial
+#' Methionine of a protein are considered "tryptic" at that site if there is no other peptide
+#' starting at position 1 for that protein.
 #'
 #' @param data a data frame containing at least information about the preceding and C-terminal
 #' amino acids of peptides.
@@ -34,49 +36,90 @@ peptide_type <- function(...) {
 #' acid as one letter code.
 #' @param aa_after a character column in the \code{data} data frame that contains the following amino
 #' acid as one letter code.
+#' @param protein_id a character column in the \code{data} data frame that contains the protein
+#' accession numbers.
+#' @param start a numeric column in the \code{data} data frame that contains the start position of
+#' each peptide within the corresponding protein. This is used to check if the protein is consistently
+#' missing the initial Methionine, making peptides starting at position 2 "tryptic" on that site.
 #'
 #' @return A data frame that contains the input data and an additional column with the peptide
 #' type information.
 #' @import dplyr
 #' @importFrom magrittr %>%
 #' @importFrom rlang .data
+#' @importFrom stringr str_detect
 #' @export
 #'
 #' @examples
 #' data <- data.frame(
-#'   aa_before = c("K", "S", "T"),
-#'   last_aa = c("R", "K", "Y"),
-#'   aa_after = c("T", "R", "T")
+#'   aa_before = c("K", "M", "", "M", "S", "M", "-"),
+#'   last_aa = c("R", "K", "R", "R", "Y", "K", "K"),
+#'   aa_after = c("T", "R", "T", "R", "T", "R", "T"),
+#'   protein_id = c("P1", "P1", "P3", "P3", "P2", "P2", "P2"),
+#'   start = c(38, 2, 1, 2, 10, 2, 1)
 #' )
 #'
-#' assign_peptide_type(data, aa_before, last_aa, aa_after)
+#' assign_peptide_type(data, aa_before, last_aa, aa_after, protein_id, start)
 assign_peptide_type <- function(data,
                                 aa_before = aa_before,
                                 last_aa = last_aa,
-                                aa_after = aa_after) {
-  data %>%
-    dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>%
-    dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" |
-      {{ aa_before }} == "K" |
-      {{ aa_before }} == "R",
-    TRUE,
-    FALSE
+                                aa_after = aa_after,
+                                protein_id = protein_id,
+                                start = start) {
+  # Check if there's any peptide starting at position 1 for each protein
+  start_summary <- data %>%
+    dplyr::group_by({{ protein_id }}) %>%
+    dplyr::summarize(has_start_1 = any({{ start }} == 1), .groups = "drop")
+
+  peptide_data <- data %>%
+    dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}, {{ protein_id }}, {{ start }}, .keep_all = TRUE) %>%
+    dplyr::left_join(start_summary, by = rlang::as_name(rlang::enquo(protein_id))) %>%
+    # Determine N-terminal trypticity
+    dplyr::mutate(N_term_tryp = dplyr::if_else(
+      !stringr::str_detect({{ aa_before }}, "[A-Y]") | {{ aa_before }} == "K" | {{ aa_before }} == "R",
+      TRUE,
+      FALSE
     )) %>%
-    dplyr::mutate(C_term_tryp = dplyr::if_else({{ last_aa }} == "K" |
-      {{ last_aa }} == "R" |
-      {{ aa_after }} == "",
-    TRUE,
-    FALSE
+    # Determine C-terminal trypticity
+    dplyr::mutate(C_term_tryp = dplyr::if_else(
+      {{ last_aa }} == "K" | {{ last_aa }} == "R" | !stringr::str_detect({{ aa_after }}, "[A-Y]"),
+      TRUE,
+      FALSE
     )) %>%
+    # Assign peptide type based on N-term and C-term trypticity
     dplyr::mutate(pep_type = dplyr::case_when(
-      .data$N_term_tryp + .data$C_term_tryp == 2 ~ "fully-tryptic",
-      .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic",
-      .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic"
+      .data$N_term_tryp & .data$C_term_tryp ~ "fully-tryptic",
+      .data$N_term_tryp | .data$C_term_tryp ~ "semi-tryptic",
+      TRUE ~ "non-tryptic"
+    )) %>%
+    # Reassign semi-tryptic peptides at position 2 to fully-tryptic if no start == 1
+    dplyr::mutate(pep_type = dplyr::if_else(
+      .data$pep_type == "semi-tryptic" & {{ start }} == 2 & !.data$has_start_1 & .data$C_term_tryp,
+      "fully-tryptic",
+      .data$pep_type
+    )) %>%
+    # Reassign non-tryptic peptides at position 2 to semi-tryptic if no start == 1
+    dplyr::mutate(pep_type = dplyr::if_else(
+      .data$pep_type == "non-tryptic" & {{ start }} == 2 & !.data$has_start_1 & !.data$C_term_tryp,
+      "fully-tryptic",
+      .data$pep_type
     )) %>%
-    dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>%
-    dplyr::right_join(data, by = c(
-      rlang::as_name(rlang::enquo(aa_before)),
-      rlang::as_name(rlang::enquo(last_aa)),
-      rlang::as_name(rlang::enquo(aa_after))
-    ))
+    # Drop unnecessary columns
+    dplyr::select(-c("N_term_tryp", "C_term_tryp", "has_start_1"))
+
+  # Join back to original data to return the full result
+  result <- data %>%
+    dplyr::left_join(
+      peptide_data %>%
+        dplyr::select({{ aa_before }}, {{ last_aa }}, {{ aa_after }}, {{ protein_id }}, {{ start }}, "pep_type"),
+      by = c(
+        rlang::as_name(rlang::enquo(aa_before)),
+        rlang::as_name(rlang::enquo(last_aa)),
+        rlang::as_name(rlang::enquo(aa_after)),
+        rlang::as_name(rlang::enquo(protein_id)),
+        rlang::as_name(rlang::enquo(start))
+      )
+    )
+
+  return(result)
 }
diff --git a/R/qc_cvs.R b/R/qc_cvs.R
@@ -122,7 +122,7 @@ The function does not handle log2 transformed data.",
         dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>%
         dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>%
         dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>%
-        dplyr::select(-.data$type) %>%
+        dplyr::select(-"type") %>%
         dplyr::group_by({{ condition }}) %>%
         dplyr::mutate(median = stats::median(.data$values)) %>%
         dplyr::distinct()

diff --git a/R/try_query.R b/R/try_query.R
@@ -13,7 +13,6 @@
 #' @param type a character value that specifies the type of data at the target URL. Options are
 #' all options that can be supplied to httr::content, these include e.g.
 #' "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values".
-#' Default is "tab-separated-values".
 #' @param timeout a numeric value that specifies the maximum request time. Default is 60 seconds.
 #' @param accept a character value that specifies the type of data that should be sent by the API if
 #' it uses content negotiation. The default is NULL and it should only be set for APIs that use
@@ -22,6 +21,7 @@
 #'
 #' @importFrom curl has_internet
 #' @importFrom httr GET timeout http_error message_for_status http_status content accept
+#' @importFrom readr read_tsv read_csv
 #'
 #' @return A data frame that contains the table from the url.
 try_query <-
@@ -88,7 +88,38 @@ try_query <-
     # Change variable to not show progress if readr is used
     options(readr.show_progress = FALSE)
 
-    result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
+    # Retrieve the content as raw bytes using httr::content
+    raw_content <- httr::content(query_result, type = "raw")
+    # Check for gzip magic number (1f 8b) before decompression
+    compressed <- length(raw_content) >= 2 && raw_content[1] == as.raw(0x1f) && raw_content[2] == as.raw(0x8b)
+
+    # Check if the content is gzip compressed
+    if (!is.null(query_result$headers[["content-encoding"]]) && query_result$headers[["content-encoding"]] == "gzip" && compressed) {
+      # Decompress the raw content using base R's `memDecompress`
+      decompressed_content <- memDecompress(raw_content, type = "gzip")
+
+      # Convert the raw bytes to a character string
+      text_content <- rawToChar(decompressed_content)
+
+      # Read the decompressed content based on the specified type
+      if (type == "text/tab-separated-values") {
+        result <- readr::read_tsv(text_content, ...)
+      } else if (type == "text/html") {
+        result <- xml2::read_html(text_content, ...)
+      } else if (type == "text/xml") {
+        result <- xml2::read_xml(text_content, ...)
+      } else if (type == "text/csv" || type == "txt/csv") {
+        result <- readr::read_csv(text_content, ...)
+      } else if (type == "application/json") {
+        result <- jsonlite::fromJSON(text_content, ...) # Using jsonlite for JSON parsing
+      } else if (type == "text") {
+        result <- text_content # Return raw text as-is
+      } else {
+        stop("Unsupported content type: ", type)
+      }
+    } else {
+      result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
+    }
 
     return(result)
   }
diff --git a/man/assign_peptide_type.Rd b/man/assign_peptide_type.Rd
diff --git a/man/try_query.Rd b/man/try_query.Rd