From 25f9abbd65f46a96cb1287e1b81a67f10a3c6a2d Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 11:31:06 +0200 Subject: [PATCH 01/35] fix bug and improve calculate_go_enrichment --- NAMESPACE | 1 + NEWS.md | 12 ++++++++++++ R/calculate_go_enrichment.R | 34 ++++++++++++++++++++++++++++------ man/calculate_go_enrichment.Rd | 11 +++++++++++ 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 09754811..c6c13c2a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -147,6 +147,7 @@ importFrom(rlang,ensym) importFrom(rlang,expr) importFrom(rlang,new_formula) importFrom(rlang,sym) +importFrom(scales,number_format) importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,p.adjust) diff --git a/NEWS.md b/NEWS.md index b49386b8..490e7387 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# protti 0.8.0.9000 + +## New features + +* `calculate_go_enrichment()` got additional arguments. + * `replace_long_name`: a logical argument that specifies if GO term names above 50 characters should be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become too small due to the long name. The default is `TRUE`. + * `label_move_frac`: a numeric argument between 0 and 1 that specifies which labels should be moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from overlapping with the bar boundaries. + +## Bug fixes + +* `calculate_go_enrichment()` now correctly uses to total number of provided proteins for the contingency table. Previously it falsely only considered proteins with a GO annotation for the enrichment analysis. + # protti 0.8.0 ## New features diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 8db67eaa..febb25df 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -82,6 +82,13 @@ go_enrichment <- function(...) { #' determines if the enrichment analysis should be performed in order to check for both enrichemnt and #' deenrichemnt or only one of the two. This affects the statistics performed and therefore also the displayed #' plot. +#' @param replace_long_name a logical argument that specifies if GO term names above 50 characters should +#' be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become +#' too small due to the long name. The default is `TRUE`. +#' @param label_move_frac a numeric argument between 0 and 1 that specifies which labels should be +#' moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size +#' of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from +#' overlapping with the bar boundaries. #' @param min_n_detected_proteins_in_process is a numeric argument that specifies the minimum number of #' detected proteins required for a GO term to be displayed in the plot. The default is 1, meaning #' no filtering of the plotted data is performed. This argument does not affect any computations or @@ -111,6 +118,7 @@ go_enrichment <- function(...) { #' @importFrom rlang .data !! ensym #' @importFrom magrittr %>% #' @importFrom purrr map +#' @importFrom scales number_format #' @export #' #' @examples @@ -217,6 +225,8 @@ calculate_go_enrichment <- function(data, heatmap_fill_colour_rev = TRUE, label = TRUE, enrichment_type = "all", + replace_long_name = TRUE, + label_move_frac = 0.2, min_n_detected_proteins_in_process = 1, plot_cutoff = "adj_pval top10") { # to avoid note about no global variable binding. Usually this can be avoided with @@ -331,7 +341,6 @@ if you used the right organism ID.", prefix = "\n", initial = "")) # group argument is not missing cont_table <- go_data %>% - tidyr::drop_na(.data$go_id, {{ is_significant }}) %>% { # group argument is missing if (group_missing) { dplyr::group_by(., {{ is_significant }}) @@ -355,7 +364,8 @@ if you used the right organism ID.", prefix = "\n", initial = "")) } } %>% tidyr::complete(.data$go_id, tidyr::nesting(!!rlang::ensym(is_significant), n_sig), fill = list(n_has_process = 0)) %>% - dplyr::ungroup() + dplyr::ungroup() %>% + tidyr::drop_na(.data$go_id) if (group_missing) { @@ -437,6 +447,11 @@ if you used the right organism ID.", prefix = "\n", initial = "")) filtered_result_table <- result_table %>% dplyr::ungroup() %>% dplyr::filter(.data$n_detected_proteins_in_process >= min_n_detected_proteins_in_process) + + if(replace_long_name){ + filtered_result_table <- filtered_result_table %>% + mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) + } if (!missing(group) & y_axis_free & plot_style == "barplot") { # arrange table by group and go term for plot @@ -449,11 +464,14 @@ if you used the right organism ID.", prefix = "\n", initial = "")) if (stringr::str_detect(plot_cutoff, pattern = "top")) { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] - top <- stringr::str_extract(split_cutoff[2], pattern = "\\d+") + top <- as.numeric(stringr::str_extract(split_cutoff[2], pattern = "\\d+")) plot_input <- filtered_result_table %>% dplyr::ungroup() %>% dplyr::mutate(neg_log_sig = -log10(!!rlang::ensym(type))) %>% - dplyr::slice(1:top) + dplyr::group_by({{ group }}) %>% + dplyr::mutate(n = 1:dplyr::n()) %>% + dplyr::filter(n <= top) + # dplyr::slice(1:top) } else { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] @@ -463,6 +481,10 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::mutate(neg_log_sig = -log10(!!rlang::ensym(type))) %>% dplyr::filter(!!rlang::ensym(type) <= threshold) } + + # move label if bar is less than 20% (default) of largest bar + plot_input <- plot_input %>% + mutate(hjust = ifelse((.data$neg_log_sig / max(.data$neg_log_sig)) < label_move_frac, -0.15, 1.05)) if (plot_style == "barplot") { # Check if ggforce package is available. If not prompt user to install it. @@ -509,13 +531,13 @@ if you used the right organism ID.", prefix = "\n", initial = "")) "%)" ), y = .data$neg_log_sig - 0.1, - hjust = 1 + hjust = .data$hjust ) ) } } + ggplot2::scale_fill_manual(values = c(Deenriched = barplot_fill_colour[1], Enriched = barplot_fill_colour[2])) + - ggplot2::scale_y_continuous(breaks = seq(0, 100, 1)) + + ggplot2::scale_y_continuous(labels = scales::number_format(accuracy = 1)) + ggplot2::coord_flip() + { if (!missing(group)) { diff --git a/man/calculate_go_enrichment.Rd b/man/calculate_go_enrichment.Rd index 90b06303..884bff16 100644 --- a/man/calculate_go_enrichment.Rd +++ b/man/calculate_go_enrichment.Rd @@ -23,6 +23,8 @@ calculate_go_enrichment( heatmap_fill_colour_rev = TRUE, label = TRUE, enrichment_type = "all", + replace_long_name = TRUE, + label_move_frac = 0.2, min_n_detected_proteins_in_process = 1, plot_cutoff = "adj_pval top10" ) @@ -99,6 +101,15 @@ determines if the enrichment analysis should be performed in order to check for deenrichemnt or only one of the two. This affects the statistics performed and therefore also the displayed plot.} +\item{replace_long_name}{a logical argument that specifies if GO term names above 50 characters should +be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become +too small due to the long name. The default is \code{TRUE}.} + +\item{label_move_frac}{a numeric argument between 0 and 1 that specifies which labels should be +moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size +of 20\% or less of the largest bar are moved to the right of the bar. This prevents labels from +overlapping with the bar boundaries.} + \item{min_n_detected_proteins_in_process}{is a numeric argument that specifies the minimum number of detected proteins required for a GO term to be displayed in the plot. The default is 1, meaning no filtering of the plotted data is performed. This argument does not affect any computations or From 14d8c487deb7ac2442c64f71211aef51928580c4 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 09:43:35 +0000 Subject: [PATCH 02/35] Style code (GHA) --- R/calculate_go_enrichment.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index febb25df..12dd462e 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -82,12 +82,12 @@ go_enrichment <- function(...) { #' determines if the enrichment analysis should be performed in order to check for both enrichemnt and #' deenrichemnt or only one of the two. This affects the statistics performed and therefore also the displayed #' plot. -#' @param replace_long_name a logical argument that specifies if GO term names above 50 characters should -#' be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become +#' @param replace_long_name a logical argument that specifies if GO term names above 50 characters should +#' be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become #' too small due to the long name. The default is `TRUE`. -#' @param label_move_frac a numeric argument between 0 and 1 that specifies which labels should be +#' @param label_move_frac a numeric argument between 0 and 1 that specifies which labels should be #' moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size -#' of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from +#' of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from #' overlapping with the bar boundaries. #' @param min_n_detected_proteins_in_process is a numeric argument that specifies the minimum number of #' detected proteins required for a GO term to be displayed in the plot. The default is 1, meaning @@ -364,7 +364,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) } } %>% tidyr::complete(.data$go_id, tidyr::nesting(!!rlang::ensym(is_significant), n_sig), fill = list(n_has_process = 0)) %>% - dplyr::ungroup() %>% + dplyr::ungroup() %>% tidyr::drop_na(.data$go_id) @@ -447,9 +447,9 @@ if you used the right organism ID.", prefix = "\n", initial = "")) filtered_result_table <- result_table %>% dplyr::ungroup() %>% dplyr::filter(.data$n_detected_proteins_in_process >= min_n_detected_proteins_in_process) - - if(replace_long_name){ - filtered_result_table <- filtered_result_table %>% + + if (replace_long_name) { + filtered_result_table <- filtered_result_table %>% mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) } @@ -468,10 +468,10 @@ if you used the right organism ID.", prefix = "\n", initial = "")) plot_input <- filtered_result_table %>% dplyr::ungroup() %>% dplyr::mutate(neg_log_sig = -log10(!!rlang::ensym(type))) %>% - dplyr::group_by({{ group }}) %>% - dplyr::mutate(n = 1:dplyr::n()) %>% + dplyr::group_by({{ group }}) %>% + dplyr::mutate(n = 1:dplyr::n()) %>% dplyr::filter(n <= top) - # dplyr::slice(1:top) + # dplyr::slice(1:top) } else { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] @@ -481,9 +481,9 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::mutate(neg_log_sig = -log10(!!rlang::ensym(type))) %>% dplyr::filter(!!rlang::ensym(type) <= threshold) } - + # move label if bar is less than 20% (default) of largest bar - plot_input <- plot_input %>% + plot_input <- plot_input %>% mutate(hjust = ifelse((.data$neg_log_sig / max(.data$neg_log_sig)) < label_move_frac, -0.15, 1.05)) if (plot_style == "barplot") { From 733b6e9b41ce4d4891d689d838b54d1e826cb3c8 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 14:45:53 +0200 Subject: [PATCH 03/35] remove line --- R/calculate_go_enrichment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 12dd462e..0568b808 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -471,7 +471,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::group_by({{ group }}) %>% dplyr::mutate(n = 1:dplyr::n()) %>% dplyr::filter(n <= top) - # dplyr::slice(1:top) + } else { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] From aa7bf3ef9957d02169b164c1d8ae9dc6891d7354 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 15:00:39 +0200 Subject: [PATCH 04/35] remove scales from import --- R/calculate_go_enrichment.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 0568b808..eac60b1e 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -118,7 +118,6 @@ go_enrichment <- function(...) { #' @importFrom rlang .data !! ensym #' @importFrom magrittr %>% #' @importFrom purrr map -#' @importFrom scales number_format #' @export #' #' @examples From 851c1cd7aa94f134d522e6eec44b3f881b05a100 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 13:02:56 +0000 Subject: [PATCH 05/35] Style code (GHA) --- R/calculate_go_enrichment.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index eac60b1e..b03c7aa1 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -470,7 +470,6 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::group_by({{ group }}) %>% dplyr::mutate(n = 1:dplyr::n()) %>% dplyr::filter(n <= top) - } else { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] From eed10729287460174f0fe12ef2ff5561a3ce9d64 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 15:29:42 +0200 Subject: [PATCH 06/35] fix bug for good --- NAMESPACE | 1 - 1 file changed, 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index c6c13c2a..09754811 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -147,7 +147,6 @@ importFrom(rlang,ensym) importFrom(rlang,expr) importFrom(rlang,new_formula) importFrom(rlang,sym) -importFrom(scales,number_format) importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,p.adjust) From 22093824d916ca5edbf33d9013a12676ed576b7c Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 16:25:19 +0200 Subject: [PATCH 07/35] fix test --- R/calculate_go_enrichment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index b03c7aa1..6caef2df 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -447,7 +447,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::ungroup() %>% dplyr::filter(.data$n_detected_proteins_in_process >= min_n_detected_proteins_in_process) - if (replace_long_name) { + if (replace_long_name & missing(go_data) & !missing(go_annotations_uniprot)) { filtered_result_table <- filtered_result_table %>% mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) } From 75e5a9d7a3ef253fa30e1da6bdc061904f4a2543 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 17 May 2024 17:26:00 +0200 Subject: [PATCH 08/35] fix another bug --- R/calculate_go_enrichment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 6caef2df..14931340 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -447,7 +447,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::ungroup() %>% dplyr::filter(.data$n_detected_proteins_in_process >= min_n_detected_proteins_in_process) - if (replace_long_name & missing(go_data) & !missing(go_annotations_uniprot)) { + if (replace_long_name & !missing(go_annotations_uniprot)) { filtered_result_table <- filtered_result_table %>% mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) } From fa4d048e0fd83b6eefb0f2f536873496e65dd217 Mon Sep 17 00:00:00 2001 From: jpquast Date: Sat, 18 May 2024 12:06:08 +0200 Subject: [PATCH 09/35] fix example --- R/calculate_go_enrichment.R | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 14931340..05dc6e33 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -252,24 +252,23 @@ calculate_go_enrichment <- function(data, 'adj_pval top5', 'pval 0.05'") } - if (length(unique(dplyr::pull(data, {{ protein_id }}))) != nrow(data)) { - data <- data %>% - dplyr::ungroup() %>% - { - # conditional grouping - if (!group_missing) { - dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}, {{ group }}) %>% - dplyr::group_by({{ protein_id }}, {{ group }}) - } else { - dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}) %>% - dplyr::group_by({{ protein_id }}) - } - } %>% - dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, TRUE, FALSE)) %>% - # do this to remove accidental double annotations - dplyr::ungroup() %>% - dplyr::distinct() - } + # Clean-up data + data <- data %>% + dplyr::ungroup() %>% + { + # conditional grouping + if (!group_missing) { + dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}, {{ group }}) %>% + dplyr::group_by({{ protein_id }}, {{ group }}) + } else { + dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}) %>% + dplyr::group_by({{ protein_id }}) + } + } %>% + dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, TRUE, FALSE)) %>% + # do this to remove accidental double annotations + dplyr::ungroup() %>% + dplyr::distinct() if (!missing(go_annotations_uniprot)) { if (!stringr::str_detect( From d11176ab3ff05be6bbfb0140d3891bad31f6c75d Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 21 May 2024 22:21:43 +0200 Subject: [PATCH 10/35] Add dplyr:: dependency --- R/calculate_go_enrichment.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 05dc6e33..c2aa15d0 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -402,7 +402,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) go_id = .y ) ) %>% - mutate({{ group }} := .y) + dplyr::mutate({{ group }} := .y) } ) } @@ -448,7 +448,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) if (replace_long_name & !missing(go_annotations_uniprot)) { filtered_result_table <- filtered_result_table %>% - mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) + dplyr::mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) } if (!missing(group) & y_axis_free & plot_style == "barplot") { From 01bca2812727e68a217a2182b620b1179fdd6aa0 Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 21 May 2024 22:23:24 +0200 Subject: [PATCH 11/35] Add one more dplyr:: --- R/calculate_go_enrichment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index c2aa15d0..6e37539c 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -481,7 +481,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) # move label if bar is less than 20% (default) of largest bar plot_input <- plot_input %>% - mutate(hjust = ifelse((.data$neg_log_sig / max(.data$neg_log_sig)) < label_move_frac, -0.15, 1.05)) + dplyr::mutate(hjust = ifelse((.data$neg_log_sig / max(.data$neg_log_sig)) < label_move_frac, -0.15, 1.05)) if (plot_style == "barplot") { # Check if ggforce package is available. If not prompt user to install it. From e539bfcca73d4cd7d585b62a7d233fcbb98a4377 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 22 May 2024 21:11:12 +0200 Subject: [PATCH 12/35] Adjust query of uniprot info --- R/fetch_uniprot.R | 16 ++++++++++++++-- R/try_query.R | 18 ++++++++++++++---- man/fetch_uniprot.Rd | 2 ++ man/try_query.Rd | 4 +++- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index bda3d5ee..6cdc6bea 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -53,6 +53,8 @@ fetch_uniprot <- "xref_pdb" ), batchsize = 200, + max_tries = 10, + timeout = 10, show_progress = TRUE) { if (!curl::has_internet()) { message("No internet connection.") @@ -125,7 +127,12 @@ They were fetched and the original input ID can be found in the "input_id" colum collapsed_columns )) - query <- try_query(query_url, progress = FALSE, show_col_types = FALSE) + query <- try_query(query_url, + max_tries = max_tries, + try_if_timeout = TRUE, + timeout = timeout, + progress = FALSE, + show_col_types = FALSE) if (show_progress == TRUE) { pb$tick() @@ -200,7 +207,12 @@ They were fetched and the original input ID can be found in the "input_id" colum collapsed_columns )) - new_result <- try_query(new_query_url, progress = FALSE, show_col_types = FALSE) + new_result <- try_query(new_query_url, + max_tries = max_tries, + try_if_timeout = TRUE, + timeout = timeout, + progress = FALSE, + show_col_types = FALSE) # If a problem occurs at this step NULL is returned. if (!methods::is(new_result, "data.frame")) { message(new_result) diff --git a/R/try_query.R b/R/try_query.R index 27e1619e..0167fbe3 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -7,7 +7,11 @@ #' @param url a character value of an URL to the website that contains the table that should be #' downloaded. #' @param max_tries a numeric value that specifies the number of times the function tries to download -#' the data in case an error occurs. +#' the data in case an error occurs. By default does not try again if database times out see `try_if_timout`. +#' @param try_if_timeout a logical value specifying if the function should try again in case of a timeout. +#' The default is `FALSE`, which prevents queries from taking very long even though the database is not +#' responding at the time. In some cases it makes sense to set this to `TRUE`, increase the number +#' of tries and reduce the timeout time. #' @param silent a logical value that specifies if individual messages are printed after each try #' that failed. #' @param type a character value that specifies the type of data at the target URL. Options are @@ -25,7 +29,7 @@ #' #' @return A data frame that contains the table from the url. try_query <- - function(url, max_tries = 5, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { + function(url, max_tries = 5, try_if_timeout = FALSE, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { # Check if there is an internet connection first if (!curl::has_internet()) { if (!silent) message("No internet connection.") @@ -34,12 +38,18 @@ try_query <- query_result <- "empty" try_n <- 0 + # Note: The handling of retries in case of a timeout could be adjusted in the future. + # For now I introduced try_if_timeout, but potentially it makes sense to always retry even + # in case of a timeout. Then however, the number of retries needs to be adjusted for some + # functions that retrieve a lot of data. while (!is(query_result, "response") & try_n < max_tries & + # this ifelse stops requery if they timeout except for if try_if_timeout is TRUE !ifelse(is(query_result, "character"), - stringr::str_detect(query_result, pattern = "Timeout was reached"), + stringr::str_detect(query_result, pattern = "Timeout was reached") & !try_if_timeout, FALSE - )) { # this ifelse stops requery if the timeout is too low. + ) + ) { if (!missing(accept)) { # with accept set query_result <- tryCatch(httr::GET(url, httr::accept(accept), httr::timeout(timeout)), diff --git a/man/fetch_uniprot.Rd b/man/fetch_uniprot.Rd index 6f38424b..ef789a97 100644 --- a/man/fetch_uniprot.Rd +++ b/man/fetch_uniprot.Rd @@ -10,6 +10,8 @@ fetch_uniprot( "xref_string", "go_f", "go_p", "go_c", "cc_interaction", "ft_act_site", "ft_binding", "cc_cofactor", "cc_catalytic_activity", "xref_pdb"), batchsize = 200, + max_tries = 5, + timeout = 60, show_progress = TRUE ) } diff --git a/man/try_query.Rd b/man/try_query.Rd index 3f320514..20af7061 100644 --- a/man/try_query.Rd +++ b/man/try_query.Rd @@ -19,7 +19,9 @@ try_query( downloaded.} \item{max_tries}{a numeric value that specifies the number of times the function tries to download -the data in case an error occurs.} +the data in case an error occurs. If a timeout occurs the function will not try again. This prevents +queries from taking very long even though the database is slow or not responding at the time. In this +case it is better to increase the timeout time.} \item{silent}{a logical value that specifies if individual messages are printed after each try that failed.} From f2ade7b0a93aa5b7d74bb05180bcd554c0768778 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 22 May 2024 19:13:01 +0000 Subject: [PATCH 13/35] Style code (GHA) --- R/fetch_uniprot.R | 22 ++++++++++++---------- R/try_query.R | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index 6cdc6bea..02f764e1 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -128,11 +128,12 @@ They were fetched and the original input ID can be found in the "input_id" colum )) query <- try_query(query_url, - max_tries = max_tries, - try_if_timeout = TRUE, - timeout = timeout, - progress = FALSE, - show_col_types = FALSE) + max_tries = max_tries, + try_if_timeout = TRUE, + timeout = timeout, + progress = FALSE, + show_col_types = FALSE + ) if (show_progress == TRUE) { pb$tick() @@ -208,11 +209,12 @@ They were fetched and the original input ID can be found in the "input_id" colum )) new_result <- try_query(new_query_url, - max_tries = max_tries, - try_if_timeout = TRUE, - timeout = timeout, - progress = FALSE, - show_col_types = FALSE) + max_tries = max_tries, + try_if_timeout = TRUE, + timeout = timeout, + progress = FALSE, + show_col_types = FALSE + ) # If a problem occurs at this step NULL is returned. if (!methods::is(new_result, "data.frame")) { message(new_result) diff --git a/R/try_query.R b/R/try_query.R index 0167fbe3..9cb586f2 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -49,7 +49,7 @@ try_query <- stringr::str_detect(query_result, pattern = "Timeout was reached") & !try_if_timeout, FALSE ) - ) { + ) { if (!missing(accept)) { # with accept set query_result <- tryCatch(httr::GET(url, httr::accept(accept), httr::timeout(timeout)), From 2fbecb66ba5775f9f8507c7511cfc5c784553e38 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 23 May 2024 15:30:02 +0200 Subject: [PATCH 14/35] Add arguments to fetch functions --- R/fetch_alphafold_aligned_error.R | 23 ++++++++++++++----- R/fetch_alphafold_prediction.R | 13 +++++++++++ R/fetch_mobidb.R | 13 +++++++++-- R/fetch_quickgo.R | 33 ++++++++++++++++++++++++---- R/fetch_uniprot.R | 10 ++++++--- R/fetch_uniprot_proteome.R | 19 ++++++++++++++-- R/try_query.R | 25 ++++++++------------- man/fetch_alphafold_aligned_error.Rd | 10 ++++++--- man/fetch_alphafold_prediction.Rd | 4 ++++ man/fetch_mobidb.Rd | 14 +++++++++++- man/fetch_quickgo.Rd | 8 +++++++ man/fetch_uniprot.Rd | 9 ++++++-- man/fetch_uniprot_proteome.Rd | 14 +++++++++++- man/try_query.Rd | 4 +--- 14 files changed, 157 insertions(+), 42 deletions(-) diff --git a/R/fetch_alphafold_aligned_error.R b/R/fetch_alphafold_aligned_error.R index d370b77a..4d77e872 100644 --- a/R/fetch_alphafold_aligned_error.R +++ b/R/fetch_alphafold_aligned_error.R @@ -9,8 +9,10 @@ #' should be fetched. #' @param error_cutoff a numeric value specifying the maximum position error (in Angstroms) that should be retained. #' setting this value to a low number reduces the size of the retrieved data. Default is 20. -#' @param timeout a numeric value specifying the time in seconds until the download of an organism -#' archive times out. The default is 3600 seconds. +#' @param timeout a numeric value specifying the time in seconds until the download times out. +#' The default is 30 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 1. #' @param return_data_frame a logical value; if `TRUE` a data frame instead of a list #' is returned. It is recommended to only use this if information for few proteins is retrieved. #' Default is `FALSE`. @@ -49,7 +51,8 @@ #' } fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, error_cutoff = 20, - timeout = 3600, + timeout = 30, + max_tries = 1, return_data_frame = FALSE, show_progress = TRUE) { if (!curl::has_internet()) { @@ -62,7 +65,7 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, batches <- purrr::map( .x = uniprot_ids, - .f = ~ paste0("https://alphafold.ebi.ac.uk/files/AF-", .x, "-F1-predicted_aligned_error_v3.json") + .f = ~ paste0("https://alphafold.ebi.ac.uk/files/AF-", .x, "-F1-predicted_aligned_error_v4.json") ) names(batches) <- uniprot_ids @@ -79,13 +82,19 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, .f = ~ { # query information from database query <- try_query(.x, - type = "application/json" + type = "application/json", + timeout = timeout, + max_tries = max_tries ) if (show_progress == TRUE) { pb$tick() } + if (methods::is(query, "character")) { + return(invisible(query)) + } + initial_result <- query[[1]][["predicted_aligned_error"]] final_result <- purrr::imap_dfr( @@ -114,6 +123,10 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, ) %>% dplyr::distinct() + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } + message("The following IDs have not been retrieved correctly:") message(paste0(utils::capture.output(error_table), collapse = "\n")) } diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 7b435258..1c9bd1d7 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -16,6 +16,8 @@ #' Available version can be found here: https://ftp.ebi.ac.uk/pub/databases/alphafold/ #' @param timeout a numeric value specifying the time in seconds until the download of an organism #' archive times out. The default is 3600 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 1. #' @param return_data_frame a logical value that specifies if true, a data frame instead of a list #' is returned. It is recommended to only use this if information for few proteins is retrieved. #' Default is FALSE. @@ -74,6 +76,7 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, organism_name = NULL, version = "v4", timeout = 3600, + max_tries = 1, return_data_frame = FALSE, show_progress = TRUE) { if (!curl::has_internet()) { @@ -205,6 +208,10 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, old <- options(timeout = timeout) on.exit(options(old)) + # This does not fail gracefully. I could not figure out how to + # catch the error and the two warnings in case of a timeout. + # The error would not be caught while only the first warning that does not + # contain the timeout information was caught. utils::download.file(url, destfile = paste0(tempdir(), "/alphafold.tar")) utils::untar( @@ -338,6 +345,8 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, # query information from database query <- try_query(.x, type = "text/tab-separated-values", + timeout = timeout, + max_tries = max_tries, col_names = FALSE, quote = "", show_col_types = FALSE, @@ -430,6 +439,10 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, ) %>% dplyr::distinct() + if (any(stringr::str_detect(unique(error_table$error), pattern = "Timeout"))){ + message('The retrieval of data timed out. Consider increasing the "timeout" or "max_tries" argument. \n') + } + message("The following IDs have not been retrieved correctly.") message(paste0(utils::capture.output(error_table), collapse = "\n")) } diff --git a/R/fetch_mobidb.R b/R/fetch_mobidb.R index 9e22712a..3fc5e941 100644 --- a/R/fetch_mobidb.R +++ b/R/fetch_mobidb.R @@ -9,6 +9,10 @@ #' argument is mutually exclusive to the \code{uniprot_ids} argument. #' @param show_progress a logical value; if `TRUE` a progress bar will be shown. #' Default is `TRUE`. +#' @param timeout a numeric value specifying the time in seconds until the download of an organism +#' archive times out. The default is 60 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 2. #' #' @return A data frame that contains start and end positions for disordered and flexible protein #' regions. The \code{feature} column contains information on the source of this @@ -30,7 +34,7 @@ #' uniprot_ids = c("P0A799", "P62707") #' ) #' } -fetch_mobidb <- function(uniprot_ids = NULL, organism_id = NULL, show_progress = TRUE) { +fetch_mobidb <- function(uniprot_ids = NULL, organism_id = NULL, show_progress = TRUE, timeout = 60, max_tries = 2) { if (!curl::has_internet()) { message("No internet connection.") return(invisible(NULL)) @@ -113,7 +117,8 @@ to uniprot standards and were skipped from fetching: ", # query information from database query <- try_query( url = .x, - timeout = 60 + timeout = timeout, + max_tries = max_tries ) if (show_progress == TRUE) { @@ -135,6 +140,10 @@ to uniprot standards and were skipped from fetching: ", ) %>% dplyr::distinct() + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } + message("The following IDs have not been retrieved correctly.") message(paste0(utils::capture.output(error_table), collapse = "\n")) } diff --git a/R/fetch_quickgo.R b/R/fetch_quickgo.R index 23a5c9e5..42c548e0 100644 --- a/R/fetch_quickgo.R +++ b/R/fetch_quickgo.R @@ -21,6 +21,10 @@ #' a slim go set should be generated. This argument should only be provided if slims are retrieved. #' @param relations_slims an optional character vector that specifies the relations of GO IDs that should be #' considered for the generation of the slim dataset. This argument should only be provided if slims are retrieved. +#' @param timeout a numeric value specifying the time in seconds until the download times out. +#' The default is 1200 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 2. #' @param show_progress a logical value that indicates if a progress bar will be shown. #' Default is TRUE. #' @@ -67,6 +71,8 @@ fetch_quickgo <- function(type = "annotations", ontology_annotations = "all", go_id_slims = NULL, relations_slims = c("is_a", "part_of", "regulates", "occurs_in"), + timeout = 1200, + max_tries = 2, show_progress = TRUE) { type <- match.arg(type, c("annotations", "terms", "slims")) @@ -119,7 +125,7 @@ fetch_quickgo <- function(type = "annotations", start_time <- Sys.time() } - query_result <- try_query(url, timeout = 1200, accept = "text/tsv") + query_result <- try_query(url, timeout = timeout, max_tries = max_tries, accept = "text/tsv") if (show_progress == TRUE) { message("DONE", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)")) @@ -127,6 +133,9 @@ fetch_quickgo <- function(type = "annotations", if (methods::is(query_result, "character")) { message(query_result) + if (stringr::str_detect(query_result, pattern = "Timeout")){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(NULL)) } @@ -167,11 +176,16 @@ fetch_quickgo <- function(type = "annotations", url <- paste0(base_url, page) test_query <- try_query(url, - type = "application/json" + type = "application/json", + timeout = timeout, + max_tries = max_tries ) if (methods::is(test_query, "character")) { - message(test_query) + message(test_query, "\n") + if (stringr::str_detect(test_query, pattern = "Timeout")){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(NULL)) } @@ -195,6 +209,8 @@ fetch_quickgo <- function(type = "annotations", ) query <- try_query(query_url, type = "application/json", + timeout = timeout, + max_tries = max_tries, simplifyDataFrame = TRUE ) @@ -213,6 +229,11 @@ fetch_quickgo <- function(type = "annotations", if (!is.null(error_vector)) { message(paste(unique(error_vector), collapse = ", ")) + + if (any(stringr::str_detect(unique(error_vector), pattern = "Timeout"))){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } + return(invisible(NULL)) } else { query_result <- query_result_list %>% @@ -321,12 +342,16 @@ fetch_quickgo <- function(type = "annotations", query_result <- try_query(url, type = "application/json", - timeout = 1200, + timeout = timeout, + max_tries = max_tries, simplifyDataFrame = TRUE ) if (methods::is(query_result, "character")) { message(query_result) + if (stringr::str_detect(query_result, pattern = "Timeout")){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(NULL)) } else { query_result <- query_result[["results"]] %>% diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index 02f764e1..8dd03b03 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -8,6 +8,9 @@ #' cross-referenced database provide the database name with the prefix "xref_", e.g. `"xref_pdb"`) #' @param batchsize a numeric value that specifies the number of proteins processed in a single #' single query. Default and max value is 200. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. +#' @param timeout a numeric value that specifies the maximum request time per try. Default is 20 seconds. #' @param show_progress a logical value that determines if a progress bar will be shown. Default #' is TRUE. #' @@ -54,7 +57,7 @@ fetch_uniprot <- ), batchsize = 200, max_tries = 10, - timeout = 10, + timeout = 20, show_progress = TRUE) { if (!curl::has_internet()) { message("No internet connection.") @@ -129,7 +132,6 @@ They were fetched and the original input ID can be found in the "input_id" colum query <- try_query(query_url, max_tries = max_tries, - try_if_timeout = TRUE, timeout = timeout, progress = FALSE, show_col_types = FALSE @@ -155,6 +157,9 @@ They were fetched and the original input ID can be found in the "input_id" colum message("The following IDs have not been retrieved correctly.") message(paste0(utils::capture.output(error_table), collapse = "\n")) + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))){ + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } } # only keep data in output and transform to data.frame @@ -210,7 +215,6 @@ They were fetched and the original input ID can be found in the "input_id" colum new_result <- try_query(new_query_url, max_tries = max_tries, - try_if_timeout = TRUE, timeout = timeout, progress = FALSE, show_col_types = FALSE diff --git a/R/fetch_uniprot_proteome.R b/R/fetch_uniprot_proteome.R index fea8eeb6..c30beef1 100644 --- a/R/fetch_uniprot_proteome.R +++ b/R/fetch_uniprot_proteome.R @@ -11,6 +11,10 @@ #' able to efficiently retrieve the information. If more information is needed, \code{fetch_uniprot()} #' can be used with the IDs retrieved by this function. #' @param reviewed a logical value that determines if only reviewed protein entries will be retrieved. +#' @param timeout a numeric value specifying the time in seconds until the download times out. +#' The default is 60 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 2. #' #' @return A data frame that contains all protein metadata specified in \code{columns} for the #' organism of choice. @@ -24,7 +28,15 @@ fetch_uniprot_proteome <- function(organism_id, columns = c("accession"), - reviewed = TRUE) { + reviewed = TRUE, + timeout = 120, + max_tries = 2) { + + if (!curl::has_internet()) { + message("No internet connection.") + return(invisible(NULL)) + } + if (length(organism_id) == 0) { stop("No valid organism ID found.") } @@ -47,9 +59,12 @@ fetch_uniprot_proteome <- "&format=tsv&fields=", collapsed_columns )) - result <- try_query(query_url, progress = FALSE, show_col_types = FALSE) + result <- try_query(query_url, timeout = timeout, max_tries = max_tries, progress = FALSE, show_col_types = FALSE) # result can either be a data.frame or it is a character string with the error message if (!methods::is(result, "data.frame")) { + if (stringr::str_detect(result, pattern = "Timeout")){ + message('The data retrieval timed out. Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(result)) } colnames(result) <- column_names diff --git a/R/try_query.R b/R/try_query.R index 9cb586f2..6fc7b505 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -7,11 +7,7 @@ #' @param url a character value of an URL to the website that contains the table that should be #' downloaded. #' @param max_tries a numeric value that specifies the number of times the function tries to download -#' the data in case an error occurs. By default does not try again if database times out see `try_if_timout`. -#' @param try_if_timeout a logical value specifying if the function should try again in case of a timeout. -#' The default is `FALSE`, which prevents queries from taking very long even though the database is not -#' responding at the time. In some cases it makes sense to set this to `TRUE`, increase the number -#' of tries and reduce the timeout time. +#' the data in case an error occurs. Default is 5. #' @param silent a logical value that specifies if individual messages are printed after each try #' that failed. #' @param type a character value that specifies the type of data at the target URL. Options are @@ -29,7 +25,13 @@ #' #' @return A data frame that contains the table from the url. try_query <- - function(url, max_tries = 5, try_if_timeout = FALSE, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { + function(url, max_tries = 5, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { + + # Check timeout time + if (timeout < 1){ + stop("The timeout cannot be less than 1 second.") + } + # Check if there is an internet connection first if (!curl::has_internet()) { if (!silent) message("No internet connection.") @@ -38,17 +40,8 @@ try_query <- query_result <- "empty" try_n <- 0 - # Note: The handling of retries in case of a timeout could be adjusted in the future. - # For now I introduced try_if_timeout, but potentially it makes sense to always retry even - # in case of a timeout. Then however, the number of retries needs to be adjusted for some - # functions that retrieve a lot of data. while (!is(query_result, "response") & - try_n < max_tries & - # this ifelse stops requery if they timeout except for if try_if_timeout is TRUE - !ifelse(is(query_result, "character"), - stringr::str_detect(query_result, pattern = "Timeout was reached") & !try_if_timeout, - FALSE - ) + try_n < max_tries ) { if (!missing(accept)) { # with accept set diff --git a/man/fetch_alphafold_aligned_error.Rd b/man/fetch_alphafold_aligned_error.Rd index 31791692..2629fdfb 100644 --- a/man/fetch_alphafold_aligned_error.Rd +++ b/man/fetch_alphafold_aligned_error.Rd @@ -7,7 +7,8 @@ fetch_alphafold_aligned_error( uniprot_ids = NULL, error_cutoff = 20, - timeout = 3600, + timeout = 30, + max_tries = 1, return_data_frame = FALSE, show_progress = TRUE ) @@ -19,8 +20,11 @@ should be fetched.} \item{error_cutoff}{a numeric value specifying the maximum position error (in Angstroms) that should be retained. setting this value to a low number reduces the size of the retrieved data. Default is 20.} -\item{timeout}{a numeric value specifying the time in seconds until the download of an organism -archive times out. The default is 3600 seconds.} +\item{timeout}{a numeric value specifying the time in seconds until the download times out. +The default is 30 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 1.} \item{return_data_frame}{a logical value; if \code{TRUE} a data frame instead of a list is returned. It is recommended to only use this if information for few proteins is retrieved. diff --git a/man/fetch_alphafold_prediction.Rd b/man/fetch_alphafold_prediction.Rd index 99b2141a..dec68285 100644 --- a/man/fetch_alphafold_prediction.Rd +++ b/man/fetch_alphafold_prediction.Rd @@ -9,6 +9,7 @@ fetch_alphafold_prediction( organism_name = NULL, version = "v4", timeout = 3600, + max_tries = 1, return_data_frame = FALSE, show_progress = TRUE ) @@ -31,6 +32,9 @@ Available version can be found here: https://ftp.ebi.ac.uk/pub/databases/alphafo \item{timeout}{a numeric value specifying the time in seconds until the download of an organism archive times out. The default is 3600 seconds.} +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 1.} + \item{return_data_frame}{a logical value that specifies if true, a data frame instead of a list is returned. It is recommended to only use this if information for few proteins is retrieved. Default is FALSE.} diff --git a/man/fetch_mobidb.Rd b/man/fetch_mobidb.Rd index ff59791e..eaa0ff10 100644 --- a/man/fetch_mobidb.Rd +++ b/man/fetch_mobidb.Rd @@ -4,7 +4,13 @@ \alias{fetch_mobidb} \title{Fetch protein disorder and mobility information from MobiDB} \usage{ -fetch_mobidb(uniprot_ids = NULL, organism_id = NULL, show_progress = TRUE) +fetch_mobidb( + uniprot_ids = NULL, + organism_id = NULL, + show_progress = TRUE, + timeout = 60, + max_tries = 2 +) } \arguments{ \item{uniprot_ids}{optional, a character vector of UniProt identifiers for which information @@ -16,6 +22,12 @@ argument is mutually exclusive to the \code{uniprot_ids} argument.} \item{show_progress}{a logical value; if \code{TRUE} a progress bar will be shown. Default is \code{TRUE}.} + +\item{timeout}{a numeric value specifying the time in seconds until the download of an organism +archive times out. The default is 60 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 2.} } \value{ A data frame that contains start and end positions for disordered and flexible protein diff --git a/man/fetch_quickgo.Rd b/man/fetch_quickgo.Rd index 0f0da3ac..fed8c218 100644 --- a/man/fetch_quickgo.Rd +++ b/man/fetch_quickgo.Rd @@ -11,6 +11,8 @@ fetch_quickgo( ontology_annotations = "all", go_id_slims = NULL, relations_slims = c("is_a", "part_of", "regulates", "occurs_in"), + timeout = 1200, + max_tries = 2, show_progress = TRUE ) } @@ -36,6 +38,12 @@ a slim go set should be generated. This argument should only be provided if slim \item{relations_slims}{an optional character vector that specifies the relations of GO IDs that should be considered for the generation of the slim dataset. This argument should only be provided if slims are retrieved.} +\item{timeout}{a numeric value specifying the time in seconds until the download times out. +The default is 1200 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 2.} + \item{show_progress}{a logical value that indicates if a progress bar will be shown. Default is TRUE.} } diff --git a/man/fetch_uniprot.Rd b/man/fetch_uniprot.Rd index ef789a97..0afed05f 100644 --- a/man/fetch_uniprot.Rd +++ b/man/fetch_uniprot.Rd @@ -10,8 +10,8 @@ fetch_uniprot( "xref_string", "go_f", "go_p", "go_c", "cc_interaction", "ft_act_site", "ft_binding", "cc_cofactor", "cc_catalytic_activity", "xref_pdb"), batchsize = 200, - max_tries = 5, - timeout = 60, + max_tries = 10, + timeout = 20, show_progress = TRUE ) } @@ -25,6 +25,11 @@ cross-referenced database provide the database name with the prefix "xref_", e.g \item{batchsize}{a numeric value that specifies the number of proteins processed in a single single query. Default and max value is 200.} +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs.} + +\item{timeout}{a numeric value that specifies the maximum request time per try. Default is 20 seconds.} + \item{show_progress}{a logical value that determines if a progress bar will be shown. Default is TRUE.} } diff --git a/man/fetch_uniprot_proteome.Rd b/man/fetch_uniprot_proteome.Rd index 4b93b769..ab06d6d0 100644 --- a/man/fetch_uniprot_proteome.Rd +++ b/man/fetch_uniprot_proteome.Rd @@ -4,7 +4,13 @@ \alias{fetch_uniprot_proteome} \title{Fetch proteome data from UniProt} \usage{ -fetch_uniprot_proteome(organism_id, columns = c("accession"), reviewed = TRUE) +fetch_uniprot_proteome( + organism_id, + columns = c("accession"), + reviewed = TRUE, + timeout = 120, + max_tries = 2 +) } \arguments{ \item{organism_id}{a numeric value that specifies the NCBI taxonomy identifier (TaxId) for an @@ -18,6 +24,12 @@ able to efficiently retrieve the information. If more information is needed, \co can be used with the IDs retrieved by this function.} \item{reviewed}{a logical value that determines if only reviewed protein entries will be retrieved.} + +\item{timeout}{a numeric value specifying the time in seconds until the download times out. +The default is 60 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 2.} } \value{ A data frame that contains all protein metadata specified in \code{columns} for the diff --git a/man/try_query.Rd b/man/try_query.Rd index 20af7061..90d61467 100644 --- a/man/try_query.Rd +++ b/man/try_query.Rd @@ -19,9 +19,7 @@ try_query( downloaded.} \item{max_tries}{a numeric value that specifies the number of times the function tries to download -the data in case an error occurs. If a timeout occurs the function will not try again. This prevents -queries from taking very long even though the database is slow or not responding at the time. In this -case it is better to increase the timeout time.} +the data in case an error occurs. Default is 5.} \item{silent}{a logical value that specifies if individual messages are printed after each try that failed.} From 083cef503c157f4b23069351e98ae9578d3eb8b1 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 23 May 2024 13:33:13 +0000 Subject: [PATCH 15/35] Style code (GHA) --- R/fetch_alphafold_aligned_error.R | 2 +- R/fetch_alphafold_prediction.R | 2 +- R/fetch_mobidb.R | 2 +- R/fetch_quickgo.R | 8 ++++---- R/fetch_uniprot.R | 2 +- R/fetch_uniprot_proteome.R | 3 +-- R/try_query.R | 3 +-- 7 files changed, 10 insertions(+), 12 deletions(-) diff --git a/R/fetch_alphafold_aligned_error.R b/R/fetch_alphafold_aligned_error.R index 4d77e872..100d5e98 100644 --- a/R/fetch_alphafold_aligned_error.R +++ b/R/fetch_alphafold_aligned_error.R @@ -123,7 +123,7 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, ) %>% dplyr::distinct() - if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))){ + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 1c9bd1d7..98a16196 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -439,7 +439,7 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, ) %>% dplyr::distinct() - if (any(stringr::str_detect(unique(error_table$error), pattern = "Timeout"))){ + if (any(stringr::str_detect(unique(error_table$error), pattern = "Timeout"))) { message('The retrieval of data timed out. Consider increasing the "timeout" or "max_tries" argument. \n') } diff --git a/R/fetch_mobidb.R b/R/fetch_mobidb.R index 3fc5e941..7d6f1014 100644 --- a/R/fetch_mobidb.R +++ b/R/fetch_mobidb.R @@ -140,7 +140,7 @@ to uniprot standards and were skipped from fetching: ", ) %>% dplyr::distinct() - if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))){ + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } diff --git a/R/fetch_quickgo.R b/R/fetch_quickgo.R index 42c548e0..302f42b4 100644 --- a/R/fetch_quickgo.R +++ b/R/fetch_quickgo.R @@ -133,7 +133,7 @@ fetch_quickgo <- function(type = "annotations", if (methods::is(query_result, "character")) { message(query_result) - if (stringr::str_detect(query_result, pattern = "Timeout")){ + if (stringr::str_detect(query_result, pattern = "Timeout")) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } return(invisible(NULL)) @@ -183,7 +183,7 @@ fetch_quickgo <- function(type = "annotations", if (methods::is(test_query, "character")) { message(test_query, "\n") - if (stringr::str_detect(test_query, pattern = "Timeout")){ + if (stringr::str_detect(test_query, pattern = "Timeout")) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } return(invisible(NULL)) @@ -230,7 +230,7 @@ fetch_quickgo <- function(type = "annotations", if (!is.null(error_vector)) { message(paste(unique(error_vector), collapse = ", ")) - if (any(stringr::str_detect(unique(error_vector), pattern = "Timeout"))){ + if (any(stringr::str_detect(unique(error_vector), pattern = "Timeout"))) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } @@ -349,7 +349,7 @@ fetch_quickgo <- function(type = "annotations", if (methods::is(query_result, "character")) { message(query_result) - if (stringr::str_detect(query_result, pattern = "Timeout")){ + if (stringr::str_detect(query_result, pattern = "Timeout")) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } return(invisible(NULL)) diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index 8dd03b03..1eb18c7b 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -157,7 +157,7 @@ They were fetched and the original input ID can be found in the "input_id" colum message("The following IDs have not been retrieved correctly.") message(paste0(utils::capture.output(error_table), collapse = "\n")) - if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))){ + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))) { message('Consider increasing the "timeout" or "max_tries" argument. \n') } } diff --git a/R/fetch_uniprot_proteome.R b/R/fetch_uniprot_proteome.R index c30beef1..72355489 100644 --- a/R/fetch_uniprot_proteome.R +++ b/R/fetch_uniprot_proteome.R @@ -31,7 +31,6 @@ fetch_uniprot_proteome <- reviewed = TRUE, timeout = 120, max_tries = 2) { - if (!curl::has_internet()) { message("No internet connection.") return(invisible(NULL)) @@ -62,7 +61,7 @@ fetch_uniprot_proteome <- result <- try_query(query_url, timeout = timeout, max_tries = max_tries, progress = FALSE, show_col_types = FALSE) # result can either be a data.frame or it is a character string with the error message if (!methods::is(result, "data.frame")) { - if (stringr::str_detect(result, pattern = "Timeout")){ + if (stringr::str_detect(result, pattern = "Timeout")) { message('The data retrieval timed out. Consider increasing the "timeout" or "max_tries" argument. \n') } return(invisible(result)) diff --git a/R/try_query.R b/R/try_query.R index 6fc7b505..d1c9f952 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -26,9 +26,8 @@ #' @return A data frame that contains the table from the url. try_query <- function(url, max_tries = 5, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { - # Check timeout time - if (timeout < 1){ + if (timeout < 1) { stop("The timeout cannot be less than 1 second.") } From 438d23fcbe600211993c3d75de6f383c289b9926 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 6 Jun 2024 13:21:18 +0200 Subject: [PATCH 16/35] #256 include min_n_peptides --- NEWS.md | 3 +++ R/calculate_protein_abundance.R | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1cf61eac..ab4210f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # protti 0.8.9000 +## New features +* Enhanced Flexibility in Protein Quantification: Introduced the `min_n_peptides` parameter to the `calculate_protein_abundance()` function. This allows users to specify the minimum number of peptides per protein needed for analysis. Default is set at three peptides. + ## Bug fixes * `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits. diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R index d85f3a75..5fb71220 100644 --- a/R/calculate_protein_abundance.R +++ b/R/calculate_protein_abundance.R @@ -14,6 +14,9 @@ #' to more than three precursors. The quantification is done on the precursor level. #' @param intensity_log2 a numeric column in the \code{data} data frame that contains log2 #' transformed precursor intensities. +#' @param min_n_peptides An integer specifying the minimum number of peptides required +#' for a protein to be included in the analysis. The default value is 3, which means +#' proteins with fewer than three unique peptides will be excluded from the analysis. #' @param method a character value specifying with which method protein quantities should be #' calculated. Possible options include \code{"sum"}, which takes the sum of all precursor #' intensities as the protein abundance. Another option is \code{"iq"}, which performs protein @@ -109,6 +112,7 @@ calculate_protein_abundance <- function(data, precursor, peptide, intensity_log2, + min_n_peptides = 3, method = "sum", for_plot = FALSE, retain_columns = NULL) { @@ -127,7 +131,7 @@ calculate_protein_abundance <- function(data, tidyr::drop_na() %>% dplyr::group_by({{ protein_id }}, {{ sample }}) %>% dplyr::mutate(n_peptides = dplyr::n_distinct(!!rlang::ensym(peptide))) %>% - dplyr::filter(.data$n_peptides > 2) %>% + dplyr::filter(.data$n_peptides >= min_n_peptides) %>% dplyr::select(-"n_peptides") %>% dplyr::ungroup() From 5fb21960a3a73d1a3d9fd57621265da2ed68c7ab Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 14 Jun 2024 11:47:16 +0200 Subject: [PATCH 17/35] Update try_query --- R/try_query.R | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/R/try_query.R b/R/try_query.R index d1c9f952..8016a00b 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -33,12 +33,13 @@ try_query <- # Check if there is an internet connection first if (!curl::has_internet()) { - if (!silent) message("No internet connection.") + if (!silent) message("\nNo internet connection.") return(invisible("No internet connection")) } query_result <- "empty" try_n <- 0 + retry_delay <- 3 # seconds while (!is(query_result, "response") & try_n < max_tries ) { @@ -55,26 +56,27 @@ try_query <- warning = function(w) conditionMessage(w) ) } - try_n <- try_n + 1 - if (!silent & !is(query_result, "response")) { - message(paste0(try_n, ". try failed!")) - } - if (!is(query_result, "response")) { - Sys.sleep(3) + + if (inherits(query_result, "response")) { + break + } else if (stringr::str_detect(query_result, "Timeout was reached")) { + if (!silent) message(paste0("\nAttempt ", try_n + 1, "/", max_tries, " failed: Timeout was reached. Retrying...")) + } else { + if (!silent) message(paste0("\nAttempt ", try_n + 1, "/", max_tries, " failed: ", query_result, ". Retrying...")) } + + try_n <- try_n + 1 + + Sys.sleep(retry_delay) + retry_delay <- retry_delay * 2 # Exponential backoff } # Check again if there is an internet connection, if not then the correct error is returned if (!curl::has_internet()) { - if (!silent) message("No internet connection.") + if (!silent) message("\nNo internet connection.") return(invisible("No internet connection")) } - if (!is(query_result, "response")) { - if (!silent) message(query_result) - return(invisible(query_result)) - } - if (httr::http_error(query_result)) { if (!silent) httr::message_for_status(query_result) return(invisible(httr::http_status(query_result)$message)) From 884826eca23de866a4c2188ca0e224faa18ad92b Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 14 Jun 2024 11:47:29 +0200 Subject: [PATCH 18/35] Fix test --- tests/testthat/test-structure_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index 149ac32d..31fdd1c4 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -21,7 +21,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { expect_is(positions_structure, "data.frame") # test if position structure is in certain range as db can be updated expect_gte(nrow(positions_structure), 569) - expect_lte(nrow(positions_structure), 600) + expect_lte(nrow(positions_structure), 700) expect_equal(ncol(positions_structure), 17) }) From c97f3d8cb214065a0347d77d393bde9bf1bb2046 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 14 Jun 2024 11:47:53 +0200 Subject: [PATCH 19/35] Update fetch functions --- R/fetch_alphafold_prediction.R | 4 ++-- R/fetch_uniprot.R | 2 ++ R/fetch_uniprot_proteome.R | 4 ++-- man/fetch_alphafold_prediction.Rd | 4 ++-- man/fetch_uniprot_proteome.Rd | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 98a16196..8251bd63 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -17,7 +17,7 @@ #' @param timeout a numeric value specifying the time in seconds until the download of an organism #' archive times out. The default is 3600 seconds. #' @param max_tries a numeric value that specifies the number of times the function tries to download -#' the data in case an error occurs. The default is 1. +#' the data in case an error occurs. The default is 5. This only applies if `uniprot_ids` were provided. #' @param return_data_frame a logical value that specifies if true, a data frame instead of a list #' is returned. It is recommended to only use this if information for few proteins is retrieved. #' Default is FALSE. @@ -76,7 +76,7 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, organism_name = NULL, version = "v4", timeout = 3600, - max_tries = 1, + max_tries = 5, return_data_frame = FALSE, show_progress = TRUE) { if (!curl::has_internet()) { diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index 1eb18c7b..85e7d837 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -133,6 +133,7 @@ They were fetched and the original input ID can be found in the "input_id" colum query <- try_query(query_url, max_tries = max_tries, timeout = timeout, + silent = FALSE, progress = FALSE, show_col_types = FALSE ) @@ -216,6 +217,7 @@ They were fetched and the original input ID can be found in the "input_id" colum new_result <- try_query(new_query_url, max_tries = max_tries, timeout = timeout, + silent = FALSE, progress = FALSE, show_col_types = FALSE ) diff --git a/R/fetch_uniprot_proteome.R b/R/fetch_uniprot_proteome.R index 72355489..0ca51cd7 100644 --- a/R/fetch_uniprot_proteome.R +++ b/R/fetch_uniprot_proteome.R @@ -30,7 +30,7 @@ fetch_uniprot_proteome <- columns = c("accession"), reviewed = TRUE, timeout = 120, - max_tries = 2) { + max_tries = 5) { if (!curl::has_internet()) { message("No internet connection.") return(invisible(NULL)) @@ -58,7 +58,7 @@ fetch_uniprot_proteome <- "&format=tsv&fields=", collapsed_columns )) - result <- try_query(query_url, timeout = timeout, max_tries = max_tries, progress = FALSE, show_col_types = FALSE) + result <- try_query(query_url, timeout = timeout, max_tries = max_tries, silent = FALSE, progress = FALSE, show_col_types = FALSE) # result can either be a data.frame or it is a character string with the error message if (!methods::is(result, "data.frame")) { if (stringr::str_detect(result, pattern = "Timeout")) { diff --git a/man/fetch_alphafold_prediction.Rd b/man/fetch_alphafold_prediction.Rd index dec68285..20d731ac 100644 --- a/man/fetch_alphafold_prediction.Rd +++ b/man/fetch_alphafold_prediction.Rd @@ -9,7 +9,7 @@ fetch_alphafold_prediction( organism_name = NULL, version = "v4", timeout = 3600, - max_tries = 1, + max_tries = 5, return_data_frame = FALSE, show_progress = TRUE ) @@ -33,7 +33,7 @@ Available version can be found here: https://ftp.ebi.ac.uk/pub/databases/alphafo archive times out. The default is 3600 seconds.} \item{max_tries}{a numeric value that specifies the number of times the function tries to download -the data in case an error occurs. The default is 1.} +the data in case an error occurs. The default is 5. This only applies if \code{uniprot_ids} were provided.} \item{return_data_frame}{a logical value that specifies if true, a data frame instead of a list is returned. It is recommended to only use this if information for few proteins is retrieved. diff --git a/man/fetch_uniprot_proteome.Rd b/man/fetch_uniprot_proteome.Rd index ab06d6d0..1859d1c3 100644 --- a/man/fetch_uniprot_proteome.Rd +++ b/man/fetch_uniprot_proteome.Rd @@ -9,7 +9,7 @@ fetch_uniprot_proteome( columns = c("accession"), reviewed = TRUE, timeout = 120, - max_tries = 2 + max_tries = 5 ) } \arguments{ From 27ba11adf5e4408592d20966b47f2e4a868be7c7 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 14 Jun 2024 11:48:07 +0200 Subject: [PATCH 20/35] Update vignette and NEWS --- NEWS.md | 8 ++++++++ .../data_analysis_dose_response_workflow.Rmd | 15 ++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index e705950c..0c109d60 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,12 +5,20 @@ * `calculate_go_enrichment()` got additional arguments. * `replace_long_name`: a logical argument that specifies if GO term names above 50 characters should be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become too small due to the long name. The default is `TRUE`. * `label_move_frac`: a numeric argument between 0 and 1 that specifies which labels should be moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from overlapping with the bar boundaries. +* `fetch_alphafold_aligned_error()`, `fetch_alphafold_prediction()`, `fetch_mobidb()`, `fetch_quickgo()`, `fetch_uniprot()` and `fetch_uniprot_proteome()` got additional arguments: + * `timeout`: a numeric value specifying the time in seconds until the download times out. + * `max_tries`: a numeric value that specifies the number of times the function tries to download the data in case an error occurs. + ## Bug fixes * `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits. * `calculate_go_enrichment()` now correctly uses to total number of provided proteins for the contingency table. Previously it falsely only considered proteins with a GO annotation for the enrichment analysis. +## Additional Changes + +* `fetch_uniprot()` and `fetch_uniprot_proteome()` are more resistant to database connection issues. They also give more informative messages as to why the data could not be retrieved. + # protti 0.8.0 ## New features diff --git a/vignettes/data_analysis_dose_response_workflow.Rmd b/vignettes/data_analysis_dose_response_workflow.Rmd index 977ffb01..d8481d62 100644 --- a/vignettes/data_analysis_dose_response_workflow.Rmd +++ b/vignettes/data_analysis_dose_response_workflow.Rmd @@ -44,7 +44,7 @@ A typical dose-response experiment contains multiple samples that were treated w For limited proteolysis-coupled to mass spectrometry ([LiP-MS](https://www.nature.com/articles/nbt.2999)) data, dose-response curves have been used previously for the identification of drug binding sites in complex proteomes ([Piazza 2020](https://www.nature.com/articles/s41467-020-18071-x)). Since LiP-MS data is analysed on the peptide or precursor* level, using additional information about peptide behaviour from multiple conditions reduces false discovery rate. -_*A peptide precursor is the actual molecular unit that was detected on the mass spectrometer. This is a peptide with one specific charge state and its modification(s)._ +_A peptide precursor is the actual molecular unit that was detected on the mass spectrometer. This is a peptide with one specific charge state and its modification(s)._ # Getting started @@ -166,6 +166,8 @@ $b$ is the Hill's coefficient (e.g. the negative slope at the inflection point) The output of `fit_drc_4p()` provides extensive information on the goodness of fit. Furthermore, the function filters and ranks fits based on several criteria which include completeness of data and a significance cutoff based on an adjusted p-value obtained from ANOVA. For details about the filtering steps you can read the function documentation by calling `?fit_drc_4p`. If you only care about your potential hits (and exclude precursors/peptides/proteins that do not pass the filtering e.g. due to too few observations), you can choose `filter = "pre"`, which filters the data before model fitting. This speeds up the process because less models need to be fit. If you want to perform enrichment analysis later on you should keep the default `filter = "post"`. This will fit all possible models and then only annotate models based on whether they passed or failed the filtering step (`passed_filter`). +Since **protti** version `0.8.0` we recommend you use the `n_replicate_completeness` and `n_condition_completeness` arguments in order to define cutoffs for completeness of replicates in a given number of conditions. This makes the output of the function reproducible independent of the input data. With the provided cutoffs below we define that at least 2 replicates need to be detected in 4 conditions. Any precursor with less completeness than that will not be considered and removed from the data. + ```{r model_fit, eval = test_protti} fit <- data_normalised %>% fit_drc_4p( @@ -174,6 +176,8 @@ fit <- data_normalised %>% response = normalised_intensity_log2, dose = r_condition, filter = "post", + n_replicate_completeness = 2, # specifiy this based on your data and cutoff criteria + n_condition_completeness = 4, # specifiy this based on your data and cutoff criteria retain_columns = c(pg_protein_accessions) ) @@ -196,7 +200,9 @@ parallel_fit <- data_normalised %>% response = normalised_intensity_log2, dose = r_condition, retain_columns = c(pg_protein_accessions), - n_cores = 3 + n_cores = 3, + n_replicate_completeness = 2, # specifiy this based on your data and cutoff criteria + n_condition_completeness = 4, # specifiy this based on your data and cutoff criteria ) # remove workers again after you are done @@ -230,7 +236,7 @@ fit %>% After model fitting, you can easily visualise model fits by plotting them with the `drc_4p_plot()` function. You can provide one or more precursor IDs to the `targets` argument or just return plots for all precursors with `targets = "all"`. Make sure that the number of plots is reasonable to return within R. You can also directly export your plots by setting `export = TRUE`. -Note: The output of `fit_drc_4p()` includes two columns with data frames containing the model and original data points required for the plot. These data frames contain the original names for `dose` and `response` that you provided to `fit_drc_4p()`. +_Note: The output of `fit_drc_4p()` includes two columns with data frames containing the model and original data points required for the plot. These data frames contain the original names for `dose` and `response` that you provided to `fit_drc_4p()`._ ```{r model_plot, eval = test_protti, fig.align = "center", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE} # Model plotting @@ -240,10 +246,13 @@ drc_4p_plot(fit, response = normalised_intensity_log2, targets = "_VFDVELLKLE_.2", unit = "pM", + x_axis_limits = c(10, 1e+08), export = FALSE ) ``` +There are additional arguments that help you customise your plot. You can for example specify the axis limits with `x_axis_limits`. This should generally always be set to the lowest non-zero and highest concentration `x_axis_limits = c(10, 1e+08)`. Like this you can more easily see if extreme concentrations are completely missing from the curve. The `colours` argument allows you to specify your own colours for the points, confidence interval and curve. + # Further analysis After models are fit, you can have a deeper look into the hits you obtained. **protti** provides several functions that help you to identify patterns in your data. From 8164356052980e29786ec51ce717add7bcbb48eb Mon Sep 17 00:00:00 2001 From: jpquast Date: Sun, 16 Jun 2024 13:48:18 +0200 Subject: [PATCH 21/35] Correct chebi URLs --- R/fetch_chebi.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/fetch_chebi.R b/R/fetch_chebi.R index c79a2f53..38ae2478 100644 --- a/R/fetch_chebi.R +++ b/R/fetch_chebi.R @@ -35,7 +35,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { if (relation == TRUE) { chebi_relation_result <- tryCatch( httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv", + "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv", httr::timeout(timeout) ), error = function(e) conditionMessage(e), @@ -76,7 +76,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { # Download compound data chebi_chemical_data_result <- tryCatch( httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", + "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", httr::timeout(timeout) ), error = function(e) conditionMessage(e), @@ -100,7 +100,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { show_col_types = FALSE )) - chebi_compounds_download <- tryCatch(readLines(con <- gzcon(url("ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/compounds.tsv.gz", method = "libcurl"))), + chebi_compounds_download <- tryCatch(readLines(con <- gzcon(url("https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/compounds.tsv.gz", method = "libcurl"))), error = function(e) conditionMessage(e), warning = function(w) conditionMessage(w) ) @@ -118,7 +118,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { return(invisible(NULL)) } - chebi_names_download <- tryCatch(readLines(con <- gzcon(url("ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/names.tsv.gz", method = "libcurl"))), + chebi_names_download <- tryCatch(readLines(con <- gzcon(url("https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/names.tsv.gz", method = "libcurl"))), error = function(e) conditionMessage(e), warning = function(w) conditionMessage(w) ) @@ -138,7 +138,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi_accession_result <- tryCatch( httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv", + "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv", httr::timeout(timeout) ), error = function(e) conditionMessage(e), From c55f5be16c0b9bf2030eae4497b9e15a8bb0bacd Mon Sep 17 00:00:00 2001 From: jpquast Date: Sun, 16 Jun 2024 16:12:13 +0200 Subject: [PATCH 22/35] Document function --- NEWS.md | 3 --- man/calculate_protein_abundance.Rd | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 760c14de..7348ee30 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,9 +8,6 @@ * `fetch_alphafold_aligned_error()`, `fetch_alphafold_prediction()`, `fetch_mobidb()`, `fetch_quickgo()`, `fetch_uniprot()` and `fetch_uniprot_proteome()` got additional arguments: * `timeout`: a numeric value specifying the time in seconds until the download times out. * `max_tries`: a numeric value that specifies the number of times the function tries to download the data in case an error occurs. - - -## New features * Enhanced Flexibility in Protein Quantification: Introduced the `min_n_peptides` parameter to the `calculate_protein_abundance()` function. This allows users to specify the minimum number of peptides per protein needed for analysis. Default is set at three peptides. ## Bug fixes diff --git a/man/calculate_protein_abundance.Rd b/man/calculate_protein_abundance.Rd index cab700fd..7fac88ed 100644 --- a/man/calculate_protein_abundance.Rd +++ b/man/calculate_protein_abundance.Rd @@ -11,6 +11,7 @@ calculate_protein_abundance( precursor, peptide, intensity_log2, + min_n_peptides = 3, method = "sum", for_plot = FALSE, retain_columns = NULL @@ -33,6 +34,10 @@ to more than three precursors. The quantification is done on the precursor level \item{intensity_log2}{a numeric column in the \code{data} data frame that contains log2 transformed precursor intensities.} +\item{min_n_peptides}{An integer specifying the minimum number of peptides required +for a protein to be included in the analysis. The default value is 3, which means +proteins with fewer than three unique peptides will be excluded from the analysis.} + \item{method}{a character value specifying with which method protein quantities should be calculated. Possible options include \code{"sum"}, which takes the sum of all precursor intensities as the protein abundance. Another option is \code{"iq"}, which performs protein From 6e8ff9d9d60ceec38ade057fc6d02c7d1d343ff6 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Wed, 10 Jul 2024 13:07:37 +0200 Subject: [PATCH 23/35] hande plot_network/get interactions more gracefully #259 --- R/analyse_functional_network.R | 78 ++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index fdc878d8..3a6acb7d 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -114,6 +114,7 @@ analyse_functional_network <- function(data, binds_treatment = NULL, halo_color = NULL, plot = TRUE) { + if (!requireNamespace("STRINGdb", quietly = TRUE)) { message(strwrap("Package \"STRINGdb\" is needed for this function to work. Please install it.", prefix = "\n", initial = "" @@ -121,16 +122,32 @@ analyse_functional_network <- function(data, return(invisible(NULL)) } - STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb")) + # Ensure data frame is not empty and columns exist + if (nrow(data) == 0) { + stop("The input data frame is empty.") + } + + required_columns <- c(ensym(protein_id), ensym(string_id)) + missing_columns <- required_columns[!required_columns %in% colnames(data)] + if (length(missing_columns) > 0) { + stop("The following required columns are missing from the input data frame: ", + paste(missing_columns, collapse = ", ")) + } + + if (plot && nrow(data) > 400) { + stop("Please only provide the top 400 significant proteins for plots! STRING cannot plot more at once.") + } + data <- data %>% dplyr::distinct({{ protein_id }}, {{ string_id }}, {{ binds_treatment }}) - if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) != nrow(data)) { stop(strwrap("Please provide unique annotations for each protein! The number of proteins -does not match the number of rows in your data.", prefix = "\n", initial = "")) + does not match the number of rows in your data.", prefix = "\n", initial = "")) } + STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb")) + string_db <- STRINGdb$new( version = version, species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) @@ -147,36 +164,31 @@ does not match the number of rows in your data.", prefix = "\n", initial = "")) payload_id <- NULL if (!missing(binds_treatment)) { - if (missing(halo_color)) { - coloring <- input %>% - dplyr::filter({{ binds_treatment }}) %>% - dplyr::mutate(color = "#5680C1") - } else { - coloring <- input %>% - dplyr::filter({{ binds_treatment }}) %>% - dplyr::mutate(color = halo_color) - } + coloring <- input %>% + dplyr::filter({{ binds_treatment }}) %>% + dplyr::mutate(color = if (missing(halo_color)) "#5680C1" else halo_color) payload_id <- string_db$post_payload(dplyr::pull(coloring, {{ string_id }}), - colors = coloring$color - ) - } - if (plot == TRUE) { - if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) > 400) { - stop(strwrap("Please only provide the top 400 significant proteins for plots! String -cannot plot more at once.", prefix = "\n", initial = "")) - } - string_db$plot_network(string_ids, payload_id = payload_id) - } else { - mapping <- input %>% - dplyr::distinct({{ protein_id }}, {{ string_id }}) - - interactions <- string_db$get_interactions(string_ids) %>% - dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(from_protein = {{ protein_id }}) %>% - dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(to_protein = {{ protein_id }}) %>% - dplyr::distinct() - - return(interactions) + colors = coloring$color) } + + + tryCatch({ + if (plot) { + string_db$plot_network(string_ids, payload_id = payload_id) + } else { + mapping <- input %>% + dplyr::distinct({{ protein_id }}, {{ string_id }}) + + interactions <- string_db$get_interactions(string_ids) %>% + dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(from_protein = {{ protein_id }}) %>% + dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(to_protein = {{ protein_id }}) %>% + dplyr::distinct() + return(interactions) + } + }, error = function(e) { + message("An error occurred during the interaction network analysis: ", e$message) + return(invisible(NULL)) + }) } From 66856510224f9fc029f933b055ea5cd02dd22e6e Mon Sep 17 00:00:00 2001 From: elena-krismer Date: Wed, 10 Jul 2024 11:16:36 +0000 Subject: [PATCH 24/35] Style code (GHA) --- R/analyse_functional_network.R | 45 +++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index 3a6acb7d..8c068b82 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -114,7 +114,6 @@ analyse_functional_network <- function(data, binds_treatment = NULL, halo_color = NULL, plot = TRUE) { - if (!requireNamespace("STRINGdb", quietly = TRUE)) { message(strwrap("Package \"STRINGdb\" is needed for this function to work. Please install it.", prefix = "\n", initial = "" @@ -130,8 +129,10 @@ analyse_functional_network <- function(data, required_columns <- c(ensym(protein_id), ensym(string_id)) missing_columns <- required_columns[!required_columns %in% colnames(data)] if (length(missing_columns) > 0) { - stop("The following required columns are missing from the input data frame: ", - paste(missing_columns, collapse = ", ")) + stop( + "The following required columns are missing from the input data frame: ", + paste(missing_columns, collapse = ", ") + ) } if (plot && nrow(data) > 400) { @@ -168,27 +169,31 @@ analyse_functional_network <- function(data, dplyr::filter({{ binds_treatment }}) %>% dplyr::mutate(color = if (missing(halo_color)) "#5680C1" else halo_color) payload_id <- string_db$post_payload(dplyr::pull(coloring, {{ string_id }}), - colors = coloring$color) + colors = coloring$color + ) } - tryCatch({ - if (plot) { - string_db$plot_network(string_ids, payload_id = payload_id) - } else { - mapping <- input %>% - dplyr::distinct({{ protein_id }}, {{ string_id }}) - - interactions <- string_db$get_interactions(string_ids) %>% - dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(from_protein = {{ protein_id }}) %>% - dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(to_protein = {{ protein_id }}) %>% - dplyr::distinct() - return(interactions) + tryCatch( + { + if (plot) { + string_db$plot_network(string_ids, payload_id = payload_id) + } else { + mapping <- input %>% + dplyr::distinct({{ protein_id }}, {{ string_id }}) + + interactions <- string_db$get_interactions(string_ids) %>% + dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(from_protein = {{ protein_id }}) %>% + dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(to_protein = {{ protein_id }}) %>% + dplyr::distinct() + return(interactions) } - }, error = function(e) { + }, + error = function(e) { message("An error occurred during the interaction network analysis: ", e$message) return(invisible(NULL)) - }) + } + ) } From a0756d8ec16a28989cd6ba60493f1b38ffa6e62a Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:19:34 +0200 Subject: [PATCH 25/35] #244 upgrade to 12.0 --- R/analyse_functional_network.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index 3a6acb7d..67e66063 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -40,7 +40,7 @@ network_analysis <- #' \href{https://string-db.org/cgi/input?sessionId=bpvps5GS2As6&input_page_show_search=on}{here}. #' H. sapiens: 9606, S. cerevisiae: 4932, E. coli: 511145. #' @param version a character value that specifies the version of STRINGdb to be used. -#' Default is 11.5. +#' Default is 12.0. #' @param score_threshold a numeric value specifying the interaction score that based on #' \href{https://string-db.org/cgi/info?sessionId=bBP5N4cIf0PA&footer_active_subpage=scores}{STRING} #' has to be between 0 and 1000. A score closer to 1000 is related to a higher confidence for the @@ -109,7 +109,7 @@ analyse_functional_network <- function(data, protein_id, string_id, organism_id, - version = "11.5", + version = "12.0", score_threshold = 900, binds_treatment = NULL, halo_color = NULL, From eef58a229eec25bf2460f297a93bd04b164d8a29 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 11 Jul 2024 08:46:16 +0200 Subject: [PATCH 26/35] fix halo color erro --- R/analyse_functional_network.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index b3ed2f70..b322054d 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -165,11 +165,16 @@ analyse_functional_network <- function(data, payload_id <- NULL if (!missing(binds_treatment)) { + if (is.null(halo_color)) { + halo_color <- "#5680C1" + } + coloring <- input %>% dplyr::filter({{ binds_treatment }}) %>% - dplyr::mutate(color = if (missing(halo_color)) "#5680C1" else halo_color) + dplyr::mutate(color = halo_color) + payload_id <- string_db$post_payload(dplyr::pull(coloring, {{ string_id }}), - colors = coloring$color + colors = coloring$color ) } From 3c8c8445ef41f2e192b0df55e3635db1e7921b51 Mon Sep 17 00:00:00 2001 From: elena-krismer Date: Thu, 11 Jul 2024 06:48:04 +0000 Subject: [PATCH 27/35] Style code (GHA) --- R/analyse_functional_network.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index b322054d..2ca9c6fc 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -174,7 +174,7 @@ analyse_functional_network <- function(data, dplyr::mutate(color = halo_color) payload_id <- string_db$post_payload(dplyr::pull(coloring, {{ string_id }}), - colors = coloring$color + colors = coloring$color ) } From 80497bf974d22d3cb26ec6d720baaf7c053cf428 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 11 Jul 2024 08:57:06 +0200 Subject: [PATCH 28/35] update news.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7348ee30..e81ae894 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,8 @@ * `fetch_uniprot()` and `fetch_uniprot_proteome()` are more resistant to database connection issues. They also give more informative messages as to why the data could not be retrieved. Fixes issue #252. * `qc_csv()` now properly works if the column supplied to the `condition` argument is a factor. Fixes issue #254. +* The `analyse_functional_network()` function now includes enhanced error handling to ensure it fails gracefully in case of any issues. Fixes issue #259. +* The default `version` parameter for `analyse_functional_network()` has been updated to 12.0, aligning with the latest STRINGdb version. Fixes issue #244. # protti 0.8.0 From 8a01882b533dc7056a916df4cd12d4d86664eab1 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 11 Jul 2024 09:48:53 +0200 Subject: [PATCH 29/35] fix documentation mismatch --- man/analyse_functional_network.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/analyse_functional_network.Rd b/man/analyse_functional_network.Rd index 8cfe0446..cbe56557 100644 --- a/man/analyse_functional_network.Rd +++ b/man/analyse_functional_network.Rd @@ -9,7 +9,7 @@ analyse_functional_network( protein_id, string_id, organism_id, - version = "11.5", + version = "12.0", score_threshold = 900, binds_treatment = NULL, halo_color = NULL, @@ -34,7 +34,7 @@ obtained from H. sapiens: 9606, S. cerevisiae: 4932, E. coli: 511145.} \item{version}{a character value that specifies the version of STRINGdb to be used. -Default is 11.5.} +Default is 12.0.} \item{score_threshold}{a numeric value specifying the interaction score that based on \href{https://string-db.org/cgi/info?sessionId=bBP5N4cIf0PA&footer_active_subpage=scores}{STRING} From 45fe5eb6a08d7ca43b31b59b3cdef40c2a7ba8b9 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Fri, 12 Jul 2024 11:56:02 +0200 Subject: [PATCH 30/35] check for interenet connection --- R/analyse_functional_network.R | 45 ++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index 2ca9c6fc..6689b189 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -114,6 +114,7 @@ analyse_functional_network <- function(data, binds_treatment = NULL, halo_color = NULL, plot = TRUE) { + if (!requireNamespace("STRINGdb", quietly = TRUE)) { message(strwrap("Package \"STRINGdb\" is needed for this function to work. Please install it.", prefix = "\n", initial = "" @@ -128,6 +129,7 @@ analyse_functional_network <- function(data, required_columns <- c(ensym(protein_id), ensym(string_id)) missing_columns <- required_columns[!required_columns %in% colnames(data)] + if (length(missing_columns) > 0) { stop( "The following required columns are missing from the input data frame: ", @@ -142,20 +144,38 @@ analyse_functional_network <- function(data, data <- data %>% dplyr::distinct({{ protein_id }}, {{ string_id }}, {{ binds_treatment }}) + if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) != nrow(data)) { stop(strwrap("Please provide unique annotations for each protein! The number of proteins does not match the number of rows in your data.", prefix = "\n", initial = "")) } + if (!curl::has_internet()) { + message("No internet connection.") + return(invisible(NULL)) + } + STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb")) - string_db <- STRINGdb$new( - version = version, - species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) - score_threshold = score_threshold, # Cutoff score to consider something an interaction - input_directory = "" + string_db <- tryCatch( + { + STRINGdb$new( + version = version, + species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) + score_threshold = score_threshold, # Cutoff score to consider something an interaction + input_directory = "" + ) + }, + error = function(e) { + e$message + } ) + if (is.character(string_db)) { + message("An error occurred during the interaction network analysis: ", string_db) + return(invisible(NULL)) + } + input <- data %>% dplyr::mutate({{ string_id }} := stringr::str_extract({{ string_id }}, pattern = ".+[^;]")) %>% tidyr::drop_na({{ string_id }}) @@ -165,7 +185,7 @@ analyse_functional_network <- function(data, payload_id <- NULL if (!missing(binds_treatment)) { - if (is.null(halo_color)) { + if (missing(halo_color)) { halo_color <- "#5680C1" } @@ -179,7 +199,7 @@ analyse_functional_network <- function(data, } - tryCatch( + interactions <- tryCatch( { if (plot) { string_db$plot_network(string_ids, payload_id = payload_id) @@ -193,12 +213,17 @@ analyse_functional_network <- function(data, dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% dplyr::rename(to_protein = {{ protein_id }}) %>% dplyr::distinct() - return(interactions) } }, error = function(e) { - message("An error occurred during the interaction network analysis: ", e$message) - return(invisible(NULL)) + e$message } ) + + if (is.character(interactions)) { + message("An error occurred during the interaction network analysis: ", interactions) + return(invisible(NULL)) + } else { + return(interactions) + } } From 07175c0c35e49e18079da571bc5df9802180954f Mon Sep 17 00:00:00 2001 From: elena-krismer Date: Fri, 12 Jul 2024 09:57:26 +0000 Subject: [PATCH 31/35] Style code (GHA) --- R/analyse_functional_network.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index 6689b189..ed1c0727 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -114,7 +114,6 @@ analyse_functional_network <- function(data, binds_treatment = NULL, halo_color = NULL, plot = TRUE) { - if (!requireNamespace("STRINGdb", quietly = TRUE)) { message(strwrap("Package \"STRINGdb\" is needed for this function to work. Please install it.", prefix = "\n", initial = "" From 8ace407f6365b837a1a3729e828ae9db916b532a Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Fri, 12 Jul 2024 13:04:41 +0200 Subject: [PATCH 32/35] catch the warning --- R/analyse_functional_network.R | 53 ++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index 6689b189..e0097f14 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -159,18 +159,25 @@ analyse_functional_network <- function(data, string_db <- tryCatch( { - STRINGdb$new( - version = version, - species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) - score_threshold = score_threshold, # Cutoff score to consider something an interaction - input_directory = "" + withCallingHandlers( + expr = { + STRINGdb$new( + version = version, + species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) + score_threshold = score_threshold, # Cutoff score to consider something an interaction + input_directory = "" + ) + }, + warning = function(w) { + message("A warning occurred during STRINGdb object creation: ", conditionMessage(w)) + invokeRestart("muffleWarning") + } ) }, error = function(e) { e$message } ) - if (is.character(string_db)) { message("An error occurred during the interaction network analysis: ", string_db) return(invisible(NULL)) @@ -201,19 +208,27 @@ analyse_functional_network <- function(data, interactions <- tryCatch( { - if (plot) { - string_db$plot_network(string_ids, payload_id = payload_id) - } else { - mapping <- input %>% - dplyr::distinct({{ protein_id }}, {{ string_id }}) - - interactions <- string_db$get_interactions(string_ids) %>% - dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(from_protein = {{ protein_id }}) %>% - dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(to_protein = {{ protein_id }}) %>% - dplyr::distinct() - } + withCallingHandlers( + expr = { + if (plot) { + string_db$plot_network(string_ids, payload_id = payload_id) + } else { + mapping <- input %>% + dplyr::distinct({{ protein_id }}, {{ string_id }}) + + interactions <- string_db$get_interactions(string_ids) %>% + dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(from_protein = {{ protein_id }}) %>% + dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(to_protein = {{ protein_id }}) %>% + dplyr::distinct() + } + }, + warning = function(w) { + message("A warning occurred during the interaction network analysis: ", conditionMessage(w)) + invokeRestart("muffleWarning") + } + ) }, error = function(e) { e$message From d46a31c4bd8874ab416201c8353535f9877a3ef6 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 12 Jul 2024 12:54:50 +0100 Subject: [PATCH 33/35] Revert "Merge branch 'master' into developer" This reverts commit 4b13c009825f45ce4b015d717ce170b35aa2bc94, reversing changes made to 1826a6dc80478bd41acd2967ba1028f4c85d5094. --- .github/workflows/format-code.yml | 75 +++++++++++++++++++++++++++++++ DESCRIPTION | 2 +- R/calculate_sequence_coverage.R | 1 - R/fetch_uniprot.R | 22 ++++----- R/find_peptide.R | 8 ++-- R/find_peptide_in_structure.R | 2 +- R/qc_cvs.R | 23 +++++----- man/qc_cvs.Rd | 2 +- 8 files changed, 105 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/format-code.yml diff --git a/.github/workflows/format-code.yml b/.github/workflows/format-code.yml new file mode 100644 index 00000000..5546cd0c --- /dev/null +++ b/.github/workflows/format-code.yml @@ -0,0 +1,75 @@ +on: + push: + paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"] + +name: Style +env: + GITHUB_ACTOR: "actions-user" + +jobs: + style: + runs-on: ubuntu-latest + permissions: + contents: write + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: Install dependencies + uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::styler, any::roxygen2 + needs: styler + + - name: Enable styler cache + run: styler::cache_activate() + shell: Rscript {0} + + - name: Determine cache location + id: styler-location + run: | + cat( + "location=", + styler::cache_info(format = "tabular")$location, + "\n", + file = Sys.getenv("GITHUB_OUTPUT"), + append = TRUE, + sep = "" + ) + shell: Rscript {0} + + - name: Cache styler + uses: actions/cache@v4 + with: + path: ${{ steps.styler-location.outputs.location }} + key: ${{ runner.os }}-styler-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-styler- + ${{ runner.os }}- + + - name: Style + run: styler::style_pkg() + shell: Rscript {0} + + - name: Commit and push changes + run: | + if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \ + | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$')) + then + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)" + git pull --ff-only + git push origin + else + echo "No changes to commit." + fi diff --git a/DESCRIPTION b/DESCRIPTION index 9e1775f4..ba41caf0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: protti Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools -Version: 0.8.0 +Version: 0.8.0.9000 Authors@R: c(person(given = "Jan-Philipp", family = "Quast", diff --git a/R/calculate_sequence_coverage.R b/R/calculate_sequence_coverage.R index e670b1b0..7d43fd93 100644 --- a/R/calculate_sequence_coverage.R +++ b/R/calculate_sequence_coverage.R @@ -77,5 +77,4 @@ calculate_sequence_coverage <- data %>% dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups)) - } diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index cb471d4f..85e7d837 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -78,16 +78,16 @@ fetch_uniprot <- non_conform_ids <- uniprot_ids[!id_test] # if non_conform_ids contain IDs they are extracted and fetched. contains_valid_id <- non_conform_ids[stringr::str_detect(non_conform_ids, - pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + pattern = "(?% dplyr::mutate(accession = stringr::str_extract_all(.data$input_id, - pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + pattern = "(?% tidyr::unnest("accession") %>% dplyr::distinct() @@ -95,7 +95,7 @@ fetch_uniprot <- uniprot_ids_filtered <- unique(c(uniprot_ids_filtered, valid_id_annotations$accession)) non_identifiable_id <- non_conform_ids[!stringr::str_detect(non_conform_ids, - pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + pattern = "(?% - dplyr::left_join(valid_id_annotations, by = "input_id") %>% - dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession)) + original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>% + dplyr::left_join(valid_id_annotations, by = "input_id") %>% + dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession)) - result <- result %>% - dplyr::right_join(original_ids, by = "accession") %>% - dplyr::relocate(.data$accession, .data$input_id) + result <- result %>% + dplyr::right_join(original_ids, by = "accession") %>% + dplyr::relocate("accession", "input_id") return(result) } @@ -242,7 +242,7 @@ They were fetched and the original input ID can be found in the "input_id" colum result <- result %>% dplyr::right_join(original_ids, by = "accession") %>% - dplyr::relocate(.data$accession, .data$input_id) + dplyr::relocate("accession", "input_id") result } diff --git a/R/find_peptide.R b/R/find_peptide.R index 4abb9be2..5091a321 100644 --- a/R/find_peptide.R +++ b/R/find_peptide.R @@ -52,8 +52,8 @@ find_peptide <- end = .data$end + 1 )) - data %>% dplyr::left_join(result, c( - rlang::as_name(rlang::enquo(protein_sequence)), - rlang::as_name(rlang::enquo(peptide_sequence)) - )) + data %>% dplyr::left_join(result, c( + rlang::as_name(rlang::enquo(protein_sequence)), + rlang::as_name(rlang::enquo(peptide_sequence)) + )) } diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R index 7b3af2c4..3fbbee52 100644 --- a/R/find_peptide_in_structure.R +++ b/R/find_peptide_in_structure.R @@ -193,7 +193,7 @@ find_peptide_in_structure <- function(peptide_data, {{ end }} > .data$ref_end_seq_id)) %>% dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>% dplyr::mutate(n_peptides = dplyr::n_distinct({{ peptide }})) %>% - tidyr::drop_na(.data$pdb_ids) %>% + tidyr::drop_na("pdb_ids") %>% dplyr::mutate(n_peptides_in_structure = sum(.data$peptide_in_pdb)) %>% dplyr::ungroup() %>% dplyr::mutate( diff --git a/R/qc_cvs.R b/R/qc_cvs.R index f1b7db21..713303d3 100644 --- a/R/qc_cvs.R +++ b/R/qc_cvs.R @@ -6,7 +6,7 @@ #' information on conditions and intensity values for each peptide, precursor or protein. #' @param grouping a character column in the \code{data} data frame that contains the grouping #' variables (e.g. peptides, precursors or proteins). -#' @param condition a column in the \code{data} data frame that contains condition information +#' @param condition a character or factor column in the \code{data} data frame that contains condition information #' (e.g. "treated" and "control"). #' @param intensity a numeric column in the \code{data} data frame that contains the corresponding #' raw or untransformed normalised intensity values for each peptide or precursor. @@ -119,10 +119,11 @@ The function does not handle log2 transformed data.", dplyr::distinct({{ condition }}, {{ grouping }}, .data$cv_combined, .data$cv) %>% tidyr::drop_na() %>% tidyr::pivot_longer(cols = starts_with("cv"), names_to = "type", values_to = "values") %>% - dplyr::mutate(type = ifelse(.data$type == "cv", {{ condition }}, "combined")) %>% - dplyr::mutate(type = forcats::fct_relevel(as.factor(.data$type), "combined")) %>% - dplyr::select(-{{ condition }}) %>% - dplyr::group_by(.data$type) %>% + dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>% + dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>% + dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>% + dplyr::select(-.data$type) %>% + dplyr::group_by({{ condition }}) %>% dplyr::mutate(median = stats::median(.data$values)) %>% dplyr::distinct() @@ -137,9 +138,9 @@ The function does not handle log2 transformed data.", plot <- ggplot2::ggplot(result) + ggplot2::geom_boxplot( aes( - x = .data$type, + x = {{ condition }}, y = .data$values, - fill = .data$type + fill = {{ condition }} ), na.rm = TRUE ) + @@ -165,7 +166,7 @@ The function does not handle log2 transformed data.", } if (plot_style == "density") { plot <- ggplot2::ggplot(result) + - ggplot2::geom_density(ggplot2::aes(x = .data$values, col = .data$type), size = 1, na.rm = TRUE) + + ggplot2::geom_density(ggplot2::aes(x = .data$values, col = {{ condition }}), size = 1, na.rm = TRUE) + ggplot2::labs( title = "Coefficients of variation", x = "Coefficient of variation [%]", @@ -174,10 +175,10 @@ The function does not handle log2 transformed data.", ) + ggplot2::scale_x_continuous(limits = c(0, max_cv)) + geom_vline( - data = dplyr::distinct(result, .data$median, .data$type), + data = dplyr::distinct(result, .data$median, {{ condition }}), ggplot2::aes( xintercept = median, - col = .data$type + col = {{ condition }} ), size = 1, linetype = "dashed", @@ -198,7 +199,7 @@ The function does not handle log2 transformed data.", return(plot) } if (plot_style == "violin") { - plot <- ggplot2::ggplot(result, aes(x = .data$type, y = .data$values, fill = .data$type)) + + plot <- ggplot2::ggplot(result, aes(x = {{ condition }}, y = .data$values, fill = {{ condition }})) + ggplot2::geom_violin(na.rm = TRUE) + ggplot2::geom_boxplot(width = 0.15, fill = "white", na.rm = TRUE, alpha = 0.6) + ggplot2::labs( diff --git a/man/qc_cvs.Rd b/man/qc_cvs.Rd index ae47c047..b2525af6 100644 --- a/man/qc_cvs.Rd +++ b/man/qc_cvs.Rd @@ -21,7 +21,7 @@ information on conditions and intensity values for each peptide, precursor or pr \item{grouping}{a character column in the \code{data} data frame that contains the grouping variables (e.g. peptides, precursors or proteins).} -\item{condition}{a column in the \code{data} data frame that contains condition information +\item{condition}{a character or factor column in the \code{data} data frame that contains condition information (e.g. "treated" and "control").} \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding From fbf265ecb3fd8abceec8c70b96c802a40a6b8da5 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 12 Jul 2024 17:50:15 +0100 Subject: [PATCH 34/35] Bump version and update cran-comments --- DESCRIPTION | 2 +- NEWS.md | 2 +- cran-comments.md | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ba41caf0..0faf4c66 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: protti Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools -Version: 0.8.0.9000 +Version: 0.9.0 Authors@R: c(person(given = "Jan-Philipp", family = "Quast", diff --git a/NEWS.md b/NEWS.md index e81ae894..f947cda2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# protti 0.8.0.9000 +# protti 0.9.0 ## New features diff --git a/cran-comments.md b/cran-comments.md index 9c3f09ea..29084767 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,11 +1,13 @@ ## Submission -* We added new features and fixed bugs. +* We specifically addressed and fixed the issue raised by Prof. Brian Ripley: + * The `analyse_functional_network()` function did not fail gracefully. +* Additinally we added new features and fixed bugs. ## Test environments -* macOS-latest (on GitHub actions), R 4.3.3 -* windows-latest (on GitHub actions), R 4.3.3 -* ubuntu-20.04 (on GitHub actions), R 4.3.3 +* macOS-latest (on GitHub actions), R 4.4.1 +* windows-latest (on GitHub actions), R 4.4.1 +* ubuntu-20.04 (on GitHub actions), R 4.4.1 * ubuntu-20.04 (on GitHub actions), r-devel * windows-ix86+x86_64 (win-builder), r-devel * fedora-clang-devel (R-hub), r-devel From 645ce604d1cb62b2a3994b3893503d0e0ffe153b Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 12 Jul 2024 17:52:54 +0100 Subject: [PATCH 35/35] Provide more info in cran-comments --- cran-comments.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cran-comments.md b/cran-comments.md index 29084767..9356edf4 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,7 +2,8 @@ * We specifically addressed and fixed the issue raised by Prof. Brian Ripley: * The `analyse_functional_network()` function did not fail gracefully. -* Additinally we added new features and fixed bugs. + * We implemented a `try_catch()` that specifically rescues the cases in which the `STRINGdb` package does not fail gracefully. This fixes the issue. +* Additionally we added new features and fixed bugs. ## Test environments * macOS-latest (on GitHub actions), R 4.4.1