diff --git a/.github/workflows/format-code.yml b/.github/workflows/format-code.yml new file mode 100644 index 00000000..5546cd0c --- /dev/null +++ b/.github/workflows/format-code.yml @@ -0,0 +1,75 @@ +on: + push: + paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"] + +name: Style +env: + GITHUB_ACTOR: "actions-user" + +jobs: + style: + runs-on: ubuntu-latest + permissions: + contents: write + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: Install dependencies + uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::styler, any::roxygen2 + needs: styler + + - name: Enable styler cache + run: styler::cache_activate() + shell: Rscript {0} + + - name: Determine cache location + id: styler-location + run: | + cat( + "location=", + styler::cache_info(format = "tabular")$location, + "\n", + file = Sys.getenv("GITHUB_OUTPUT"), + append = TRUE, + sep = "" + ) + shell: Rscript {0} + + - name: Cache styler + uses: actions/cache@v4 + with: + path: ${{ steps.styler-location.outputs.location }} + key: ${{ runner.os }}-styler-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-styler- + ${{ runner.os }}- + + - name: Style + run: styler::style_pkg() + shell: Rscript {0} + + - name: Commit and push changes + run: | + if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \ + | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$')) + then + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)" + git pull --ff-only + git push origin + else + echo "No changes to commit." + fi diff --git a/DESCRIPTION b/DESCRIPTION index 9e1775f4..0faf4c66 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: protti Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools -Version: 0.8.0 +Version: 0.9.0 Authors@R: c(person(given = "Jan-Philipp", family = "Quast", diff --git a/NEWS.md b/NEWS.md index b49386b8..f947cda2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,27 @@ +# protti 0.9.0 + +## New features + +* `calculate_go_enrichment()` got additional arguments. + * `replace_long_name`: a logical argument that specifies if GO term names above 50 characters should be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become too small due to the long name. The default is `TRUE`. + * `label_move_frac`: a numeric argument between 0 and 1 that specifies which labels should be moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from overlapping with the bar boundaries. +* `fetch_alphafold_aligned_error()`, `fetch_alphafold_prediction()`, `fetch_mobidb()`, `fetch_quickgo()`, `fetch_uniprot()` and `fetch_uniprot_proteome()` got additional arguments: + * `timeout`: a numeric value specifying the time in seconds until the download times out. + * `max_tries`: a numeric value that specifies the number of times the function tries to download the data in case an error occurs. +* Enhanced Flexibility in Protein Quantification: Introduced the `min_n_peptides` parameter to the `calculate_protein_abundance()` function. This allows users to specify the minimum number of peptides per protein needed for analysis. Default is set at three peptides. + +## Bug fixes + +* `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits. Fixes issue #245. +* `calculate_go_enrichment()` now correctly uses the total number of provided proteins for the contingency table. Previously it falsely only considered proteins with a GO annotation for the enrichment analysis. + +## Additional Changes + +* `fetch_uniprot()` and `fetch_uniprot_proteome()` are more resistant to database connection issues. They also give more informative messages as to why the data could not be retrieved. Fixes issue #252. +* `qc_csv()` now properly works if the column supplied to the `condition` argument is a factor. Fixes issue #254. +* The `analyse_functional_network()` function now includes enhanced error handling to ensure it fails gracefully in case of any issues. Fixes issue #259. +* The default `version` parameter for `analyse_functional_network()` has been updated to 12.0, aligning with the latest STRINGdb version. Fixes issue #244. + # protti 0.8.0 ## New features diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R index fdc878d8..e433383d 100644 --- a/R/analyse_functional_network.R +++ b/R/analyse_functional_network.R @@ -40,7 +40,7 @@ network_analysis <- #' \href{https://string-db.org/cgi/input?sessionId=bpvps5GS2As6&input_page_show_search=on}{here}. #' H. sapiens: 9606, S. cerevisiae: 4932, E. coli: 511145. #' @param version a character value that specifies the version of STRINGdb to be used. -#' Default is 11.5. +#' Default is 12.0. #' @param score_threshold a numeric value specifying the interaction score that based on #' \href{https://string-db.org/cgi/info?sessionId=bBP5N4cIf0PA&footer_active_subpage=scores}{STRING} #' has to be between 0 and 1000. A score closer to 1000 is related to a higher confidence for the @@ -109,7 +109,7 @@ analyse_functional_network <- function(data, protein_id, string_id, organism_id, - version = "11.5", + version = "12.0", score_threshold = 900, binds_treatment = NULL, halo_color = NULL, @@ -121,22 +121,66 @@ analyse_functional_network <- function(data, return(invisible(NULL)) } - STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb")) + # Ensure data frame is not empty and columns exist + if (nrow(data) == 0) { + stop("The input data frame is empty.") + } + + required_columns <- c(ensym(protein_id), ensym(string_id)) + missing_columns <- required_columns[!required_columns %in% colnames(data)] + + if (length(missing_columns) > 0) { + stop( + "The following required columns are missing from the input data frame: ", + paste(missing_columns, collapse = ", ") + ) + } + + if (plot && nrow(data) > 400) { + stop("Please only provide the top 400 significant proteins for plots! STRING cannot plot more at once.") + } + data <- data %>% dplyr::distinct({{ protein_id }}, {{ string_id }}, {{ binds_treatment }}) if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) != nrow(data)) { stop(strwrap("Please provide unique annotations for each protein! The number of proteins -does not match the number of rows in your data.", prefix = "\n", initial = "")) + does not match the number of rows in your data.", prefix = "\n", initial = "")) } - string_db <- STRINGdb$new( - version = version, - species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) - score_threshold = score_threshold, # Cutoff score to consider something an interaction - input_directory = "" + if (!curl::has_internet()) { + message("No internet connection.") + return(invisible(NULL)) + } + + STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb")) + + string_db <- tryCatch( + { + withCallingHandlers( + expr = { + STRINGdb$new( + version = version, + species = organism_id, # Check on String database to get the right code (E.coli K12: 511145) + score_threshold = score_threshold, # Cutoff score to consider something an interaction + input_directory = "" + ) + }, + warning = function(w) { + message("A warning occurred during STRINGdb object creation: ", conditionMessage(w)) + invokeRestart("muffleWarning") + } + ) + }, + error = function(e) { + e$message + } ) + if (is.character(string_db)) { + message("An error occurred during the interaction network analysis: ", string_db) + return(invisible(NULL)) + } input <- data %>% dplyr::mutate({{ string_id }} := stringr::str_extract({{ string_id }}, pattern = ".+[^;]")) %>% @@ -148,35 +192,52 @@ does not match the number of rows in your data.", prefix = "\n", initial = "")) if (!missing(binds_treatment)) { if (missing(halo_color)) { - coloring <- input %>% - dplyr::filter({{ binds_treatment }}) %>% - dplyr::mutate(color = "#5680C1") - } else { - coloring <- input %>% - dplyr::filter({{ binds_treatment }}) %>% - dplyr::mutate(color = halo_color) + halo_color <- "#5680C1" } + + coloring <- input %>% + dplyr::filter({{ binds_treatment }}) %>% + dplyr::mutate(color = halo_color) + payload_id <- string_db$post_payload(dplyr::pull(coloring, {{ string_id }}), colors = coloring$color ) } - if (plot == TRUE) { - if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) > 400) { - stop(strwrap("Please only provide the top 400 significant proteins for plots! String -cannot plot more at once.", prefix = "\n", initial = "")) - } - string_db$plot_network(string_ids, payload_id = payload_id) - } else { - mapping <- input %>% - dplyr::distinct({{ protein_id }}, {{ string_id }}) - interactions <- string_db$get_interactions(string_ids) %>% - dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(from_protein = {{ protein_id }}) %>% - dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% - dplyr::rename(to_protein = {{ protein_id }}) %>% - dplyr::distinct() + interactions <- tryCatch( + { + withCallingHandlers( + expr = { + if (plot) { + string_db$plot_network(string_ids, payload_id = payload_id) + } else { + mapping <- input %>% + dplyr::distinct({{ protein_id }}, {{ string_id }}) + + interactions <- string_db$get_interactions(string_ids) %>% + dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(from_protein = {{ protein_id }}) %>% + dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>% + dplyr::rename(to_protein = {{ protein_id }}) %>% + dplyr::distinct() + } + }, + warning = function(w) { + message("A warning occurred during the interaction network analysis: ", conditionMessage(w)) + invokeRestart("muffleWarning") + } + ) + }, + error = function(e) { + e$message + } + ) + + if (is.character(interactions)) { + message("An error occurred during the interaction network analysis: ", interactions) + return(invisible(NULL)) + } else { return(interactions) } } diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 8db67eaa..6e37539c 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -82,6 +82,13 @@ go_enrichment <- function(...) { #' determines if the enrichment analysis should be performed in order to check for both enrichemnt and #' deenrichemnt or only one of the two. This affects the statistics performed and therefore also the displayed #' plot. +#' @param replace_long_name a logical argument that specifies if GO term names above 50 characters should +#' be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become +#' too small due to the long name. The default is `TRUE`. +#' @param label_move_frac a numeric argument between 0 and 1 that specifies which labels should be +#' moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size +#' of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from +#' overlapping with the bar boundaries. #' @param min_n_detected_proteins_in_process is a numeric argument that specifies the minimum number of #' detected proteins required for a GO term to be displayed in the plot. The default is 1, meaning #' no filtering of the plotted data is performed. This argument does not affect any computations or @@ -217,6 +224,8 @@ calculate_go_enrichment <- function(data, heatmap_fill_colour_rev = TRUE, label = TRUE, enrichment_type = "all", + replace_long_name = TRUE, + label_move_frac = 0.2, min_n_detected_proteins_in_process = 1, plot_cutoff = "adj_pval top10") { # to avoid note about no global variable binding. Usually this can be avoided with @@ -243,24 +252,23 @@ calculate_go_enrichment <- function(data, 'adj_pval top5', 'pval 0.05'") } - if (length(unique(dplyr::pull(data, {{ protein_id }}))) != nrow(data)) { - data <- data %>% - dplyr::ungroup() %>% - { - # conditional grouping - if (!group_missing) { - dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}, {{ group }}) %>% - dplyr::group_by({{ protein_id }}, {{ group }}) - } else { - dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}) %>% - dplyr::group_by({{ protein_id }}) - } - } %>% - dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, TRUE, FALSE)) %>% - # do this to remove accidental double annotations - dplyr::ungroup() %>% - dplyr::distinct() - } + # Clean-up data + data <- data %>% + dplyr::ungroup() %>% + { + # conditional grouping + if (!group_missing) { + dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}, {{ group }}) %>% + dplyr::group_by({{ protein_id }}, {{ group }}) + } else { + dplyr::distinct(., {{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}) %>% + dplyr::group_by({{ protein_id }}) + } + } %>% + dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, TRUE, FALSE)) %>% + # do this to remove accidental double annotations + dplyr::ungroup() %>% + dplyr::distinct() if (!missing(go_annotations_uniprot)) { if (!stringr::str_detect( @@ -331,7 +339,6 @@ if you used the right organism ID.", prefix = "\n", initial = "")) # group argument is not missing cont_table <- go_data %>% - tidyr::drop_na(.data$go_id, {{ is_significant }}) %>% { # group argument is missing if (group_missing) { dplyr::group_by(., {{ is_significant }}) @@ -355,7 +362,8 @@ if you used the right organism ID.", prefix = "\n", initial = "")) } } %>% tidyr::complete(.data$go_id, tidyr::nesting(!!rlang::ensym(is_significant), n_sig), fill = list(n_has_process = 0)) %>% - dplyr::ungroup() + dplyr::ungroup() %>% + tidyr::drop_na(.data$go_id) if (group_missing) { @@ -394,7 +402,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) go_id = .y ) ) %>% - mutate({{ group }} := .y) + dplyr::mutate({{ group }} := .y) } ) } @@ -438,6 +446,11 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::ungroup() %>% dplyr::filter(.data$n_detected_proteins_in_process >= min_n_detected_proteins_in_process) + if (replace_long_name & !missing(go_annotations_uniprot)) { + filtered_result_table <- filtered_result_table %>% + dplyr::mutate(term = ifelse(nchar(.data$term) > 50, .data$go_id, .data$term)) + } + if (!missing(group) & y_axis_free & plot_style == "barplot") { # arrange table by group and go term for plot # this ensures that the terms are in the right order for a facet plot with a free axis @@ -449,11 +462,13 @@ if you used the right organism ID.", prefix = "\n", initial = "")) if (stringr::str_detect(plot_cutoff, pattern = "top")) { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] - top <- stringr::str_extract(split_cutoff[2], pattern = "\\d+") + top <- as.numeric(stringr::str_extract(split_cutoff[2], pattern = "\\d+")) plot_input <- filtered_result_table %>% dplyr::ungroup() %>% dplyr::mutate(neg_log_sig = -log10(!!rlang::ensym(type))) %>% - dplyr::slice(1:top) + dplyr::group_by({{ group }}) %>% + dplyr::mutate(n = 1:dplyr::n()) %>% + dplyr::filter(n <= top) } else { split_cutoff <- stringr::str_split(plot_cutoff, pattern = " ", simplify = TRUE) type <- split_cutoff[1] @@ -464,6 +479,10 @@ if you used the right organism ID.", prefix = "\n", initial = "")) dplyr::filter(!!rlang::ensym(type) <= threshold) } + # move label if bar is less than 20% (default) of largest bar + plot_input <- plot_input %>% + dplyr::mutate(hjust = ifelse((.data$neg_log_sig / max(.data$neg_log_sig)) < label_move_frac, -0.15, 1.05)) + if (plot_style == "barplot") { # Check if ggforce package is available. If not prompt user to install it. if (!requireNamespace("ggforce", quietly = TRUE)) { @@ -509,13 +528,13 @@ if you used the right organism ID.", prefix = "\n", initial = "")) "%)" ), y = .data$neg_log_sig - 0.1, - hjust = 1 + hjust = .data$hjust ) ) } } + ggplot2::scale_fill_manual(values = c(Deenriched = barplot_fill_colour[1], Enriched = barplot_fill_colour[2])) + - ggplot2::scale_y_continuous(breaks = seq(0, 100, 1)) + + ggplot2::scale_y_continuous(labels = scales::number_format(accuracy = 1)) + ggplot2::coord_flip() + { if (!missing(group)) { diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R index d85f3a75..5fb71220 100644 --- a/R/calculate_protein_abundance.R +++ b/R/calculate_protein_abundance.R @@ -14,6 +14,9 @@ #' to more than three precursors. The quantification is done on the precursor level. #' @param intensity_log2 a numeric column in the \code{data} data frame that contains log2 #' transformed precursor intensities. +#' @param min_n_peptides An integer specifying the minimum number of peptides required +#' for a protein to be included in the analysis. The default value is 3, which means +#' proteins with fewer than three unique peptides will be excluded from the analysis. #' @param method a character value specifying with which method protein quantities should be #' calculated. Possible options include \code{"sum"}, which takes the sum of all precursor #' intensities as the protein abundance. Another option is \code{"iq"}, which performs protein @@ -109,6 +112,7 @@ calculate_protein_abundance <- function(data, precursor, peptide, intensity_log2, + min_n_peptides = 3, method = "sum", for_plot = FALSE, retain_columns = NULL) { @@ -127,7 +131,7 @@ calculate_protein_abundance <- function(data, tidyr::drop_na() %>% dplyr::group_by({{ protein_id }}, {{ sample }}) %>% dplyr::mutate(n_peptides = dplyr::n_distinct(!!rlang::ensym(peptide))) %>% - dplyr::filter(.data$n_peptides > 2) %>% + dplyr::filter(.data$n_peptides >= min_n_peptides) %>% dplyr::select(-"n_peptides") %>% dplyr::ungroup() diff --git a/R/calculate_sequence_coverage.R b/R/calculate_sequence_coverage.R index e670b1b0..7d43fd93 100644 --- a/R/calculate_sequence_coverage.R +++ b/R/calculate_sequence_coverage.R @@ -77,5 +77,4 @@ calculate_sequence_coverage <- data %>% dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups)) - } diff --git a/R/fetch_alphafold_aligned_error.R b/R/fetch_alphafold_aligned_error.R index d370b77a..100d5e98 100644 --- a/R/fetch_alphafold_aligned_error.R +++ b/R/fetch_alphafold_aligned_error.R @@ -9,8 +9,10 @@ #' should be fetched. #' @param error_cutoff a numeric value specifying the maximum position error (in Angstroms) that should be retained. #' setting this value to a low number reduces the size of the retrieved data. Default is 20. -#' @param timeout a numeric value specifying the time in seconds until the download of an organism -#' archive times out. The default is 3600 seconds. +#' @param timeout a numeric value specifying the time in seconds until the download times out. +#' The default is 30 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 1. #' @param return_data_frame a logical value; if `TRUE` a data frame instead of a list #' is returned. It is recommended to only use this if information for few proteins is retrieved. #' Default is `FALSE`. @@ -49,7 +51,8 @@ #' } fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, error_cutoff = 20, - timeout = 3600, + timeout = 30, + max_tries = 1, return_data_frame = FALSE, show_progress = TRUE) { if (!curl::has_internet()) { @@ -62,7 +65,7 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, batches <- purrr::map( .x = uniprot_ids, - .f = ~ paste0("https://alphafold.ebi.ac.uk/files/AF-", .x, "-F1-predicted_aligned_error_v3.json") + .f = ~ paste0("https://alphafold.ebi.ac.uk/files/AF-", .x, "-F1-predicted_aligned_error_v4.json") ) names(batches) <- uniprot_ids @@ -79,13 +82,19 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, .f = ~ { # query information from database query <- try_query(.x, - type = "application/json" + type = "application/json", + timeout = timeout, + max_tries = max_tries ) if (show_progress == TRUE) { pb$tick() } + if (methods::is(query, "character")) { + return(invisible(query)) + } + initial_result <- query[[1]][["predicted_aligned_error"]] final_result <- purrr::imap_dfr( @@ -114,6 +123,10 @@ fetch_alphafold_aligned_error <- function(uniprot_ids = NULL, ) %>% dplyr::distinct() + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))) { + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } + message("The following IDs have not been retrieved correctly:") message(paste0(utils::capture.output(error_table), collapse = "\n")) } diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 7b435258..8251bd63 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -16,6 +16,8 @@ #' Available version can be found here: https://ftp.ebi.ac.uk/pub/databases/alphafold/ #' @param timeout a numeric value specifying the time in seconds until the download of an organism #' archive times out. The default is 3600 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 5. This only applies if `uniprot_ids` were provided. #' @param return_data_frame a logical value that specifies if true, a data frame instead of a list #' is returned. It is recommended to only use this if information for few proteins is retrieved. #' Default is FALSE. @@ -74,6 +76,7 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, organism_name = NULL, version = "v4", timeout = 3600, + max_tries = 5, return_data_frame = FALSE, show_progress = TRUE) { if (!curl::has_internet()) { @@ -205,6 +208,10 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, old <- options(timeout = timeout) on.exit(options(old)) + # This does not fail gracefully. I could not figure out how to + # catch the error and the two warnings in case of a timeout. + # The error would not be caught while only the first warning that does not + # contain the timeout information was caught. utils::download.file(url, destfile = paste0(tempdir(), "/alphafold.tar")) utils::untar( @@ -338,6 +345,8 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, # query information from database query <- try_query(.x, type = "text/tab-separated-values", + timeout = timeout, + max_tries = max_tries, col_names = FALSE, quote = "", show_col_types = FALSE, @@ -430,6 +439,10 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, ) %>% dplyr::distinct() + if (any(stringr::str_detect(unique(error_table$error), pattern = "Timeout"))) { + message('The retrieval of data timed out. Consider increasing the "timeout" or "max_tries" argument. \n') + } + message("The following IDs have not been retrieved correctly.") message(paste0(utils::capture.output(error_table), collapse = "\n")) } diff --git a/R/fetch_chebi.R b/R/fetch_chebi.R index c79a2f53..38ae2478 100644 --- a/R/fetch_chebi.R +++ b/R/fetch_chebi.R @@ -35,7 +35,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { if (relation == TRUE) { chebi_relation_result <- tryCatch( httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv", + "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv", httr::timeout(timeout) ), error = function(e) conditionMessage(e), @@ -76,7 +76,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { # Download compound data chebi_chemical_data_result <- tryCatch( httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", + "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", httr::timeout(timeout) ), error = function(e) conditionMessage(e), @@ -100,7 +100,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { show_col_types = FALSE )) - chebi_compounds_download <- tryCatch(readLines(con <- gzcon(url("ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/compounds.tsv.gz", method = "libcurl"))), + chebi_compounds_download <- tryCatch(readLines(con <- gzcon(url("https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/compounds.tsv.gz", method = "libcurl"))), error = function(e) conditionMessage(e), warning = function(w) conditionMessage(w) ) @@ -118,7 +118,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { return(invisible(NULL)) } - chebi_names_download <- tryCatch(readLines(con <- gzcon(url("ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/names.tsv.gz", method = "libcurl"))), + chebi_names_download <- tryCatch(readLines(con <- gzcon(url("https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/names.tsv.gz", method = "libcurl"))), error = function(e) conditionMessage(e), warning = function(w) conditionMessage(w) ) @@ -138,7 +138,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi_accession_result <- tryCatch( httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv", + "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv", httr::timeout(timeout) ), error = function(e) conditionMessage(e), diff --git a/R/fetch_mobidb.R b/R/fetch_mobidb.R index 9e22712a..7d6f1014 100644 --- a/R/fetch_mobidb.R +++ b/R/fetch_mobidb.R @@ -9,6 +9,10 @@ #' argument is mutually exclusive to the \code{uniprot_ids} argument. #' @param show_progress a logical value; if `TRUE` a progress bar will be shown. #' Default is `TRUE`. +#' @param timeout a numeric value specifying the time in seconds until the download of an organism +#' archive times out. The default is 60 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 2. #' #' @return A data frame that contains start and end positions for disordered and flexible protein #' regions. The \code{feature} column contains information on the source of this @@ -30,7 +34,7 @@ #' uniprot_ids = c("P0A799", "P62707") #' ) #' } -fetch_mobidb <- function(uniprot_ids = NULL, organism_id = NULL, show_progress = TRUE) { +fetch_mobidb <- function(uniprot_ids = NULL, organism_id = NULL, show_progress = TRUE, timeout = 60, max_tries = 2) { if (!curl::has_internet()) { message("No internet connection.") return(invisible(NULL)) @@ -113,7 +117,8 @@ to uniprot standards and were skipped from fetching: ", # query information from database query <- try_query( url = .x, - timeout = 60 + timeout = timeout, + max_tries = max_tries ) if (show_progress == TRUE) { @@ -135,6 +140,10 @@ to uniprot standards and were skipped from fetching: ", ) %>% dplyr::distinct() + if (any(stringr::str_detect(error_table$error, pattern = "Timeout"))) { + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } + message("The following IDs have not been retrieved correctly.") message(paste0(utils::capture.output(error_table), collapse = "\n")) } diff --git a/R/fetch_quickgo.R b/R/fetch_quickgo.R index 23a5c9e5..302f42b4 100644 --- a/R/fetch_quickgo.R +++ b/R/fetch_quickgo.R @@ -21,6 +21,10 @@ #' a slim go set should be generated. This argument should only be provided if slims are retrieved. #' @param relations_slims an optional character vector that specifies the relations of GO IDs that should be #' considered for the generation of the slim dataset. This argument should only be provided if slims are retrieved. +#' @param timeout a numeric value specifying the time in seconds until the download times out. +#' The default is 1200 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 2. #' @param show_progress a logical value that indicates if a progress bar will be shown. #' Default is TRUE. #' @@ -67,6 +71,8 @@ fetch_quickgo <- function(type = "annotations", ontology_annotations = "all", go_id_slims = NULL, relations_slims = c("is_a", "part_of", "regulates", "occurs_in"), + timeout = 1200, + max_tries = 2, show_progress = TRUE) { type <- match.arg(type, c("annotations", "terms", "slims")) @@ -119,7 +125,7 @@ fetch_quickgo <- function(type = "annotations", start_time <- Sys.time() } - query_result <- try_query(url, timeout = 1200, accept = "text/tsv") + query_result <- try_query(url, timeout = timeout, max_tries = max_tries, accept = "text/tsv") if (show_progress == TRUE) { message("DONE", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)")) @@ -127,6 +133,9 @@ fetch_quickgo <- function(type = "annotations", if (methods::is(query_result, "character")) { message(query_result) + if (stringr::str_detect(query_result, pattern = "Timeout")) { + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(NULL)) } @@ -167,11 +176,16 @@ fetch_quickgo <- function(type = "annotations", url <- paste0(base_url, page) test_query <- try_query(url, - type = "application/json" + type = "application/json", + timeout = timeout, + max_tries = max_tries ) if (methods::is(test_query, "character")) { - message(test_query) + message(test_query, "\n") + if (stringr::str_detect(test_query, pattern = "Timeout")) { + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(NULL)) } @@ -195,6 +209,8 @@ fetch_quickgo <- function(type = "annotations", ) query <- try_query(query_url, type = "application/json", + timeout = timeout, + max_tries = max_tries, simplifyDataFrame = TRUE ) @@ -213,6 +229,11 @@ fetch_quickgo <- function(type = "annotations", if (!is.null(error_vector)) { message(paste(unique(error_vector), collapse = ", ")) + + if (any(stringr::str_detect(unique(error_vector), pattern = "Timeout"))) { + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } + return(invisible(NULL)) } else { query_result <- query_result_list %>% @@ -321,12 +342,16 @@ fetch_quickgo <- function(type = "annotations", query_result <- try_query(url, type = "application/json", - timeout = 1200, + timeout = timeout, + max_tries = max_tries, simplifyDataFrame = TRUE ) if (methods::is(query_result, "character")) { message(query_result) + if (stringr::str_detect(query_result, pattern = "Timeout")) { + message('Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(NULL)) } else { query_result <- query_result[["results"]] %>% diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R index d896ca84..85e7d837 100644 --- a/R/fetch_uniprot.R +++ b/R/fetch_uniprot.R @@ -8,6 +8,9 @@ #' cross-referenced database provide the database name with the prefix "xref_", e.g. `"xref_pdb"`) #' @param batchsize a numeric value that specifies the number of proteins processed in a single #' single query. Default and max value is 200. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. +#' @param timeout a numeric value that specifies the maximum request time per try. Default is 20 seconds. #' @param show_progress a logical value that determines if a progress bar will be shown. Default #' is TRUE. #' @@ -53,6 +56,8 @@ fetch_uniprot <- "xref_pdb" ), batchsize = 200, + max_tries = 10, + timeout = 20, show_progress = TRUE) { if (!curl::has_internet()) { message("No internet connection.") @@ -73,16 +78,16 @@ fetch_uniprot <- non_conform_ids <- uniprot_ids[!id_test] # if non_conform_ids contain IDs they are extracted and fetched. contains_valid_id <- non_conform_ids[stringr::str_detect(non_conform_ids, - pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + pattern = "(?% dplyr::mutate(accession = stringr::str_extract_all(.data$input_id, - pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + pattern = "(?% tidyr::unnest("accession") %>% dplyr::distinct() @@ -90,7 +95,7 @@ fetch_uniprot <- uniprot_ids_filtered <- unique(c(uniprot_ids_filtered, valid_id_annotations$accession)) non_identifiable_id <- non_conform_ids[!stringr::str_detect(non_conform_ids, - pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + pattern = "(?% - dplyr::left_join(valid_id_annotations, by = "input_id") %>% - dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession)) + original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>% + dplyr::left_join(valid_id_annotations, by = "input_id") %>% + dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession)) - result <- result %>% - dplyr::right_join(original_ids, by = "accession") %>% - dplyr::relocate(.data$accession, .data$input_id) + result <- result %>% + dplyr::right_join(original_ids, by = "accession") %>% + dplyr::relocate("accession", "input_id") return(result) } @@ -200,7 +214,13 @@ They were fetched and the original input ID can be found in the "input_id" colum collapsed_columns )) - new_result <- try_query(new_query_url, progress = FALSE, show_col_types = FALSE) + new_result <- try_query(new_query_url, + max_tries = max_tries, + timeout = timeout, + silent = FALSE, + progress = FALSE, + show_col_types = FALSE + ) # If a problem occurs at this step NULL is returned. if (!methods::is(new_result, "data.frame")) { message(new_result) @@ -222,7 +242,7 @@ They were fetched and the original input ID can be found in the "input_id" colum result <- result %>% dplyr::right_join(original_ids, by = "accession") %>% - dplyr::relocate(.data$accession, .data$input_id) + dplyr::relocate("accession", "input_id") result } diff --git a/R/fetch_uniprot_proteome.R b/R/fetch_uniprot_proteome.R index fea8eeb6..0ca51cd7 100644 --- a/R/fetch_uniprot_proteome.R +++ b/R/fetch_uniprot_proteome.R @@ -11,6 +11,10 @@ #' able to efficiently retrieve the information. If more information is needed, \code{fetch_uniprot()} #' can be used with the IDs retrieved by this function. #' @param reviewed a logical value that determines if only reviewed protein entries will be retrieved. +#' @param timeout a numeric value specifying the time in seconds until the download times out. +#' The default is 60 seconds. +#' @param max_tries a numeric value that specifies the number of times the function tries to download +#' the data in case an error occurs. The default is 2. #' #' @return A data frame that contains all protein metadata specified in \code{columns} for the #' organism of choice. @@ -24,7 +28,14 @@ fetch_uniprot_proteome <- function(organism_id, columns = c("accession"), - reviewed = TRUE) { + reviewed = TRUE, + timeout = 120, + max_tries = 5) { + if (!curl::has_internet()) { + message("No internet connection.") + return(invisible(NULL)) + } + if (length(organism_id) == 0) { stop("No valid organism ID found.") } @@ -47,9 +58,12 @@ fetch_uniprot_proteome <- "&format=tsv&fields=", collapsed_columns )) - result <- try_query(query_url, progress = FALSE, show_col_types = FALSE) + result <- try_query(query_url, timeout = timeout, max_tries = max_tries, silent = FALSE, progress = FALSE, show_col_types = FALSE) # result can either be a data.frame or it is a character string with the error message if (!methods::is(result, "data.frame")) { + if (stringr::str_detect(result, pattern = "Timeout")) { + message('The data retrieval timed out. Consider increasing the "timeout" or "max_tries" argument. \n') + } return(invisible(result)) } colnames(result) <- column_names diff --git a/R/find_peptide.R b/R/find_peptide.R index 4abb9be2..5091a321 100644 --- a/R/find_peptide.R +++ b/R/find_peptide.R @@ -52,8 +52,8 @@ find_peptide <- end = .data$end + 1 )) - data %>% dplyr::left_join(result, c( - rlang::as_name(rlang::enquo(protein_sequence)), - rlang::as_name(rlang::enquo(peptide_sequence)) - )) + data %>% dplyr::left_join(result, c( + rlang::as_name(rlang::enquo(protein_sequence)), + rlang::as_name(rlang::enquo(peptide_sequence)) + )) } diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R index 7b3af2c4..3fbbee52 100644 --- a/R/find_peptide_in_structure.R +++ b/R/find_peptide_in_structure.R @@ -193,7 +193,7 @@ find_peptide_in_structure <- function(peptide_data, {{ end }} > .data$ref_end_seq_id)) %>% dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>% dplyr::mutate(n_peptides = dplyr::n_distinct({{ peptide }})) %>% - tidyr::drop_na(.data$pdb_ids) %>% + tidyr::drop_na("pdb_ids") %>% dplyr::mutate(n_peptides_in_structure = sum(.data$peptide_in_pdb)) %>% dplyr::ungroup() %>% dplyr::mutate( diff --git a/R/qc_cvs.R b/R/qc_cvs.R index f1b7db21..713303d3 100644 --- a/R/qc_cvs.R +++ b/R/qc_cvs.R @@ -6,7 +6,7 @@ #' information on conditions and intensity values for each peptide, precursor or protein. #' @param grouping a character column in the \code{data} data frame that contains the grouping #' variables (e.g. peptides, precursors or proteins). -#' @param condition a column in the \code{data} data frame that contains condition information +#' @param condition a character or factor column in the \code{data} data frame that contains condition information #' (e.g. "treated" and "control"). #' @param intensity a numeric column in the \code{data} data frame that contains the corresponding #' raw or untransformed normalised intensity values for each peptide or precursor. @@ -119,10 +119,11 @@ The function does not handle log2 transformed data.", dplyr::distinct({{ condition }}, {{ grouping }}, .data$cv_combined, .data$cv) %>% tidyr::drop_na() %>% tidyr::pivot_longer(cols = starts_with("cv"), names_to = "type", values_to = "values") %>% - dplyr::mutate(type = ifelse(.data$type == "cv", {{ condition }}, "combined")) %>% - dplyr::mutate(type = forcats::fct_relevel(as.factor(.data$type), "combined")) %>% - dplyr::select(-{{ condition }}) %>% - dplyr::group_by(.data$type) %>% + dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>% + dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>% + dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>% + dplyr::select(-.data$type) %>% + dplyr::group_by({{ condition }}) %>% dplyr::mutate(median = stats::median(.data$values)) %>% dplyr::distinct() @@ -137,9 +138,9 @@ The function does not handle log2 transformed data.", plot <- ggplot2::ggplot(result) + ggplot2::geom_boxplot( aes( - x = .data$type, + x = {{ condition }}, y = .data$values, - fill = .data$type + fill = {{ condition }} ), na.rm = TRUE ) + @@ -165,7 +166,7 @@ The function does not handle log2 transformed data.", } if (plot_style == "density") { plot <- ggplot2::ggplot(result) + - ggplot2::geom_density(ggplot2::aes(x = .data$values, col = .data$type), size = 1, na.rm = TRUE) + + ggplot2::geom_density(ggplot2::aes(x = .data$values, col = {{ condition }}), size = 1, na.rm = TRUE) + ggplot2::labs( title = "Coefficients of variation", x = "Coefficient of variation [%]", @@ -174,10 +175,10 @@ The function does not handle log2 transformed data.", ) + ggplot2::scale_x_continuous(limits = c(0, max_cv)) + geom_vline( - data = dplyr::distinct(result, .data$median, .data$type), + data = dplyr::distinct(result, .data$median, {{ condition }}), ggplot2::aes( xintercept = median, - col = .data$type + col = {{ condition }} ), size = 1, linetype = "dashed", @@ -198,7 +199,7 @@ The function does not handle log2 transformed data.", return(plot) } if (plot_style == "violin") { - plot <- ggplot2::ggplot(result, aes(x = .data$type, y = .data$values, fill = .data$type)) + + plot <- ggplot2::ggplot(result, aes(x = {{ condition }}, y = .data$values, fill = {{ condition }})) + ggplot2::geom_violin(na.rm = TRUE) + ggplot2::geom_boxplot(width = 0.15, fill = "white", na.rm = TRUE, alpha = 0.6) + ggplot2::labs( diff --git a/R/try_query.R b/R/try_query.R index 27e1619e..8016a00b 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -7,7 +7,7 @@ #' @param url a character value of an URL to the website that contains the table that should be #' downloaded. #' @param max_tries a numeric value that specifies the number of times the function tries to download -#' the data in case an error occurs. +#' the data in case an error occurs. Default is 5. #' @param silent a logical value that specifies if individual messages are printed after each try #' that failed. #' @param type a character value that specifies the type of data at the target URL. Options are @@ -26,20 +26,23 @@ #' @return A data frame that contains the table from the url. try_query <- function(url, max_tries = 5, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { + # Check timeout time + if (timeout < 1) { + stop("The timeout cannot be less than 1 second.") + } + # Check if there is an internet connection first if (!curl::has_internet()) { - if (!silent) message("No internet connection.") + if (!silent) message("\nNo internet connection.") return(invisible("No internet connection")) } query_result <- "empty" try_n <- 0 + retry_delay <- 3 # seconds while (!is(query_result, "response") & - try_n < max_tries & - !ifelse(is(query_result, "character"), - stringr::str_detect(query_result, pattern = "Timeout was reached"), - FALSE - )) { # this ifelse stops requery if the timeout is too low. + try_n < max_tries + ) { if (!missing(accept)) { # with accept set query_result <- tryCatch(httr::GET(url, httr::accept(accept), httr::timeout(timeout)), @@ -53,26 +56,27 @@ try_query <- warning = function(w) conditionMessage(w) ) } - try_n <- try_n + 1 - if (!silent & !is(query_result, "response")) { - message(paste0(try_n, ". try failed!")) - } - if (!is(query_result, "response")) { - Sys.sleep(3) + + if (inherits(query_result, "response")) { + break + } else if (stringr::str_detect(query_result, "Timeout was reached")) { + if (!silent) message(paste0("\nAttempt ", try_n + 1, "/", max_tries, " failed: Timeout was reached. Retrying...")) + } else { + if (!silent) message(paste0("\nAttempt ", try_n + 1, "/", max_tries, " failed: ", query_result, ". Retrying...")) } + + try_n <- try_n + 1 + + Sys.sleep(retry_delay) + retry_delay <- retry_delay * 2 # Exponential backoff } # Check again if there is an internet connection, if not then the correct error is returned if (!curl::has_internet()) { - if (!silent) message("No internet connection.") + if (!silent) message("\nNo internet connection.") return(invisible("No internet connection")) } - if (!is(query_result, "response")) { - if (!silent) message(query_result) - return(invisible(query_result)) - } - if (httr::http_error(query_result)) { if (!silent) httr::message_for_status(query_result) return(invisible(httr::http_status(query_result)$message)) diff --git a/cran-comments.md b/cran-comments.md index 9c3f09ea..9356edf4 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,11 +1,14 @@ ## Submission -* We added new features and fixed bugs. +* We specifically addressed and fixed the issue raised by Prof. Brian Ripley: + * The `analyse_functional_network()` function did not fail gracefully. + * We implemented a `try_catch()` that specifically rescues the cases in which the `STRINGdb` package does not fail gracefully. This fixes the issue. +* Additionally we added new features and fixed bugs. ## Test environments -* macOS-latest (on GitHub actions), R 4.3.3 -* windows-latest (on GitHub actions), R 4.3.3 -* ubuntu-20.04 (on GitHub actions), R 4.3.3 +* macOS-latest (on GitHub actions), R 4.4.1 +* windows-latest (on GitHub actions), R 4.4.1 +* ubuntu-20.04 (on GitHub actions), R 4.4.1 * ubuntu-20.04 (on GitHub actions), r-devel * windows-ix86+x86_64 (win-builder), r-devel * fedora-clang-devel (R-hub), r-devel diff --git a/man/analyse_functional_network.Rd b/man/analyse_functional_network.Rd index 8cfe0446..cbe56557 100644 --- a/man/analyse_functional_network.Rd +++ b/man/analyse_functional_network.Rd @@ -9,7 +9,7 @@ analyse_functional_network( protein_id, string_id, organism_id, - version = "11.5", + version = "12.0", score_threshold = 900, binds_treatment = NULL, halo_color = NULL, @@ -34,7 +34,7 @@ obtained from H. sapiens: 9606, S. cerevisiae: 4932, E. coli: 511145.} \item{version}{a character value that specifies the version of STRINGdb to be used. -Default is 11.5.} +Default is 12.0.} \item{score_threshold}{a numeric value specifying the interaction score that based on \href{https://string-db.org/cgi/info?sessionId=bBP5N4cIf0PA&footer_active_subpage=scores}{STRING} diff --git a/man/calculate_go_enrichment.Rd b/man/calculate_go_enrichment.Rd index 90b06303..884bff16 100644 --- a/man/calculate_go_enrichment.Rd +++ b/man/calculate_go_enrichment.Rd @@ -23,6 +23,8 @@ calculate_go_enrichment( heatmap_fill_colour_rev = TRUE, label = TRUE, enrichment_type = "all", + replace_long_name = TRUE, + label_move_frac = 0.2, min_n_detected_proteins_in_process = 1, plot_cutoff = "adj_pval top10" ) @@ -99,6 +101,15 @@ determines if the enrichment analysis should be performed in order to check for deenrichemnt or only one of the two. This affects the statistics performed and therefore also the displayed plot.} +\item{replace_long_name}{a logical argument that specifies if GO term names above 50 characters should +be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become +too small due to the long name. The default is \code{TRUE}.} + +\item{label_move_frac}{a numeric argument between 0 and 1 that specifies which labels should be +moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size +of 20\% or less of the largest bar are moved to the right of the bar. This prevents labels from +overlapping with the bar boundaries.} + \item{min_n_detected_proteins_in_process}{is a numeric argument that specifies the minimum number of detected proteins required for a GO term to be displayed in the plot. The default is 1, meaning no filtering of the plotted data is performed. This argument does not affect any computations or diff --git a/man/calculate_protein_abundance.Rd b/man/calculate_protein_abundance.Rd index cab700fd..7fac88ed 100644 --- a/man/calculate_protein_abundance.Rd +++ b/man/calculate_protein_abundance.Rd @@ -11,6 +11,7 @@ calculate_protein_abundance( precursor, peptide, intensity_log2, + min_n_peptides = 3, method = "sum", for_plot = FALSE, retain_columns = NULL @@ -33,6 +34,10 @@ to more than three precursors. The quantification is done on the precursor level \item{intensity_log2}{a numeric column in the \code{data} data frame that contains log2 transformed precursor intensities.} +\item{min_n_peptides}{An integer specifying the minimum number of peptides required +for a protein to be included in the analysis. The default value is 3, which means +proteins with fewer than three unique peptides will be excluded from the analysis.} + \item{method}{a character value specifying with which method protein quantities should be calculated. Possible options include \code{"sum"}, which takes the sum of all precursor intensities as the protein abundance. Another option is \code{"iq"}, which performs protein diff --git a/man/fetch_alphafold_aligned_error.Rd b/man/fetch_alphafold_aligned_error.Rd index 31791692..2629fdfb 100644 --- a/man/fetch_alphafold_aligned_error.Rd +++ b/man/fetch_alphafold_aligned_error.Rd @@ -7,7 +7,8 @@ fetch_alphafold_aligned_error( uniprot_ids = NULL, error_cutoff = 20, - timeout = 3600, + timeout = 30, + max_tries = 1, return_data_frame = FALSE, show_progress = TRUE ) @@ -19,8 +20,11 @@ should be fetched.} \item{error_cutoff}{a numeric value specifying the maximum position error (in Angstroms) that should be retained. setting this value to a low number reduces the size of the retrieved data. Default is 20.} -\item{timeout}{a numeric value specifying the time in seconds until the download of an organism -archive times out. The default is 3600 seconds.} +\item{timeout}{a numeric value specifying the time in seconds until the download times out. +The default is 30 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 1.} \item{return_data_frame}{a logical value; if \code{TRUE} a data frame instead of a list is returned. It is recommended to only use this if information for few proteins is retrieved. diff --git a/man/fetch_alphafold_prediction.Rd b/man/fetch_alphafold_prediction.Rd index 99b2141a..20d731ac 100644 --- a/man/fetch_alphafold_prediction.Rd +++ b/man/fetch_alphafold_prediction.Rd @@ -9,6 +9,7 @@ fetch_alphafold_prediction( organism_name = NULL, version = "v4", timeout = 3600, + max_tries = 5, return_data_frame = FALSE, show_progress = TRUE ) @@ -31,6 +32,9 @@ Available version can be found here: https://ftp.ebi.ac.uk/pub/databases/alphafo \item{timeout}{a numeric value specifying the time in seconds until the download of an organism archive times out. The default is 3600 seconds.} +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 5. This only applies if \code{uniprot_ids} were provided.} + \item{return_data_frame}{a logical value that specifies if true, a data frame instead of a list is returned. It is recommended to only use this if information for few proteins is retrieved. Default is FALSE.} diff --git a/man/fetch_mobidb.Rd b/man/fetch_mobidb.Rd index ff59791e..eaa0ff10 100644 --- a/man/fetch_mobidb.Rd +++ b/man/fetch_mobidb.Rd @@ -4,7 +4,13 @@ \alias{fetch_mobidb} \title{Fetch protein disorder and mobility information from MobiDB} \usage{ -fetch_mobidb(uniprot_ids = NULL, organism_id = NULL, show_progress = TRUE) +fetch_mobidb( + uniprot_ids = NULL, + organism_id = NULL, + show_progress = TRUE, + timeout = 60, + max_tries = 2 +) } \arguments{ \item{uniprot_ids}{optional, a character vector of UniProt identifiers for which information @@ -16,6 +22,12 @@ argument is mutually exclusive to the \code{uniprot_ids} argument.} \item{show_progress}{a logical value; if \code{TRUE} a progress bar will be shown. Default is \code{TRUE}.} + +\item{timeout}{a numeric value specifying the time in seconds until the download of an organism +archive times out. The default is 60 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 2.} } \value{ A data frame that contains start and end positions for disordered and flexible protein diff --git a/man/fetch_quickgo.Rd b/man/fetch_quickgo.Rd index 0f0da3ac..fed8c218 100644 --- a/man/fetch_quickgo.Rd +++ b/man/fetch_quickgo.Rd @@ -11,6 +11,8 @@ fetch_quickgo( ontology_annotations = "all", go_id_slims = NULL, relations_slims = c("is_a", "part_of", "regulates", "occurs_in"), + timeout = 1200, + max_tries = 2, show_progress = TRUE ) } @@ -36,6 +38,12 @@ a slim go set should be generated. This argument should only be provided if slim \item{relations_slims}{an optional character vector that specifies the relations of GO IDs that should be considered for the generation of the slim dataset. This argument should only be provided if slims are retrieved.} +\item{timeout}{a numeric value specifying the time in seconds until the download times out. +The default is 1200 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 2.} + \item{show_progress}{a logical value that indicates if a progress bar will be shown. Default is TRUE.} } diff --git a/man/fetch_uniprot.Rd b/man/fetch_uniprot.Rd index 6f38424b..0afed05f 100644 --- a/man/fetch_uniprot.Rd +++ b/man/fetch_uniprot.Rd @@ -10,6 +10,8 @@ fetch_uniprot( "xref_string", "go_f", "go_p", "go_c", "cc_interaction", "ft_act_site", "ft_binding", "cc_cofactor", "cc_catalytic_activity", "xref_pdb"), batchsize = 200, + max_tries = 10, + timeout = 20, show_progress = TRUE ) } @@ -23,6 +25,11 @@ cross-referenced database provide the database name with the prefix "xref_", e.g \item{batchsize}{a numeric value that specifies the number of proteins processed in a single single query. Default and max value is 200.} +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs.} + +\item{timeout}{a numeric value that specifies the maximum request time per try. Default is 20 seconds.} + \item{show_progress}{a logical value that determines if a progress bar will be shown. Default is TRUE.} } diff --git a/man/fetch_uniprot_proteome.Rd b/man/fetch_uniprot_proteome.Rd index 4b93b769..1859d1c3 100644 --- a/man/fetch_uniprot_proteome.Rd +++ b/man/fetch_uniprot_proteome.Rd @@ -4,7 +4,13 @@ \alias{fetch_uniprot_proteome} \title{Fetch proteome data from UniProt} \usage{ -fetch_uniprot_proteome(organism_id, columns = c("accession"), reviewed = TRUE) +fetch_uniprot_proteome( + organism_id, + columns = c("accession"), + reviewed = TRUE, + timeout = 120, + max_tries = 5 +) } \arguments{ \item{organism_id}{a numeric value that specifies the NCBI taxonomy identifier (TaxId) for an @@ -18,6 +24,12 @@ able to efficiently retrieve the information. If more information is needed, \co can be used with the IDs retrieved by this function.} \item{reviewed}{a logical value that determines if only reviewed protein entries will be retrieved.} + +\item{timeout}{a numeric value specifying the time in seconds until the download times out. +The default is 60 seconds.} + +\item{max_tries}{a numeric value that specifies the number of times the function tries to download +the data in case an error occurs. The default is 2.} } \value{ A data frame that contains all protein metadata specified in \code{columns} for the diff --git a/man/qc_cvs.Rd b/man/qc_cvs.Rd index ae47c047..b2525af6 100644 --- a/man/qc_cvs.Rd +++ b/man/qc_cvs.Rd @@ -21,7 +21,7 @@ information on conditions and intensity values for each peptide, precursor or pr \item{grouping}{a character column in the \code{data} data frame that contains the grouping variables (e.g. peptides, precursors or proteins).} -\item{condition}{a column in the \code{data} data frame that contains condition information +\item{condition}{a character or factor column in the \code{data} data frame that contains condition information (e.g. "treated" and "control").} \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding diff --git a/man/try_query.Rd b/man/try_query.Rd index 3f320514..90d61467 100644 --- a/man/try_query.Rd +++ b/man/try_query.Rd @@ -19,7 +19,7 @@ try_query( downloaded.} \item{max_tries}{a numeric value that specifies the number of times the function tries to download -the data in case an error occurs.} +the data in case an error occurs. Default is 5.} \item{silent}{a logical value that specifies if individual messages are printed after each try that failed.} diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index 149ac32d..31fdd1c4 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -21,7 +21,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { expect_is(positions_structure, "data.frame") # test if position structure is in certain range as db can be updated expect_gte(nrow(positions_structure), 569) - expect_lte(nrow(positions_structure), 600) + expect_lte(nrow(positions_structure), 700) expect_equal(ncol(positions_structure), 17) }) diff --git a/vignettes/data_analysis_dose_response_workflow.Rmd b/vignettes/data_analysis_dose_response_workflow.Rmd index 977ffb01..d8481d62 100644 --- a/vignettes/data_analysis_dose_response_workflow.Rmd +++ b/vignettes/data_analysis_dose_response_workflow.Rmd @@ -44,7 +44,7 @@ A typical dose-response experiment contains multiple samples that were treated w For limited proteolysis-coupled to mass spectrometry ([LiP-MS](https://www.nature.com/articles/nbt.2999)) data, dose-response curves have been used previously for the identification of drug binding sites in complex proteomes ([Piazza 2020](https://www.nature.com/articles/s41467-020-18071-x)). Since LiP-MS data is analysed on the peptide or precursor* level, using additional information about peptide behaviour from multiple conditions reduces false discovery rate. -_*A peptide precursor is the actual molecular unit that was detected on the mass spectrometer. This is a peptide with one specific charge state and its modification(s)._ +_A peptide precursor is the actual molecular unit that was detected on the mass spectrometer. This is a peptide with one specific charge state and its modification(s)._ # Getting started @@ -166,6 +166,8 @@ $b$ is the Hill's coefficient (e.g. the negative slope at the inflection point) The output of `fit_drc_4p()` provides extensive information on the goodness of fit. Furthermore, the function filters and ranks fits based on several criteria which include completeness of data and a significance cutoff based on an adjusted p-value obtained from ANOVA. For details about the filtering steps you can read the function documentation by calling `?fit_drc_4p`. If you only care about your potential hits (and exclude precursors/peptides/proteins that do not pass the filtering e.g. due to too few observations), you can choose `filter = "pre"`, which filters the data before model fitting. This speeds up the process because less models need to be fit. If you want to perform enrichment analysis later on you should keep the default `filter = "post"`. This will fit all possible models and then only annotate models based on whether they passed or failed the filtering step (`passed_filter`). +Since **protti** version `0.8.0` we recommend you use the `n_replicate_completeness` and `n_condition_completeness` arguments in order to define cutoffs for completeness of replicates in a given number of conditions. This makes the output of the function reproducible independent of the input data. With the provided cutoffs below we define that at least 2 replicates need to be detected in 4 conditions. Any precursor with less completeness than that will not be considered and removed from the data. + ```{r model_fit, eval = test_protti} fit <- data_normalised %>% fit_drc_4p( @@ -174,6 +176,8 @@ fit <- data_normalised %>% response = normalised_intensity_log2, dose = r_condition, filter = "post", + n_replicate_completeness = 2, # specifiy this based on your data and cutoff criteria + n_condition_completeness = 4, # specifiy this based on your data and cutoff criteria retain_columns = c(pg_protein_accessions) ) @@ -196,7 +200,9 @@ parallel_fit <- data_normalised %>% response = normalised_intensity_log2, dose = r_condition, retain_columns = c(pg_protein_accessions), - n_cores = 3 + n_cores = 3, + n_replicate_completeness = 2, # specifiy this based on your data and cutoff criteria + n_condition_completeness = 4, # specifiy this based on your data and cutoff criteria ) # remove workers again after you are done @@ -230,7 +236,7 @@ fit %>% After model fitting, you can easily visualise model fits by plotting them with the `drc_4p_plot()` function. You can provide one or more precursor IDs to the `targets` argument or just return plots for all precursors with `targets = "all"`. Make sure that the number of plots is reasonable to return within R. You can also directly export your plots by setting `export = TRUE`. -Note: The output of `fit_drc_4p()` includes two columns with data frames containing the model and original data points required for the plot. These data frames contain the original names for `dose` and `response` that you provided to `fit_drc_4p()`. +_Note: The output of `fit_drc_4p()` includes two columns with data frames containing the model and original data points required for the plot. These data frames contain the original names for `dose` and `response` that you provided to `fit_drc_4p()`._ ```{r model_plot, eval = test_protti, fig.align = "center", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE} # Model plotting @@ -240,10 +246,13 @@ drc_4p_plot(fit, response = normalised_intensity_log2, targets = "_VFDVELLKLE_.2", unit = "pM", + x_axis_limits = c(10, 1e+08), export = FALSE ) ``` +There are additional arguments that help you customise your plot. You can for example specify the axis limits with `x_axis_limits`. This should generally always be set to the lowest non-zero and highest concentration `x_axis_limits = c(10, 1e+08)`. Like this you can more easily see if extreme concentrations are completely missing from the curve. The `colours` argument allows you to specify your own colours for the points, confidence interval and curve. + # Further analysis After models are fit, you can have a deeper look into the hits you obtained. **protti** provides several functions that help you to identify patterns in your data.