From afe2aefb422ec6335786eb7663c885b506b48934 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 5 Apr 2023 14:48:29 +0200 Subject: [PATCH 01/71] Fix qc_proteome_coverage label order --- NEWS.md | 1 + R/qc_proteome_coverage.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 692faa2a..e2fd22bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * `plot_volcano()` now also works interactively if there are no significant hits. * `fetch_chebi()` fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update. +* `qc_proteome_coverage()` fixed the label order of fractions proteins detected and not detected in the proteome. # protti 0.6.0 diff --git a/R/qc_proteome_coverage.R b/R/qc_proteome_coverage.R index 50161fa1..56622872 100644 --- a/R/qc_proteome_coverage.R +++ b/R/qc_proteome_coverage.R @@ -96,7 +96,7 @@ qc_proteome_coverage <- function(data, ggplot2::scale_fill_manual( values = c("proteins_detected" = "#5680C1", "proteins_undetected" = "#B96DAD"), name = "Proteins", - labels = c("Detected", "Not detected") + labels = c("Not detected", "Detected") ) + ggplot2::geom_text( data = proteome_coverage %>% dplyr::filter(.data$percentage > 5), From 6e7d37b5eb264d596a15c8d794a169ea7da87f0b Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 5 Apr 2023 15:08:10 +0200 Subject: [PATCH 02/71] Fix calculate_protein_abundance() documentation --- R/calculate_protein_abundance.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R index 0e0725b2..69922e20 100644 --- a/R/calculate_protein_abundance.R +++ b/R/calculate_protein_abundance.R @@ -23,7 +23,7 @@ #' used. Default is \code{"iq"}. #' @param for_plot a logical value indicating whether the result should be only protein intensities #' or protein intensities together with precursor intensities that can be used for plotting using -#' \code{qc_protein_abundance}. Default is \code{FALSE}. +#' \code{peptide_profile_plot()}. Default is \code{FALSE}. #' @param retain_columns a vector indicating if certain columns should be retained from the input #' data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific #' columns can be retained by providing their names (not in quotations marks, just like other @@ -31,7 +31,7 @@ #' #' @return If \code{for_plot = FALSE}, protein abundances are returned, if \code{for_plot = TRUE} #' also precursor intensities are returned in a data frame. The later output is ideal for plotting -#' with \code{qc_protein_abundance} and can be filtered to only include protein abundances. +#' with \code{peptide_profile_plot()} and can be filtered to only include protein abundances. #' #' @import dplyr #' @import progress From e6f5e6b9f8fec45a07e71de78e5c848e2c9270ab Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 5 Apr 2023 15:08:41 +0200 Subject: [PATCH 03/71] Add max_cv argument to qc_cvs --- NEWS.md | 4 ++++ R/qc_cvs.R | 17 ++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index e2fd22bf..7acc8b27 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # protti 0.6.0.9000 +## New features + +* `qc_cvs()` received a new argument called `max_cv` that specifies the maximum CV that should be included in the plot. + ## Bug fixes * `plot_volcano()` now also works interactively if there are no significant hits. diff --git a/R/qc_cvs.R b/R/qc_cvs.R index cbe4cd6e..0a20a264 100644 --- a/R/qc_cvs.R +++ b/R/qc_cvs.R @@ -14,6 +14,8 @@ #' @param plot_style a character value that indicates the plotting style. \code{plot_style = "boxplot"} #' plots a boxplot, whereas \code{plot_style = "density"} plots the CV density distribution. #' \code{plot_style = "violin"} returns a violin plot. Default is \code{plot_style = "density"}. +#' @param max_cv a numeric value that specifies the maximum percentage of CVs that should be included +#' in the returned plot. The default value is `max_cv = 200`. #' #' @return Either a data frame with the median CVs in % or a plot showing the distribution of the CVs #' is returned. @@ -68,7 +70,8 @@ qc_cvs <- condition, intensity, plot = TRUE, - plot_style = "density") { + plot_style = "density", + max_cv = 200) { protti_colours <- "placeholder" # assign a placeholder to prevent a missing global variable warning utils::data("protti_colours", envir = environment()) # then overwrite it with real data if (plot == FALSE) { @@ -123,11 +126,11 @@ The function does not handle log2 transformed data.", dplyr::mutate(median = stats::median(.data$values)) %>% dplyr::distinct() - if (max(result$values) > 200) { + if (max(result$values) > max_cv) { cv_too_high <- result %>% - dplyr::filter(.data$values > 200) %>% + dplyr::filter(.data$values > max_cv) %>% nrow() - warning(paste(cv_too_high), " values were exluded from the plot (CV > 200 %)") + warning(paste(cv_too_high), " values were exluded from the plot (CV > ", max_cv, " %)") } if (plot_style == "boxplot") { @@ -144,7 +147,7 @@ The function does not handle log2 transformed data.", y = "Coefficient of variation [%]", fill = "Condition" ) + - ggplot2::scale_y_continuous(limits = c(0, 200)) + + ggplot2::scale_y_continuous(limits = c(0, max_cv)) + ggplot2::scale_fill_manual(values = c("grey", protti_colours)) + ggplot2::theme_bw() + ggplot2::theme( @@ -168,7 +171,7 @@ The function does not handle log2 transformed data.", y = "Density", color = "Condition" ) + - ggplot2::scale_x_continuous(limits = c(0, 200)) + + ggplot2::scale_x_continuous(limits = c(0, max_cv)) + geom_vline( data = dplyr::distinct(result, .data$median, .data$type), ggplot2::aes( @@ -203,7 +206,7 @@ The function does not handle log2 transformed data.", y = "Coefficient of variation [%]", fill = "Condition" ) + - ggplot2::scale_y_continuous(limits = c(0, 200)) + + ggplot2::scale_y_continuous(limits = c(0, max_cv)) + ggplot2::scale_fill_manual(values = c("grey", protti_colours)) + ggplot2::theme_bw() + ggplot2::theme( From 9ad0f847c61eaaf4e6e47774542ff39e12d9d09b Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 5 Apr 2023 16:12:47 +0200 Subject: [PATCH 04/71] Fix calculate_protein_abundance retain_columns --- NEWS.md | 5 +++-- R/calculate_protein_abundance.R | 9 ++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7acc8b27..1dc61973 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,8 +7,9 @@ ## Bug fixes * `plot_volcano()` now also works interactively if there are no significant hits. -* `fetch_chebi()` fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update. -* `qc_proteome_coverage()` fixed the label order of fractions proteins detected and not detected in the proteome. +* `fetch_chebi()`: fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update. +* `qc_proteome_coverage()`: fixed the label order of fractions proteins detected and not detected in the proteome. +* `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. # protti 0.6.0 diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R index 69922e20..6424cd1a 100644 --- a/R/calculate_protein_abundance.R +++ b/R/calculate_protein_abundance.R @@ -116,6 +116,7 @@ calculate_protein_abundance <- function(data, # Filter out any proteins with less than 3 peptides input <- data %>% + dplyr::ungroup() %>% dplyr::distinct( {{ sample }}, {{ protein_id }}, @@ -127,7 +128,7 @@ calculate_protein_abundance <- function(data, dplyr::group_by({{ protein_id }}, {{ sample }}) %>% dplyr::mutate(n_peptides = dplyr::n_distinct(!!rlang::ensym(peptide))) %>% dplyr::filter(.data$n_peptides > 2) %>% - dplyr::select(-.data$n_peptides) %>% + dplyr::select(-"n_peptides") %>% dplyr::ungroup() if (method == "sum") { @@ -227,15 +228,13 @@ calculate_protein_abundance <- function(data, !!enquo(retain_columns), colnames(combined)[!colnames(combined) %in% c( - rlang::as_name(rlang::enquo(intensity_log2)), - rlang::as_name(rlang::enquo(precursor)) + rlang::as_name(rlang::enquo(intensity_log2)) )] ) %>% dplyr::distinct() %>% dplyr::right_join(combined, by = colnames(combined)[!colnames(combined) %in% c( - rlang::as_name(rlang::enquo(intensity_log2)), - rlang::as_name(rlang::enquo(precursor)) + rlang::as_name(rlang::enquo(intensity_log2)) )]) return(combined) From cb76aa4f0fe566cfb8a40a9363cf7d8a9190d969 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 5 Apr 2023 17:22:27 +0200 Subject: [PATCH 05/71] Add complete_sample to peptide_profile_plot --- NEWS.md | 1 + R/peptide_profile_plot.R | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1dc61973..532ad463 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ ## New features * `qc_cvs()` received a new argument called `max_cv` that specifies the maximum CV that should be included in the plot. +* `peptide_profile_plot()` received a new argument called `complete_sample`. If set to `TRUE`, each protein gets assigned all sample names that are found in the input data. This ensures that the plot always contains all samples on the x-axis even if there are no measured intensities for a specific sample. The default is `FALSE`, which is the original behaviour of the function. ## Bug fixes diff --git a/R/peptide_profile_plot.R b/R/peptide_profile_plot.R index e713f2ac..fbd40346 100644 --- a/R/peptide_profile_plot.R +++ b/R/peptide_profile_plot.R @@ -32,6 +32,8 @@ plot_peptide_profiles <- function(...) { #' @param targets a character vector that specifies elements of the grouping column which should #' be plotted. This can also be \code{"all"} if plots for all groups should be created. Depending #' on the number of elements in your grouping column this can be many plots. +#' @param complete_sample a logical value that indicates if samples that are completely missing for +#' a given protein should be anyway shown on the x-axis of the plot. The default value is `FALSE`. #' @param protein_abundance_plot a logical value. If the input for this plot comes directly from #' \code{calculate_protein_abundance} this argument can be set to \code{TRUE}. This displays all #' peptides in gray, while the protein abundance is displayed in green. @@ -41,7 +43,7 @@ plot_peptide_profiles <- function(...) { #' @param export a logical value that indicates if plots should be exported as PDF. The output #' directory will be the current working directory. The name of the file can be chosen using the #' \code{export_name} argument. -#' @param export_name A character vector that provides the name of the exported file if +#' @param export_name a character vector that provides the name of the exported file if #' \code{export = TRUE}. #' #' @return A list of peptide profile plots. @@ -118,6 +120,7 @@ peptide_profile_plot <- function(data, intensity_log2, grouping, targets, + complete_sample = FALSE, protein_abundance_plot = FALSE, interactive = FALSE, export = FALSE, @@ -127,19 +130,27 @@ peptide_profile_plot <- function(data, protti_colours <- "placeholder" # assign a placeholder to prevent a missing global variable warning utils::data("protti_colours", envir = environment()) # then overwrite it with real data if (missing(targets)) stop("Please provide at least one target to plot!") + + input <- data %>% + dplyr::distinct({{ sample }}, {{ peptide }}, {{ intensity_log2 }}, {{ grouping }}) %>% + tidyr::drop_na({{ intensity_log2 }}) + + if (complete_sample){ + input <- input %>% + tidyr::complete({{ sample }}, {{ grouping }}) %>% + tidyr::fill({{ peptide }}, .direction = "downup") + } + if (!("all" %in% targets)) { - input <- data %>% - dplyr::distinct({{ sample }}, {{ peptide }}, {{ intensity_log2 }}, {{ grouping }}) %>% - tidyr::drop_na({{ intensity_log2 }}) %>% + input <- input %>% dplyr::filter({{ grouping }} %in% targets) %>% split(dplyr::pull(., !!ensym(grouping))) } + if ("all" %in% targets) { groups <- length(unique(dplyr::pull(data, {{ grouping }}))) message("Splitting into ", groups, " groups and returning ", groups, " plots.") - input <- data %>% - dplyr::distinct({{ sample }}, {{ peptide }}, {{ intensity_log2 }}, {{ grouping }}) %>% - tidyr::drop_na({{ intensity_log2 }}) %>% + input <- input %>% split(dplyr::pull(., !!ensym(grouping))) } pb <- progress::progress_bar$new( @@ -197,7 +208,7 @@ peptide_profile_plot <- function(data, suppressWarnings(print(plot)) grDevices::dev.off() } else { - return(plot) + return(suppressWarnings(suppressMessages(print(plot)))) } } if (interactive == TRUE) { From 2ab10eca7481c6d1b1674abc045d76d4bd02288b Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 5 Apr 2023 17:35:09 +0200 Subject: [PATCH 06/71] Documentation update --- man/calculate_protein_abundance.Rd | 4 ++-- man/peptide_profile_plot.Rd | 6 +++++- man/qc_cvs.Rd | 6 +++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/man/calculate_protein_abundance.Rd b/man/calculate_protein_abundance.Rd index 668df8bf..cab700fd 100644 --- a/man/calculate_protein_abundance.Rd +++ b/man/calculate_protein_abundance.Rd @@ -43,7 +43,7 @@ used. Default is \code{"iq"}.} \item{for_plot}{a logical value indicating whether the result should be only protein intensities or protein intensities together with precursor intensities that can be used for plotting using -\code{qc_protein_abundance}. Default is \code{FALSE}.} +\code{peptide_profile_plot()}. Default is \code{FALSE}.} \item{retain_columns}{a vector indicating if certain columns should be retained from the input data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific @@ -53,7 +53,7 @@ column names, but in a vector).} \value{ If \code{for_plot = FALSE}, protein abundances are returned, if \code{for_plot = TRUE} also precursor intensities are returned in a data frame. The later output is ideal for plotting -with \code{qc_protein_abundance} and can be filtered to only include protein abundances. +with \code{peptide_profile_plot()} and can be filtered to only include protein abundances. } \description{ Determines relative protein abundances from ion quantification. Only proteins with at least diff --git a/man/peptide_profile_plot.Rd b/man/peptide_profile_plot.Rd index 1f800b6a..1f1c5777 100644 --- a/man/peptide_profile_plot.Rd +++ b/man/peptide_profile_plot.Rd @@ -11,6 +11,7 @@ peptide_profile_plot( intensity_log2, grouping, targets, + complete_sample = FALSE, protein_abundance_plot = FALSE, interactive = FALSE, export = FALSE, @@ -35,6 +36,9 @@ the data should be split. This can be for example protein IDs.} be plotted. This can also be \code{"all"} if plots for all groups should be created. Depending on the number of elements in your grouping column this can be many plots.} +\item{complete_sample}{a logical value that indicates if samples that are completely missing for +a given protein should be anyway shown on the x-axis of the plot. The default value is \code{FALSE}.} + \item{protein_abundance_plot}{a logical value. If the input for this plot comes directly from \code{calculate_protein_abundance} this argument can be set to \code{TRUE}. This displays all peptides in gray, while the protein abundance is displayed in green.} @@ -47,7 +51,7 @@ plots cannot be exported either.} directory will be the current working directory. The name of the file can be chosen using the \code{export_name} argument.} -\item{export_name}{A character vector that provides the name of the exported file if +\item{export_name}{a character vector that provides the name of the exported file if \code{export = TRUE}.} } \value{ diff --git a/man/qc_cvs.Rd b/man/qc_cvs.Rd index 63141229..ae47c047 100644 --- a/man/qc_cvs.Rd +++ b/man/qc_cvs.Rd @@ -10,7 +10,8 @@ qc_cvs( condition, intensity, plot = TRUE, - plot_style = "density" + plot_style = "density", + max_cv = 200 ) } \arguments{ @@ -31,6 +32,9 @@ raw or untransformed normalised intensity values for each peptide or precursor.} \item{plot_style}{a character value that indicates the plotting style. \code{plot_style = "boxplot"} plots a boxplot, whereas \code{plot_style = "density"} plots the CV density distribution. \code{plot_style = "violin"} returns a violin plot. Default is \code{plot_style = "density"}.} + +\item{max_cv}{a numeric value that specifies the maximum percentage of CVs that should be included +in the returned plot. The default value is \code{max_cv = 200}.} } \value{ Either a data frame with the median CVs in \% or a plot showing the distribution of the CVs From 015f1db19d10eac128da75803d53488685a9b6e1 Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 11 Apr 2023 14:12:47 +0200 Subject: [PATCH 07/71] Fix bug in fetch_kegg --- NEWS.md | 1 + R/fetch_kegg.R | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index 532ad463..78e74ebc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ * `fetch_chebi()`: fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update. * `qc_proteome_coverage()`: fixed the label order of fractions proteins detected and not detected in the proteome. * `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. +* `fetch_kegg()` now returns the pathway name again correctly. # protti 0.6.0 diff --git a/R/fetch_kegg.R b/R/fetch_kegg.R index 4f2c94a4..e1233f0a 100644 --- a/R/fetch_kegg.R +++ b/R/fetch_kegg.R @@ -34,6 +34,10 @@ fetch_kegg <- function(species) { return(invisible(NULL)) } colnames(result_link) <- c("kegg_id", "pathway_id") + result_link$pathway_id <- stringr::str_replace_all(result_link$pathway_id, + pattern = "path:", + replacement = "" + ) # download pathway_id names url_name <- paste("https://rest.kegg.jp/list/pathway", species, sep = "/") result_name <- try_query(url_name, col_names = FALSE, progress = FALSE, show_col_types = FALSE) @@ -42,6 +46,7 @@ fetch_kegg <- function(species) { return(invisible(NULL)) } colnames(result_name) <- c("pathway_id", "pathway_name") + # download kegg_id to uniprot_id conversion url_conv <- paste("https://rest.kegg.jp/conv/uniprot", species, sep = "/") result_conv <- try_query(url_conv, col_names = FALSE, progress = FALSE, show_col_types = FALSE) From 0cc4fe03a3128f5e0c1b95ee9b2697cdeca1bbc4 Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 11 Apr 2023 17:08:17 +0200 Subject: [PATCH 08/71] Add colour variable to volcano_plot --- NEWS.md | 3 ++- R/volcano_plot.R | 25 ++++++++++++++++--- man/volcano_plot.Rd | 7 ++++++ ...t-fetch_extract_and_enrichment_functions.R | 2 +- tests/testthat/test-structure_functions.R | 2 +- 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 78e74ebc..7c797a65 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,10 +4,11 @@ * `qc_cvs()` received a new argument called `max_cv` that specifies the maximum CV that should be included in the plot. * `peptide_profile_plot()` received a new argument called `complete_sample`. If set to `TRUE`, each protein gets assigned all sample names that are found in the input data. This ensures that the plot always contains all samples on the x-axis even if there are no measured intensities for a specific sample. The default is `FALSE`, which is the original behaviour of the function. +* `volcano_plot()` received the `colour` argument that allows the user to provide custom colours for points. ## Bug fixes -* `plot_volcano()` now also works interactively if there are no significant hits. +* `volcano_plot()` now also works interactively if there are no significant hits. * `fetch_chebi()`: fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update. * `qc_proteome_coverage()`: fixed the label order of fractions proteins detected and not detected in the proteome. * `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. diff --git a/R/volcano_plot.R b/R/volcano_plot.R index 36ceae26..e141ec86 100644 --- a/R/volcano_plot.R +++ b/R/volcano_plot.R @@ -54,6 +54,11 @@ volcano_protti <- function(...) { #' "-log10(q-value)". #' @param legend_label optional, a character value that specifies the legend label. Default is #' "Target". +#' @param colour optional, a character vector containing colours that should be used to colour +#' points according to the selected method. IMPORTANT: the first value in the vector is the +#' default point colour, the additional values specify colouring of target or significant points. +#' E.g. `c("grey60", "#5680C1")` to achieve the same colouring as the default for the "significant" +#' method. #' @param log2FC_cutoff optional, a numeric value that specifies the log2 transformed fold change #' cutoff used for the vertical lines, which can be used to assess the significance of changes. #' Default value is 1. @@ -148,12 +153,24 @@ volcano_plot <- function(data, x_axis_label = "log2(fold change)", y_axis_label = "-log10(p-value)", legend_label = "Target", + colour = NULL, log2FC_cutoff = 1, significance_cutoff = 0.01, interactive = FALSE) { protti_colours <- "placeholder" # assign a placeholder to prevent a missing global variable warning utils::data("protti_colours", envir = environment()) # then overwrite it with real data + if (!missing(colour)){ + if(length(colour) < 2){ + stop("Please provide more colours!") + } + background <- colour[1] + additional_colour <- colour[-1] + } else { + background <- "grey60" + additional_colour <- protti_colours + } + data <- data %>% tidyr::drop_na({{ log2FC }}, {{ significance }}) @@ -207,7 +224,7 @@ volcano_plot <- function(data, x = {{ log2FC }}, y = -1 * log10({{ significance }}) ), - colour = "grey60" + colour = background ) + geom_point( data = dplyr::filter(data, .data$target == TRUE), @@ -248,7 +265,7 @@ volcano_plot <- function(data, legend.text = ggplot2::element_text(size = 15), strip.background = element_blank() ) + - ggplot2::scale_color_manual(values = protti_colours) + + ggplot2::scale_color_manual(values = additional_colour) + scale_x_continuous(breaks = seq( round(-1 * max(abs(dplyr::pull(data, {{ log2FC }})), na.rm = TRUE) - 0.5, 0), round(max(abs(dplyr::pull(data, {{ log2FC }})), na.rm = TRUE) + 0.5, 0), 1 @@ -278,7 +295,7 @@ volcano_plot <- function(data, x = {{ log2FC }}, y = -log10({{ significance }}) ), - colour = "grey60" + colour = background ) + geom_point( data = dplyr::filter(data, (abs({{ log2FC }}) > log2FC_cutoff) & ({{ significance }} < .data$mean_adjusted_cutoff)), @@ -287,7 +304,7 @@ volcano_plot <- function(data, y = -log10({{ significance }}) ), size = 3, - colour = "#5680C1" + colour = additional_colour[1] ) + labs( title = title, diff --git a/man/volcano_plot.Rd b/man/volcano_plot.Rd index 79a296bf..1eea9168 100644 --- a/man/volcano_plot.Rd +++ b/man/volcano_plot.Rd @@ -18,6 +18,7 @@ volcano_plot( x_axis_label = "log2(fold change)", y_axis_label = "-log10(p-value)", legend_label = "Target", + colour = NULL, log2FC_cutoff = 1, significance_cutoff = 0.01, interactive = FALSE @@ -70,6 +71,12 @@ is "Volcano plot".} \item{legend_label}{optional, a character value that specifies the legend label. Default is "Target".} +\item{colour}{optional, a character vector containing colours that should be used to colour +points according to the selected method. IMPORTANT: the first value in the vector is the +default point colour, the additional values specify colouring of target or significant points. +E.g. \code{c("grey60", "#5680C1")} to achieve the same colouring as the default for the "significant" +method.} + \item{log2FC_cutoff}{optional, a numeric value that specifies the log2 transformed fold change cutoff used for the vertical lines, which can be used to assess the significance of changes. Default value is 1.} diff --git a/tests/testthat/test-fetch_extract_and_enrichment_functions.R b/tests/testthat/test-fetch_extract_and_enrichment_functions.R index a745b5aa..7c45410a 100644 --- a/tests/testthat/test-fetch_extract_and_enrichment_functions.R +++ b/tests/testthat/test-fetch_extract_and_enrichment_functions.R @@ -453,7 +453,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { annotations <- fetch_quickgo(type = "annotations", id = uniprot_ids, ontology = "molecular_function") test_that("fetch_quickgo works", { expect_is(annotations, "data.frame") - expect_equal(nrow(annotations), 21) + expect_equal(nrow(annotations), 24) expect_equal(ncol(annotations), 15) terms <- fetch_quickgo(type = "terms") diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index 579f8d30..f7892c31 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -19,7 +19,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("find_peptide_in_structure works", { expect_is(positions_structure, "data.frame") - expect_equal(nrow(positions_structure), 409) + expect_equal(nrow(positions_structure), 451) expect_equal(ncol(positions_structure), 17) }) From 17c0498cbac1ea6f25f1451525045a0cd1dc0f4c Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 11 Apr 2023 17:08:35 +0200 Subject: [PATCH 09/71] Correct .data$ issue in some functions --- R/assign_missingness.R | 10 ++-- R/calculate_diff_abundance.R | 24 ++++----- R/create_structure_contact_map.R | 84 ++++++++++++++++---------------- R/create_synthetic_data.R | 18 +++---- R/filter_cv.R | 2 +- R/fit_drc_4p.R | 12 ++--- R/impute.R | 2 +- 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/R/assign_missingness.R b/R/assign_missingness.R index 5885cbff..d6f6e49e 100644 --- a/R/assign_missingness.R +++ b/R/assign_missingness.R @@ -128,7 +128,7 @@ from the conditions and assigned their missingness. The created comparisons are: # create dataframe that contains all combinations to be tested all_combinations <- all_combinations %>% - tidyr::pivot_longer(cols = c(.data$V1, .data$V2), names_to = "name", values_to = rlang::as_name(rlang::enquo(condition))) %>% + tidyr::pivot_longer(cols = c("V1", "V2"), names_to = "name", values_to = rlang::as_name(rlang::enquo(condition))) %>% dplyr::select(-.data$name) %>% dplyr::group_by({{ condition }}) %>% dplyr::mutate(comparison = list(.data$combinations)) %>% @@ -143,7 +143,7 @@ from the conditions and assigned their missingness. The created comparisons are: dplyr::mutate(n_replicates = dplyr::n()) %>% dplyr::ungroup() %>% dplyr::left_join(all_combinations, by = rlang::as_name(rlang::enquo(condition))) %>% - tidyr::unnest(.data$comparison) + tidyr::unnest("comparison") # check if there are any unequal replicate comparisons unequal_replicates <- data_prep %>% @@ -190,9 +190,9 @@ from the conditions and assigned their missingness. The created comparisons are: )) %>% split(.$comparison) %>% purrr::map_df(.f = ~ .x %>% - tidyr::pivot_wider(names_from = .data$type, values_from = c(.data$n_detect, .data$n_replicates)) %>% + tidyr::pivot_wider(names_from = "type", values_from = c("n_detect", "n_replicates")) %>% dplyr::group_by({{ grouping }}) %>% - tidyr::fill(.data$n_detect_treated, .data$n_detect_control, .data$n_replicates_treated, .data$n_replicates_control, .direction = "updown") %>% + tidyr::fill("n_detect_treated", "n_detect_control", "n_replicates_treated", "n_replicates_control", .direction = "updown") %>% dplyr::ungroup() %>% dplyr::mutate(missingness = dplyr::case_when( .data$n_detect_control == .data$n_replicates_control & @@ -204,7 +204,7 @@ from the conditions and assigned their missingness. The created comparisons are: .data$n_detect_control >= max(floor(.data$n_replicates_control * 0.7), 1) & .data$n_detect_treated >= max(floor(.data$n_replicates_control * 0.7), 1) ~ "MAR" ))) %>% - dplyr::select(-c(.data$n_detect_control, .data$n_detect_treated, .data$n_replicates_control, .data$n_replicates_treated)) %>% + dplyr::select(-c("n_detect_control", "n_detect_treated", "n_replicates_control", "n_replicates_treated")) %>% # Arrange by grouping but in a numeric order of the character vector. dplyr::arrange(factor({{ grouping }}, levels = unique(stringr::str_sort({{ grouping }}, numeric = TRUE)))) diff --git a/R/calculate_diff_abundance.R b/R/calculate_diff_abundance.R index 3f879f7b..34c5b247 100644 --- a/R/calculate_diff_abundance.R +++ b/R/calculate_diff_abundance.R @@ -253,7 +253,7 @@ Please provide a valid reference condition.", prefix = "\n", initial = "")) "treated" )) %>% dplyr::select(-{{ condition }}) %>% - tidyr::pivot_wider(names_from = .data$type, values_from = .data$intensity, values_fill = list(NA)) + tidyr::pivot_wider(names_from = "type", values_from = "intensity", values_fill = list(NA)) message("DONE", appendLF = TRUE) message("[2/2] Calculate t-tests ... ", appendLF = FALSE) @@ -303,7 +303,7 @@ Please provide a valid reference condition.", prefix = "\n", initial = "")) } )) %>% dplyr::mutate(diff = ifelse(diff == "NaN", NA, diff)) %>% - dplyr::select(-c(.data$control, .data$treated)) %>% + dplyr::select(-c("control", "treated")) %>% dplyr::left_join(t_test_missingness_obs, by = c(rlang::as_name(rlang::enquo(grouping)), "comparison")) # save all p-values that are NA to join back after adjustment. Otherwise adjustment is wrong since @@ -408,7 +408,7 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", n1 = .data$n_control, n2 = .data$n_treated )) %>% - tidyr::drop_na(.data$pval) %>% + tidyr::drop_na("pval") %>% dplyr::group_by(.data$comparison) %>% dplyr::mutate(adj_pval = stats::p.adjust(.data$pval, method = p_adj_method)) %>% dplyr::arrange(.data$adj_pval, .data$pval) @@ -580,13 +580,13 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", ) %>% dplyr::mutate(comparison = stringr::str_replace_all({{ comparison }}, pattern = "^x|(?<=_vs_)x", replacement = "")) %>% dplyr::rename( - diff = .data$logFC, - CI_2.5 = .data$CI.L, - CI_97.5 = .data$CI.R, - t_statistic = .data$t, - avg_abundance = .data$AveExpr, - pval = .data$P.Value, - adj_pval = .data$adj.P.Val + diff = "logFC", + CI_2.5 = "CI.L", + CI_97.5 = "CI.R", + t_statistic = "t", + avg_abundance = "AveExpr", + pval = "P.Value", + adj_pval = "adj.P.Val" ) %>% dplyr::left_join(moderated_t_test_missingness, by = c(rlang::as_name(rlang::enquo(grouping)), "comparison")) @@ -747,8 +747,8 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", .f = ~ dplyr::mutate(.x, comparison = str_replace_all(.y, pattern = "`", replacement = "")) ) %>% purrr::map_dfr(~ dplyr::mutate(.x, adj_pval = p.adjust(.data$pval, method = p_adj_method))) %>% - dplyr::select(-.data$n_obs, -.data$n_approx) %>% - dplyr::rename({{ grouping }} := .data$name, std_error = .data$se) %>% + dplyr::select(-c("n_obs", "n_approx")) %>% + dplyr::rename({{ grouping }} := "name", std_error = "se") %>% dplyr::left_join(proDA_missingness, by = c(rlang::as_name(rlang::enquo(grouping)), "comparison")) message("DONE", appendLF = TRUE) diff --git a/R/create_structure_contact_map.R b/R/create_structure_contact_map.R index 774503cc..cdb8756a 100644 --- a/R/create_structure_contact_map.R +++ b/R/create_structure_contact_map.R @@ -259,7 +259,7 @@ Please always provide a chain ID for your start and end positions."), ), extra = "drop" ) %>% - dplyr::select(-c(.data$X1, .data$x1, .data$x2, .data$x3, .data$x4)) %>% + dplyr::select(-c("X1", "x1", "x2", "x3", "x4")) %>% dplyr::group_by(.data$label_asym_id, .data$label_atom_id, .data$label_comp_id) %>% dplyr::mutate(label_seq_id = ifelse(.data$label_seq_id == ".", 1:n(), @@ -279,17 +279,17 @@ Please always provide a chain ID for your start and end positions."), ) %>% dplyr::filter(.data$pdb_model_number %in% pdb_model_number_selection) %>% dplyr::select( - .data$label_id, - .data$x, - .data$y, - .data$z, - .data$label_comp_id, - .data$label_seq_id, - .data$label_asym_id, - .data$auth_comp_id, - .data$auth_seq_id, - .data$auth_asym_id, - .data$id + "label_id", + "x", + "y", + "z", + "label_comp_id", + "label_seq_id", + "label_asym_id", + "auth_comp_id", + "auth_seq_id", + "auth_asym_id", + "id" ) %>% dplyr::mutate(retain_pattern = stringr::str_replace_all( paste(.data$id, .data$auth_asym_id, .data$auth_seq_id, sep = "_"), @@ -426,17 +426,17 @@ Please always provide a chain ID for your start and end positions."), structures <- .x %>% dplyr::filter(.data$pdb_model_number %in% pdb_model_number_selection) %>% dplyr::select( - .data$label_id, - .data$x, - .data$y, - .data$z, - .data$label_comp_id, - .data$label_seq_id, - .data$label_asym_id, - .data$auth_comp_id, - .data$auth_seq_id, - .data$auth_asym_id, - .data$pdb_id + "label_id", + "x", + "y", + "z", + "label_comp_id", + "label_seq_id", + "label_asym_id", + "auth_comp_id", + "auth_seq_id", + "auth_asym_id", + "pdb_id" ) %>% dplyr::mutate(retain_pattern = stringr::str_replace_all( paste(.data$pdb_id, .data$auth_asym_id, .data$auth_seq_id, sep = "_"), @@ -481,19 +481,19 @@ Please always provide a chain ID for your start and end positions."), } predictions <- .x %>% dplyr::select( - .data$label_id, - .data$x, - .data$y, - .data$z, - .data$label_comp_id, - .data$label_seq_id, - .data$label_asym_id, - .data$auth_comp_id, - .data$auth_seq_id, - .data$auth_asym_id, - .data$uniprot_id, - .data$prediction_score, - .data$score_quality + "label_id", + "x", + "y", + "z", + "label_comp_id", + "label_seq_id", + "label_asym_id", + "auth_comp_id", + "auth_seq_id", + "auth_asym_id", + "uniprot_id", + "prediction_score", + "score_quality" ) %>% dplyr::mutate(retain_pattern = stringr::str_replace_all( paste(.data$uniprot_id, .data$auth_asym_id, .data$auth_seq_id, sep = "_"), @@ -561,11 +561,11 @@ Please always provide a chain ID for your start and end positions."), current_structure1 <- .y %>% dplyr::filter(.data$should_be_retained) %>% - dplyr::select(-c(.data$x, .data$y, .data$z, .data$should_be_retained, .data$should_be_retained2, .data$retain_pattern)) + dplyr::select(-c("x", "y", "z", "should_be_retained", "should_be_retained2", "retain_pattern")) current_structure2 <- .y %>% dplyr::filter(.data$should_be_retained2) %>% - dplyr::select(-c(.data$x, .data$y, .data$z, .data$should_be_retained, .data$should_be_retained2, .data$retain_pattern)) + dplyr::select(-c("x", "y", "z", "should_be_retained", "should_be_retained2", "retain_pattern")) current_structure_minimum1 <- .y %>% dplyr::filter(.data$should_be_retained) %>% @@ -597,7 +597,7 @@ Please always provide a chain ID for your start and end positions."), dplyr::left_join(current_structure_minimum1, by = c("var1" = "label_id")) %>% dplyr::left_join(current_structure_minimum2, by = c("var2" = "label_id")) %>% dplyr::mutate(distance = sqrt((.data$x.x - .data$x.y)^2 + (.data$y.x - .data$y.y)^2 + (.data$z.x - .data$z.y)^2)) %>% - dplyr::select(.data$var1, .data$var2, .data$distance) %>% + dplyr::select("var1", "var2", "distance") %>% dplyr::filter(.data$distance <= distance_cutoff) %>% dplyr::left_join(current_structure1 %>% dplyr::select(-.data$id), by = c("var1" = "label_id")) %>% dplyr::left_join(current_structure2, by = c("var2" = "label_id"), suffix = c("_var1", "_var2")) @@ -628,13 +628,13 @@ Please always provide a chain ID for your start and end positions."), dplyr::mutate(min_distance_residue = suppressWarnings(min(.data$distance))) %>% dplyr::ungroup() %>% dplyr::rename( - label_id_var1 = .data$var1, - label_id_var2 = .data$var2 + label_id_var1 = "var1", + label_id_var2 = "var2" ) if (return_min_residue_distance == TRUE) { residue_distance <- residue_distance %>% - dplyr::select(-c(.data$label_id_var1, .data$label_id_var2, .data$distance)) %>% + dplyr::select(-c("label_id_var1", "label_id_var2", "distance")) %>% dplyr::distinct() } diff --git a/R/create_synthetic_data.R b/R/create_synthetic_data.R index 6d926ab3..b59b78ce 100644 --- a/R/create_synthetic_data.R +++ b/R/create_synthetic_data.R @@ -285,15 +285,15 @@ create_synthetic_data <- function(n_proteins, dplyr::ungroup() %>% dplyr::mutate(peptide_intensity = .data$peptide_intensity + .data$offset) %>% dplyr::select(-c( - .data$peptide_intensity_mean, - .data$replicate_sd, - .data$n, - .data$n_change_peptide, - .data$effect, - .data$effect_total, - .data$b, - .data$c, - .data$offset + "peptide_intensity_mean", + "replicate_sd", + "n", + "n_change_peptide", + "effect", + "effect_total", + "b", + "c", + "offset" )) # formula for inflection point and slope sampling roughly simulates # the behaviour of real data. They have been figured out by trial and error. diff --git a/R/filter_cv.R b/R/filter_cv.R index 49c8d33d..accd5fa6 100644 --- a/R/filter_cv.R +++ b/R/filter_cv.R @@ -72,7 +72,7 @@ filter_cv <- function(data, dplyr::group_by({{ grouping }}) %>% dplyr::mutate(cv_count = sum(.data$cv_small, na.rm = TRUE)) %>% dplyr::filter(.data$cv_count >= {{ min_conditions }}) %>% - dplyr::select(-c(.data$cv_small, .data$cv_count)) %>% + dplyr::select(-c("cv_small", "cv_count")) %>% dplyr::ungroup() n_groups_end <- length(unique(dplyr::pull(peptide_list, {{ grouping }}))) diff --git a/R/fit_drc_4p.R b/R/fit_drc_4p.R index f635a762..b5595d98 100644 --- a/R/fit_drc_4p.R +++ b/R/fit_drc_4p.R @@ -176,7 +176,7 @@ fit_drc_4p <- function(data, sd = sd({{ response }}, na.rm = TRUE) ) %>% dplyr::distinct({{ grouping }}, {{ dose }}, .data$mean_ratio, .data$sd, .data$n) %>% - tidyr::drop_na(.data$mean_ratio, .data$sd) %>% + tidyr::drop_na("mean_ratio", "sd") %>% anova_protti({{ grouping }}, {{ dose }}, .data$mean_ratio, .data$sd, .data$n) %>% dplyr::distinct({{ grouping }}, .data$pval) %>% dplyr::mutate(anova_adj_pval = stats::p.adjust(.data$pval, method = "BH")) %>% @@ -512,14 +512,14 @@ fit_drc_4p <- function(data, output <- correlation_output %>% dplyr::left_join(line_fit, by = rlang::as_name(rlang::enquo(grouping))) %>% dplyr::group_by({{ grouping }}) %>% - tidyr::nest(plot_curve = c(.data$dose, .data$Prediction, .data$Lower, .data$Upper)) %>% + tidyr::nest(plot_curve = c("dose", "Prediction", "Lower", "Upper")) %>% dplyr::left_join(plot_points, by = rlang::as_name(rlang::enquo(grouping))) %>% tidyr::nest(plot_points = c({{ response }}, {{ dose }})) %>% dplyr::rename( - hill_coefficient = .data$`hill:(Intercept)`, - min_model = .data$`min_value:(Intercept)`, - max_model = .data$`max_value:(Intercept)`, - ec_50 = .data$`ec_50:(Intercept)` + hill_coefficient = "hill:(Intercept)", + min_model = "min_value:(Intercept)", + max_model = "max_value:(Intercept)", + ec_50 = "ec_50:(Intercept)" ) %>% dplyr::ungroup() %>% dplyr::arrange(dplyr::desc(.data$correlation)) diff --git a/R/impute.R b/R/impute.R index 1a77d3b4..5bfd7393 100644 --- a/R/impute.R +++ b/R/impute.R @@ -142,7 +142,7 @@ impute <- function(data, {{ intensity_log2 }} )) %>% dplyr::mutate(imputed = is.na({{ intensity_log2 }}) & !is.na(.data$imputed_intensity)) %>% - dplyr::select(-.data$impute, -.data$mean, -.data$sd, -.data$min, -.data$noise_mean) %>% + dplyr::select(-c("impute", "mean", "sd", "min", "noise_mean")) %>% dplyr::arrange(factor({{ grouping }}, levels = unique(stringr::str_sort({{ grouping }}, numeric = TRUE)))) if (missing(retain_columns)) { From 160d1417a7ad8a5bddae8e61cdda596c0dec2c20 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 19 Apr 2023 11:27:40 +0200 Subject: [PATCH 10/71] Fix bug in qc_plot functions --- .gitignore | 2 ++ NEWS.md | 2 ++ R/qc_intensity_distribution.R | 2 +- R/qc_median_intensities.R | 12 ++++++++---- man/qc_median_intensities.Rd | 2 +- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 23345d35..96d5d919 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,5 @@ inst/doc doc Meta docs +/doc/ +/Meta/ diff --git a/NEWS.md b/NEWS.md index 7c797a65..60562621 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,8 @@ * `qc_proteome_coverage()`: fixed the label order of fractions proteins detected and not detected in the proteome. * `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. * `fetch_kegg()` now returns the pathway name again correctly. +* `qc_intensity_distribution()`: If the provided sample column is of type factor, the level order won't be overwritten anymore. +* `qc_median_intensities()`: If the provided sample column is of type factor, the level order won't be overwritten anymore. # protti 0.6.0 diff --git a/R/qc_intensity_distribution.R b/R/qc_intensity_distribution.R index ccc52803..58d96ad8 100644 --- a/R/qc_intensity_distribution.R +++ b/R/qc_intensity_distribution.R @@ -59,7 +59,7 @@ qc_intensity_distribution <- function(data, dplyr::distinct({{ sample }}, {{ grouping }}, {{ intensity_log2 }}) %>% tidyr::drop_na({{ intensity_log2 }}) - if (!missing(sample) && is(dplyr::pull(input, {{ sample }}), "factor")) { + if (!missing(sample) && is(dplyr::pull(input, {{ sample }}), "character")) { input <- input %>% dplyr::mutate({{ sample }} := factor({{ sample }}, levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) diff --git a/R/qc_median_intensities.R b/R/qc_median_intensities.R index 87c70110..a2f9f033 100644 --- a/R/qc_median_intensities.R +++ b/R/qc_median_intensities.R @@ -3,7 +3,7 @@ #' Median intensities per run are returned either as a plot or a table. #' #' @param data a data frame that contains at least the input variables. -#' @param sample a character column in the \code{data} data frame that contains the sample name. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample name. #' @param grouping a character column in the \code{data} data frame that contains either precursor or #' peptide identifiers. #' @param intensity a numeric column in the \code{data} data frame that contains intensity values. @@ -69,10 +69,14 @@ qc_median_intensities <- function(data, return(table) } + if (is(dplyr::pull(table, {{ sample }}), "character")) { + table <- table %>% + mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } + plot <- table %>% - mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$median_intensity, group = 1)) + ggplot2::geom_line(size = 1) + ggplot2::labs(title = "Medians of run intensities", x = "", y = "Intensity") + diff --git a/man/qc_median_intensities.Rd b/man/qc_median_intensities.Rd index 86aacc5e..bbb1033f 100644 --- a/man/qc_median_intensities.Rd +++ b/man/qc_median_intensities.Rd @@ -16,7 +16,7 @@ qc_median_intensities( \arguments{ \item{data}{a data frame that contains at least the input variables.} -\item{sample}{a character column in the \code{data} data frame that contains the sample name.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} \item{grouping}{a character column in the \code{data} data frame that contains either precursor or peptide identifiers.} From ee4fee3f1a1a30dfb8a6f27c02ca4b700ad0acd2 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 19 Apr 2023 18:01:27 +0200 Subject: [PATCH 11/71] Bugfix qc_ids --- R/qc_ids.R | 12 ++++++++---- man/qc_ids.Rd | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/R/qc_ids.R b/R/qc_ids.R index b7ee7b68..07ed3279 100644 --- a/R/qc_ids.R +++ b/R/qc_ids.R @@ -5,7 +5,7 @@ #' counted as IDs. #' #' @param data a data frame containing at least sample names and precursor/peptide/protein IDs. -#' @param sample a character column in the \code{data} data frame that contains the sample name. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample name. #' @param grouping a character column in the \code{data} data frame that contains either precursor or #' peptide identifiers. #' @param intensity a character column in the \code{data} data frame that contains raw or log2 @@ -99,11 +99,15 @@ or set remove_na_intensities to FALSE", dplyr::distinct() %>% dplyr::ungroup() - if (plot == TRUE) { - plot <- result %>% + if (is(dplyr::pull(input, {{ sample }}), "character")) { + result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% + )) + } + + if (plot == TRUE) { + plot <- result %>% ggplot2::ggplot(aes(x = {{ sample }}, y = .data$count, fill = {{ condition }})) + ggplot2::geom_col(col = "black", size = 1) + { diff --git a/man/qc_ids.Rd b/man/qc_ids.Rd index 50428fc1..eac6d231 100644 --- a/man/qc_ids.Rd +++ b/man/qc_ids.Rd @@ -19,7 +19,7 @@ qc_ids( \arguments{ \item{data}{a data frame containing at least sample names and precursor/peptide/protein IDs.} -\item{sample}{a character column in the \code{data} data frame that contains the sample name.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} \item{grouping}{a character column in the \code{data} data frame that contains either precursor or peptide identifiers.} From 8b5992afb8a6a6d779e8f27e2715c727d9a140a6 Mon Sep 17 00:00:00 2001 From: jpquast Date: Wed, 19 Apr 2023 18:09:33 +0200 Subject: [PATCH 12/71] Fix bug in qc_ids 2 --- R/qc_ids.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/qc_ids.R b/R/qc_ids.R index 07ed3279..0207d437 100644 --- a/R/qc_ids.R +++ b/R/qc_ids.R @@ -99,7 +99,7 @@ or set remove_na_intensities to FALSE", dplyr::distinct() %>% dplyr::ungroup() - if (is(dplyr::pull(input, {{ sample }}), "character")) { + if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) From 99ba98993705d3d2accae7e0efe509b52d4f1992 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 20 Apr 2023 13:33:53 +0200 Subject: [PATCH 13/71] Fix more factor column bugs --- R/qc_charge_states.R | 20 ++++++++++++++++---- R/qc_contaminants.R | 10 ++++++++-- R/qc_missed_cleavages.R | 25 ++++++++++++++++--------- R/qc_peptide_type.R | 28 +++++++++++++++------------- man/qc_charge_states.Rd | 2 +- man/qc_contaminants.Rd | 2 +- man/qc_missed_cleavages.Rd | 2 +- man/qc_peptide_type.Rd | 2 +- 8 files changed, 59 insertions(+), 32 deletions(-) diff --git a/R/qc_charge_states.R b/R/qc_charge_states.R index b7ba13c8..8881344e 100644 --- a/R/qc_charge_states.R +++ b/R/qc_charge_states.R @@ -4,7 +4,7 @@ #' #' @param data a data frame that contains at least sample names, peptide or precursor identifiers #' and missed cleavage counts for each peptide or precursor. -#' @param sample a character column in the \code{data} data frame that contains the sample name. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample name. #' @param grouping a character column in the \code{data} data frame that contains either precursor or #' peptide identifiers. #' @param charge_states a character or numeric column in the \code{data} data frame that contains @@ -105,9 +105,15 @@ qc_charge_states <- dplyr::group_by({{ sample }}, {{ charge_states }}) %>% dplyr::summarise(charge_per = n / .data$total_peptides * 100) %>% dplyr::ungroup() %>% - dplyr::mutate({{ charge_states }} := forcats::fct_inorder(factor({{ charge_states }}))) %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)))) + dplyr::mutate({{ charge_states }} := forcats::fct_inorder(factor({{ charge_states }}))) + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } + if (plot == FALSE) { return(result) } else { @@ -155,8 +161,14 @@ qc_charge_states <- dplyr::summarise(charge_per = .data$sum_intensity_cs / .data$total_intensity * 100) %>% dplyr::ungroup() %>% dplyr::mutate({{ charge_states }} := forcats::fct_inorder(factor({{ charge_states }}))) %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)))) %>% dplyr::distinct() + + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } if (plot == FALSE) { return(result) diff --git a/R/qc_contaminants.R b/R/qc_contaminants.R index 515e3958..ccaff81c 100644 --- a/R/qc_contaminants.R +++ b/R/qc_contaminants.R @@ -3,7 +3,7 @@ #' Calculates the percentage of contaminating proteins as the share of total intensity. #' #' @param data a data frame that contains at least the input variables. -#' @param sample a character column in the \code{data} data frame that contains the sample names. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample names. #' @param protein a character column in the \code{data} data frame that contains protein IDs or #' protein names. #' @param is_contaminant a logical column that indicates if the protein is a contaminant. @@ -76,9 +76,15 @@ qc_contaminants <- function(data, if (plot == FALSE) { return(result) } + + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } plot_result <- result %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)))) %>% ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$contaminant_percentage, fill = {{ protein }})) + ggplot2::geom_col(col = "black", size = 1) + ggplot2::labs( diff --git a/R/qc_missed_cleavages.R b/R/qc_missed_cleavages.R index 5e0863ad..7497b10f 100644 --- a/R/qc_missed_cleavages.R +++ b/R/qc_missed_cleavages.R @@ -6,7 +6,7 @@ #' #' @param data a data frame containing at least sample names, peptide or precursor identifiers #' and missed cleavage counts for each peptide or precursor. -#' @param sample a character column in the \code{data} data frame that contains the sample name. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample name. #' @param grouping a character column in the \code{data} data frame that contains either precursor or #' peptide identifiers. #' @param missed_cleavages a numeric column in the \code{data} data frame that contains the counts @@ -104,10 +104,14 @@ intensities or set remove_na_intensities to FALSE", dplyr::group_by({{ sample }}, {{ missed_cleavages }}) %>% dplyr::summarise(mc_percent = n / .data$total_peptide_count * 100) %>% dplyr::ungroup() %>% - dplyr::mutate({{ missed_cleavages }} := forcats::fct_inorder(factor({{ missed_cleavages }}))) %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) + dplyr::mutate({{ missed_cleavages }} := forcats::fct_inorder(factor({{ missed_cleavages }}))) + + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } if (plot == FALSE) { return(result) @@ -165,12 +169,15 @@ intensities or set remove_na_intensities to FALSE", dplyr::summarise(mc_percent = .data$sum_intensity_mc / .data$total_intensity * 100) %>% dplyr::ungroup() %>% dplyr::mutate({{ missed_cleavages }} := forcats::fct_inorder(factor({{ missed_cleavages }}))) %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% dplyr::distinct() - + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } + if (plot == FALSE) { return(result) } else { diff --git a/R/qc_peptide_type.R b/R/qc_peptide_type.R index 6f2aae07..3fadc8d2 100644 --- a/R/qc_peptide_type.R +++ b/R/qc_peptide_type.R @@ -4,7 +4,7 @@ #' non-tryptic) for each sample. #' #' @param data a data frame that contains at least the input columns. -#' @param sample a character column in the \code{data} data frame that contains the sample names. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample names. #' @param peptide a character column in the \code{data} data frame that contains the peptide #' sequence. #' @param pep_type a character column in the \code{data} data frame that contains the peptide @@ -102,12 +102,16 @@ qc_peptide_type <- function(data, dplyr::mutate(pep_type = factor({{ pep_type }}, levels = c("fully-tryptic", "semi-tryptic", "non-tryptic") )) + + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } if (plot == TRUE & interactive == FALSE) { plot <- result %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% ggplot2::ggplot(ggplot2::aes( x = {{ sample }}, y = .data$peptide_type_percent, @@ -141,9 +145,6 @@ qc_peptide_type <- function(data, } if (plot == TRUE & interactive == TRUE) { plot <- result %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$peptide_type_percent, fill = .data$pep_type)) + ggplot2::geom_col(col = "black", size = 1) + ggplot2::labs( @@ -189,12 +190,16 @@ qc_peptide_type <- function(data, dplyr::mutate(pep_type = factor({{ pep_type }}, levels = c("fully-tryptic", "semi-tryptic", "non-tryptic") )) + + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } if (plot == TRUE & interactive == FALSE) { plot <- result %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% ggplot2::ggplot(ggplot2::aes( x = {{ sample }}, y = .data$peptide_type_percent, @@ -227,9 +232,6 @@ qc_peptide_type <- function(data, } if (plot == TRUE & interactive == TRUE) { plot <- result %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% ggplot2::ggplot(ggplot2::aes( x = {{ sample }}, .data$peptide_type_percent, diff --git a/man/qc_charge_states.Rd b/man/qc_charge_states.Rd index 42417da2..06fd7e3c 100644 --- a/man/qc_charge_states.Rd +++ b/man/qc_charge_states.Rd @@ -20,7 +20,7 @@ qc_charge_states( \item{data}{a data frame that contains at least sample names, peptide or precursor identifiers and missed cleavage counts for each peptide or precursor.} -\item{sample}{a character column in the \code{data} data frame that contains the sample name.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} \item{grouping}{a character column in the \code{data} data frame that contains either precursor or peptide identifiers.} diff --git a/man/qc_contaminants.Rd b/man/qc_contaminants.Rd index fd296a42..b18736a7 100644 --- a/man/qc_contaminants.Rd +++ b/man/qc_contaminants.Rd @@ -18,7 +18,7 @@ qc_contaminants( \arguments{ \item{data}{a data frame that contains at least the input variables.} -\item{sample}{a character column in the \code{data} data frame that contains the sample names.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.} \item{protein}{a character column in the \code{data} data frame that contains protein IDs or protein names.} diff --git a/man/qc_missed_cleavages.Rd b/man/qc_missed_cleavages.Rd index fc404726..98499536 100644 --- a/man/qc_missed_cleavages.Rd +++ b/man/qc_missed_cleavages.Rd @@ -20,7 +20,7 @@ qc_missed_cleavages( \item{data}{a data frame containing at least sample names, peptide or precursor identifiers and missed cleavage counts for each peptide or precursor.} -\item{sample}{a character column in the \code{data} data frame that contains the sample name.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample name.} \item{grouping}{a character column in the \code{data} data frame that contains either precursor or peptide identifiers.} diff --git a/man/qc_peptide_type.Rd b/man/qc_peptide_type.Rd index 1c1a2ab7..cd5c6a6c 100644 --- a/man/qc_peptide_type.Rd +++ b/man/qc_peptide_type.Rd @@ -19,7 +19,7 @@ qc_peptide_type( \arguments{ \item{data}{a data frame that contains at least the input columns.} -\item{sample}{a character column in the \code{data} data frame that contains the sample names.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.} \item{peptide}{a character column in the \code{data} data frame that contains the peptide sequence.} From b83397b59181929a5581eee6dee8ae55e0d54fd2 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 20 Apr 2023 17:50:32 +0200 Subject: [PATCH 14/71] Improved assign_peptide_type and find_peptide --- NEWS.md | 10 +++++----- R/assign_peptide_type.R | 8 +++++++- R/find_peptide.R | 5 +++++ R/peptide_profile_plot.R | 2 +- man/peptide_profile_plot.Rd | 2 +- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 60562621..43765ec2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,16 +5,16 @@ * `qc_cvs()` received a new argument called `max_cv` that specifies the maximum CV that should be included in the plot. * `peptide_profile_plot()` received a new argument called `complete_sample`. If set to `TRUE`, each protein gets assigned all sample names that are found in the input data. This ensures that the plot always contains all samples on the x-axis even if there are no measured intensities for a specific sample. The default is `FALSE`, which is the original behaviour of the function. * `volcano_plot()` received the `colour` argument that allows the user to provide custom colours for points. +* Increased the speed of `find_peptide()` and `assign_peptide_type()` by only computing on the smallest possible subset of data before joining back to the original data frame. ## Bug fixes * `volcano_plot()` now also works interactively if there are no significant hits. -* `fetch_chebi()`: fixed an issue cased by `na_if()` that changed its behaviour after the recent `dplyr` update. -* `qc_proteome_coverage()`: fixed the label order of fractions proteins detected and not detected in the proteome. +* `fetch_chebi()`: fixed an issue caused by `na_if()` that changed its behaviour after the recent `dplyr` update. +* `qc_proteome_coverage()`: fixed the label order of fractions of proteins detected and not detected in the proteome. * `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. -* `fetch_kegg()` now returns the pathway name again correctly. -* `qc_intensity_distribution()`: If the provided sample column is of type factor, the level order won't be overwritten anymore. -* `qc_median_intensities()`: If the provided sample column is of type factor, the level order won't be overwritten anymore. +* `fetch_kegg()` now returns the pathway name correctly again. +* `qc_intensity_distribution()`, `qc_median_intensities()`, `qc_charge_states()`, `qc_contaminants()`, `qc_missed_cleavages()`, `qc_peptide_type()`, `qc_ids()`: If the provided sample column is of type factor, the level order won't be overwritten anymore. # protti 0.6.0 diff --git a/R/assign_peptide_type.R b/R/assign_peptide_type.R index 0d42face..92409c0f 100644 --- a/R/assign_peptide_type.R +++ b/R/assign_peptide_type.R @@ -55,6 +55,7 @@ assign_peptide_type <- function(data, last_aa = last_aa, aa_after = aa_after) { data %>% + dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>% dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" | {{ aa_before }} == "K" | {{ aa_before }} == "R", @@ -72,5 +73,10 @@ assign_peptide_type <- function(data, .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic", .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic" )) %>% - dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) + dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>% + dplyr::right_join(data, by = c( + rlang::as_name(rlang::enquo(aa_before)), + rlang::as_name(rlang::enquo(last_aa)), + rlang::as_name(rlang::enquo(aa_after)) + )) } diff --git a/R/find_peptide.R b/R/find_peptide.R index c4e693c9..d6ddce43 100644 --- a/R/find_peptide.R +++ b/R/find_peptide.R @@ -33,6 +33,7 @@ find_peptide <- function(data, protein_sequence, peptide_sequence) { data %>% + dplyr::distinct({{ protein_sequence }}, {{ peptide_sequence }}) %>% dplyr::mutate( start = stringr::str_locate({{ protein_sequence }}, {{ peptide_sequence }})[, 1], end = stringr::str_locate({{ protein_sequence }}, {{ peptide_sequence }})[, 2] @@ -48,5 +49,9 @@ find_peptide <- dplyr::mutate(aa_after = stringr::str_sub({{ protein_sequence }}, start = .data$end + 1, end = .data$end + 1 + )) %>% + dplyr::right_join(data, c( + rlang::as_name(rlang::enquo(protein_sequence)), + rlang::as_name(rlang::enquo(peptide_sequence)) )) } diff --git a/R/peptide_profile_plot.R b/R/peptide_profile_plot.R index fbd40346..02464de5 100644 --- a/R/peptide_profile_plot.R +++ b/R/peptide_profile_plot.R @@ -33,7 +33,7 @@ plot_peptide_profiles <- function(...) { #' be plotted. This can also be \code{"all"} if plots for all groups should be created. Depending #' on the number of elements in your grouping column this can be many plots. #' @param complete_sample a logical value that indicates if samples that are completely missing for -#' a given protein should be anyway shown on the x-axis of the plot. The default value is `FALSE`. +#' a given protein should be shown on the x-axis of the plot anyway. The default value is `FALSE`. #' @param protein_abundance_plot a logical value. If the input for this plot comes directly from #' \code{calculate_protein_abundance} this argument can be set to \code{TRUE}. This displays all #' peptides in gray, while the protein abundance is displayed in green. diff --git a/man/peptide_profile_plot.Rd b/man/peptide_profile_plot.Rd index 1f1c5777..3d02712d 100644 --- a/man/peptide_profile_plot.Rd +++ b/man/peptide_profile_plot.Rd @@ -37,7 +37,7 @@ be plotted. This can also be \code{"all"} if plots for all groups should be crea on the number of elements in your grouping column this can be many plots.} \item{complete_sample}{a logical value that indicates if samples that are completely missing for -a given protein should be anyway shown on the x-axis of the plot. The default value is \code{FALSE}.} +a given protein should be shown on the x-axis of the plot anyway. The default value is \code{FALSE}.} \item{protein_abundance_plot}{a logical value. If the input for this plot comes directly from \code{calculate_protein_abundance} this argument can be set to \code{TRUE}. This displays all From d5c6f48198d42aca44551a18e82b8b874b8d95bd Mon Sep 17 00:00:00 2001 From: jpquast Date: Mon, 24 Apr 2023 14:16:38 +0200 Subject: [PATCH 15/71] Update qc_data_completeness --- R/qc_data_completeness.R | 12 ++++++++---- man/qc_data_completeness.Rd | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/R/qc_data_completeness.R b/R/qc_data_completeness.R index 3b1172dc..0c0b0a4e 100644 --- a/R/qc_data_completeness.R +++ b/R/qc_data_completeness.R @@ -4,7 +4,7 @@ #' precursors is present in each sample. #' #' @param data a data frame containing at least the input variables. -#' @param sample a character column in the \code{data} data frame that contains the sample names. +#' @param sample a character or factor column in the \code{data} data frame that contains the sample names. #' @param grouping a character column in the \code{data} data frame that contains either precursor #' or peptide identifiers. #' @param intensity a numeric column in the \code{data} data frame that contains any intensity @@ -91,10 +91,14 @@ qc_data_completeness <- function(data, return(result) } + if (is(dplyr::pull(result, {{ sample }}), "character")) { + result <- result %>% + dplyr::mutate({{ sample }} := factor({{ sample }}, + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + )) + } + completeness_plot <- result %>% - dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) - )) %>% ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$completeness)) + ggplot2::geom_col(fill = "#5680C1", col = "black", size = 1) + { diff --git a/man/qc_data_completeness.Rd b/man/qc_data_completeness.Rd index 921bc87c..3c2fc71c 100644 --- a/man/qc_data_completeness.Rd +++ b/man/qc_data_completeness.Rd @@ -17,7 +17,7 @@ qc_data_completeness( \arguments{ \item{data}{a data frame containing at least the input variables.} -\item{sample}{a character column in the \code{data} data frame that contains the sample names.} +\item{sample}{a character or factor column in the \code{data} data frame that contains the sample names.} \item{grouping}{a character column in the \code{data} data frame that contains either precursor or peptide identifiers.} From c24814619d8e7b2e64a4100fdd05c740c52f6a6f Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 17 Aug 2023 18:57:02 +0200 Subject: [PATCH 16/71] Add group to calculate_treatment_enrichment --- R/calculate_go_enrichment.R | 2 +- R/calculate_treatment_enrichment.R | 98 +++++++++++++++++++++------ man/calculate_go_enrichment.Rd | 2 +- man/calculate_treatment_enrichment.Rd | 6 ++ 4 files changed, 86 insertions(+), 22 deletions(-) diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 9f1e90ed..0eb30eba 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -35,7 +35,7 @@ go_enrichment <- function(...) { #' corresponding protein has a significantly changing peptide. The input data frame may contain #' peptide level information with significance information. The function is able to extract #' protein level information from this. -#' @param group a character column in the \code{data} data frame that contains information by +#' @param group optional, character column in the \code{data} data frame that contains information by #' which the analysis should be grouped. The analysis will be performed separately for each of the #' groups. This is most likely a column that labels separate comparisons of different conditions. #' In protti the `asign_missingness()` function creates such a column automatically. diff --git a/R/calculate_treatment_enrichment.R b/R/calculate_treatment_enrichment.R index 71df748f..56d34927 100644 --- a/R/calculate_treatment_enrichment.R +++ b/R/calculate_treatment_enrichment.R @@ -34,6 +34,10 @@ treatment_enrichment <- function(...) { #' @param binds_treatment a logical column in the \code{data} data frame that indicates if the #' corresponding protein binds to the treatment. This information can be obtained from different #' databases, e.g. UniProt. +#' @param group optional, character column in the \code{data} data frame that contains information by +#' which the analysis should be grouped. The analysis will be performed separately for each of the +#' groups. This is most likely a column that labels separate comparisons of different conditions. +#' In protti the `asign_missingness()` function creates such a column automatically. #' @param treatment_name a character value that indicates the treatment name. It will be included #' in the plot title. #' @param plot a logical value indicating whether the result should be plotted or returned as a @@ -50,6 +54,7 @@ treatment_enrichment <- function(...) { #' @importFrom rlang .data as_name enquo ensym !! #' @importFrom tibble column_to_rownames #' @importFrom magrittr %>% +#' @importFrom purrr map2_dfr #' @export #' #' @examples @@ -92,31 +97,79 @@ calculate_treatment_enrichment <- function(data, protein_id, is_significant, binds_treatment, + group = NULL, treatment_name, plot = TRUE) { - data <- data %>% - dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ binds_treatment }}) %>% - dplyr::group_by({{ protein_id }}) %>% - dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, - TRUE, - FALSE - )) %>% - dplyr::distinct() + # to avoid note about no global variable binding. + . <- NULL + + # group by the "group" argument if provided + if(!missing(group)){ + data <- data %>% + dplyr::ungroup() %>% + dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ binds_treatment }}, {{group}}) %>% + dplyr::group_by({{ protein_id }}, {{group}}) %>% + dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, + TRUE, + FALSE + )) %>% + dplyr::ungroup() %>% + dplyr::distinct() + + # Create contingency table + cont_table <- data %>% + dplyr::group_by({{ binds_treatment }}, {{ is_significant }}, {{group}}) %>% + dplyr::summarize(n = dplyr::n_distinct(!!rlang::ensym(protein_id)), .groups = "drop") %>% + dplyr::group_by({{group}}) %>% + tidyr::complete({{ binds_treatment }}, {{ is_significant }}, fill = list(n = 0)) %>% + dplyr::ungroup() + + fisher_test <- cont_table %>% + split(dplyr::pull(., {{ group }})) %>% + purrr::map2_dfr(.y = names(.), + .f = ~ {ftest <- .x %>% + dplyr::select(-{{ group }}) %>% + tidyr::pivot_wider(names_from = {{ is_significant }}, values_from = .data$n) %>% + tibble::column_to_rownames(var = rlang::as_name(rlang::enquo(binds_treatment))) %>% + as.matrix() %>% + stats::fisher.test() + + data.frame(pval = ftest$p.value) %>% + dplyr::mutate({{ group }} := .y) + } + ) + + cont_table <- cont_table %>% + dplyr::left_join(fisher_test, by = rlang::as_name(rlang::enquo(group))) %>% + dplyr::arrange({{ group }}) - cont_table <- data %>% - dplyr::group_by({{ binds_treatment }}, {{ is_significant }}) %>% - dplyr::summarize(n = dplyr::n_distinct(!!rlang::ensym(protein_id)), .groups = "drop") %>% - tidyr::complete({{ binds_treatment }}, {{ is_significant }}, fill = list(n = 0)) + } else { + data <- data %>% + dplyr::ungroup() %>% + dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ binds_treatment }}) %>% + dplyr::group_by({{ protein_id }}) %>% + dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, + TRUE, + FALSE + )) %>% + dplyr::ungroup() %>% + dplyr::distinct() + # Create contingency table + cont_table <- data %>% + dplyr::group_by({{ binds_treatment }}, {{ is_significant }}) %>% + dplyr::summarize(n = dplyr::n_distinct(!!rlang::ensym(protein_id)), .groups = "drop") %>% + tidyr::complete({{ binds_treatment }}, {{ is_significant }}, fill = list(n = 0)) - fisher_test <- cont_table %>% - tidyr::pivot_wider(names_from = {{ is_significant }}, values_from = .data$n) %>% - tibble::column_to_rownames(var = rlang::as_name(rlang::enquo(binds_treatment))) %>% - as.matrix() %>% - stats::fisher.test() + fisher_test <- cont_table %>% + tidyr::pivot_wider(names_from = {{ is_significant }}, values_from = .data$n) %>% + tibble::column_to_rownames(var = rlang::as_name(rlang::enquo(binds_treatment))) %>% + as.matrix() %>% + stats::fisher.test() - cont_table <- cont_table %>% - dplyr::mutate(pval = fisher_test$p.value) + cont_table <- cont_table %>% + dplyr::mutate(pval = fisher_test$p.value) + } if (plot == FALSE) { return(cont_table) @@ -155,6 +208,9 @@ calculate_treatment_enrichment <- function(data, )) %>% ggplot2::ggplot(ggplot2::aes(.data$name, .data$value)) + ggplot2::geom_col(fill = "cornflowerblue", col = "black", size = 1.2) + + {if (!missing(group)){ + ggplot2::facet_wrap(rlang::new_formula(NULL, rlang::enquo(group))) + }} + ggplot2::labs( title = paste0( "Proteins interacting with ", @@ -188,7 +244,9 @@ calculate_treatment_enrichment <- function(data, ), axis.title.y = ggplot2::element_text( size = 15 - ) + ), + strip.text = ggplot2::element_text(size = 15), + strip.background = element_blank() ) if (plot == TRUE) { return(enrichment_plot) diff --git a/man/calculate_go_enrichment.Rd b/man/calculate_go_enrichment.Rd index df294e76..bd89aed8 100644 --- a/man/calculate_go_enrichment.Rd +++ b/man/calculate_go_enrichment.Rd @@ -30,7 +30,7 @@ corresponding protein has a significantly changing peptide. The input data frame peptide level information with significance information. The function is able to extract protein level information from this.} -\item{group}{a character column in the \code{data} data frame that contains information by +\item{group}{optional, character column in the \code{data} data frame that contains information by which the analysis should be grouped. The analysis will be performed separately for each of the groups. This is most likely a column that labels separate comparisons of different conditions. In protti the \code{asign_missingness()} function creates such a column automatically.} diff --git a/man/calculate_treatment_enrichment.Rd b/man/calculate_treatment_enrichment.Rd index 0ddc17ca..6d8a1f10 100644 --- a/man/calculate_treatment_enrichment.Rd +++ b/man/calculate_treatment_enrichment.Rd @@ -9,6 +9,7 @@ calculate_treatment_enrichment( protein_id, is_significant, binds_treatment, + group = NULL, treatment_name, plot = TRUE ) @@ -28,6 +29,11 @@ level information from this.} corresponding protein binds to the treatment. This information can be obtained from different databases, e.g. UniProt.} +\item{group}{optional, character column in the \code{data} data frame that contains information by +which the analysis should be grouped. The analysis will be performed separately for each of the +groups. This is most likely a column that labels separate comparisons of different conditions. +In protti the \code{asign_missingness()} function creates such a column automatically.} + \item{treatment_name}{a character value that indicates the treatment name. It will be included in the plot title.} From 0c23e5847c89b080e9b0692f04bf73c48516f575 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 17 Aug 2023 18:57:28 +0200 Subject: [PATCH 17/71] Fix calculate_protein_abundance retain_columns --- R/calculate_protein_abundance.R | 16 +++++++++++++++- R/fit_drc_4p.R | 12 ++++++++++++ R/qc_pca.R | 16 +++++++++++++--- data-raw/metal_chebi_uniprot.R | 2 +- man/qc_pca.Rd | 2 +- 5 files changed, 42 insertions(+), 6 deletions(-) diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R index 6424cd1a..a39d1e33 100644 --- a/R/calculate_protein_abundance.R +++ b/R/calculate_protein_abundance.R @@ -116,7 +116,7 @@ calculate_protein_abundance <- function(data, # Filter out any proteins with less than 3 peptides input <- data %>% - dplyr::ungroup() %>% + dplyr::ungroup() %>% dplyr::distinct( {{ sample }}, {{ protein_id }}, @@ -209,6 +209,19 @@ calculate_protein_abundance <- function(data, dplyr::select(-{{ precursor }}) } + if(!missing(retain_columns)){ + protein_intensity_retain <- data %>% + dplyr::select( + !!enquo(retain_columns), + colnames(combined)[!colnames(combined) %in% + c( + rlang::as_name(rlang::enquo(intensity_log2)), + rlang::as_name(rlang::enquo(precursor)) + )]) %>% + dplyr::distinct() %>% + dplyr::mutate({{ precursor }} := "protein_intensity") + } + if (!missing(retain_columns) & for_plot == FALSE) { result <- data %>% dplyr::select( @@ -232,6 +245,7 @@ calculate_protein_abundance <- function(data, )] ) %>% dplyr::distinct() %>% + dplyr::bind_rows(protein_intensity_retain) %>% dplyr::right_join(combined, by = colnames(combined)[!colnames(combined) %in% c( rlang::as_name(rlang::enquo(intensity_log2)) diff --git a/R/fit_drc_4p.R b/R/fit_drc_4p.R index b5595d98..fca3d464 100644 --- a/R/fit_drc_4p.R +++ b/R/fit_drc_4p.R @@ -156,11 +156,17 @@ fit_drc_4p <- function(data, # preprocessing of data data_prep <- data %>% + tidyr::drop_na({{ dose }}) %>% dplyr::ungroup() %>% dplyr::distinct({{ sample }}, {{ grouping }}, {{ response }}, {{ dose }}) %>% tidyr::complete(nesting(!!ensym(sample), !!ensym(dose)), !!ensym(grouping)) %>% dplyr::mutate({{ dose }} := as.numeric({{ dose }})) + # If the data_prep data.frame is empty return a data.frame that contains only the grouping and retained column + if (nrow(data_prep) == 0){ + return(data.frame()) + } + if (filter != "none") { n_conditions <- length(unique(dplyr::pull(data_prep, {{ dose }}))) n_replicates <- length(unique(dplyr::pull(data_prep, {{ sample }}))) / n_conditions @@ -179,6 +185,7 @@ fit_drc_4p <- function(data, tidyr::drop_na("mean_ratio", "sd") %>% anova_protti({{ grouping }}, {{ dose }}, .data$mean_ratio, .data$sd, .data$n) %>% dplyr::distinct({{ grouping }}, .data$pval) %>% + tidyr::drop_na(.data$pval) %>% # remove NA pvalues before adjustment! dplyr::mutate(anova_adj_pval = stats::p.adjust(.data$pval, method = "BH")) %>% dplyr::rename(anova_pval = .data$pval) @@ -437,6 +444,11 @@ fit_drc_4p <- function(data, .f = ~ dplyr::mutate(.x, {{ grouping }} := .y) ) + # Return empty data.frame if there are no correlations. This prevents parallel_fit_drc_4p from failing. + if (nrow(correlation_output) == 0){ + return(data.frame()) + } + # creating correlation output data frame correlation_output <- correlation_output %>% diff --git a/R/qc_pca.R b/R/qc_pca.R index 31bc300c..23a1ff45 100644 --- a/R/qc_pca.R +++ b/R/qc_pca.R @@ -9,7 +9,7 @@ #' or peptide identifiers. #' @param intensity a numeric column in the \code{data} data frame that contains the corresponding #' intensity values for each peptide or precursor. -#' @param condition a column in the \code{data} data frame that contains condition information +#' @param condition a numeric or character column in the \code{data} data frame that contains condition information #' (e.g. "treated" and "control"). #' @param components a character vector indicating the two components that should be displayed in #' the plot. By default these are PC1 and PC2. You can provide these using a character vector of @@ -106,12 +106,13 @@ qc_pca <- dplyr::mutate(dimension = factor(.data$dimension, levels = unique(stringr::str_sort(.data$dimension, numeric = TRUE)) )) + if (plot_style == "pca") { plot <- pca_df %>% ggplot2::ggplot(aes( x = !!rlang::sym(components[1]), y = !!rlang::sym(components[2]), - col = as.character({{ condition }}), + col = {{ condition }}, shape = {{ digestion }} )) + ggplot2::geom_point(size = 3) + @@ -137,7 +138,16 @@ qc_pca <- size = 4, show.legend = FALSE ) + - ggplot2::scale_color_manual(values = protti_colours) + + {if(is.numeric(unique(dplyr::pull(pca_df, {{ condition }})))){ + ggplot2::scale_color_gradientn(colours = c( + "#0D0887", "#2E0595", "#46039F", "#5C01A6", "#7201A8", "#8707A6", "#9A169F", + "#AC2694", "#BC3587", "#CA457A", "#D6556D", "#E26561", "#EB7655", "#F48849", + "#FA9B3D", "#FDAF31", "#FDC527", "#F9DC24", "#F0F921" + )) + } else { + ggplot2::scale_color_manual(values = protti_colours) + } + }+ ggplot2::theme( panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA), diff --git a/data-raw/metal_chebi_uniprot.R b/data-raw/metal_chebi_uniprot.R index 2c8f23c1..538f54ae 100644 --- a/data-raw/metal_chebi_uniprot.R +++ b/data-raw/metal_chebi_uniprot.R @@ -73,7 +73,7 @@ metal_chebi_uniprot <- chebi %>% metal_chebi_ids_wo_formula[as.character(id)], metal_chebi_annotation[extract_formula] )) %>% - dplyr::select(-.data$extract_formula) %>% + dplyr::select(-"extract_formula") %>% dplyr::group_by(.data$id) %>% dplyr::mutate(metal_atom_id = paste0(.data$metal_atom_id, collapse = ",")) %>% dplyr::distinct() diff --git a/man/qc_pca.Rd b/man/qc_pca.Rd index 9e788979..407e6f7d 100644 --- a/man/qc_pca.Rd +++ b/man/qc_pca.Rd @@ -27,7 +27,7 @@ or peptide identifiers.} \item{intensity}{a numeric column in the \code{data} data frame that contains the corresponding intensity values for each peptide or precursor.} -\item{condition}{a column in the \code{data} data frame that contains condition information +\item{condition}{a numeric or character column in the \code{data} data frame that contains condition information (e.g. "treated" and "control").} \item{components}{a character vector indicating the two components that should be displayed in From 5b3da330820d8f4db4639e687e78042cff9144e9 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 17 Aug 2023 19:12:00 +0200 Subject: [PATCH 18/71] Fix NA issue in calculate_sequence_coverage --- R/calculate_sequence_coverage.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/calculate_sequence_coverage.R b/R/calculate_sequence_coverage.R index 6c938140..0d6125b0 100644 --- a/R/calculate_sequence_coverage.R +++ b/R/calculate_sequence_coverage.R @@ -33,6 +33,7 @@ sequence_coverage <- function(...) { #' @importFrom magrittr %>% #' @importFrom stringr str_count #' @importFrom rlang .data as_name enquo +#' @importFrom tidyr drop_na #' @export #' #' @examples @@ -49,6 +50,9 @@ sequence_coverage <- function(...) { calculate_sequence_coverage <- function(data, protein_sequence, peptides) { result <- data %>% + dplyr::ungroup() %>% + # drop_na prevents function from failing if a protein group contains only NA peptide sequences. + tidyr::drop_na({{ peptides }}) %>% dplyr::distinct({{ protein_sequence }}, {{ peptides }}) %>% dplyr::group_by({{ protein_sequence }}) %>% find_peptide({{ protein_sequence }}, {{ peptides }}) %>% From 8f7fd64104ce6eadfbae13e70f919f8c79152c41 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 17 Aug 2023 19:19:34 +0200 Subject: [PATCH 19/71] Update test-coverage to v2 --- .github/workflows/test-coverage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 0834d63b..2291581b 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 - uses: r-lib/actions/setup-pandoc@v1 From e4cb6f273e56438ce6606d56b1fac23e57bf4204 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 18 Aug 2023 17:56:51 +0200 Subject: [PATCH 20/71] Correct pval label calculate_treatment_enrichment --- .github/workflows/test-coverage.yaml | 2 +- R/calculate_treatment_enrichment.R | 58 ++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 2291581b..5e9f40ba 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -20,7 +20,7 @@ jobs: - uses: r-lib/actions/setup-r@v2 - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | diff --git a/R/calculate_treatment_enrichment.R b/R/calculate_treatment_enrichment.R index 56d34927..1715102f 100644 --- a/R/calculate_treatment_enrichment.R +++ b/R/calculate_treatment_enrichment.R @@ -175,6 +175,19 @@ calculate_treatment_enrichment <- function(data, return(cont_table) } + # Add p-value to group name for plot + if(!missing(group)){ + cont_table <- cont_table %>% + dplyr::mutate(group_pval = paste0({{ group }}, " (p-value: ", + ifelse(.data$pval < 0.01, + formatC(.data$pval, + format = "e", digits = 1 + ), + round(.data$pval, digits = 2) + ), + ")")) + } + enrichment_plot <- cont_table %>% dplyr::mutate(total = sum(.data$n)) %>% dplyr::mutate( @@ -209,24 +222,37 @@ calculate_treatment_enrichment <- function(data, ggplot2::ggplot(ggplot2::aes(.data$name, .data$value)) + ggplot2::geom_col(fill = "cornflowerblue", col = "black", size = 1.2) + {if (!missing(group)){ - ggplot2::facet_wrap(rlang::new_formula(NULL, rlang::enquo(group))) + ggplot2::facet_wrap(~ .data$group_pval) }} + - ggplot2::labs( - title = paste0( - "Proteins interacting with ", - treatment_name, - " (p-value: ", - ifelse(cont_table$pval < 0.01, - formatC(cont_table$pval, - format = "e", digits = 1 + { + if (!missing(group)){ + ggplot2::labs( + title = paste0( + "Proteins interacting with ", + treatment_name ), - round(cont_table$pval, digits = 2) - ), - ")" - ), - x = "", - y = paste("Interact with", treatment_name, "[%]") - ) + + x = "", + y = paste("Interact with", treatment_name, "[%]") + ) + } else { + ggplot2::labs( + title = paste0( + "Proteins interacting with ", + treatment_name, + " (p-value: ", + ifelse(cont_table$pval < 0.01, + formatC(cont_table$pval, + format = "e", digits = 1 + ), + round(cont_table$pval, digits = 2) + ), + ")" + ), + x = "", + y = paste("Interact with", treatment_name, "[%]") + ) + } + } + ggplot2::geom_text(aes(label = paste("n =", count)), position = position_stack(vjust = 0.5), size = 8 From 1bc111c1eba8ff501a9d23d6a2946131fffd8850 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 18 Aug 2023 19:10:17 +0200 Subject: [PATCH 21/71] Update tidyeval warnings fetch_chebi, fetch_pdb, fetch_alphafold_prediction, fetch_kegg, fetch_pdb_structure, woods_plot, fixed also a broken test. --- R/fetch_alphafold_prediction.R | 24 +- R/fetch_chebi.R | 82 +++--- R/fetch_kegg.R | 4 +- R/fetch_pdb.R | 330 +++++++++++----------- R/fetch_pdb_structure.R | 2 +- R/woods_plot.R | 4 +- tests/testthat/test-structure_functions.R | 2 +- 7 files changed, 226 insertions(+), 222 deletions(-) diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 59bf0023..3945e8ad 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -282,18 +282,18 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, ) ) %>% dplyr::select(-c( - .data$X1, - .data$x1, - .data$x2, - .data$x3, - .data$x4, - .data$x5, - .data$x6, - .data$x7, - .data$formal_charge, - .data$site_occupancy, - .data$entity_id, - .data$pdb_model_number + "X1", + "x1", + "x2", + "x3", + "x4", + "x5", + "x6", + "x7", + "formal_charge", + "site_occupancy", + "entity_id", + "pdb_model_number" )) %>% dplyr::mutate( label_id = as.numeric(.data$label_id), diff --git a/R/fetch_chebi.R b/R/fetch_chebi.R index b2572d05..a74e36d0 100644 --- a/R/fetch_chebi.R +++ b/R/fetch_chebi.R @@ -64,9 +64,9 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi_relation_clean <- chebi_relation %>% dplyr::filter(.data$STATUS == "C") %>% - dplyr::select(-c(.data$ID, .data$STATUS)) %>% - dplyr::rename(incoming = .data$FINAL_ID) %>% - dplyr::rename(ID = .data$INIT_ID) %>% + dplyr::select(-c("ID", "STATUS")) %>% + dplyr::rename(incoming = "FINAL_ID") %>% + dplyr::rename(ID = "INIT_ID") %>% janitor::clean_names() return(chebi_relation_clean) @@ -164,34 +164,34 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi_compounds_clean <- chebi_compounds %>% dplyr::filter(.data$STAR %in% stars) %>% dplyr::select(-c( - .data$SOURCE, - .data$NAME, - .data$STATUS, - .data$MODIFIED_ON, - .data$CREATED_BY + "SOURCE", + "NAME", + "STATUS", + "MODIFIED_ON", + "CREATED_BY" )) %>% - dplyr::mutate(dplyr::across(c(.data$PARENT_ID, .data$DEFINITION), ~ dplyr::na_if(.x, "null"))) %>% + dplyr::mutate(dplyr::across(c("PARENT_ID", "DEFINITION"), ~ dplyr::na_if(.x, "null"))) %>% dplyr::mutate(PARENT_ID = as.numeric(.data$PARENT_ID)) chebi_accession_clean <- chebi_accession %>% dplyr::distinct(.data$COMPOUND_ID, .data$TYPE, .data$ACCESSION_NUMBER) %>% - dplyr::rename(ID = .data$COMPOUND_ID) %>% - dplyr::rename(TYPE_ACCESSION = .data$TYPE) + dplyr::rename(ID = "COMPOUND_ID") %>% + dplyr::rename(TYPE_ACCESSION = "TYPE") chebi_chemical_data_clean <- chebi_chemical_data %>% dplyr::distinct(.data$COMPOUND_ID, .data$TYPE, .data$CHEMICAL_DATA) %>% - dplyr::rename(ID = .data$COMPOUND_ID) %>% + dplyr::rename(ID = "COMPOUND_ID") %>% tidyr::pivot_wider( - names_from = .data$TYPE, - values_from = .data$CHEMICAL_DATA, + names_from = "TYPE", + values_from = "CHEMICAL_DATA", values_fn = list ) %>% tidyr::unnest(cols = c( - .data$FORMULA, - .data$MASS, - .data$CHARGE, - .data$`MONOISOTOPIC MASS` + "FORMULA", + "MASS", + "CHARGE", + "MONOISOTOPIC MASS" )) chebi_compounds_names_clean <- chebi_compounds %>% @@ -200,7 +200,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { dplyr::mutate(dplyr::across(c(.data$NAME), ~ dplyr::na_if(.x, "null"))) %>% dplyr::filter(!is.na(.data$NAME)) %>% dplyr::mutate(TYPE_NAME = "STANDARD") %>% - dplyr::select(.data$ID, .data$TYPE_NAME, .data$NAME) + dplyr::select("ID", "TYPE_NAME", "NAME") chebi_names_clean <- chebi_names %>% dplyr::distinct(.data$COMPOUND_ID, .data$NAME, .data$TYPE) %>% @@ -209,8 +209,8 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi <- chebi_compounds_clean %>% dplyr::left_join(chebi_names_clean, by = "ID") %>% - dplyr::left_join(chebi_accession_clean, by = "ID") %>% - dplyr::left_join(chebi_chemical_data_clean, by = "ID") + dplyr::left_join(chebi_accession_clean, by = "ID", relationship = "many-to-many") %>% + dplyr::left_join(chebi_chemical_data_clean, by = "ID", relationship = "many-to-many") # Add info to old compound IDs @@ -222,32 +222,32 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { parent_info <- chebi %>% dplyr::filter(.data$ID %in% parent_ids) %>% dplyr::select(c( - .data$ID, - .data$NAME, - .data$TYPE_NAME, - .data$DEFINITION, - .data$TYPE_ACCESSION, - .data$ACCESSION_NUMBER, - .data$FORMULA, - .data$MASS, - .data$CHARGE, - .data$`MONOISOTOPIC MASS` + "ID", + "NAME", + "TYPE_NAME", + "DEFINITION", + "TYPE_ACCESSION", + "ACCESSION_NUMBER", + "FORMULA", + "MASS", + "CHARGE", + "MONOISOTOPIC MASS" )) parent_complete <- chebi %>% dplyr::filter(!is.na(.data$PARENT_ID)) %>% dplyr::select(-c( - .data$NAME, - .data$TYPE_NAME, - .data$DEFINITION, - .data$TYPE_ACCESSION, - .data$ACCESSION_NUMBER, - .data$FORMULA, - .data$MASS, - .data$CHARGE, - .data$`MONOISOTOPIC MASS` + "NAME", + "TYPE_NAME", + "DEFINITION", + "TYPE_ACCESSION", + "ACCESSION_NUMBER", + "FORMULA", + "MASS", + "CHARGE", + "MONOISOTOPIC MASS" )) %>% - dplyr::left_join(parent_info, by = c("PARENT_ID" = "ID")) + dplyr::left_join(parent_info, by = c("PARENT_ID" = "ID"), relationship = "many-to-many") chebi <- chebi %>% dplyr::filter(is.na(.data$PARENT_ID)) %>% diff --git a/R/fetch_kegg.R b/R/fetch_kegg.R index e1233f0a..4eac7605 100644 --- a/R/fetch_kegg.R +++ b/R/fetch_kegg.R @@ -46,7 +46,7 @@ fetch_kegg <- function(species) { return(invisible(NULL)) } colnames(result_name) <- c("pathway_id", "pathway_name") - + # download kegg_id to uniprot_id conversion url_conv <- paste("https://rest.kegg.jp/conv/uniprot", species, sep = "/") result_conv <- try_query(url_conv, col_names = FALSE, progress = FALSE, show_col_types = FALSE) @@ -62,6 +62,6 @@ fetch_kegg <- function(species) { # combine datasets result <- result_link %>% dplyr::left_join(result_name, by = "pathway_id") %>% - dplyr::left_join(result_conv, by = "kegg_id") + dplyr::left_join(result_conv, by = "kegg_id", relationship = "many-to-many") result } diff --git a/R/fetch_pdb.R b/R/fetch_pdb.R index 13eae0b7..f7bb0592 100644 --- a/R/fetch_pdb.R +++ b/R/fetch_pdb.R @@ -303,7 +303,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { "entries.rcsb_entry_info" )) %>% tidyr::unnest("entries.exptl") %>% - dplyr::rename(structure_method = .data$method) + dplyr::rename(structure_method = "method") crystal_growth_info <- query_result_clean %>% dplyr::select("pdb_ids", "entries.exptl_crystal_grow") %>% @@ -329,7 +329,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { ) %in% colnames(crystal_growth_info)] crystal_growth_info <- crystal_growth_info %>% - dplyr::select(-should_not_be_here) %>% + dplyr::select(-all_of(should_not_be_here)) %>% dplyr::bind_cols(stats::setNames( data.frame(matrix( ncol = length(should_be_here), @@ -338,21 +338,21 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { should_be_here )) %>% dplyr::rename( - pH_crystallisation = .data$pH, - method_crystallisation = .data$method, - temp_crystallisation = .data$temp + pH_crystallisation = "pH", + method_crystallisation = "method", + temp_crystallisation = "temp" ) resolution_info <- query_result_clean %>% - dplyr::select(.data$pdb_ids, .data$resolution_combined) %>% + dplyr::select("pdb_ids", "resolution_combined") %>% tidyr::unnest(.data$resolution_combined) nmr_info <- query_result_clean %>% dplyr::select( - .data$pdb_ids, - .data$entries.pdbx_nmr_exptl, - .data$entries.pdbx_nmr_exptl_sample_conditions, - .data$entries.pdbx_nmr_refine + "pdb_ids", + "entries.pdbx_nmr_exptl", + "entries.pdbx_nmr_exptl_sample_conditions", + "entries.pdbx_nmr_refine" ) %>% tidyr::unnest(.data$entries.pdbx_nmr_exptl) %>% tidyr::unnest(.data$entries.pdbx_nmr_exptl_sample_conditions) %>% @@ -384,7 +384,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { ) %in% colnames(nmr_info)] nmr_info <- nmr_info %>% - dplyr::select(-should_not_be_here) %>% + dplyr::select(-all_of(should_not_be_here)) %>% dplyr::bind_cols(stats::setNames( data.frame(matrix( ncol = length(should_be_here), @@ -393,15 +393,15 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { should_be_here )) %>% dplyr::rename( - type_nmr = .data$type, - pH_nmr = .data$pH, - temp_nmr = .data$temperature, - method_nmr = .data$method, - ionic_strength_nmr = .data$ionic_strength + type_nmr = "type", + pH_nmr = "pH", + temp_nmr = "temperature", + method_nmr = "method", + ionic_strength_nmr = "ionic_strength" ) rcsb_binding_affinity <- query_result_clean %>% - dplyr::select(.data$pdb_ids, .data$entries.rcsb_binding_affinity) %>% + dplyr::select("pdb_ids", "entries.rcsb_binding_affinity") %>% tidyr::unnest(.data$entries.rcsb_binding_affinity) # make sure that the data is complete even if there is no affinity information @@ -421,7 +421,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { ) %in% colnames(rcsb_binding_affinity)] rcsb_binding_affinity <- rcsb_binding_affinity %>% - dplyr::select(-should_not_be_here) %>% + dplyr::select(-all_of(should_not_be_here)) %>% dplyr::bind_cols(stats::setNames( data.frame(matrix( ncol = length(should_be_here), @@ -430,8 +430,8 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { should_be_here )) %>% dplyr::rename( - affinity_comp_id = .data$comp_id, - affinity_value = .data$value + affinity_comp_id = "comp_id", + affinity_value = "value" ) if (show_progress == TRUE) { @@ -442,18 +442,18 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { } polymer_entities <- query_result_clean %>% - dplyr::select(.data$pdb_ids, .data$entries.polymer_entities) %>% + dplyr::select("pdb_ids", "entries.polymer_entities") %>% tidyr::unnest(.data$entries.polymer_entities) %>% dplyr::bind_cols( .$entity_poly, .$rcsb_polymer_entity_container_identifiers ) %>% dplyr::select(-c( - .data$entity_poly, - .data$rcsb_polymer_entity_container_identifiers, - .data$rcsb_entity_source_organism + "entity_poly", + "rcsb_polymer_entity_container_identifiers", + "rcsb_entity_source_organism" )) %>% - tidyr::unnest(.data$rcsb_polymer_entity) %>% + tidyr::unnest("rcsb_polymer_entity") %>% dplyr::rowwise() %>% dplyr::mutate(rcsb_non_std_monomers = ifelse(!is.null(unlist(.data$rcsb_non_std_monomers)), paste0(.data$rcsb_non_std_monomers, collapse = ";"), @@ -467,7 +467,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::mutate(no_uniprots = is.null(unlist(.data$uniprots))) %>% dplyr::ungroup() %>% dplyr::filter(.data$no_uniprots) %>% - dplyr::select(-c(.data$uniprots, .data$no_uniprots)) + dplyr::select(-c("uniprots", "no_uniprots")) if (nrow(polymer_entities_no_uniprots) > 0) { polymer_entities <- polymer_entities %>% @@ -490,7 +490,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::mutate(no_rcsb_polymer_entity_align = is.null(unlist(.data$rcsb_polymer_entity_align))) %>% dplyr::ungroup() %>% dplyr::filter(.data$no_rcsb_polymer_entity_align) %>% - dplyr::select(-c(.data$rcsb_polymer_entity_align, .data$no_rcsb_polymer_entity_align)) + dplyr::select(-c("rcsb_polymer_entity_align", "no_rcsb_polymer_entity_align")) if (nrow(polymer_entities_no_rcsb_polymer_entity_align) > 0) { polymer_entities <- polymer_entities %>% @@ -508,10 +508,10 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { uniprot_container_identifiers = .$rcsb_uniprot_container_identifiers, uniprot_protein = .$rcsb_uniprot_protein ) %>% - dplyr::select(-c(.data$rcsb_uniprot_container_identifiers, .data$rcsb_uniprot_protein)) + dplyr::select(-c("rcsb_uniprot_container_identifiers", "rcsb_uniprot_protein")) } else { polymer_entities <- polymer_entities %>% - dplyr::select(-c(.data$uniprots)) %>% + dplyr::select(-c("uniprots")) %>% dplyr::mutate( uniprot_id = NA, name = data.frame(value = NA) @@ -528,27 +528,27 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::filter(.data$no_aligned_regions) polymer_entities <- polymer_entities %>% - tidyr::unnest(c(.data$aligned_regions)) %>% + tidyr::unnest(c("aligned_regions")) %>% dplyr::bind_cols(.$name) %>% - dplyr::select(-c(.data$name, .data$entry_id)) %>% - dplyr::rename(name_protein = .data$value) %>% - tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>% + dplyr::select(-c("name", "entry_id")) %>% + dplyr::rename(name_protein = "value") %>% + tidyr::unnest(c("auth_asym_ids", "polymer_entity_instances")) %>% dplyr::bind_cols( rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers ) %>% - dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) + dplyr::select(-c("rcsb_polymer_entity_instance_container_identifiers")) if (nrow(polymer_entities_no_aligned_regions) > 0) { polymer_entities_no_aligned_regions <- polymer_entities_no_aligned_regions %>% - dplyr::select(-c(.data$aligned_regions, .data$no_aligned_regions)) %>% + dplyr::select(-c("aligned_regions", "no_aligned_regions")) %>% dplyr::bind_cols(.$name) %>% - dplyr::select(-c(.data$name, .data$entry_id)) %>% - dplyr::rename(name_protein = .data$value) %>% - tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>% + dplyr::select(-c("name", "entry_id")) %>% + dplyr::rename(name_protein = "value") %>% + tidyr::unnest(c("auth_asym_ids", "polymer_entity_instances")) %>% dplyr::bind_cols( rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers ) %>% - dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) %>% + dplyr::select(-c("rcsb_polymer_entity_instance_container_identifiers")) %>% dplyr::mutate( entity_beg_seq_id = NA, ref_beg_seq_id = NA, @@ -561,15 +561,15 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { } } else { polymer_entities <- polymer_entities %>% - dplyr::select(-c(.data$rcsb_polymer_entity_align)) %>% + dplyr::select(-c("rcsb_polymer_entity_align")) %>% dplyr::bind_cols(.$name) %>% - dplyr::select(-c(.data$name, .data$entry_id)) %>% - dplyr::rename(name_protein = .data$value) %>% - tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>% + dplyr::select(-c("name", "entry_id")) %>% + dplyr::rename(name_protein = "value") %>% + tidyr::unnest(c("auth_asym_ids", "polymer_entity_instances")) %>% dplyr::bind_cols( rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers ) %>% - dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) %>% + dplyr::select(-c("rcsb_polymer_entity_instance_container_identifiers")) %>% dplyr::mutate( entity_beg_seq_id = NA, ref_beg_seq_id = NA, @@ -592,7 +592,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::mutate(no_ligands = is.null(unlist(.data$rcsb_ligand_neighbors))) %>% dplyr::ungroup() %>% dplyr::filter(.data$no_ligands) %>% - dplyr::select(-c(.data$rcsb_ligand_neighbors, .data$no_ligands)) %>% + dplyr::select(-c("rcsb_ligand_neighbors", "no_ligands")) %>% dplyr::mutate( atom_id = as.character(NA), auth_seq_id = as.integer(NA), @@ -606,20 +606,20 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { ) polymer_entities <- polymer_entities %>% - tidyr::unnest(c(.data$rcsb_ligand_neighbors)) %>% + tidyr::unnest(c("rcsb_ligand_neighbors")) %>% dplyr::bind_rows(polymer_entities_no_ligands) %>% dplyr::mutate(ligand_is_bound = ifelse(.data$ligand_is_bound == "Y", "TRUE", "FALSE")) %>% dplyr::group_by(.data$pdb_ids, .data$auth_asym_id, .data$ligand_entity_id) %>% dplyr::mutate(dplyr::across( .cols = c( - .data$atom_id, - .data$auth_seq_id, - .data$comp_id, - .data$ligand_asym_id, - .data$ligand_atom_id, - .data$ligand_comp_id, - .data$ligand_is_bound, - .data$seq_id + "atom_id", + "auth_seq_id", + "comp_id", + "ligand_asym_id", + "ligand_atom_id", + "ligand_comp_id", + "ligand_is_bound", + "seq_id" ), .fns = ~ paste0(.x, collapse = ";") )) %>% @@ -627,15 +627,15 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>% dplyr::mutate(dplyr::across( .cols = c( - .data$atom_id, - .data$auth_seq_id, - .data$comp_id, - .data$ligand_asym_id, - .data$ligand_atom_id, - .data$ligand_comp_id, - .data$ligand_entity_id, - .data$ligand_is_bound, - .data$seq_id + "atom_id", + "auth_seq_id", + "comp_id", + "ligand_asym_id", + "ligand_atom_id", + "ligand_comp_id", + "ligand_entity_id", + "ligand_is_bound", + "seq_id" ), .fns = ~ paste0(.x, collapse = "|") )) %>% @@ -656,19 +656,19 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { )) %>% dplyr::ungroup() %>% dplyr::rename( - ligand_donor_atom_id = .data$atom_id, - ligand_donor_auth_seq_id = .data$auth_seq_id, - ligand_donor_id = .data$comp_id, - ligand_id = .data$ligand_comp_id, - ligand_label_asym_id = .data$ligand_asym_id, - ligand_bond_is_covalent_or_coordinating = .data$ligand_is_bound, - ligand_donor_label_seq_id = .data$seq_id + ligand_donor_atom_id = "atom_id", + ligand_donor_auth_seq_id = "auth_seq_id", + ligand_donor_id = "comp_id", + ligand_id = "ligand_comp_id", + ligand_label_asym_id = "ligand_asym_id", + ligand_bond_is_covalent_or_coordinating = "ligand_is_bound", + ligand_donor_label_seq_id = "seq_id" ) if ("rcsb_ligand_neighbors" %in% colnames(polymer_entities)) { # if none of the retrieved entries contains any ligands then this column needs to be removed manually polymer_entities <- polymer_entities %>% - select(-.data$rcsb_ligand_neighbors) + select(-"rcsb_ligand_neighbors") } # extract modified monomer information @@ -679,9 +679,9 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { } rcsb_polymer_entity_feature <- polymer_entities %>% - dplyr::select(.data$pdb_ids, .data$auth_asym_id, .data$rcsb_polymer_entity_feature) %>% - tidyr::unnest(.data$rcsb_polymer_entity_feature) %>% - tidyr::unnest(.data$feature_positions) + dplyr::select("pdb_ids", "auth_asym_id", "rcsb_polymer_entity_feature") %>% + tidyr::unnest("rcsb_polymer_entity_feature") %>% + tidyr::unnest("feature_positions") modified_monomer <- rcsb_polymer_entity_feature %>% dplyr::filter(.data$type %in% c("modified_monomer")) %>% @@ -706,9 +706,9 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { } rcsb_polymer_instance_feature_data <- polymer_entities %>% - dplyr::select(.data$pdb_ids, .data$auth_asym_id, .data$rcsb_polymer_instance_feature) %>% - tidyr::unnest(.data$rcsb_polymer_instance_feature) %>% - tidyr::unnest(.data$feature_positions) + dplyr::select("pdb_ids", "auth_asym_id", "rcsb_polymer_instance_feature") %>% + tidyr::unnest("rcsb_polymer_instance_feature") %>% + tidyr::unnest("feature_positions") secondary_structures <- rcsb_polymer_instance_feature_data %>% dplyr::filter(.data$name %in% c("helix", "sheet")) %>% @@ -739,7 +739,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { left_join(modified_monomer, by = c("pdb_ids", "auth_asym_id")) %>% left_join(secondary_structures, by = c("pdb_ids", "auth_asym_id")) %>% left_join(unmodeled_residues, by = c("pdb_ids", "auth_asym_id")) %>% - select(-c(.data$rcsb_polymer_instance_feature, .data$rcsb_polymer_entity_feature)) + select(-c("rcsb_polymer_instance_feature", "rcsb_polymer_entity_feature")) # Modify auth_seq_id positions that are either duplicated or missing. # Missing or duplicated entries are identified by comparing the length of auth_seq_id to the length of the sequence. @@ -778,7 +778,11 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { values = as.character(rep(.data$non_consecutive, .data$n_missing) + unlist(.data$replacement_values_addition)) ))) %>% dplyr::ungroup() %>% - dplyr::select(-c(.data$auth_seq_id_pdb_numeric, .data$non_consecutive, .data$n_missing, .data$replacement_values_addition, .data$replacement_positions)) + dplyr::select(-c("auth_seq_id_pdb_numeric", + "non_consecutive", + "n_missing", + "replacement_values_addition", + "replacement_positions")) } # Join corrected entries back polymer_entities <- polymer_entities %>% @@ -808,25 +812,25 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { .data$auth_to_entity_poly_seq_mapping ) %>% dplyr::rename( - auth_asym_ids = .data$auth_asym_id, - pdb_ids = .data$entry_id + auth_asym_ids = "auth_asym_id", + pdb_ids = "entry_id" ) uniprot_info <- polymer_entities %>% dplyr::distinct(.data$uniprot_id, .data$name_protein) %>% dplyr::rename( - reference_database_accession = .data$uniprot_id, - protein_name = .data$name_protein + reference_database_accession = "uniprot_id", + protein_name = "name_protein" ) polymer_entities <- polymer_entities %>% select(-c( - .data$uniprot_id, - .data$name_protein, - .data$asym_id, - .data$auth_asym_id, - .data$entry_id, - .data$auth_to_entity_poly_seq_mapping + "uniprot_id", + "name_protein", + "asym_id", + "auth_asym_id", + "entry_id", + "auth_to_entity_poly_seq_mapping" )) %>% distinct() @@ -837,30 +841,30 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { if (!all(is.na(query_result_clean$entries.nonpolymer_entities))) { nonpolymer_entities <- query_result_clean %>% - dplyr::select(.data$pdb_ids, .data$entries.nonpolymer_entities) %>% - tidyr::unnest(.data$entries.nonpolymer_entities) %>% + dplyr::select("pdb_ids", "entries.nonpolymer_entities") %>% + tidyr::unnest("entries.nonpolymer_entities") %>% dplyr::bind_cols( .$rcsb_nonpolymer_entity_container_identifiers, .$nonpolymer_comp ) %>% dplyr::bind_cols(.$chem_comp) %>% dplyr::select(-c( - .data$nonpolymer_comp, - .data$rcsb_nonpolymer_entity_container_identifiers, - .data$chem_comp, - .data$entry_id + "nonpolymer_comp", + "rcsb_nonpolymer_entity_container_identifiers", + "chem_comp", + "entry_id" )) %>% - tidyr::unnest(.data$auth_asym_ids) %>% + tidyr::unnest("auth_asym_ids") %>% dplyr::rename( - name_nonpolymer = .data$name, - formula_nonpolymer = .data$formula, - formula_weight_nonpolymer = .data$formula_weight, - type_nonpolymer = .data$type, - id_nonpolymer = .data$id + name_nonpolymer = "name", + formula_nonpolymer = "formula", + formula_weight_nonpolymer = "formula_weight", + type_nonpolymer = "type", + id_nonpolymer = "id" ) } else { nonpolymer_entities <- polymer_entities %>% - dplyr::select(.data$pdb_ids, .data$auth_asym_ids) %>% + dplyr::select("pdb_ids", "auth_asym_ids") %>% dplyr::mutate( name_nonpolymer = NA, formula_nonpolymer = NA, @@ -872,14 +876,14 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { additional_info <- query_result_clean %>% dplyr::select(-c( - .data$entries.nonpolymer_entities, - .data$entries.polymer_entities, - .data$entries.rcsb_binding_affinity, - .data$entries.pdbx_nmr_exptl, - .data$entries.pdbx_nmr_exptl_sample_conditions, - .data$entries.pdbx_nmr_refine, - .data$entries.exptl_crystal_grow, - .data$resolution_combined + "entries.nonpolymer_entities", + "entries.polymer_entities", + "entries.rcsb_binding_affinity", + "entries.pdbx_nmr_exptl", + "entries.pdbx_nmr_exptl_sample_conditions", + "entries.pdbx_nmr_refine", + "entries.exptl_crystal_grow", + "resolution_combined" )) if (show_progress == TRUE) { @@ -889,7 +893,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { } combined <- polymer_entities %>% - dplyr::full_join(nonpolymer_entities, by = c("pdb_ids", "auth_asym_ids")) %>% + dplyr::full_join(nonpolymer_entities, by = c("pdb_ids", "auth_asym_ids"), relationship = "many-to-many") %>% dplyr::left_join(rcsb_binding_affinity, by = "pdb_ids") %>% dplyr::left_join(additional_info, by = "pdb_ids") %>% dplyr::left_join(crystal_growth_info, by = "pdb_ids") %>% @@ -898,12 +902,12 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::left_join(uniprot_info, by = "reference_database_accession") %>% dplyr::left_join(entity_instance_info, by = c("pdb_ids", "auth_asym_ids")) %>% dplyr::rename( - auth_asym_id = .data$auth_asym_ids, - label_asym_id = .data$asym_id, - pdb_sequence = .data$pdbx_seq_one_letter_code_can, - auth_seq_id_original = .data$auth_to_entity_poly_seq_mapping, - engineered_mutation = .data$pdbx_mutation, - non_std_monomer = .data$rcsb_non_std_monomers + auth_asym_id = "auth_asym_ids", + label_asym_id = "asym_id", + pdb_sequence = "pdbx_seq_one_letter_code_can", + auth_seq_id_original = "auth_to_entity_poly_seq_mapping", + engineered_mutation = "pdbx_mutation", + non_std_monomer = "rcsb_non_std_monomers" ) %>% dplyr::rowwise() %>% # make character string out of list column @@ -923,52 +927,52 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { ) ) %>% dplyr::select( - .data$pdb_ids, - .data$auth_asym_id, - .data$label_asym_id, - .data$reference_database_accession, - .data$protein_name, - .data$reference_database_name, - .data$entity_beg_seq_id, - .data$ref_beg_seq_id, - .data$length, - .data$pdb_sequence, - .data$auth_seq_id, - .data$auth_seq_id_original, - .data$engineered_mutation, - .data$modified_monomer, - .data$ligand_donor_atom_id, - .data$ligand_donor_auth_seq_id, - .data$ligand_donor_label_seq_id, - .data$ligand_donor_id, - .data$ligand_label_asym_id, - .data$ligand_atom_id, - .data$ligand_id, - .data$ligand_entity_id, - .data$ligand_bond_is_covalent_or_coordinating, - .data$secondary_structure, - .data$unmodeled_structure, - .data$id_nonpolymer, - .data$type_nonpolymer, - .data$formula_weight_nonpolymer, - .data$name_nonpolymer, - .data$formula_nonpolymer, - .data$experimental_method, - .data$structure_method, - .data$affinity_comp_id, - .data$affinity_value, - .data$pdbx_keywords, - .data$assembly_count, - .data$inter_mol_metalic_bond_count, - .data$pH_crystallisation, - .data$temp_crystallisation, - .data$method_crystallisation, - .data$type_nmr, - .data$ionic_strength_nmr, - .data$pH_nmr, - .data$temp_nmr, - .data$method_nmr, - .data$resolution_combined + "pdb_ids", + "auth_asym_id", + "label_asym_id", + "reference_database_accession", + "protein_name", + "reference_database_name", + "entity_beg_seq_id", + "ref_beg_seq_id", + "length", + "pdb_sequence", + "auth_seq_id", + "auth_seq_id_original", + "engineered_mutation", + "modified_monomer", + "ligand_donor_atom_id", + "ligand_donor_auth_seq_id", + "ligand_donor_label_seq_id", + "ligand_donor_id", + "ligand_label_asym_id", + "ligand_atom_id", + "ligand_id", + "ligand_entity_id", + "ligand_bond_is_covalent_or_coordinating", + "secondary_structure", + "unmodeled_structure", + "id_nonpolymer", + "type_nonpolymer", + "formula_weight_nonpolymer", + "name_nonpolymer", + "formula_nonpolymer", + "experimental_method", + "structure_method", + "affinity_comp_id", + "affinity_value", + "pdbx_keywords", + "assembly_count", + "inter_mol_metalic_bond_count", + "pH_crystallisation", + "temp_crystallisation", + "method_crystallisation", + "type_nmr", + "ionic_strength_nmr", + "pH_nmr", + "temp_nmr", + "method_nmr", + "resolution_combined" ) if (show_progress == TRUE) { diff --git a/R/fetch_pdb_structure.R b/R/fetch_pdb_structure.R index 28c8ab92..5d527eae 100644 --- a/R/fetch_pdb_structure.R +++ b/R/fetch_pdb_structure.R @@ -152,7 +152,7 @@ fetch_pdb_structure <- function(pdb_ids, return_data_frame = FALSE, show_progres "pdb_model_number" ) ) %>% - dplyr::select(-c(.data$X1, .data$x1, .data$x2, .data$x3, .data$x4)) %>% + dplyr::select(-c("X1", "x1", "x2", "x3", "x4")) %>% dplyr::group_by(.data$label_asym_id, .data$label_atom_id, .data$label_comp_id) %>% dplyr::mutate(label_seq_id = ifelse(.data$label_seq_id == ".", 1:n(), diff --git a/R/woods_plot.R b/R/woods_plot.R index 2714d427..9b8330cd 100644 --- a/R/woods_plot.R +++ b/R/woods_plot.R @@ -177,13 +177,13 @@ woods_plot <- function(data, yintercept = -{{ fold_change_cutoff }}, col = "blue", alpha = .8, - size = 0.7 + linewidth = 0.7 ) + ggplot2::geom_hline( yintercept = {{ fold_change_cutoff }}, col = "blue", alpha = .8, - size = 0.7 + linewidth = 0.7 ) + ggplot2::ylim( min(-2.5, dplyr::pull(data, {{ fold_change }})) - 0.5, diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index f7892c31..abdc9ccc 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -19,7 +19,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("find_peptide_in_structure works", { expect_is(positions_structure, "data.frame") - expect_equal(nrow(positions_structure), 451) + expect_equal(nrow(positions_structure), 457) expect_equal(ncol(positions_structure), 17) }) From f7bdc2416da91fac38648c9aee24b142d889ed51 Mon Sep 17 00:00:00 2001 From: jpquast Date: Sat, 19 Aug 2023 13:28:23 +0200 Subject: [PATCH 22/71] Update more tidyeval warnings Also run stylr on package to fix format --- R/assign_missingness.R | 17 +- R/assign_peptide_type.R | 4 +- R/barcode_plot.R | 17 +- R/calculate_diff_abundance.R | 6 +- R/calculate_go_enrichment.R | 136 ++++---- R/calculate_kegg_enrichment.R | 15 +- R/calculate_protein_abundance.R | 11 +- R/calculate_treatment_enrichment.R | 77 +++-- R/create_queue.R | 10 +- R/create_structure_contact_map.R | 4 +- R/create_synthetic_data.R | 58 ++-- R/extract_metal_binders.R | 183 +++++----- R/fetch_alphafold_prediction.R | 24 +- R/fetch_chebi.R | 39 ++- R/fetch_eco.R | 48 +-- R/fetch_kegg.R | 4 +- R/fetch_metal_pdb.R | 24 +- R/fetch_mobidb.R | 8 +- R/fetch_pdb.R | 23 +- R/fetch_quickgo.R | 40 +-- R/find_peptide_in_structure.R | 48 +-- R/fit_drc_4p.R | 6 +- R/map_peptides_on_structure.R | 50 +-- R/peptide_profile_plot.R | 12 +- R/predict_alphafold_domain.R | 8 +- R/qc_charge_states.R | 13 +- R/qc_contaminants.R | 4 +- R/qc_cvs.R | 17 +- R/qc_data_completeness.R | 4 +- R/qc_missed_cleavages.R | 13 +- R/qc_pca.R | 45 +-- R/qc_peptide_type.R | 8 +- R/qc_proteome_coverage.R | 2 +- R/qc_ranked_intensities.R | 2 +- R/randomise_queue.R | 1 - R/volcano_plot.R | 50 +-- R/woods_plot.R | 36 +- README.Rmd | 94 ++--- data-raw/metal_go_slim_subset.R | 22 +- man/calculate_go_enrichment.Rd | 67 ++-- man/calculate_kegg_enrichment.Rd | 15 +- .../data_analysis_dose_response_workflow.Rmd | 185 +++++----- ...nalysis_single_dose_treatment_workflow.Rmd | 148 ++++---- vignettes/input_preparation_workflow.Rmd | 39 +-- vignettes/protein_structure_workflow.Rmd | 326 ++++++++++-------- vignettes/quality_control_workflow.Rmd | 233 +++++++------ 46 files changed, 1164 insertions(+), 1032 deletions(-) diff --git a/R/assign_missingness.R b/R/assign_missingness.R index d6f6e49e..e2fa39d1 100644 --- a/R/assign_missingness.R +++ b/R/assign_missingness.R @@ -152,25 +152,26 @@ from the conditions and assigned their missingness. The created comparisons are: dplyr::group_by(.data$comparison) %>% dplyr::mutate(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% - dplyr::mutate(n_replicates = paste0(.data$n_replicates, collapse = "/")) - - if(any(unequal_replicates$n > 2)){ + dplyr::mutate(n_replicates = paste0(.data$n_replicates, collapse = "/")) + + if (any(unequal_replicates$n > 2)) { stop( "\n", strwrap('Some created comparisons seem to have more than two unequal number of replicates. This usually only happens if the wrong grouping variable was selected. Please check this! The grouping variable should split the dataset so that each sample of each condition only - appears once for each element of the grouping. E.g. grouping peptide: Each peptide should + appears once for each element of the grouping. E.g. grouping peptide: Each peptide should only have sample_1 associated once with condition_1 and not twice or more often. If in this case grouping "protein" was inadvertently selected a protein might have multiple peptides, each containing sample_1 of condition_1, which means it appears more than once (appears as many times as there are peptides per protein). This means each condition can have an unequal number of replicates - that is as high as the max number of proteins, which is not the correct calculation for replicates.', - prefix = "\n", initial = ""), "\n" + that is as high as the max number of proteins, which is not the correct calculation for replicates.', + prefix = "\n", initial = "" + ), "\n" ) } - - unequal_replicates <- unequal_replicates %>% + + unequal_replicates <- unequal_replicates %>% dplyr::distinct(.data$n_replicates, .data$comparison) if (nrow(unequal_replicates) != 0) { diff --git a/R/assign_peptide_type.R b/R/assign_peptide_type.R index 92409c0f..a219d610 100644 --- a/R/assign_peptide_type.R +++ b/R/assign_peptide_type.R @@ -55,7 +55,7 @@ assign_peptide_type <- function(data, last_aa = last_aa, aa_after = aa_after) { data %>% - dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>% + dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>% dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" | {{ aa_before }} == "K" | {{ aa_before }} == "R", @@ -73,7 +73,7 @@ assign_peptide_type <- function(data, .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic", .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic" )) %>% - dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>% + dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>% dplyr::right_join(data, by = c( rlang::as_name(rlang::enquo(aa_before)), rlang::as_name(rlang::enquo(last_aa)), diff --git a/R/barcode_plot.R b/R/barcode_plot.R index 850069ef..2d4d3fb5 100644 --- a/R/barcode_plot.R +++ b/R/barcode_plot.R @@ -102,14 +102,15 @@ barcode_plot <- function(data, # Create plot data %>% ggplot2::ggplot() + - ggplot2::geom_rect(ggplot2::aes( - ymin = -2.5, - ymax = 2.5, - xmax = {{ end_position }} / {{ protein_length }} * 100, - xmin = ({{ start_position }} - 1) / {{ protein_length }} * 100, - fill = {{ colouring }} - ), - size = 0.7 + ggplot2::geom_rect( + ggplot2::aes( + ymin = -2.5, + ymax = 2.5, + xmax = {{ end_position }} / {{ protein_length }} * 100, + xmin = ({{ start_position }} - 1) / {{ protein_length }} * 100, + fill = {{ colouring }} + ), + size = 0.7 ) + ggplot2::scale_fill_manual(values = c( "#999999", "#5680C1", "#B96DAD", "#64CACA", "#81ABE9", "#F6B8D1", "#99F1E4", "#9AD1FF", "#548BDF", "#A55098", "#3EB6B6", diff --git a/R/calculate_diff_abundance.R b/R/calculate_diff_abundance.R index 34c5b247..57718363 100644 --- a/R/calculate_diff_abundance.R +++ b/R/calculate_diff_abundance.R @@ -382,8 +382,8 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", } all_combinations <- all_combinations %>% - tidyr::pivot_longer(cols = c(.data$V1, .data$V2), names_to = "name", values_to = rlang::as_name(rlang::enquo(condition))) %>% - dplyr::select(-.data$name) %>% + tidyr::pivot_longer(cols = c("V1", "V2"), names_to = "name", values_to = rlang::as_name(rlang::enquo(condition))) %>% + dplyr::select(-"name") %>% dplyr::group_by({{ condition }}) %>% dplyr::mutate(comparison = list(.data$combinations)) %>% dplyr::distinct(.data$comparison, {{ condition }}) @@ -393,7 +393,7 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", dplyr::distinct({{ condition }}, {{ grouping }}, {{ mean }}, {{ sd }}, {{ n_samples }}) %>% tidyr::drop_na() %>% dplyr::left_join(all_combinations, by = rlang::as_name(rlang::enquo(condition))) %>% - tidyr::unnest(.data$comparison) %>% + tidyr::unnest("comparison") %>% dplyr::rename(mean = {{ mean }}, sd = {{ sd }}, n = {{ n_samples }}) %>% dplyr::mutate({{ condition }} := ifelse({{ condition }} == stringr::str_extract(.data$comparison, pattern = "(?<=_vs_).+"), "control", diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 0eb30eba..9f2deb8b 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -98,42 +98,41 @@ go_enrichment <- function(...) { #' ) #' ) #' -#' if(!is(data, "character")){ +#' if (!is(data, "character")) { +#' data <- uniprot_go_data %>% +#' mutate(significant = c( +#' rep(TRUE, 1000), +#' rep(FALSE, n() - 1000) +#' )) %>% +#' mutate(significant = ifelse( +#' str_detect( +#' go_f, +#' pattern = "ribosome" +#' ), +#' FALSE, +#' significant +#' )) #' -#' data <- uniprot_go_data %>% -#' mutate(significant = c( -#' rep(TRUE, 1000), -#' rep(FALSE, n() - 1000) -#' )) %>% -#' mutate(significant = ifelse( -#' str_detect( -#' go_f, -#' pattern = "ribosome" -#' ), -#' FALSE, -#' significant -#' )) -#' -#' # Plot gene ontology enrichment -#' calculate_go_enrichment( -#' data, -#' protein_id = accession, -#' go_annotations_uniprot = go_f, -#' is_significant = significant, -#' plot = TRUE, -#' plot_cutoff = "pval 0.01" -#' ) +#' # Plot gene ontology enrichment +#' calculate_go_enrichment( +#' data, +#' protein_id = accession, +#' go_annotations_uniprot = go_f, +#' is_significant = significant, +#' plot = TRUE, +#' plot_cutoff = "pval 0.01" +#' ) #' -#' # Calculate gene ontology enrichment -#' go_enrichment <- calculate_go_enrichment( -#' data, -#' protein_id = accession, -#' go_annotations_uniprot = go_f, -#' is_significant = significant, -#' plot = FALSE, -#' ) +#' # Calculate gene ontology enrichment +#' go_enrichment <- calculate_go_enrichment( +#' data, +#' protein_id = accession, +#' go_annotations_uniprot = go_f, +#' is_significant = significant, +#' plot = FALSE, +#' ) #' -#' head(go_enrichment, n = 10) +#' head(go_enrichment, n = 10) #' } #' } calculate_go_enrichment <- function(data, @@ -155,11 +154,11 @@ calculate_go_enrichment <- function(data, if (length(unique(dplyr::pull(data, {{ protein_id }}))) != nrow(data)) { # group by the "group" argument if provided - if(!missing(group)){ + if (!missing(group)) { data <- data %>% dplyr::ungroup() %>% - dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}, {{group}}) %>% - dplyr::group_by({{ protein_id }}, {{group}}) %>% + dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ go_annotations_uniprot }}, {{ group }}) %>% + dplyr::group_by({{ protein_id }}, {{ group }}) %>% dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, TRUE, FALSE)) %>% # do this to remove accidental double annotations dplyr::ungroup() %>% @@ -242,7 +241,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) if (!missing(go_annotations_uniprot)) go_data <- input - if (missing(group)){ + if (missing(group)) { # group argument is missing cont_table <- go_data %>% tidyr::drop_na(.data$go_id, {{ is_significant }}) %>% @@ -258,9 +257,9 @@ if you used the right organism ID.", prefix = "\n", initial = "")) fisher_test <- cont_table %>% split(dplyr::pull(., .data$go_id)) %>% purrr::map(.f = ~ dplyr::select(.x, -.data$go_id) %>% - tibble::column_to_rownames(var = rlang::as_name(enquo(is_significant))) %>% - as.matrix() %>% - fisher.test()) %>% + tibble::column_to_rownames(var = rlang::as_name(enquo(is_significant))) %>% + as.matrix() %>% + fisher.test()) %>% purrr::map2_df( .y = names(.), .f = ~ tibble::tibble( @@ -291,37 +290,40 @@ if you used the right organism ID.", prefix = "\n", initial = "")) # group argument is not missing cont_table <- go_data %>% tidyr::drop_na(.data$go_id, {{ is_significant }}) %>% - dplyr::group_by({{ is_significant }}, {{group}}) %>% + dplyr::group_by({{ is_significant }}, {{ group }}) %>% dplyr::mutate(n_sig = dplyr::n_distinct(.data$protein_id)) %>% - dplyr::group_by(.data$go_id, {{ is_significant }}, {{group}}) %>% + dplyr::group_by(.data$go_id, {{ is_significant }}, {{ group }}) %>% dplyr::mutate(n_has_process = dplyr::n_distinct(.data$protein_id)) %>% # count number of proteins with process for sig and non-sig proteins - dplyr::distinct(.data$go_id, {{ is_significant }}, .data$n_sig, .data$n_has_process, {{group}}) %>% - dplyr::group_by({{group}}) %>% + dplyr::distinct(.data$go_id, {{ is_significant }}, .data$n_sig, .data$n_has_process, {{ group }}) %>% + dplyr::group_by({{ group }}) %>% tidyr::complete(.data$go_id, tidyr::nesting(!!rlang::ensym(is_significant), n_sig), fill = list(n_has_process = 0)) %>% dplyr::ungroup() fisher_test <- cont_table %>% split(dplyr::pull(., {{ group }})) %>% - purrr::map2_dfr(.y = names(.), - .f = ~ { - .x %>% - dplyr::select(-{{ group }}) %>% - split(dplyr::pull(., .data$go_id)) %>% - purrr::map(.f = ~ {dplyr::select(.x, -c(.data$go_id)) %>% - tibble::column_to_rownames(var = rlang::as_name(enquo(is_significant))) %>% - as.matrix() %>% - fisher.test() - }) %>% - purrr::map2_dfr( - .y = names(.), - .f = ~ tibble::tibble( - pval = .x$p.value, - go_id = .y - ) - ) %>% - mutate({{ group }} := .y) - }) + purrr::map2_dfr( + .y = names(.), + .f = ~ { + .x %>% + dplyr::select(-{{ group }}) %>% + split(dplyr::pull(., .data$go_id)) %>% + purrr::map(.f = ~ { + dplyr::select(.x, -c(.data$go_id)) %>% + tibble::column_to_rownames(var = rlang::as_name(enquo(is_significant))) %>% + as.matrix() %>% + fisher.test() + }) %>% + purrr::map2_dfr( + .y = names(.), + .f = ~ tibble::tibble( + pval = .x$p.value, + go_id = .y + ) + ) %>% + mutate({{ group }} := .y) + } + ) result_table <- cont_table %>% dplyr::left_join(fisher_test, by = c("go_id", rlang::as_name(enquo(group)))) %>% @@ -352,7 +354,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) return(result_table) } - if (!missing(group) & y_axis_free){ + if (!missing(group) & y_axis_free) { # arrange table by group and go term for plot # this ensures that the terms are in the right order for a facet plot with a free axis result_table <- result_table %>% @@ -423,8 +425,8 @@ if you used the right organism ID.", prefix = "\n", initial = "")) ggplot2::scale_y_continuous(breaks = seq(0, 100, 1)) + ggplot2::coord_flip() + { - if (!missing(group)){ - if (y_axis_free){ + if (!missing(group)) { + if (y_axis_free) { ggplot2::facet_wrap(rlang::new_formula(NULL, rlang::enquo(group)), scales = "free_y") } else { ggplot2::facet_wrap(rlang::new_formula(NULL, rlang::enquo(group))) @@ -432,7 +434,7 @@ if you used the right organism ID.", prefix = "\n", initial = "")) } } + { - if (!missing(group)){ + if (!missing(group)) { # if axis were free then the special naming that ensures the right order needs to be removed again scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) } diff --git a/R/calculate_kegg_enrichment.R b/R/calculate_kegg_enrichment.R index 3a25bebc..3d8d341d 100644 --- a/R/calculate_kegg_enrichment.R +++ b/R/calculate_kegg_enrichment.R @@ -71,13 +71,14 @@ kegg_enrichment <- function(...) { #' if (!is.null(kegg_data)) { # only proceed if information was retrieved #' data <- kegg_data %>% #' group_by(uniprot_id) %>% -#' mutate(significant = rep(sample( -#' x = c(TRUE, FALSE), -#' size = 1, -#' replace = TRUE, -#' prob = c(0.2, 0.8) -#' ), -#' n = n() +#' mutate(significant = rep( +#' sample( +#' x = c(TRUE, FALSE), +#' size = 1, +#' replace = TRUE, +#' prob = c(0.2, 0.8) +#' ), +#' n = n() #' )) #' #' # Plot KEGG enrichment diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R index a39d1e33..d85f3a75 100644 --- a/R/calculate_protein_abundance.R +++ b/R/calculate_protein_abundance.R @@ -209,15 +209,16 @@ calculate_protein_abundance <- function(data, dplyr::select(-{{ precursor }}) } - if(!missing(retain_columns)){ + if (!missing(retain_columns)) { protein_intensity_retain <- data %>% dplyr::select( !!enquo(retain_columns), colnames(combined)[!colnames(combined) %in% - c( - rlang::as_name(rlang::enquo(intensity_log2)), - rlang::as_name(rlang::enquo(precursor)) - )]) %>% + c( + rlang::as_name(rlang::enquo(intensity_log2)), + rlang::as_name(rlang::enquo(precursor)) + )] + ) %>% dplyr::distinct() %>% dplyr::mutate({{ precursor }} := "protein_intensity") } diff --git a/R/calculate_treatment_enrichment.R b/R/calculate_treatment_enrichment.R index 1715102f..d0fde9cf 100644 --- a/R/calculate_treatment_enrichment.R +++ b/R/calculate_treatment_enrichment.R @@ -104,53 +104,54 @@ calculate_treatment_enrichment <- function(data, . <- NULL # group by the "group" argument if provided - if(!missing(group)){ + if (!missing(group)) { data <- data %>% dplyr::ungroup() %>% - dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ binds_treatment }}, {{group}}) %>% - dplyr::group_by({{ protein_id }}, {{group}}) %>% + dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ binds_treatment }}, {{ group }}) %>% + dplyr::group_by({{ protein_id }}, {{ group }}) %>% dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, - TRUE, - FALSE + TRUE, + FALSE )) %>% dplyr::ungroup() %>% dplyr::distinct() # Create contingency table cont_table <- data %>% - dplyr::group_by({{ binds_treatment }}, {{ is_significant }}, {{group}}) %>% + dplyr::group_by({{ binds_treatment }}, {{ is_significant }}, {{ group }}) %>% dplyr::summarize(n = dplyr::n_distinct(!!rlang::ensym(protein_id)), .groups = "drop") %>% - dplyr::group_by({{group}}) %>% + dplyr::group_by({{ group }}) %>% tidyr::complete({{ binds_treatment }}, {{ is_significant }}, fill = list(n = 0)) %>% dplyr::ungroup() fisher_test <- cont_table %>% split(dplyr::pull(., {{ group }})) %>% - purrr::map2_dfr(.y = names(.), - .f = ~ {ftest <- .x %>% - dplyr::select(-{{ group }}) %>% - tidyr::pivot_wider(names_from = {{ is_significant }}, values_from = .data$n) %>% - tibble::column_to_rownames(var = rlang::as_name(rlang::enquo(binds_treatment))) %>% - as.matrix() %>% - stats::fisher.test() + purrr::map2_dfr( + .y = names(.), + .f = ~ { + ftest <- .x %>% + dplyr::select(-{{ group }}) %>% + tidyr::pivot_wider(names_from = {{ is_significant }}, values_from = .data$n) %>% + tibble::column_to_rownames(var = rlang::as_name(rlang::enquo(binds_treatment))) %>% + as.matrix() %>% + stats::fisher.test() - data.frame(pval = ftest$p.value) %>% - dplyr::mutate({{ group }} := .y) - } + data.frame(pval = ftest$p.value) %>% + dplyr::mutate({{ group }} := .y) + } ) cont_table <- cont_table %>% dplyr::left_join(fisher_test, by = rlang::as_name(rlang::enquo(group))) %>% dplyr::arrange({{ group }}) - } else { data <- data %>% dplyr::ungroup() %>% dplyr::distinct({{ protein_id }}, {{ is_significant }}, {{ binds_treatment }}) %>% dplyr::group_by({{ protein_id }}) %>% dplyr::mutate({{ is_significant }} := ifelse(sum({{ is_significant }}, na.rm = TRUE) > 0, - TRUE, - FALSE + TRUE, + FALSE )) %>% dplyr::ungroup() %>% dplyr::distinct() @@ -176,16 +177,18 @@ calculate_treatment_enrichment <- function(data, } # Add p-value to group name for plot - if(!missing(group)){ + if (!missing(group)) { cont_table <- cont_table %>% - dplyr::mutate(group_pval = paste0({{ group }}, " (p-value: ", - ifelse(.data$pval < 0.01, - formatC(.data$pval, - format = "e", digits = 1 - ), - round(.data$pval, digits = 2) - ), - ")")) + dplyr::mutate(group_pval = paste0( + {{ group }}, " (p-value: ", + ifelse(.data$pval < 0.01, + formatC(.data$pval, + format = "e", digits = 1 + ), + round(.data$pval, digits = 2) + ), + ")" + )) } enrichment_plot <- cont_table %>% @@ -221,11 +224,13 @@ calculate_treatment_enrichment <- function(data, )) %>% ggplot2::ggplot(ggplot2::aes(.data$name, .data$value)) + ggplot2::geom_col(fill = "cornflowerblue", col = "black", size = 1.2) + - {if (!missing(group)){ + { + if (!missing(group)) { ggplot2::facet_wrap(~ .data$group_pval) - }} + + } + } + { - if (!missing(group)){ + if (!missing(group)) { ggplot2::labs( title = paste0( "Proteins interacting with ", @@ -241,10 +246,10 @@ calculate_treatment_enrichment <- function(data, treatment_name, " (p-value: ", ifelse(cont_table$pval < 0.01, - formatC(cont_table$pval, - format = "e", digits = 1 - ), - round(cont_table$pval, digits = 2) + formatC(cont_table$pval, + format = "e", digits = 1 + ), + round(cont_table$pval, digits = 2) ), ")" ), diff --git a/R/create_queue.R b/R/create_queue.R index ef7bb74d..48c005ae 100644 --- a/R/create_queue.R +++ b/R/create_queue.R @@ -203,10 +203,10 @@ create_queue <- ) )) %>% dplyr::select(-c( - .data$t1_null, - .data$t2_null, - .data$d1_null, - .data$d2_null + "t1_null", + "t2_null", + "d1_null", + "d2_null" )) sample_name <- sample_name %>% @@ -285,7 +285,7 @@ create_queue <- result <- data %>% dplyr::mutate(`Sample Type` = "Unknown") %>% - dplyr::select(.data$`Sample Type`, .data$`File Name`) %>% + dplyr::select("Sample Type", "File Name") %>% dplyr::mutate(`Sample ID` = 1) %>% dplyr::mutate(Path = data_path) %>% dplyr::mutate(`Instrument Method` = method_path) %>% diff --git a/R/create_structure_contact_map.R b/R/create_structure_contact_map.R index cdb8756a..0f76afac 100644 --- a/R/create_structure_contact_map.R +++ b/R/create_structure_contact_map.R @@ -202,7 +202,7 @@ Please always provide a chain ID for your start and end positions."), list(NA) )) %>% dplyr::ungroup() %>% - tidyr::unnest(.data$residue) %>% + tidyr::unnest("residue") %>% dplyr::mutate(retain_pattern = stringr::str_replace_all( paste({{ id }}, {{ chain }}, .data$residue, sep = "_"), pattern = "_NA", @@ -447,7 +447,7 @@ Please always provide a chain ID for your start and end positions."), .data$retain_pattern, pattern = paste(paste0(data_retain_pattern1, "(?=$|_)"), collapse = "|") )) %>% - dplyr::rename(id = .data$pdb_id) + dplyr::rename(id = "pdb_id") if (data2_missing) { structures %>% diff --git a/R/create_synthetic_data.R b/R/create_synthetic_data.R index b59b78ce..04333d67 100644 --- a/R/create_synthetic_data.R +++ b/R/create_synthetic_data.R @@ -147,7 +147,7 @@ create_synthetic_data <- function(n_proteins, ), digits = 4 ))) %>% - tidyr::unnest(.data$peptide_intensity_mean) %>% + tidyr::unnest("peptide_intensity_mean") %>% dplyr::mutate(peptide = paste0( "peptide_", stringr::str_extract( .data$protein, @@ -155,20 +155,21 @@ create_synthetic_data <- function(n_proteins, ), "_", 1:dplyr::n() )) %>% - dplyr::mutate(replicate_sd = round(stats::rlnorm( - n = 1, - meanlog = mean_log_replicates, - sdlog = sd_log_replicates - ), - digits = 4 + dplyr::mutate(replicate_sd = round( + stats::rlnorm( + n = 1, + meanlog = mean_log_replicates, + sdlog = sd_log_replicates + ), + digits = 4 )) %>% - dplyr::select(-c(.data$mean, .data$sd)) + dplyr::select(-c("mean", "sd")) # sample peptide intensities for replicates and conditions proteins_replicates <- proteins %>% dplyr::group_by(.data$peptide) %>% dplyr::mutate(condition = list(sort(rep(paste0("condition_", 1:n_conditions), n_replicates)))) %>% - tidyr::unnest(c(.data$condition)) %>% + tidyr::unnest(c("condition")) %>% dplyr::mutate(sample = paste0("sample_", 1:(n_conditions * n_replicates))) %>% dplyr::ungroup() %>% dplyr::mutate(peptide_intensity = stats::rnorm( @@ -215,12 +216,12 @@ create_synthetic_data <- function(n_proteins, dplyr::ungroup() %>% dplyr::mutate(peptide_intensity = .data$peptide_intensity + .data$offset) %>% dplyr::select(-c( - .data$peptide_intensity_mean, - .data$replicate_sd, - .data$effect, - .data$n, - .data$n_change_peptide, - .data$offset + "peptide_intensity_mean", + "replicate_sd", + "effect", + "n", + "n_change_peptide", + "offset" )) } @@ -255,17 +256,18 @@ create_synthetic_data <- function(n_proteins, dplyr::group_by(.data$peptide) %>% dplyr::mutate(effect_total = rep(.data$effect_total[1], (n_replicates * n_conditions))) %>% dplyr::ungroup() %>% - dplyr::mutate(b = sample(c( - stats::rlnorm(dplyr::n(), - meanlog = 0.6, - sdlog = 0.4 + dplyr::mutate(b = sample( + c( + stats::rlnorm(dplyr::n(), + meanlog = 0.6, + sdlog = 0.4 + ), + -rlnorm(dplyr::n(), + meanlog = 0.6, + sdlog = 0.4 + ) ), - -rlnorm(dplyr::n(), - meanlog = 0.6, - sdlog = 0.4 - ) - ), - size = dplyr::n() + size = dplyr::n() )) %>% dplyr::mutate(c = stats::rlnorm(dplyr::n(), meanlog = log(mean(concentrations) / 2), @@ -311,12 +313,12 @@ create_synthetic_data <- function(n_proteins, .data$peptide_intensity, NA )) %>% - dplyr::select(-.data$dropout_probability) %>% + dplyr::select(-"dropout_probability") %>% dplyr::group_by(.data$peptide) %>% dplyr::mutate(isna = sum(!is.na(.data$peptide_intensity_missing))) %>% # remove peptides for which every intensity is NA after dropout dplyr::filter(.data$isna > 0) %>% - dplyr::select(-.data$isna) %>% + dplyr::select(-"isna") %>% dplyr::ungroup() if (additional_metadata == FALSE) { @@ -334,7 +336,7 @@ create_synthetic_data <- function(n_proteins, dplyr::mutate(coverage_peptide = .data$coverage / dplyr::n_distinct(.data$peptide)) %>% dplyr::group_by(.data$sample, .data$protein) %>% dplyr::mutate(coverage = sum(!is.na(.data$peptide_intensity_missing)) * .data$coverage_peptide) %>% - dplyr::select(-.data$coverage_peptide) %>% + dplyr::select(-"coverage_peptide") %>% dplyr::ungroup() # adding missed cleavage estimates based on poisson distribution diff --git a/R/extract_metal_binders.R b/R/extract_metal_binders.R index ade8b27e..bc78a5e5 100644 --- a/R/extract_metal_binders.R +++ b/R/extract_metal_binders.R @@ -307,13 +307,13 @@ extract_metal_binders <- function(data_uniprot, b_uniprot <- data_uniprot %>% dplyr::distinct(.data$accession, .data$ft_binding) %>% - tidyr::drop_na(.data$ft_binding) %>% + tidyr::drop_na("ft_binding") %>% # Extract each position dplyr::mutate(ft_binding = stringr::str_extract_all( .data$ft_binding, pattern = "BINDING.+?(?=BINDING)|BINDING.+$" )) %>% - tidyr::unnest(.data$ft_binding) %>% + tidyr::unnest("ft_binding") %>% dplyr::mutate(chebi_id = stringr::str_extract(.data$ft_binding, pattern = '(?<=/ligand_id=\\"ChEBI:CHEBI:)[^\\";]+(?=[\\";])')) %>% # Filter with the previously generated data_chebi_filtered to only keep metal ChEBI IDs dplyr::filter(.data$chebi_id %in% data_chebi_filtered$chebi_id) %>% @@ -343,7 +343,7 @@ extract_metal_binders <- function(data_uniprot, !all(is.na(.data$evidence)))) %>% dplyr::ungroup() %>% dplyr::mutate(evidence_split = stringr::str_split(.data$evidence, pattern = ", ")) %>% - tidyr::unnest(.data$evidence_split) %>% + tidyr::unnest("evidence_split") %>% tidyr::separate(.data$evidence_split, into = c("eco", "evidence_source"), sep = "\\|", fill = "right") %>% dplyr::distinct() %>% dplyr::mutate(eco_type = dplyr::case_when( @@ -356,7 +356,7 @@ extract_metal_binders <- function(data_uniprot, paste0("1(", .data$ligand_name, ")"), paste0(.data$ligand_identifier, "(", .data$ligand_name, ")") )) %>% - dplyr::select(-c(.data$ft_binding, .data$evidence, .data$isoform, .data$ligand_name)) %>% + dplyr::select(-c("ft_binding", "evidence", "isoform", "ligand_name")) %>% dplyr::distinct() %>% # Extract metal positions dplyr::mutate(ligand_position = stringr::str_split(.data$ligand_position, pattern = "\\.\\.")) %>% @@ -368,7 +368,7 @@ extract_metal_binders <- function(data_uniprot, as.numeric(.x) } )) %>% - tidyr::unnest(.data$ligand_position) %>% + tidyr::unnest("ligand_position") %>% # Combine the binding_mode column to prevent duplicates # The reason for duplicates is a wrong annotation in UniProt (P00081). # There are also issues with additional IDs such as E3PRJ4, Q9DHD6, P00081. @@ -428,7 +428,7 @@ extract_metal_binders <- function(data_uniprot, NA, .data$metal_id_part )) %>% - dplyr::select(-.data$metal_atom_id) %>% + dplyr::select(-"metal_atom_id") %>% # Combine positions dplyr::group_by(.data$accession, .data$chebi_id, .data$ligand_identifier) %>% dplyr::mutate( @@ -456,10 +456,10 @@ extract_metal_binders <- function(data_uniprot, dplyr::ungroup() %>% dplyr::distinct() %>% dplyr::rename( - metal_id_part_binding = .data$metal_id_part, - eco_binding = .data$eco, - eco_type_binding = .data$eco_type, - evidence_source_binding = .data$evidence_source + metal_id_part_binding = "metal_id_part", + eco_binding = "eco", + eco_type_binding = "eco_type", + evidence_source_binding = "evidence_source" ) %>% dplyr::mutate(source = "binding") %>% # make sure that accession column is of type "chr" even if data frame is empty @@ -477,24 +477,24 @@ extract_metal_binders <- function(data_uniprot, cofactor_uniprot <- data_uniprot %>% dplyr::distinct(.data$accession, .data$cc_cofactor) %>% - tidyr::drop_na(.data$cc_cofactor) %>% + tidyr::drop_na("cc_cofactor") %>% dplyr::mutate(cofactor_split = stringr::str_extract_all( .data$cc_cofactor, pattern = "(?<=COFACTOR:).+?(?=COFACTOR|$)" )) %>% - tidyr::unnest(.data$cofactor_split) %>% + tidyr::unnest("cofactor_split") %>% # Extract notes dplyr::mutate(note = stringr::str_extract( .data$cofactor_split, pattern = "(?<=Note\\=).+?(?=;)" )) %>% - tidyr::unnest(.data$note) %>% + tidyr::unnest("note") %>% # Split names dplyr::mutate(name_split = stringr::str_extract_all( .data$cofactor_split, pattern = "(?<=Name\\=).+?(?=Name|Note|COFACTOR|$)" )) %>% - tidyr::unnest(.data$name_split) %>% + tidyr::unnest("name_split") %>% # Extract ChEBI IDs from cc_cofactor dplyr::mutate(chebi_id = stringr::str_extract( .data$name_split, @@ -506,7 +506,7 @@ extract_metal_binders <- function(data_uniprot, dplyr::mutate(evidence = stringr::str_extract(.data$name_split, pattern = "(?<=Evidence=).+(?=;|$)")) %>% dplyr::mutate(evidence = stringr::str_remove_all(.data$evidence, pattern = ";|\\{|\\}")) %>% dplyr::mutate(evidence_split = stringr::str_split(.data$evidence, pattern = ", ")) %>% - tidyr::unnest(.data$evidence_split) %>% + tidyr::unnest("evidence_split") %>% tidyr::separate(.data$evidence_split, into = c("eco", "evidence_source"), sep = "\\|", fill = "right") %>% dplyr::mutate(eco = stringr::str_trim(.data$eco)) %>% dplyr::distinct() %>% @@ -517,10 +517,10 @@ extract_metal_binders <- function(data_uniprot, dplyr::mutate(eco_type = ifelse(is.na(.data$eco_type), "automatic_assertion", .data$eco_type)) %>% # dplyr::mutate(note_evidence = str_extract(.data$note, # pattern = "(?<=\\{).+(?=\\})")) %>% - dplyr::select(-c(.data$cc_cofactor, .data$cofactor_split, .data$name_split, .data$evidence)) %>% + dplyr::select(-c("cc_cofactor", "cofactor_split", "name_split", "evidence")) %>% # Add metal_id_part position using the data provided by protti dplyr::left_join(dplyr::distinct(data_chebi_filtered, .data$chebi_id, .data$metal_atom_id) %>% - dplyr::rename(metal_id_part = .data$metal_atom_id), by = "chebi_id") %>% + dplyr::rename(metal_id_part = "metal_atom_id"), by = "chebi_id") %>% # Now combine data to have one row per accession and chebi_id # First concatenate different notes for the same accession and chebi_id dplyr::group_by(.data$accession, .data$chebi_id) %>% @@ -565,7 +565,7 @@ extract_metal_binders <- function(data_uniprot, .data$cc_catalytic_activity, pattern = "(?<=CATALYTIC ACTIVITY:).+?(?=CATALYTIC ACTIVITY|$)" )) %>% - tidyr::unnest(.data$catalytic_activity_split) %>% + tidyr::unnest("catalytic_activity_split") %>% dplyr::mutate(catalytic_activity_split = stringr::str_remove( stringr::str_trim(.data$catalytic_activity_split), pattern = "Reaction=" @@ -574,13 +574,13 @@ extract_metal_binders <- function(data_uniprot, .data$catalytic_activity_split, pattern = "(?<=CHEBI:)\\d+" )) %>% - tidyr::unnest(.data$chebi_id) %>% + tidyr::unnest("chebi_id") %>% # Filter with data_chebi_filtered to only keep metal ChEBI IDs dplyr::filter(.data$chebi_id %in% data_chebi_filtered$chebi_id) %>% dplyr::mutate(evidence = stringr::str_extract(.data$catalytic_activity_split, pattern = "(?<=Evidence=)[^;]+(?=;)")) %>% dplyr::mutate(evidence = stringr::str_remove_all(.data$evidence, pattern = ";|\\{|\\}")) %>% dplyr::mutate(evidence_split = stringr::str_split(.data$evidence, pattern = ", ")) %>% - tidyr::unnest(.data$evidence_split) %>% + tidyr::unnest("evidence_split") %>% tidyr::separate(.data$evidence_split, into = c("eco", "evidence_source"), sep = "\\|", fill = "right") %>% dplyr::mutate(eco = stringr::str_trim(.data$eco)) %>% dplyr::distinct() %>% @@ -591,7 +591,7 @@ extract_metal_binders <- function(data_uniprot, dplyr::mutate(eco_type = ifelse(is.na(.data$eco_type), "automatic_assertion", .data$eco_type)) %>% dplyr::mutate(reaction = stringr::str_extract(.data$catalytic_activity_split, pattern = "(?<=PhysiologicalDirection=).+(?=;$)")) %>% dplyr::mutate(reaction = stringr::str_split(.data$reaction, pattern = "PhysiologicalDirection=")) %>% - tidyr::unnest(.data$reaction) %>% + tidyr::unnest("reaction") %>% dplyr::mutate(reaction = stringr::str_replace(.data$reaction, pattern = "Xref=Rhea:", replacement = "Direction")) %>% dplyr::mutate(rhea = stringr::str_extract_all(.data$catalytic_activity_split, pattern = "(?<=RHEA:)\\d+")) %>% dplyr::mutate(rhea = paste0(" RHEA:", purrr::map2_chr( @@ -610,16 +610,16 @@ extract_metal_binders <- function(data_uniprot, NA )) %>% dplyr::mutate(ec = paste0("EC:", stringr::str_extract(.data$catalytic_activity_split, pattern = "(?<=EC=)[^;]+(?=;)"), ", ", .data$rhea)) %>% - tidyr::unite(col = "reaction", c(.data$ec, .data$reaction), na.rm = TRUE, sep = ", ") %>% + tidyr::unite(col = "reaction", c("ec", "reaction"), na.rm = TRUE, sep = ", ") %>% dplyr::mutate( reaction = stringr::str_replace_all(.data$reaction, pattern = "=", replacement = ":"), reaction = stringr::str_replace_all(.data$reaction, pattern = ";", replacement = ","), reaction = stringr::str_replace_all(.data$reaction, pattern = "\\|", replacement = "/") ) %>% - dplyr::select(-c(.data$cc_catalytic_activity, .data$catalytic_activity_split, .data$evidence, .data$rhea)) %>% + dplyr::select(-c("cc_catalytic_activity", "catalytic_activity_split", "evidence", "rhea")) %>% # Add metal_id_part position using the data provided by protti dplyr::left_join(dplyr::distinct(data_chebi_filtered, .data$chebi_id, .data$metal_atom_id) %>% - dplyr::rename(metal_id_part = .data$metal_atom_id), by = "chebi_id") %>% + dplyr::rename(metal_id_part = "metal_atom_id"), by = "chebi_id") %>% # Now combine data to have one row per accession and chebi_id # First concatenate different evidences for the same accession, chebi_id, eco and reaction dplyr::group_by(.data$accession, .data$chebi_id, .data$eco, .data$reaction) %>% @@ -691,22 +691,23 @@ extract_metal_binders <- function(data_uniprot, # Filter GO data to only contain protein IDs also in the UniProt input dplyr::filter(.data$gene_product_id %in% data_uniprot$accession) %>% dplyr::select( - .data$gene_product_id, - .data$go_term, - .data$go_name, - .data$eco_id, - .data$reference, - .data$with_from, - .data$assigned_by + "gene_product_id", + "go_term", + "go_name", + "eco_id", + "reference", + "with_from", + "assigned_by" ) %>% dplyr::distinct() %>% dplyr::rename( - eco = .data$eco_id, - accession = .data$gene_product_id + eco = "eco_id", + accession = "gene_product_id" ) %>% # join ChEBI annotations to data dplyr::left_join(dplyr::distinct(metal_go_slim_subset, .data$slims_from_id, .data$chebi_id, .data$database, .data$metal_atom_id), - by = c("go_term" = "slims_from_id") + by = c("go_term" = "slims_from_id"), + relationship = "many-to-many" ) %>% dplyr::rename(metal_id_part = .data$metal_atom_id) %>% dplyr::mutate(eco_type = dplyr::case_when( @@ -786,7 +787,7 @@ extract_metal_binders <- function(data_uniprot, .f = ~ length(.x) > 1 ))) %>% dplyr::ungroup() %>% - tidyr::unnest(.data$most_specific_id) %>% + tidyr::unnest("most_specific_id") %>% dplyr::arrange(.data$source) %>% dplyr::group_by(.data$accession, .data$most_specific_id) %>% dplyr::mutate( @@ -835,7 +836,7 @@ extract_metal_binders <- function(data_uniprot, ) %>% dplyr::ungroup() %>% dplyr::filter(!.data$appears) %>% - dplyr::select(-c(.data$appears, .data$chebi_sub_id)) %>% + dplyr::select(-c("appears", "chebi_sub_id")) %>% dplyr::distinct() %>% # unpack positions and corresponding info dplyr::mutate( @@ -850,15 +851,15 @@ extract_metal_binders <- function(data_uniprot, chebi_id_binding = stringr::str_split(.data$chebi_id_binding, pattern = "\\|") ) %>% tidyr::unnest(c( - .data$ligand_identifier, - .data$ligand_position, - .data$metal_id_part_binding, - .data$binding_mode, - .data$metal_function, - .data$eco_binding, - .data$eco_type_binding, - .data$evidence_source_binding, - .data$chebi_id_binding + "ligand_identifier", + "ligand_position", + "metal_id_part_binding", + "binding_mode", + "metal_function", + "eco_binding", + "eco_type_binding", + "evidence_source_binding", + "chebi_id_binding" )) %>% dplyr::mutate( ligand_identifier = stringr::str_split(.data$ligand_identifier, pattern = ";;;;"), @@ -872,15 +873,15 @@ extract_metal_binders <- function(data_uniprot, chebi_id_binding = stringr::str_split(.data$chebi_id_binding, pattern = ";;;;") ) %>% tidyr::unnest(c( - .data$ligand_identifier, - .data$ligand_position, - .data$metal_id_part_binding, - .data$binding_mode, - .data$metal_function, - .data$eco_binding, - .data$eco_type_binding, - .data$evidence_source_binding, - .data$chebi_id_binding + "ligand_identifier", + "ligand_position", + "metal_id_part_binding", + "binding_mode", + "metal_function", + "eco_binding", + "eco_type_binding", + "evidence_source_binding", + "chebi_id_binding" )) %>% dplyr::mutate( ligand_identifier = stringr::str_split(.data$ligand_identifier, pattern = ";;;"), @@ -894,22 +895,22 @@ extract_metal_binders <- function(data_uniprot, chebi_id_binding = stringr::str_split(.data$chebi_id_binding, pattern = ";;;") ) %>% tidyr::unnest(c( - .data$ligand_identifier, - .data$ligand_position, - .data$metal_id_part_binding, - .data$binding_mode, - .data$metal_function, - .data$eco_binding, - .data$eco_type_binding, - .data$evidence_source_binding, - .data$chebi_id_binding + "ligand_identifier", + "ligand_position", + "metal_id_part_binding", + "binding_mode", + "metal_function", + "eco_binding", + "eco_type_binding", + "evidence_source_binding", + "chebi_id_binding" )) %>% dplyr::mutate(ligand_position = as.numeric(.data$ligand_position)) %>% dplyr::arrange(.data$accession, .data$ligand_position) %>% dplyr::left_join(chebi_names, by = c("most_specific_id" = "id") ) %>% - dplyr::rename(most_specific_id_name = .data$name) %>% + dplyr::rename(most_specific_id_name = "name") %>% dplyr::mutate( note = ifelse(.data$note == "NA" | .data$note == "", NA, .data$note), reaction = ifelse(.data$reaction == "NA" | .data$reaction == "", NA, .data$reaction), @@ -919,23 +920,23 @@ extract_metal_binders <- function(data_uniprot, database = ifelse(.data$database == "NA" | .data$database == "", NA, .data$database) ) %>% dplyr::mutate(binding_temp_1 = stringr::str_split(.data$metal_id_part, pattern = ",")) %>% - tidyr::unnest(.data$binding_temp_1) %>% + tidyr::unnest("binding_temp_1") %>% dplyr::left_join(chebi_names, by = c("binding_temp_1" = "id") ) %>% - dplyr::rename(metal_id_part_name = .data$name) %>% - dplyr::select(-c(.data$binding_temp_1)) %>% + dplyr::rename(metal_id_part_name = "name") %>% + dplyr::select(-c("binding_temp_1")) %>% dplyr::group_by(.data$accession, .data$chebi_id, .data$ligand_identifier, .data$ligand_position) %>% dplyr::mutate(metal_id_part_name = paste0(.data$metal_id_part_name, collapse = ",")) %>% dplyr::ungroup() %>% dplyr::distinct() %>% dplyr::mutate(binding_temp_2 = stringr::str_split(.data$metal_id_part_binding, pattern = ",")) %>% - tidyr::unnest(.data$binding_temp_2) %>% + tidyr::unnest("binding_temp_2") %>% dplyr::left_join(chebi_names, by = c("binding_temp_2" = "id") ) %>% - dplyr::rename(metal_id_part_binding_name = .data$name) %>% - dplyr::select(-c(.data$binding_temp_2)) %>% + dplyr::rename(metal_id_part_binding_name = "name") %>% + dplyr::select(-c("binding_temp_2")) %>% dplyr::group_by(.data$accession, .data$chebi_id, .data$ligand_identifier, .data$ligand_position) %>% dplyr::mutate(metal_id_part_binding_name = paste0(.data$metal_id_part_binding_name, collapse = ",")) %>% dplyr::distinct() %>% @@ -1006,28 +1007,28 @@ extract_metal_binders <- function(data_uniprot, ) ) %>% dplyr::ungroup() %>% - dplyr::select(-c(.data$eco_binding, .data$eco_type_binding, .data$evidence_source_binding, .data$chebi_id_binding, .data$metal_id_part_binding, .data$metal_id_part_binding_name)) %>% + dplyr::select(-c("eco_binding", "eco_type_binding", "evidence_source_binding", "chebi_id_binding", "metal_id_part_binding", "metal_id_part_binding_name")) %>% dplyr::select( - .data$accession, - .data$most_specific_id, - .data$most_specific_id_name, - .data$ligand_identifier, - .data$ligand_position, - .data$binding_mode, - .data$metal_function, - .data$metal_id_part, - .data$metal_id_part_name, - .data$note, - .data$chebi_id, - .data$source, - .data$eco, - .data$eco_type, - .data$evidence_source, - .data$reaction, - .data$go_term, - .data$go_name, - .data$assigned_by, - .data$database + "accession", + "most_specific_id", + "most_specific_id_name", + "ligand_identifier", + "ligand_position", + "binding_mode", + "metal_function", + "metal_id_part", + "metal_id_part_name", + "note", + "chebi_id", + "source", + "eco", + "eco_type", + "evidence_source", + "reaction", + "go_term", + "go_name", + "assigned_by", + "database" ) if (show_progress == TRUE) { diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 3945e8ad..6d0a5e49 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -385,18 +385,18 @@ fetch_alphafold_prediction <- function(uniprot_ids = NULL, ) ) %>% dplyr::select(-c( - .data$X1, - .data$x1, - .data$x2, - .data$x3, - .data$x4, - .data$x5, - .data$x6, - .data$x7, - .data$formal_charge, - .data$site_occupancy, - .data$entity_id, - .data$pdb_model_number + "X1", + "x1", + "x2", + "x3", + "x4", + "x5", + "x6", + "x7", + "formal_charge", + "site_occupancy", + "entity_id", + "pdb_model_number" )) %>% dplyr::mutate( label_id = as.numeric(.data$label_id), diff --git a/R/fetch_chebi.R b/R/fetch_chebi.R index a74e36d0..fca34f3d 100644 --- a/R/fetch_chebi.R +++ b/R/fetch_chebi.R @@ -33,12 +33,13 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { } # Retrieve relational information if (relation == TRUE) { - chebi_relation_result <- tryCatch(httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv", - httr::timeout(timeout) - ), - error = function(e) conditionMessage(e), - warning = function(w) conditionMessage(w) + chebi_relation_result <- tryCatch( + httr::GET( + "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv", + httr::timeout(timeout) + ), + error = function(e) conditionMessage(e), + warning = function(w) conditionMessage(w) ) # Check again if there is an internet connection @@ -73,12 +74,13 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { } # Download compound data - chebi_chemical_data_result <- tryCatch(httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", - httr::timeout(timeout) - ), - error = function(e) conditionMessage(e), - warning = function(w) conditionMessage(w) + chebi_chemical_data_result <- tryCatch( + httr::GET( + "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", + httr::timeout(timeout) + ), + error = function(e) conditionMessage(e), + warning = function(w) conditionMessage(w) ) # check if response is error @@ -134,12 +136,13 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { return(invisible(NULL)) } - chebi_accession_result <- tryCatch(httr::GET( - "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv", - httr::timeout(timeout) - ), - error = function(e) conditionMessage(e), - warning = function(w) conditionMessage(w) + chebi_accession_result <- tryCatch( + httr::GET( + "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv", + httr::timeout(timeout) + ), + error = function(e) conditionMessage(e), + warning = function(w) conditionMessage(w) ) # check if response is error diff --git a/R/fetch_eco.R b/R/fetch_eco.R index dc5b543f..eaa3d985 100644 --- a/R/fetch_eco.R +++ b/R/fetch_eco.R @@ -133,7 +133,7 @@ fetch_eco <- function(return_relation = FALSE, history <- query_result %>% dplyr::distinct(.data$id, .data$history) %>% - tidyr::unnest(.data$history) %>% + tidyr::unnest("history") %>% dplyr::distinct(.data$id, .data$timestamp, .data$action, .data$category, .data$text) return(history) @@ -144,9 +144,9 @@ fetch_eco <- function(return_relation = FALSE, relation <- query_result %>% dplyr::distinct(.data$id, .data$children) %>% - dplyr::rename(main_id = .data$id) %>% - tidyr::unnest(.data$children) %>% - dplyr::rename(child_id = .data$id) + dplyr::rename(main_id = "id") %>% + tidyr::unnest("children") %>% + dplyr::rename(child_id = "id") return(relation) } @@ -154,36 +154,36 @@ fetch_eco <- function(return_relation = FALSE, # Unnest the data frame bit by bit to not lose information query_result_unnest_1 <- query_result %>% - dplyr::select(-c(.data$history, .data$children)) %>% - tidyr::unnest(.data$definition) %>% - dplyr::rename(main_name = .data$name) %>% - dplyr::rename(definition = .data$text) + dplyr::select(-c("history", "children")) %>% + tidyr::unnest("definition") %>% + dplyr::rename(main_name = "name") %>% + dplyr::rename(definition = "text") query_result_unnest_2 <- query_result_unnest_1 %>% - tidyr::unnest(.data$secondaryIds) %>% + tidyr::unnest("secondaryIds") %>% dplyr::distinct(.data$id, .data$secondaryIds) %>% - dplyr::right_join(dplyr::select(query_result_unnest_1, c(-.data$secondaryIds)), by = "id") + dplyr::right_join(dplyr::select(query_result_unnest_1, c(-"secondaryIds")), by = "id") query_result_unnest_3 <- query_result_unnest_2 %>% - tidyr::unnest(.data$xRefs) %>% + tidyr::unnest("xRefs") %>% dplyr::distinct(.data$id, .data$dbCode, .data$dbId) %>% - dplyr::right_join(dplyr::select(query_result_unnest_2, c(-.data$xRefs)), by = "id") + dplyr::right_join(dplyr::select(query_result_unnest_2, c(-"xRefs")), by = "id") result <- query_result_unnest_3 %>% - tidyr::unnest(.data$synonyms) %>% + tidyr::unnest("synonyms") %>% dplyr::distinct(.data$id, .data$name, .data$type) %>% - dplyr::right_join(dplyr::select(query_result_unnest_3, c(-.data$synonyms)), by = "id") %>% + dplyr::right_join(dplyr::select(query_result_unnest_3, c(-"synonyms")), by = "id") %>% dplyr::select( - .data$id, - .data$isObsolete, - .data$main_name, - .data$definition, - .data$comment, - .data$name, - .data$type, - .data$dbCode, - .data$dbId, - .data$secondaryIds + "id", + "isObsolete", + "main_name", + "definition", + "comment", + "name", + "type", + "dbCode", + "dbId", + "secondaryIds" ) %>% janitor::clean_names() diff --git a/R/fetch_kegg.R b/R/fetch_kegg.R index 4eac7605..26fc76ea 100644 --- a/R/fetch_kegg.R +++ b/R/fetch_kegg.R @@ -35,8 +35,8 @@ fetch_kegg <- function(species) { } colnames(result_link) <- c("kegg_id", "pathway_id") result_link$pathway_id <- stringr::str_replace_all(result_link$pathway_id, - pattern = "path:", - replacement = "" + pattern = "path:", + replacement = "" ) # download pathway_id names url_name <- paste("https://rest.kegg.jp/list/pathway", species, sep = "/") diff --git a/R/fetch_metal_pdb.R b/R/fetch_metal_pdb.R index bb51433f..9a6edebc 100644 --- a/R/fetch_metal_pdb.R +++ b/R/fetch_metal_pdb.R @@ -294,11 +294,11 @@ fetch_metal_pdb <- function(id_type = "uniprot", )), should_be_here )) %>% - tidyr::unnest(.data$metals) + tidyr::unnest("metals") if ("metals" %in% colnames(content_metal)) { content_metal <- content_metal %>% - dplyr::select(-c(.data$metals)) + dplyr::select(-c("metals")) } columns_metal <- c( @@ -322,14 +322,14 @@ fetch_metal_pdb <- function(id_type = "uniprot", should_be_here_metal )) %>% dplyr::rename( - auth_seq_id_metal = .data$residue_pdb_number, - auth_id_metal = .data$atom_pdb_number, - symbol_metal = .data$symbol + auth_seq_id_metal = "residue_pdb_number", + auth_id_metal = "atom_pdb_number", + symbol_metal = "symbol" ) %>% dplyr::group_by(.data$site, .data$auth_id_metal) %>% dplyr::mutate(check = length(.data$ligands[[1]])) %>% dplyr::mutate(ligands = ifelse(.data$check == 0, NA, .data$ligands)) %>% - tidyr::unnest_longer(.data$ligands) %>% + tidyr::unnest_longer("ligands") %>% dplyr::bind_cols(donors = .$ligands) columns_ligand <- c("check", "chain", "donors", "residue_pdb_number", "residue") @@ -343,9 +343,9 @@ fetch_metal_pdb <- function(id_type = "uniprot", )), should_be_here_ligand )) %>% - tidyr::unnest_longer(.data$donors) %>% + tidyr::unnest_longer("donors") %>% dplyr::bind_cols(atom = .$donors) %>% - dplyr::select(-c(.data$ligands, .data$donors, .data$check)) + dplyr::select(-c("ligands", "donors", "check")) columns_donor <- c("atom", "symbol", "atom_pdb_number", "distance") should_be_here_donor <- columns_donor[!columns_donor %in% colnames(content_donor)] @@ -359,10 +359,10 @@ fetch_metal_pdb <- function(id_type = "uniprot", should_be_here_donor )) %>% dplyr::rename( - auth_asym_id_ligand = .data$chain, - auth_seq_id_ligand = .data$residue_pdb_number, - auth_id_ligand = .data$atom_pdb_number, - auth_atom_id_ligand = .data$atom + auth_asym_id_ligand = "chain", + auth_seq_id_ligand = "residue_pdb_number", + auth_id_ligand = "atom_pdb_number", + auth_atom_id_ligand = "atom" ) %>% dplyr::ungroup() diff --git a/R/fetch_mobidb.R b/R/fetch_mobidb.R index a00c4dfc..9e22712a 100644 --- a/R/fetch_mobidb.R +++ b/R/fetch_mobidb.R @@ -47,21 +47,21 @@ fetch_mobidb <- function(uniprot_ids = NULL, organism_id = NULL, show_progress = # Check uniprot ID validity uniprot_ids <- stats::na.omit(uniprot_ids) id_test <- stringr::str_detect(uniprot_ids, - pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" + pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" ) non_conform_ids <- uniprot_ids[!id_test] uniprot_ids <- uniprot_ids[id_test] - + if (length(non_conform_ids) != 0) { warning(strwrap("These UniProt accession numbers did not conform to uniprot standards and were skipped from fetching: ", -prefix = "\n", initial = "" + prefix = "\n", initial = "" ), paste(non_conform_ids, collapse = ", ")) } if (length(uniprot_ids) == 0) { stop("No valid UniProt accession numbers found.") } - + if (length(uniprot_ids) < 800) { # generate url url <- paste0( diff --git a/R/fetch_pdb.R b/R/fetch_pdb.R index f7bb0592..896971d8 100644 --- a/R/fetch_pdb.R +++ b/R/fetch_pdb.R @@ -224,11 +224,12 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { stringr::str_replace_all(pattern = "\\[", replacement = "%5B") %>% stringr::str_replace_all(pattern = "\\]", replacement = "%5D") - query <- try_query(httr::modify_url("https://data.rcsb.org/graphql", - query = url_encode_query - ), - type = "application/json", - simplifyDataFrame = TRUE + query <- try_query( + httr::modify_url("https://data.rcsb.org/graphql", + query = url_encode_query + ), + type = "application/json", + simplifyDataFrame = TRUE ) if (show_progress == TRUE) { @@ -778,11 +779,13 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { values = as.character(rep(.data$non_consecutive, .data$n_missing) + unlist(.data$replacement_values_addition)) ))) %>% dplyr::ungroup() %>% - dplyr::select(-c("auth_seq_id_pdb_numeric", - "non_consecutive", - "n_missing", - "replacement_values_addition", - "replacement_positions")) + dplyr::select(-c( + "auth_seq_id_pdb_numeric", + "non_consecutive", + "n_missing", + "replacement_values_addition", + "replacement_positions" + )) } # Join corrected entries back polymer_entities <- polymer_entities %>% diff --git a/R/fetch_quickgo.R b/R/fetch_quickgo.R index 129a36b7..23a5c9e5 100644 --- a/R/fetch_quickgo.R +++ b/R/fetch_quickgo.R @@ -223,33 +223,33 @@ fetch_quickgo <- function(type = "annotations", pre_extracted_result <- query_result %>% janitor::clean_names() %>% - tidyr::unnest(.data$definition) %>% - dplyr::rename(main_name = .data$name) %>% - dplyr::rename(definition = .data$text) %>% - select(-c(.data$xrefs)) %>% + tidyr::unnest("definition") %>% + dplyr::rename(main_name = "name") %>% + dplyr::rename(definition = "text") %>% + select(-c("xrefs")) %>% dplyr::rename( - main_id = .data$id, - ontology = .data$aspect + main_id = "id", + ontology = "aspect" ) # synonyms (not used yet but could be added in the future) synonyms <- pre_extracted_result %>% dplyr::distinct(.data$main_id, .data$synonyms) %>% - tidyr::unnest(.data$synonyms) %>% + tidyr::unnest("synonyms") %>% dplyr::rename( - synonym = .data$name, - synonym_type = .data$type + synonym = "name", + synonym_type = "type" ) # children children <- pre_extracted_result %>% dplyr::distinct(.data$main_id, .data$children) %>% - tidyr::unnest(.data$children) %>% + tidyr::unnest("children") %>% dplyr::rename( - child_id = .data$id, - children_relation = .data$relation + child_id = "id", + children_relation = "relation" ) %>% dplyr::group_by(.data$main_id) %>% dplyr::mutate( @@ -263,19 +263,19 @@ fetch_quickgo <- function(type = "annotations", history <- pre_extracted_result %>% dplyr::distinct(.data$main_id, .data$history) %>% - tidyr::unnest(.data$history) + tidyr::unnest("history") # relations relations <- pre_extracted_result %>% dplyr::distinct(.data$main_id, .data$x_relations) %>% - tidyr::unnest(.data$x_relations) %>% + tidyr::unnest("x_relations") %>% dplyr::rename( - chebi_id = .data$id, - relations_term = .data$term, - database = .data$namespace, - relations_url = .data$url, - relations_relation = .data$relation + chebi_id = "id", + relations_term = "term", + database = "namespace", + relations_url = "url", + relations_relation = "relation" ) result <- pre_extracted_result %>% @@ -331,7 +331,7 @@ fetch_quickgo <- function(type = "annotations", } else { query_result <- query_result[["results"]] %>% janitor::clean_names() %>% - tidyr::unnest(.data$slims_to_ids) + tidyr::unnest("slims_to_ids") } return(query_result) diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R index 915a4217..d450450c 100644 --- a/R/find_peptide_in_structure.R +++ b/R/find_peptide_in_structure.R @@ -164,7 +164,7 @@ find_peptide_in_structure <- function(peptide_data, .data$auth_seq_id, .data$label_asym_id ) %>% - dplyr::rename(length_pdb = .data$length) %>% + dplyr::rename(length_pdb = "length") %>% dplyr::mutate({{ uniprot_id }} := .data$reference_database_accession) %>% dplyr::mutate(length_pdb_sequence = nchar(.data$pdb_sequence)) %>% dplyr::mutate( @@ -172,19 +172,19 @@ find_peptide_in_structure <- function(peptide_data, entity_end_seq_id = as.numeric(.data$entity_beg_seq_id) + as.numeric(.data$length_pdb) - 1 ) %>% dplyr::select( - .data$pdb_ids, - .data$auth_asym_id, + "pdb_ids", + "auth_asym_id", {{ uniprot_id }}, - .data$entity_beg_seq_id, - .data$entity_end_seq_id, - .data$ref_beg_seq_id, - .data$ref_end_seq_id, - .data$pdb_sequence, - .data$length_pdb_sequence, - .data$auth_seq_id, - .data$label_asym_id + "entity_beg_seq_id", + "entity_end_seq_id", + "ref_beg_seq_id", + "ref_end_seq_id", + "pdb_sequence", + "length_pdb_sequence", + "auth_seq_id", + "label_asym_id" ) %>% - dplyr::right_join(peptide_data_prep, by = c(rlang::as_name(rlang::enquo(uniprot_id)))) %>% + dplyr::right_join(peptide_data_prep, by = c(rlang::as_name(rlang::enquo(uniprot_id))), relationship = "many-to-many") %>% dplyr::mutate(peptide_in_pdb = ({{ start }} >= .data$ref_beg_seq_id & {{ start }} <= .data$ref_end_seq_id) | ({{ end }} >= .data$ref_beg_seq_id & @@ -252,21 +252,21 @@ find_peptide_in_structure <- function(peptide_data, ) %>% dplyr::select( {{ uniprot_id }}, - .data$pdb_ids, - .data$auth_asym_id, - .data$label_asym_id, + "pdb_ids", + "auth_asym_id", + "label_asym_id", {{ peptide }}, - .data$peptide_seq_in_pdb, - .data$fit_type, + "peptide_seq_in_pdb", + "fit_type", {{ start }}, {{ end }}, - .data$label_seq_id_start, - .data$label_seq_id_end, - .data$auth_seq_id_start, - .data$auth_seq_id_end, - .data$auth_seq_id, - .data$n_peptides, - .data$n_peptides_in_structure + "label_seq_id_start", + "label_seq_id_end", + "auth_seq_id_start", + "auth_seq_id_end", + "auth_seq_id", + "n_peptides", + "n_peptides_in_structure" ) } # Retain also peptides in the data frame that were not found in any pdb structure or of diff --git a/R/fit_drc_4p.R b/R/fit_drc_4p.R index fca3d464..fcf75bfd 100644 --- a/R/fit_drc_4p.R +++ b/R/fit_drc_4p.R @@ -163,7 +163,7 @@ fit_drc_4p <- function(data, dplyr::mutate({{ dose }} := as.numeric({{ dose }})) # If the data_prep data.frame is empty return a data.frame that contains only the grouping and retained column - if (nrow(data_prep) == 0){ + if (nrow(data_prep) == 0) { return(data.frame()) } @@ -187,7 +187,7 @@ fit_drc_4p <- function(data, dplyr::distinct({{ grouping }}, .data$pval) %>% tidyr::drop_na(.data$pval) %>% # remove NA pvalues before adjustment! dplyr::mutate(anova_adj_pval = stats::p.adjust(.data$pval, method = "BH")) %>% - dplyr::rename(anova_pval = .data$pval) + dplyr::rename(anova_pval = "pval") # extract elements that pass anova significant threshold anova_filtered <- anova %>% @@ -445,7 +445,7 @@ fit_drc_4p <- function(data, ) # Return empty data.frame if there are no correlations. This prevents parallel_fit_drc_4p from failing. - if (nrow(correlation_output) == 0){ + if (nrow(correlation_output) == 0) { return(data.frame()) } diff --git a/R/map_peptides_on_structure.R b/R/map_peptides_on_structure.R index de7f1ccf..36e78112 100644 --- a/R/map_peptides_on_structure.R +++ b/R/map_peptides_on_structure.R @@ -30,15 +30,15 @@ #' the author defined chain definitions for both ".cif" and ".pdb" files. When the output of the #' \code{find_peptide_in_structure} function is used as the input for this function, this #' corresponds to the \code{auth_asym_id} column. -#' @param auth_seq_id optional, a character (or numeric) column in the \code{peptide_data} data frame -#' that contains semicolon separated positions of peptides, protein regions or amino acids in the -#' corresponding PDB structure or AlphaFold prediction. This information can be obtained from the -#' \code{find_peptide_in_structure} function. The corresponding column in the output is called -#' \code{auth_seq_id}. In case of AlphaFold predictions, UniProt positions should be used. If -#' signal positions and not stretches of amino acids are provided, the column can be numeric and +#' @param auth_seq_id optional, a character (or numeric) column in the \code{peptide_data} data frame +#' that contains semicolon separated positions of peptides, protein regions or amino acids in the +#' corresponding PDB structure or AlphaFold prediction. This information can be obtained from the +#' \code{find_peptide_in_structure} function. The corresponding column in the output is called +#' \code{auth_seq_id}. In case of AlphaFold predictions, UniProt positions should be used. If +#' signal positions and not stretches of amino acids are provided, the column can be numeric and #' does not need to contain the semicolon separator. #' @param map_value a numeric column in the \code{peptide_data} data frame that contains a value -#' associated with each peptide, protein region or amino acid. If one start to end position pair +#' associated with each peptide, protein region or amino acid. If one start to end position pair #' has multiple different map values, the maximum will be used. This value will be displayed as a #' colour gradient when mapped onto the structure. The value can for example be the fold change, #' p-value or score associated with each peptide, protein region or amino acid (selection). If @@ -54,9 +54,9 @@ #' Fetching and mapping onto ".cif" files takes longer than for ".pdb" files. If a structure file #' is provided in the \code{structure_file} argument, the file format is detected automatically #' and does not need to be provided. -#' @param scale_per_structure a logical value that specifies if scaling should be performed for +#' @param scale_per_structure a logical value that specifies if scaling should be performed for #' each structure independently (TRUE) or over the whole data set (FALSE). The default is TRUE, -#' which scales the scores of each structure independently so that each structure has a score +#' which scales the scores of each structure independently so that each structure has a score #' range from 50 to 100. #' @param export_location optional, a character argument specifying the path to the location in #' which the fetched and altered structure files should be saved. If left empty, they will be @@ -87,7 +87,7 @@ #' } #' # Load libraries #' library(dplyr) -#' +#' #' # Create example data #' peptide_data <- data.frame( #' uniprot_id = c("P0A8T7", "P0A8T7", "P60906"), @@ -100,7 +100,7 @@ #' end = c(1198, 1206, 66), #' map_value = c(70, 100, 100) #' ) -#' +#' #' # Find peptide positions in structures #' positions_structure <- find_peptide_in_structure( #' peptide_data = peptide_data, @@ -110,7 +110,7 @@ #' uniprot_id = uniprot_id, #' retain_columns = c(map_value)) %>% #' filter(pdb_ids %in% c("6UU2", "2EL9")) -#' +#' #' # Map peptides on structures #' # You can determine the preferred output location #' # with the export_location argument. Currently it @@ -125,7 +125,7 @@ #' file_format = ".pdb", #' export_location = getwd() #' ) -#' +#' #' \dontshow{ #' setwd(.old_wd) #' } @@ -164,11 +164,11 @@ map_peptides_on_structure <- function(peptide_data, tidyr::drop_na({{ auth_seq_id }}) %>% # remove observations that do not have any position # information because they did not fit a protein. - dplyr::group_by({{ uniprot_id }}, {{ pdb_id }}, {{ chain }}, {{ auth_seq_id }}) %>% - dplyr::mutate({{ map_value}} := max({{ map_value }}, na.rm = TRUE)) %>% + dplyr::group_by({{ uniprot_id }}, {{ pdb_id }}, {{ chain }}, {{ auth_seq_id }}) %>% + dplyr::mutate({{ map_value}} := max({{ map_value }}, na.rm = TRUE)) %>% # This makes sure that there is only one value per peptide. # It will always take the maximum value. - dplyr::ungroup() %>% + dplyr::ungroup() %>% dplyr::distinct( {{ uniprot_id }}, {{ pdb_id }}, @@ -176,12 +176,12 @@ map_peptides_on_structure <- function(peptide_data, {{ auth_seq_id }}, {{ map_value }} ) %>% - dplyr::mutate(scaling_info = ifelse(rep(scale_per_structure, dplyr::n()), + dplyr::mutate(scaling_info = ifelse(rep(scale_per_structure, dplyr::n()), {{ pdb_id }}, - "scale_overall")) %>% - # determines if scores should by scaled by structure or + "scale_overall")) %>% + # determines if scores should by scaled by structure or # one scale for the whole data set. - dplyr::group_by(.data$scaling_info) %>% + dplyr::group_by(.data$scaling_info) %>% dplyr::mutate({{ map_value }} := round( scale_protti(c({{ map_value }}), method = "01") * 50 + 50, digits = 2 @@ -190,11 +190,11 @@ map_peptides_on_structure <- function(peptide_data, group_by({{ uniprot_id }}, {{ pdb_id }}, {{ chain }}, {{ auth_seq_id }}) %>% dplyr::mutate(residue_internal = stringr::str_split({{ auth_seq_id }}, pattern = ";")) %>% dplyr::ungroup() %>% - tidyr::unnest(.data$residue_internal) %>% + tidyr::unnest("residue_internal") %>% dplyr::group_by({{ uniprot_id }}, {{ pdb_id }}, {{ chain }}, .data$residue_internal) %>% dplyr::mutate({{ map_value }} := max({{ map_value }})) %>% dplyr::ungroup() %>% - dplyr::select(-c({{ auth_seq_id }}, .data$scaling_info)) %>% + dplyr::select(-c({{ auth_seq_id }}, "scaling_info")) %>% dplyr::distinct() %>% dplyr::mutate( id = ifelse(is.na({{ pdb_id }}), {{ uniprot_id }}, {{ pdb_id }}), @@ -223,11 +223,11 @@ map_peptides_on_structure <- function(peptide_data, dplyr::distinct({{ pdb_id }}, {{ uniprot_id }}) %>% dplyr::group_by({{ pdb_id }}) %>% dplyr::mutate(name = paste({{ uniprot_id }}, collapse = "_")) %>% - dplyr::mutate(name = ifelse(nchar(.data$name) >= 240, + dplyr::mutate(name = ifelse(nchar(.data$name) >= 240, paste0(stringr::str_count(.data$name, pattern = "_") + 1, "_proteins"), .data$name ) - ) %>% + ) %>% dplyr::ungroup() %>% dplyr::mutate(name = paste0({{ pdb_id }}, "_", .data$name)) %>% dplyr::select(-{{ uniprot_id }}) %>% @@ -592,7 +592,7 @@ map_peptides_on_structure <- function(peptide_data, list(NA) )) %>% dplyr::ungroup() %>% - tidyr::unnest(.data$residue_internal) %>% + tidyr::unnest(.data$residue_internal) %>% dplyr::group_by({{ chain }}, .data$residue_internal) %>% dplyr::mutate({{ map_value }} := max({{ map_value }})) %>% dplyr::ungroup() %>% diff --git a/R/peptide_profile_plot.R b/R/peptide_profile_plot.R index 02464de5..be6d6d91 100644 --- a/R/peptide_profile_plot.R +++ b/R/peptide_profile_plot.R @@ -130,23 +130,23 @@ peptide_profile_plot <- function(data, protti_colours <- "placeholder" # assign a placeholder to prevent a missing global variable warning utils::data("protti_colours", envir = environment()) # then overwrite it with real data if (missing(targets)) stop("Please provide at least one target to plot!") - + input <- data %>% dplyr::distinct({{ sample }}, {{ peptide }}, {{ intensity_log2 }}, {{ grouping }}) %>% tidyr::drop_na({{ intensity_log2 }}) - - if (complete_sample){ + + if (complete_sample) { input <- input %>% - tidyr::complete({{ sample }}, {{ grouping }}) %>% + tidyr::complete({{ sample }}, {{ grouping }}) %>% tidyr::fill({{ peptide }}, .direction = "downup") } - + if (!("all" %in% targets)) { input <- input %>% dplyr::filter({{ grouping }} %in% targets) %>% split(dplyr::pull(., !!ensym(grouping))) } - + if ("all" %in% targets) { groups <- length(unique(dplyr::pull(data, {{ grouping }}))) message("Splitting into ", groups, " groups and returning ", groups, " plots.") diff --git a/R/predict_alphafold_domain.R b/R/predict_alphafold_domain.R index e429e392..c7ae0154 100644 --- a/R/predict_alphafold_domain.R +++ b/R/predict_alphafold_domain.R @@ -86,14 +86,14 @@ predict_alphafold_domain <- function(pae_list, } # Create aligned error matrix aligned_error_matrix <- .x %>% - dplyr::select(c(.data$scored_residue, .data$aligned_residue, .data$error)) %>% + dplyr::select(c("scored_residue", "aligned_residue", "error")) %>% # prevent the creation of Inf weights. Convert all 0 values to a low value instead. dplyr::mutate(error = ifelse(error == 0, 0.001, .data$error)) %>% tidyr::pivot_wider( - names_from = .data$scored_residue, - values_from = .data$error + names_from = "scored_residue", + values_from = "error" ) %>% - dplyr::select(-c(.data$aligned_residue)) %>% + dplyr::select(-c("aligned_residue")) %>% as.matrix() # Calculate all weights diff --git a/R/qc_charge_states.R b/R/qc_charge_states.R index 8881344e..eadeb18b 100644 --- a/R/qc_charge_states.R +++ b/R/qc_charge_states.R @@ -105,15 +105,15 @@ qc_charge_states <- dplyr::group_by({{ sample }}, {{ charge_states }}) %>% dplyr::summarise(charge_per = n / .data$total_peptides * 100) %>% dplyr::ungroup() %>% - dplyr::mutate({{ charge_states }} := forcats::fct_inorder(factor({{ charge_states }}))) + dplyr::mutate({{ charge_states }} := forcats::fct_inorder(factor({{ charge_states }}))) if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } - + if (plot == FALSE) { return(result) } else { @@ -158,15 +158,14 @@ qc_charge_states <- dplyr::mutate(total_intensity = sum({{ intensity }})) %>% dplyr::group_by({{ sample }}, {{ charge_states }}) %>% dplyr::mutate(sum_intensity_cs = sum({{ intensity }})) %>% - dplyr::summarise(charge_per = .data$sum_intensity_cs / .data$total_intensity * 100) %>% - dplyr::ungroup() %>% + dplyr::reframe(charge_per = .data$sum_intensity_cs / .data$total_intensity * 100) %>% dplyr::mutate({{ charge_states }} := forcats::fct_inorder(factor({{ charge_states }}))) %>% dplyr::distinct() - + if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } diff --git a/R/qc_contaminants.R b/R/qc_contaminants.R index ccaff81c..dfa097f9 100644 --- a/R/qc_contaminants.R +++ b/R/qc_contaminants.R @@ -76,11 +76,11 @@ qc_contaminants <- function(data, if (plot == FALSE) { return(result) } - + if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } diff --git a/R/qc_cvs.R b/R/qc_cvs.R index 0a20a264..f1b7db21 100644 --- a/R/qc_cvs.R +++ b/R/qc_cvs.R @@ -14,7 +14,7 @@ #' @param plot_style a character value that indicates the plotting style. \code{plot_style = "boxplot"} #' plots a boxplot, whereas \code{plot_style = "density"} plots the CV density distribution. #' \code{plot_style = "violin"} returns a violin plot. Default is \code{plot_style = "density"}. -#' @param max_cv a numeric value that specifies the maximum percentage of CVs that should be included +#' @param max_cv a numeric value that specifies the maximum percentage of CVs that should be included #' in the returned plot. The default value is `max_cv = 200`. #' #' @return Either a data frame with the median CVs in % or a plot showing the distribution of the CVs @@ -95,7 +95,7 @@ The function does not handle log2 transformed data.", dplyr::mutate(median_cv = stats::median(.data$cv)) %>% dplyr::ungroup() %>% dplyr::mutate(median_cv_combined = stats::median(.data$cv_combined)) %>% - dplyr::select(-{{ grouping }}, -.data$cv_combined, -.data$cv) %>% + dplyr::select(-{{ grouping }}, -c("cv_combined", "cv")) %>% dplyr::distinct() return(result) @@ -135,12 +135,13 @@ The function does not handle log2 transformed data.", if (plot_style == "boxplot") { plot <- ggplot2::ggplot(result) + - ggplot2::geom_boxplot(aes( - x = .data$type, - y = .data$values, - fill = .data$type - ), - na.rm = TRUE + ggplot2::geom_boxplot( + aes( + x = .data$type, + y = .data$values, + fill = .data$type + ), + na.rm = TRUE ) + ggplot2::labs( title = "Coefficients of variation", diff --git a/R/qc_data_completeness.R b/R/qc_data_completeness.R index 0c0b0a4e..f9e66e75 100644 --- a/R/qc_data_completeness.R +++ b/R/qc_data_completeness.R @@ -94,10 +94,10 @@ qc_data_completeness <- function(data, if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } - + completeness_plot <- result %>% ggplot2::ggplot(ggplot2::aes({{ sample }}, .data$completeness)) + ggplot2::geom_col(fill = "#5680C1", col = "black", size = 1) + diff --git a/R/qc_missed_cleavages.R b/R/qc_missed_cleavages.R index 7497b10f..0c42a217 100644 --- a/R/qc_missed_cleavages.R +++ b/R/qc_missed_cleavages.R @@ -104,12 +104,12 @@ intensities or set remove_na_intensities to FALSE", dplyr::group_by({{ sample }}, {{ missed_cleavages }}) %>% dplyr::summarise(mc_percent = n / .data$total_peptide_count * 100) %>% dplyr::ungroup() %>% - dplyr::mutate({{ missed_cleavages }} := forcats::fct_inorder(factor({{ missed_cleavages }}))) - + dplyr::mutate({{ missed_cleavages }} := forcats::fct_inorder(factor({{ missed_cleavages }}))) + if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } @@ -166,18 +166,17 @@ intensities or set remove_na_intensities to FALSE", dplyr::mutate(total_intensity = sum({{ intensity }})) %>% dplyr::group_by({{ sample }}, {{ missed_cleavages }}) %>% dplyr::mutate(sum_intensity_mc = sum({{ intensity }})) %>% - dplyr::summarise(mc_percent = .data$sum_intensity_mc / .data$total_intensity * 100) %>% - dplyr::ungroup() %>% + dplyr::reframe(mc_percent = .data$sum_intensity_mc / .data$total_intensity * 100) %>% dplyr::mutate({{ missed_cleavages }} := forcats::fct_inorder(factor({{ missed_cleavages }}))) %>% dplyr::distinct() if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } - + if (plot == FALSE) { return(result) } else { diff --git a/R/qc_pca.R b/R/qc_pca.R index 23a1ff45..088a7247 100644 --- a/R/qc_pca.R +++ b/R/qc_pca.R @@ -132,22 +132,24 @@ qc_pca <- ), color = "Condition" ) + - ggrepel::geom_text_repel(aes(label = paste( - stringr::str_replace_all(as.character({{ sample }}), fixed("_"), " ") - )), - size = 4, - show.legend = FALSE + ggrepel::geom_text_repel( + aes(label = paste( + stringr::str_replace_all(as.character({{ sample }}), fixed("_"), " ") + )), + size = 4, + show.legend = FALSE ) + - {if(is.numeric(unique(dplyr::pull(pca_df, {{ condition }})))){ - ggplot2::scale_color_gradientn(colours = c( - "#0D0887", "#2E0595", "#46039F", "#5C01A6", "#7201A8", "#8707A6", "#9A169F", - "#AC2694", "#BC3587", "#CA457A", "#D6556D", "#E26561", "#EB7655", "#F48849", - "#FA9B3D", "#FDAF31", "#FDC527", "#F9DC24", "#F0F921" - )) - } else { - ggplot2::scale_color_manual(values = protti_colours) - } - }+ + { + if (is.numeric(unique(dplyr::pull(pca_df, {{ condition }})))) { + ggplot2::scale_color_gradientn(colours = c( + "#0D0887", "#2E0595", "#46039F", "#5C01A6", "#7201A8", "#8707A6", "#9A169F", + "#AC2694", "#BC3587", "#CA457A", "#D6556D", "#E26561", "#EB7655", "#F48849", + "#FA9B3D", "#FDAF31", "#FDC527", "#F9DC24", "#F0F921" + )) + } else { + ggplot2::scale_color_manual(values = protti_colours) + } + } + ggplot2::theme( panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA), @@ -173,12 +175,13 @@ qc_pca <- x = "Dimension", y = "Explained variance [%]" ) + - ggplot2::geom_text(aes(label = paste0( - as.character(round(.data$percent_variance, digits = 1)), "%" - )), - size = 4, - vjust = -0.6, - hjust = -0.1 + ggplot2::geom_text( + aes(label = paste0( + as.character(round(.data$percent_variance, digits = 1)), "%" + )), + size = 4, + vjust = -0.6, + hjust = -0.1 ) + ggplot2::scale_y_continuous( limits = NULL, diff --git a/R/qc_peptide_type.R b/R/qc_peptide_type.R index 3fadc8d2..6a3d4c6e 100644 --- a/R/qc_peptide_type.R +++ b/R/qc_peptide_type.R @@ -102,11 +102,11 @@ qc_peptide_type <- function(data, dplyr::mutate(pep_type = factor({{ pep_type }}, levels = c("fully-tryptic", "semi-tryptic", "non-tryptic") )) - + if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } @@ -190,11 +190,11 @@ qc_peptide_type <- function(data, dplyr::mutate(pep_type = factor({{ pep_type }}, levels = c("fully-tryptic", "semi-tryptic", "non-tryptic") )) - + if (is(dplyr::pull(result, {{ sample }}), "character")) { result <- result %>% dplyr::mutate({{ sample }} := factor({{ sample }}, - levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) + levels = unique(stringr::str_sort({{ sample }}, numeric = TRUE)) )) } diff --git a/R/qc_proteome_coverage.R b/R/qc_proteome_coverage.R index 56622872..3e5011c7 100644 --- a/R/qc_proteome_coverage.R +++ b/R/qc_proteome_coverage.R @@ -77,7 +77,7 @@ qc_proteome_coverage <- function(data, proteins_detected = .data$proteins_detected / proteome$proteins_proteome * 100 ) %>% tidyr::pivot_longer( - cols = c(.data$proteins_detected, .data$proteins_undetected), + cols = c("proteins_detected", "proteins_undetected"), names_to = "type", values_to = "percentage" ) %>% diff --git a/R/qc_ranked_intensities.R b/R/qc_ranked_intensities.R index d3952365..0ae3d707 100644 --- a/R/qc_ranked_intensities.R +++ b/R/qc_ranked_intensities.R @@ -137,7 +137,7 @@ qc_ranked_intensities <- function(data, if (plot == FALSE) { output <- input %>% - dplyr::select(-.data$intensity_plot) + dplyr::select(-"intensity_plot") # return data frame return(output) diff --git a/R/randomise_queue.R b/R/randomise_queue.R index ef496fd4..cbbab4bf 100644 --- a/R/randomise_queue.R +++ b/R/randomise_queue.R @@ -65,7 +65,6 @@ randomise_queue <- function(data = NULL, rows = NULL, export = FALSE) { - # load data interactively if no data is provided in the data argument if (is.null(data)) { path <- file.choose(".") diff --git a/R/volcano_plot.R b/R/volcano_plot.R index e141ec86..3d5a21aa 100644 --- a/R/volcano_plot.R +++ b/R/volcano_plot.R @@ -54,8 +54,8 @@ volcano_protti <- function(...) { #' "-log10(q-value)". #' @param legend_label optional, a character value that specifies the legend label. Default is #' "Target". -#' @param colour optional, a character vector containing colours that should be used to colour -#' points according to the selected method. IMPORTANT: the first value in the vector is the +#' @param colour optional, a character vector containing colours that should be used to colour +#' points according to the selected method. IMPORTANT: the first value in the vector is the #' default point colour, the additional values specify colouring of target or significant points. #' E.g. `c("grey60", "#5680C1")` to achieve the same colouring as the default for the "significant" #' method. @@ -160,17 +160,17 @@ volcano_plot <- function(data, protti_colours <- "placeholder" # assign a placeholder to prevent a missing global variable warning utils::data("protti_colours", envir = environment()) # then overwrite it with real data - if (!missing(colour)){ - if(length(colour) < 2){ + if (!missing(colour)) { + if (length(colour) < 2) { stop("Please provide more colours!") } background <- colour[1] additional_colour <- colour[-1] } else { background <- "grey60" - additional_colour <- protti_colours + additional_colour <- protti_colours } - + data <- data %>% tidyr::drop_na({{ log2FC }}, {{ significance }}) @@ -220,11 +220,12 @@ volcano_plot <- function(data, label1 = {{ target_column }}, label2 = {{ grouping }} )) + - geom_point(aes( - x = {{ log2FC }}, - y = -1 * log10({{ significance }}) - ), - colour = background + geom_point( + aes( + x = {{ log2FC }}, + y = -1 * log10({{ significance }}) + ), + colour = background ) + geom_point( data = dplyr::filter(data, .data$target == TRUE), @@ -241,9 +242,11 @@ volcano_plot <- function(data, y = y_axis_label, color = legend_label ) + - {if (nrow(cutoff_line) != 0){ - geom_hline(data = cutoff_line, aes(yintercept = .data$mean_adjusted_cutoff), linetype = "dashed") - }}+ + { + if (nrow(cutoff_line) != 0) { + geom_hline(data = cutoff_line, aes(yintercept = .data$mean_adjusted_cutoff), linetype = "dashed") + } + } + geom_vline(xintercept = log2FC_cutoff, linetype = "dashed") + geom_vline(xintercept = -log2FC_cutoff, linetype = "dashed") + { @@ -291,11 +294,12 @@ volcano_plot <- function(data, label1 = {{ target_column }}, label2 = {{ grouping }} )) + - geom_point(aes( - x = {{ log2FC }}, - y = -log10({{ significance }}) - ), - colour = background + geom_point( + aes( + x = {{ log2FC }}, + y = -log10({{ significance }}) + ), + colour = background ) + geom_point( data = dplyr::filter(data, (abs({{ log2FC }}) > log2FC_cutoff) & ({{ significance }} < .data$mean_adjusted_cutoff)), @@ -311,9 +315,11 @@ volcano_plot <- function(data, x = x_axis_label, y = y_axis_label ) + - {if (nrow(cutoff_line) != 0){ - geom_hline(data = cutoff_line, aes(yintercept = .data$mean_adjusted_cutoff), linetype = "dashed") - }}+ + { + if (nrow(cutoff_line) != 0) { + geom_hline(data = cutoff_line, aes(yintercept = .data$mean_adjusted_cutoff), linetype = "dashed") + } + } + geom_vline(xintercept = log2FC_cutoff, linetype = "dashed") + geom_vline(xintercept = -1 * log2FC_cutoff, linetype = "dashed") + { diff --git a/R/woods_plot.R b/R/woods_plot.R index 9b8330cd..b491fe8f 100644 --- a/R/woods_plot.R +++ b/R/woods_plot.R @@ -140,24 +140,26 @@ woods_plot <- function(data, plots <- purrr::map2(.x = data_facet, .y = names(data_facet), function(x, y) { pb$tick() ggplot2::ggplot(data = x) + - ggplot2::geom_rect(ggplot2::aes( - xmin = 0, - xmax = {{ protein_length }}, - ymin = -0.01, - ymax = 0.01 - ), - fill = "black" + ggplot2::geom_rect( + ggplot2::aes( + xmin = 0, + xmax = {{ protein_length }}, + ymin = -0.01, + ymax = 0.01 + ), + fill = "black" ) + - ggplot2::geom_rect(ggplot2::aes( - xmin = {{ start_position }}, - xmax = {{ end_position }}, - ymin = {{ fold_change }} - 0.2, - ymax = {{ fold_change }} + 0.2, - fill = {{ colouring }} - ), - col = "black", - size = 0.7, - alpha = 0.8 + ggplot2::geom_rect( + ggplot2::aes( + xmin = {{ start_position }}, + xmax = {{ end_position }}, + ymin = {{ fold_change }} - 0.2, + ymax = {{ fold_change }} + 0.2, + fill = {{ colouring }} + ), + col = "black", + size = 0.7, + alpha = 0.8 ) + { if (!highlight_missing) { diff --git a/README.Rmd b/README.Rmd index e41f5b17..47dab585 100644 --- a/README.Rmd +++ b/README.Rmd @@ -119,15 +119,17 @@ The data this function creates is similar to data obtained from a LiP-MS experim set.seed(42) # Makes example reproducible # Create synthetic data -data <- create_synthetic_data(n_proteins = 100, - frac_change = 0.05, - n_replicates = 4, - n_conditions = 2, - method = "effect_random", - additional_metadata = FALSE) - -# The method "effect_random" as opposed to "dose-response" just randomly samples -# the extend of the change of significantly changing peptides for each condition. +data <- create_synthetic_data( + n_proteins = 100, + frac_change = 0.05, + n_replicates = 4, + n_conditions = 2, + method = "effect_random", + additional_metadata = FALSE +) + +# The method "effect_random" as opposed to "dose-response" just randomly samples +# the extend of the change of significantly changing peptides for each condition. # They do not follow any trend and can go in any direction. ``` @@ -148,10 +150,12 @@ In addition to filtering and log2 transformation it is also advised to normalise _Note: If your search tool already normalised your data you should not normalise it another time._ ```{r normalisation, message=FALSE, warning=FALSE} -normalised_data <- data %>% - normalise(sample = sample, - intensity_log2 = peptide_intensity_missing, - method = "median") +normalised_data <- data %>% + normalise( + sample = sample, + intensity_log2 = peptide_intensity_missing, + method = "median" + ) ``` #### Assign Missingness @@ -161,16 +165,18 @@ The next step is to deal with missing data points. You could choose to impute mi If a certain condition has all replicates while the other one has less than 20% (adjusted downward) of total possible replicates, the case is considered to be "missing not at random" (`MNAR`). In order to be labeled "missing at random" (`MAR`) 70% (adjusted downward) of total replicates need to be present in both conditions. If you performed an experiment with 4 replicates that means that both conditions need to contain at least 2 observations. Comparisons that have too few observations are labeled `NA`. These will not be imputed if imputation is performed later on using the `impute()` function. You can read the exact details in the documentation of this function and also adjust the thresholds if you want to be more or less conservative with how many data points to retain. ```{r assign_missingness, message=FALSE, warning=FALSE} -data_missing <- normalised_data %>% - assign_missingness(sample = sample, - condition = condition, - grouping = peptide, - intensity = normalised_intensity_log2, - ref_condition = "condition_1", - retain_columns = c(protein, change_peptide)) - -# Next to the columns it generates, assign_missingness only contains the columns -# you provide as input in its output. If you want to retain additional columns you +data_missing <- normalised_data %>% + assign_missingness( + sample = sample, + condition = condition, + grouping = peptide, + intensity = normalised_intensity_log2, + ref_condition = "condition_1", + retain_columns = c(protein, change_peptide) + ) + +# Next to the columns it generates, assign_missingness only contains the columns +# you provide as input in its output. If you want to retain additional columns you # can provide them in the retain_columns argument. ``` @@ -183,28 +189,32 @@ For the calculation of abundance changes and the associated significances **prot The type of missingness assigned to a comparison does not have any influence on the statistical test. However, by default (can be changed) comparisons with missingness `NA` are filtered out prior to p-value adjustment. This means that in addition to imputation, the user can use missingness cutoffs also in order to define which comparisons are too incomplete to be trustworthy even if significant. ```{r calculate_diff_abundance, message=FALSE, warning=FALSE} -result <- data_missing %>% - calculate_diff_abundance(sample = sample, - condition = condition, - grouping = peptide, - intensity_log2 = normalised_intensity_log2, - missingness = missingness, - comparison = comparison, - filter_NA_missingness = TRUE, - method = "moderated_t-test", - retain_columns = c(protein, change_peptide)) +result <- data_missing %>% + calculate_diff_abundance( + sample = sample, + condition = condition, + grouping = peptide, + intensity_log2 = normalised_intensity_log2, + missingness = missingness, + comparison = comparison, + filter_NA_missingness = TRUE, + method = "moderated_t-test", + retain_columns = c(protein, change_peptide) + ) ``` Next we can use a Volcano plot to visualize significantly changing peptides with the function `volcano_plot()`. You can choose to create an interactive plot with the `interactive` argument. Please note that this is not recommended for large datasets. ```{r volcano, fig.height = 5, fig.width = 8, message=FALSE, warning=FALSE} -result %>% - volcano_plot(grouping = peptide, - log2FC = diff, - significance = pval, - method = "target", - target_column = change_peptide, - target = TRUE, - legend_label = "Ground Truth", - significance_cutoff = c(0.05, "adj_pval")) +result %>% + volcano_plot( + grouping = peptide, + log2FC = diff, + significance = pval, + method = "target", + target_column = change_peptide, + target = TRUE, + legend_label = "Ground Truth", + significance_cutoff = c(0.05, "adj_pval") + ) ``` diff --git a/data-raw/metal_go_slim_subset.R b/data-raw/metal_go_slim_subset.R index d5f41d07..6b7cba29 100644 --- a/data-raw/metal_go_slim_subset.R +++ b/data-raw/metal_go_slim_subset.R @@ -57,7 +57,7 @@ parent_metal_ids <- c( "GO:0015625", # ABC-type ferric hydroxamate transporter activity "GO:0015345", # ferric enterobactin:proton symporter activity "GO:1903981", # enterobactin binding - "GO:0004076", # biotin synthase activity + "GO:0004076", # biotin synthase activity "GO:0016041", # glutamate synthase (ferredoxin) activity "GO:0018695", # 4-cresol dehydrogenase (hydroxylating) activity "GO:0018694", # p-cymene methyl hydroxylase activity @@ -320,7 +320,7 @@ terms_metal <- terms %>% filter(chebi_id %in% unique(metal_chebi$chebi_accession)) %>% filter(!main_id %in% metal_slim_subset_annotated$slims_from_id) -terms_metal_id_name <- terms_metal %>% +terms_metal_id_name <- terms_metal %>% distinct(main_id, main_name) terms_metal_paste <- paste0(paste0('"', terms_metal_id_name$main_id, '", \\# ', terms_metal_id_name$main_name), collapse = "\n") @@ -330,14 +330,16 @@ terms_metal_paste <- paste0(paste0('"', terms_metal_id_name$main_id, '", \\# ', # The igraph package is used for this go_network <- metal_slim_subset_annotated %>% - mutate(chebi_id = ifelse(!(chebi_id %in% metal_chebi$chebi_accession) | is.na(chebi_id), NA, chebi_id), - relations_relation = ifelse(is.na(chebi_id), NA, relations_relation), - relations_url = ifelse(is.na(chebi_id), NA, relations_url), - database = ifelse(is.na(chebi_id), NA, database), - relations_term = ifelse(is.na(chebi_id), NA, relations_term)) %>% - group_by(slims_from_id) %>% - filter(all(is.na(chebi_id)) | !is.na(chebi_id)) %>% - ungroup() %>% + mutate( + chebi_id = ifelse(!(chebi_id %in% metal_chebi$chebi_accession) | is.na(chebi_id), NA, chebi_id), + relations_relation = ifelse(is.na(chebi_id), NA, relations_relation), + relations_url = ifelse(is.na(chebi_id), NA, relations_url), + database = ifelse(is.na(chebi_id), NA, database), + relations_term = ifelse(is.na(chebi_id), NA, relations_term) + ) %>% + group_by(slims_from_id) %>% + filter(all(is.na(chebi_id)) | !is.na(chebi_id)) %>% + ungroup() %>% # exclude all non-metal IDs according to the above ChEBI list # this might however also exclude non-metal ChEBI IDs if they are present. # One can run this pipeline once commenting these arguments out and checking for new non-metal IDs diff --git a/man/calculate_go_enrichment.Rd b/man/calculate_go_enrichment.Rd index bd89aed8..5369dd1d 100644 --- a/man/calculate_go_enrichment.Rd +++ b/man/calculate_go_enrichment.Rd @@ -101,42 +101,41 @@ uniprot_go_data <- fetch_uniprot_proteome( ) ) -if(!is(data, "character")){ - -data <- uniprot_go_data \%>\% - mutate(significant = c( - rep(TRUE, 1000), - rep(FALSE, n() - 1000) - )) \%>\% - mutate(significant = ifelse( - str_detect( - go_f, - pattern = "ribosome" - ), - FALSE, - significant - )) - -# Plot gene ontology enrichment -calculate_go_enrichment( - data, - protein_id = accession, - go_annotations_uniprot = go_f, - is_significant = significant, - plot = TRUE, - plot_cutoff = "pval 0.01" -) +if (!is(data, "character")) { + data <- uniprot_go_data \%>\% + mutate(significant = c( + rep(TRUE, 1000), + rep(FALSE, n() - 1000) + )) \%>\% + mutate(significant = ifelse( + str_detect( + go_f, + pattern = "ribosome" + ), + FALSE, + significant + )) + + # Plot gene ontology enrichment + calculate_go_enrichment( + data, + protein_id = accession, + go_annotations_uniprot = go_f, + is_significant = significant, + plot = TRUE, + plot_cutoff = "pval 0.01" + ) -# Calculate gene ontology enrichment -go_enrichment <- calculate_go_enrichment( - data, - protein_id = accession, - go_annotations_uniprot = go_f, - is_significant = significant, - plot = FALSE, -) + # Calculate gene ontology enrichment + go_enrichment <- calculate_go_enrichment( + data, + protein_id = accession, + go_annotations_uniprot = go_f, + is_significant = significant, + plot = FALSE, + ) -head(go_enrichment, n = 10) + head(go_enrichment, n = 10) } } } diff --git a/man/calculate_kegg_enrichment.Rd b/man/calculate_kegg_enrichment.Rd index 26a9694d..3d2878f4 100644 --- a/man/calculate_kegg_enrichment.Rd +++ b/man/calculate_kegg_enrichment.Rd @@ -64,13 +64,14 @@ kegg_data <- fetch_kegg(species = "eco") if (!is.null(kegg_data)) { # only proceed if information was retrieved data <- kegg_data \%>\% group_by(uniprot_id) \%>\% - mutate(significant = rep(sample( - x = c(TRUE, FALSE), - size = 1, - replace = TRUE, - prob = c(0.2, 0.8) - ), - n = n() + mutate(significant = rep( + sample( + x = c(TRUE, FALSE), + size = 1, + replace = TRUE, + prob = c(0.2, 0.8) + ), + n = n() )) # Plot KEGG enrichment diff --git a/vignettes/data_analysis_dose_response_workflow.Rmd b/vignettes/data_analysis_dose_response_workflow.Rmd index a954a2e9..be538156 100644 --- a/vignettes/data_analysis_dose_response_workflow.Rmd +++ b/vignettes/data_analysis_dose_response_workflow.Rmd @@ -25,7 +25,7 @@ knitr::opts_chunk$set( # Introduction ```{r CRAN_comment, message=FALSE, warning=FALSE, echo=FALSE} -if (build_vignette_on_cran == FALSE){ +if (build_vignette_on_cran == FALSE) { print("!!! IMPORTANT !!!") print("This Vignette has not been built completely on CRAN due to size limitations.") print("Please check the correct version here: ") @@ -92,12 +92,14 @@ In addition to the removal of decoys we also remove any non-proteotypic peptides ```{r filter_transform_normalise, eval = test_protti} # Filter, log2 transform and normalise data -data_normalised <- rapamycin_dose_response %>% - filter(eg_is_decoy == FALSE) %>% - mutate(intensity_log2 = log2(fg_quantity)) %>% - normalise(sample = r_file_name, - intensity_log2 = intensity_log2, - method = "median") %>% +data_normalised <- rapamycin_dose_response %>% + filter(eg_is_decoy == FALSE) %>% + mutate(intensity_log2 = log2(fg_quantity)) %>% + normalise( + sample = r_file_name, + intensity_log2 = intensity_log2, + method = "median" + ) %>% filter(pep_is_proteotypic == TRUE) ``` @@ -106,14 +108,16 @@ It is also useful to check the intensity distribution of all precursors with the For experiments measured in data independent acquisition (DIA) mode, it is good to filter out very low intensity values that are not part of the distribution. These values are likely false assignments of peaks. If you look closely you can see a small peak around 0 corresponding to these values. In the case of our distribution we could chose a cutoff at a log2 intensity of 5, which corresponds to a raw intensity of 32. ```{r intensity_distribution, eval = test_protti, fig.align = "center", fig.width = 7, fig.height = 5} -qc_intensity_distribution(data = data_normalised, - grouping = eg_precursor_id, - intensity = normalised_intensity_log2, - plot_style = "histogram") +qc_intensity_distribution( + data = data_normalised, + grouping = eg_precursor_id, + intensity = normalised_intensity_log2, + plot_style = "histogram" +) ``` ```{r intensity_filtering, eval = test_protti} -data_normalised <- data_normalised %>% +data_normalised <- data_normalised %>% filter(normalised_intensity_log2 > 5) ``` @@ -124,11 +128,13 @@ Before we fit dose-response models to our data, we can check how samples cluster _Note: Many of **protti**'s plotting functions also have the option to display an interactive version of the plot with an `interactive` argument._ ```{r sample_correlation, eval = test_protti, fig.align = "center", fig.width = 7, fig.height = 5} -qc_sample_correlation(data = data_normalised, - sample = r_file_name, - grouping = eg_precursor_id, - intensity_log2 = normalised_intensity_log2, - condition = r_condition) +qc_sample_correlation( + data = data_normalised, + sample = r_file_name, + grouping = eg_precursor_id, + intensity_log2 = normalised_intensity_log2, + condition = r_condition +) ``` For our rapamycin data set we cannot identify any clear sample clustering based on correlation, because only a small subset of proteins was selected for this example. Furthermore, data sets in which only a few changes are expected commonly do not cluster nicely. This is because precursors/peptides/proteins have very similar intensity values across conditions and clustering is in this case based on variance. In case of more global changes samples should usually cluster nicely. @@ -136,11 +142,13 @@ For our rapamycin data set we cannot identify any clear sample clustering based In addition to correlation based clustering a principal component analysis can be performed using the **protti** function `qc_pca()`. Similar to before, good clustering is usually dependent on the amount of changing precursors (or peptides/proteins). ```{r sample_pca, eval = test_protti, fig.align = "center", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE} -qc_pca(data = data_normalised, - sample = r_file_name, - grouping = eg_precursor_id, - intensity = normalised_intensity_log2, - condition = r_condition) +qc_pca( + data = data_normalised, + sample = r_file_name, + grouping = eg_precursor_id, + intensity = normalised_intensity_log2, + condition = r_condition +) ``` # Fitting dose-response curves @@ -159,13 +167,15 @@ $b$ is the Hill's coefficient (e.g. the negative slope at the inflection point) The output of `fit_drc_4p()` provides extensive information on the goodness of fit. Furthermore, the function filters and ranks fits based on several criteria which include completeness of data and a significance cutoff based on an adjusted p-value obtained from ANOVA. For details about the filtering steps you can read the function documentation by calling `?fit_drc_4p`. If you only care about your potential hits (and exclude precursors/peptides/proteins that do not pass the filtering e.g. due to too few observations), you can choose `filter = "pre"`, which filters the data before model fitting. This speeds up the process because less models need to be fit. If you want to perform enrichment analysis later on you should keep the default `filter = "post"`. This will fit all possible models and then only annotate models based on whether they passed or failed the filtering step (`passed_filter`). ```{r model_fit, eval = test_protti} -fit <- data_normalised %>% - fit_drc_4p(sample = r_file_name, - grouping = eg_precursor_id, - response = normalised_intensity_log2, - dose = r_condition, - filter = "post", - retain_columns = c(pg_protein_accessions)) +fit <- data_normalised %>% + fit_drc_4p( + sample = r_file_name, + grouping = eg_precursor_id, + response = normalised_intensity_log2, + dose = r_condition, + filter = "post", + retain_columns = c(pg_protein_accessions) + ) # make sure to retain columns that you need later but that are not part of the function ``` @@ -179,13 +189,15 @@ _Note: Keep in mind that by spreading the model fitting over multiple cores, mor future::plan(future::multisession, workers = 3) # fit models in parallel -parallel_fit <- data_normalised %>% - parallel_fit_drc_4p(sample = r_file_name, - grouping = eg_precursor_id, - response = normalised_intensity_log2, - dose = r_condition, - retain_columns = c(pg_protein_accessions), - n_cores = 3) +parallel_fit <- data_normalised %>% + parallel_fit_drc_4p( + sample = r_file_name, + grouping = eg_precursor_id, + response = normalised_intensity_log2, + dose = r_condition, + retain_columns = c(pg_protein_accessions), + n_cores = 3 + ) # remove workers again after you are done future::plan(future::sequential) @@ -194,19 +206,23 @@ future::plan(future::sequential) If we examine all precursors based on their rank (calculated from correlation and ANOVA q-value), we can see that as expected most of them are FKBP12 (P62942) peptides. One benefit of analysing LiP-MS data on the precursor level (rather than peptide level) is that multiple lines of evidence for a change in the specific peptide can be used (we have data for each different charge and modification state). Therefore, it is always good to check if there are other precursors of a good scoring peptide that do not show any regulation at all. This would mean that the reason for the observed response is not based on a biological effect. ```{r result_analysis, eval = test_protti, echo=FALSE, results='asis'} -fit %>% - filter(rank <= 20) %>% - select(rank, - score, - eg_precursor_id, - pg_protein_accessions, - anova_adj_pval, - correlation, - ec_50) %>% - mutate(anova_adj_pval = format(anova_adj_pval, digits = 3), - correlation = format(correlation, digits = 3), - ec_50 = format(ec_50, digits = 2), - score = format(score, digits = 3)) %>% +fit %>% + filter(rank <= 20) %>% + select( + rank, + score, + eg_precursor_id, + pg_protein_accessions, + anova_adj_pval, + correlation, + ec_50 + ) %>% + mutate( + anova_adj_pval = format(anova_adj_pval, digits = 3), + correlation = format(correlation, digits = 3), + ec_50 = format(ec_50, digits = 2), + score = format(score, digits = 3) + ) %>% knitr::kable(caption = "All hits") ``` @@ -219,12 +235,13 @@ Note: The output of `fit_drc_4p()` includes two columns with data frames contain ```{r model_plot, eval = test_protti, fig.align = "center", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE} # Model plotting drc_4p_plot(fit, - grouping = eg_precursor_id, - dose = r_condition, - response = normalised_intensity_log2, - targets = "_VFDVELLKLE_.2", - unit = "pM", - export = FALSE) + grouping = eg_precursor_id, + dose = r_condition, + response = normalised_intensity_log2, + targets = "_VFDVELLKLE_.2", + unit = "pM", + export = FALSE +) ``` # Further analysis @@ -249,13 +266,13 @@ unis <- unique(fit$pg_protein_accessions) uniprot <- fetch_uniprot(unis) # annotation of fit data based on information from UniProt -fit_annotated <- fit %>% +fit_annotated <- fit %>% # columns containing proteins IDs are named differently - left_join(uniprot, by = c("pg_protein_accessions" = "accession")) %>% + left_join(uniprot, by = c("pg_protein_accessions" = "accession")) %>% # mark peptides that pass the filtering - mutate(passed_filter = !is.na(rank)) %>% + mutate(passed_filter = !is.na(rank)) %>% # create new column with prior knowledge about binding partners of treatment - mutate(binds_treatment = pg_protein_accessions == "P62942") + mutate(binds_treatment = pg_protein_accessions == "P62942") ``` ## Enrichment and network analysis @@ -268,44 +285,50 @@ Nevertheless, we will demonstrate below how you could use some additional functi ### GO enrichment using "molecular function" annotation from UniProt calculate_go_enrichment(fit_annotated, - protein_id = pg_protein_accessions, - is_significant = passed_filter, - go_annotations_uniprot = go_f) # column obtained from UniProt + protein_id = pg_protein_accessions, + is_significant = passed_filter, + go_annotations_uniprot = go_f +) # column obtained from UniProt ### KEGG pathway enrichment -# First you need to load KEGG pathway annotations from the KEGG database -# for your specific organism of interest. In this case HeLa cells were +# First you need to load KEGG pathway annotations from the KEGG database +# for your specific organism of interest. In this case HeLa cells were # used, therefore the organism of interest is homo sapiens (hsa) kegg <- fetch_kegg(species = "hsa") # Next we need to annotate our data with KEGG pathway IDs and perform enrichment analysis -fit %>% +fit %>% # columns containing proteins IDs are named differently - left_join(kegg, by = c("pg_protein_accessions" = "uniprot_id")) %>% - calculate_kegg_enrichment(protein_id = pg_protein_accessions, - is_significant = passed_filter, - pathway_id = pathway_id, # column name from kegg data frame - pathway_name = pathway_name) # column name from kegg data frame + left_join(kegg, by = c("pg_protein_accessions" = "uniprot_id")) %>% + calculate_kegg_enrichment( + protein_id = pg_protein_accessions, + is_significant = passed_filter, + pathway_id = pathway_id, # column name from kegg data frame + pathway_name = pathway_name + ) # column name from kegg data frame ### Treatment enrichment analysis calculate_treatment_enrichment(fit_annotated, - protein_id = pg_protein_accessions, - is_significant = passed_filter, - binds_treatment = binds_treatment, - treatment_name = "Rapamycin") + protein_id = pg_protein_accessions, + is_significant = passed_filter, + binds_treatment = binds_treatment, + treatment_name = "Rapamycin" +) ### Network analysis -fit_annotated %>% +fit_annotated %>% filter(passed_filter == TRUE) %>% # only analyse hits that were significant - analyse_functional_network(protein_id = pg_protein_accessions, - string_id = xref_string, # column from UniProt containing STRING IDs - organism_id = 9606, - # tax ID can be found in function documentation or STRING database - binds_treatment = binds_treatment, - plot = TRUE) + analyse_functional_network( + protein_id = pg_protein_accessions, + string_id = xref_string, # column from UniProt containing STRING IDs + organism_id = 9606, + # tax ID can be found in function documentation or STRING database + binds_treatment = binds_treatment, + plot = TRUE + ) ``` diff --git a/vignettes/data_analysis_single_dose_treatment_workflow.Rmd b/vignettes/data_analysis_single_dose_treatment_workflow.Rmd index 627d02b7..ae7813c0 100644 --- a/vignettes/data_analysis_single_dose_treatment_workflow.Rmd +++ b/vignettes/data_analysis_single_dose_treatment_workflow.Rmd @@ -128,16 +128,20 @@ _Note: The use of the `filter_cv()` function is optional. It might remove a lot data_normalised <- rapamycin_10uM %>% filter(eg_is_decoy == FALSE) %>% mutate(intensity_log2 = log2(fg_quantity)) %>% - normalise(sample = r_file_name, - intensity_log2 = intensity_log2, - method = "median") + normalise( + sample = r_file_name, + intensity_log2 = intensity_log2, + method = "median" + ) data_filtered <- data_normalised %>% - filter_cv(grouping = eg_precursor_id, - condition = r_condition, - log2_intensity = intensity_log2, - cv_limit = 0.25, - min_conditions = 1) + filter_cv( + grouping = eg_precursor_id, + condition = r_condition, + log2_intensity = intensity_log2, + cv_limit = 0.25, + min_conditions = 1 + ) ``` ### Remove non-proteotypic peptides @@ -183,19 +187,27 @@ uniprot <- "length", "sequence" ) - ) %>% + ) %>% rename(pg_protein_accessions = accession) data_filtered_uniprot <- data_filtered_proteotypic %>% - left_join(y = uniprot, - by = "pg_protein_accessions") %>% - find_peptide(protein_sequence = sequence, - peptide_sequence = pep_stripped_sequence) %>% - assign_peptide_type(aa_before = aa_before, - last_aa = last_aa, - aa_after = aa_after) %>% - calculate_sequence_coverage(protein_sequence = sequence, - peptides = pep_stripped_sequence) + left_join( + y = uniprot, + by = "pg_protein_accessions" + ) %>% + find_peptide( + protein_sequence = sequence, + peptide_sequence = pep_stripped_sequence + ) %>% + assign_peptide_type( + aa_before = aa_before, + last_aa = last_aa, + aa_after = aa_after + ) %>% + calculate_sequence_coverage( + protein_sequence = sequence, + peptides = pep_stripped_sequence + ) ``` With the `qc_sequence_coverage()` function, you check how sequence coverage is distributed over all proteins in the sample. Usually, the center of the distribution is low due to many proteins with poor coverage. For this small data set with only 40 proteins the sequence coverage is distributed relatively evenly. @@ -246,13 +258,15 @@ diff_abundance_data <- data_filtered_uniprot %>% ref_condition = "control", completeness_MAR = 0.7, completeness_MNAR = 0.25, - retain_columns = c(pg_protein_accessions, - go_f, - xref_string, - start, - end, - length, - coverage) + retain_columns = c( + pg_protein_accessions, + go_f, + xref_string, + start, + end, + length, + coverage + ) ) %>% calculate_diff_abundance( sample = r_file_name, @@ -262,14 +276,16 @@ diff_abundance_data <- data_filtered_uniprot %>% missingness = missingness, comparison = comparison, method = "moderated_t-test", - retain_columns = c(pg_protein_accessions, - go_f, - xref_string, - start, - end, - length, - coverage) - ) + retain_columns = c( + pg_protein_accessions, + go_f, + xref_string, + start, + end, + length, + coverage + ) + ) ``` ### p-value distribution @@ -277,10 +293,11 @@ diff_abundance_data <- data_filtered_uniprot %>% The p-value calculated with the moderated t-test is automatically adjusted for multiple testing using the Benjamini-Hochberg correction. This assures that we keep the false discovery rate low. An assumption of this correction is however, that p-values should have an overall uniform distribution. If there is an effect in the data, there will be an increased frequency of low p-values. You can check this by using the protti function `pval_distribution_plot()`. This also helps you assess whether your p-value distribution fulfills the assumptions for your selected FDR control. The [cp4p](https://CRAN.R-project.org/package=cp4p) R package is another great way to check the assumptions underlying FDR control in quantitative experiments. ```{r pval_distribution, eval = test_protti, message = FALSE, warning = FALSE, fig.align= "center", fig.width = 6, fig.height = 5} -pval_distribution_plot(data = diff_abundance_data, - grouping = eg_precursor_id, - pval = pval - ) +pval_distribution_plot( + data = diff_abundance_data, + grouping = eg_precursor_id, + pval = pval +) ``` For this subset of data the distribution of p-values is relatively flat and there is no large increase in values in the low p-value range (the distribution is uniform when a lot of your hypotheses are null). This is likely because, for this experiment, only a very small fraction of peptides show changes. @@ -307,10 +324,10 @@ volcano_plot( target_column = pg_protein_accessions, target = "P62942", x_axis_label = "log2(fold change) Rapamycin treated vs. untreated", - significance_cutoff = c(0.05, "adj_pval") + significance_cutoff = c(0.05, "adj_pval") ) -# The significance_cutoff argument can also just be used for a +# The significance_cutoff argument can also just be used for a # regular cutoff line by just providing the cutoff value, e.g. # signficiance_cutoff = 0.05 ``` @@ -334,7 +351,7 @@ barcode_plot( colouring = diff, cutoffs = c(diff = 1, adj_pval = 0.05), protein_id = pg_protein_accessions - ) +) ``` ### Wood's plot @@ -344,7 +361,6 @@ An additional way to plot LiP-MS changes is the Woods' plot. This plot will show To produce a Woods' plot we use the function `woods_plot()` and colour the peptides according to their adjusted p-values. We are highlighting significant adjusted p-values (< 0.01) with an asterisk. Peptides can also be coloured by another categorical or continous variable. Asterisks can be added for any logical (binary) variable. ```{r woods_plot, eval = test_protti, fig.align = "center", fig.width = 6, message = FALSE, warning = FALSE} - FKBP12 <- FKBP12 %>% mutate(significant = ifelse(adj_pval < 0.01, TRUE, FALSE)) @@ -356,11 +372,11 @@ woods_plot( protein_length = length, coverage = coverage, colouring = adj_pval, - protein_id = pg_protein_accessions, + protein_id = pg_protein_accessions, facet = FALSE, fold_change_cutoff = 1, highlight = significant - ) +) ``` ### Peptide profile plots @@ -369,7 +385,7 @@ To see how the individual precursors in our target protein are changing with the If you have protein abundance data you can also use the plot to show changes in protein abundance over your treatment condition(s). By selecting multiple targets (as a vector) you can produce the plot for multiple proteins. ```{r protile_plot, eval = test_protti, fig.align = "center", fig.width = 20, fig.height = 6, message = FALSE, warning = FALSE} -FKBP12_intensity <- data_filtered_uniprot %>% +FKBP12_intensity <- data_filtered_uniprot %>% filter(pg_protein_accessions == "P62942") peptide_profile_plot( @@ -398,9 +414,9 @@ If you know all known interactors of your specific treatment you can check for a ```{r additional_functions, eval=FALSE} diff_abundance_significant <- diff_abundance_data %>% # mark significant peptides - mutate(is_significant = ifelse((adj_pval < 0.01 & abs(diff) > 1), TRUE, FALSE)) %>% + mutate(is_significant = ifelse((adj_pval < 0.01 & abs(diff) > 1), TRUE, FALSE)) %>% # mark true positive hits - mutate(binds_treatment = pg_protein_accessions == "P62942") + mutate(binds_treatment = pg_protein_accessions == "P62942") ### GO enrichment using "molecular function" annotation from UniProt @@ -416,37 +432,41 @@ calculate_go_enrichment( network_input <- diff_abundance_significant %>% filter(is_significant == TRUE) -analyse_functional_network(data = network_input, - protein_id = pg_protein_accessions, - string_id = xref_string, - binds_treatment = binds_treatment, - organism_id = 9606) +analyse_functional_network( + data = network_input, + protein_id = pg_protein_accessions, + string_id = xref_string, + binds_treatment = binds_treatment, + organism_id = 9606 +) ### KEGG pathway enrichment -# First you need to load KEGG pathway annotations from the KEGG database -# for your specific organism of interest. In this case HeLa cells were +# First you need to load KEGG pathway annotations from the KEGG database +# for your specific organism of interest. In this case HeLa cells were # used, therefore the organism of interest is homo sapiens (hsa) kegg <- fetch_kegg(species = "hsa") # Next we need to annotate our data with KEGG pathway IDs and perform enrichment analysis -diff_abundance_significant %>% +diff_abundance_significant %>% # columns containing proteins IDs are named differently - left_join(kegg, by = c("pg_protein_accessions" = "uniprot_id")) %>% - calculate_kegg_enrichment(protein_id = pg_protein_accessions, - is_significant = is_significant, - pathway_id = pathway_id, - pathway_name = pathway_name) + left_join(kegg, by = c("pg_protein_accessions" = "uniprot_id")) %>% + calculate_kegg_enrichment( + protein_id = pg_protein_accessions, + is_significant = is_significant, + pathway_id = pathway_id, + pathway_name = pathway_name + ) ### Treatment enrichment analysis calculate_treatment_enrichment(diff_abundance_significant, - protein_id = pg_protein_accessions, - is_significant = is_significant, - binds_treatment = binds_treatment, - treatment_name = "Rapamycin") - + protein_id = pg_protein_accessions, + is_significant = is_significant, + binds_treatment = binds_treatment, + treatment_name = "Rapamycin" +) ``` diff --git a/vignettes/input_preparation_workflow.Rmd b/vignettes/input_preparation_workflow.Rmd index 10898fad..3884abbb 100644 --- a/vignettes/input_preparation_workflow.Rmd +++ b/vignettes/input_preparation_workflow.Rmd @@ -80,7 +80,7 @@ Please make sure that the report is a .csv file. You can use the `read_protti()` ```{r Spectronaut, eval=FALSE} # To read in your own data you can use read_protti() -spectronaut_data <- read_protti(filename = "mydata/spectronaut.csv") +spectronaut_data <- read_protti(filename = "mydata/spectronaut.csv") ``` # MaxQuant @@ -103,10 +103,10 @@ In this section we will show you how to read in the file with `read_protti()` an ```{r MaxQuant_peptide, eval=FALSE} # To read in your own data you can use read_protti() -evidence <- read_protti(filename = "yourpath/evidence.txt") +evidence <- read_protti(filename = "yourpath/evidence.txt") evidence_proteotypic <- evidence %>% - # adds new column with logicals that are TRUE if the peptide can be assigned + # adds new column with logicals that are TRUE if the peptide can be assigned # to only one protein and FALSE if it can be assigned to multiple mutate(is_proteotypic = str_detect( string = proteins, @@ -114,7 +114,7 @@ evidence_proteotypic <- evidence %>% negate = TRUE )) %>% # adds new column with logicals indicating if peptide is coming from a potential contaminant - mutate(is_contaminant = ifelse(potential_contaminant == "+", TRUE, FALSE)) + mutate(is_contaminant = ifelse(potential_contaminant == "+", TRUE, FALSE)) # Make an annotation data frame and merge it with your data frame to obtain conditions # We are annotating sample 1-3 as controls and samples 4-6 as treated conditions @@ -140,7 +140,7 @@ condition <- c( annotation <- data.frame(file_name, condition) # Combine your long data frame with the annotation -evidence_annotated <- evidence_proteotypic %>% +evidence_annotated <- evidence_proteotypic %>% left_join(y = annotation, by = "file_name") ``` @@ -152,12 +152,12 @@ We will filter the data and use `tidyr`'s `pivot_longer()` to change the format ```{r MaxQuant_protein, eval=FALSE} # To read in your own data you can use read_protti() -protein_groups <- read_protti(filename = "yourpath/proteinGroups.txt") %>% - # adds new column with logicals indicating if protein is a potential contaminant, - # you can filter these out later on. You should also consider filtering out proteins - # that were "only identified by site" and reverse hits, as well as proteins with only +protein_groups <- read_protti(filename = "yourpath/proteinGroups.txt") %>% + # adds new column with logicals indicating if protein is a potential contaminant, + # you can filter these out later on. You should also consider filtering out proteins + # that were "only identified by site" and reverse hits, as well as proteins with only # one identified peptide - mutate(is_potential_contaminant = ifelse(potential_contaminant == "+", TRUE, FALSE)) + mutate(is_potential_contaminant = ifelse(potential_contaminant == "+", TRUE, FALSE)) # Change wide format to long format and create new columns called `r_file_name`and `intensity` protein_groups_long <- protein_groups %>% @@ -191,7 +191,7 @@ condition <- c( annotation <- data.frame(file_name, condition) # Combine your long data frame with the annotation -protein_groups_annotated <- protein_groups_long %>% +protein_groups_annotated <- protein_groups_long %>% left_join(y = annotation, by = "file_name") ``` @@ -216,17 +216,17 @@ skyline_data <- read_protti(filename = "yourpath/skyline.csv") skyline_data_int <- skyline_data %>% # create a column with precursor information - mutate(precursor = paste0(peptide_sequence, "_", charge)) %>% + mutate(precursor = paste0(peptide_sequence, "_", charge)) %>% group_by(replicate_name, precursor) %>% # making a new column containing the summed up intensities of all transitions of one precursor - mutate(sum_intensity = sum(area)) %>% + mutate(sum_intensity = sum(area)) %>% select(-c(product_mz, area)) %>% # removing the columns we don't need distinct() # removing duplicated rows from the data frame # Add annotation # make sure that the names are the same name as in your report -replicate_name <- c( - "sample_1", +replicate_name <- c( + "sample_1", "sample_2", "sample_3", "sample_1", @@ -246,9 +246,8 @@ condition <- c( annotation <- data.frame(replicate_name, condition) # Combine your long data frame with the annotation -skyline_annotated <- skyline_data_int %>% +skyline_annotated <- skyline_data_int %>% left_join(y = annotation, by = "replicate_name") - ``` # Proteome Discoverer @@ -303,7 +302,7 @@ pd_pep_long <- pd_pep_filtered %>% values_to = "intensity" ) %>% # combine peptide sequence and modifications to make a precursor column - mutate(precursor = paste(sequence, modifications)) + mutate(precursor = paste(sequence, modifications)) # Make annotation data frame file_name <- c( # make sure that the names are the same name as in your report @@ -327,7 +326,7 @@ condition <- c( annotation <- data.frame(file_name, condition) # Combine your long data frame with the annotation -pd_pep_long_annotated <- pd_pep_long %>% +pd_pep_long_annotated <- pd_pep_long %>% left_join(y = annotation, by = "file_name") ``` @@ -396,7 +395,7 @@ condition <- c( annotation <- data.frame(file_name, condition) # Combine your long data frame with the annotation -pd_prot_long_annotated <- pd_prot_long %>% +pd_prot_long_annotated <- pd_prot_long %>% left_join(y = annotation, by = "file_name") ``` diff --git a/vignettes/protein_structure_workflow.Rmd b/vignettes/protein_structure_workflow.Rmd index b01047f1..302e7a46 100644 --- a/vignettes/protein_structure_workflow.Rmd +++ b/vignettes/protein_structure_workflow.Rmd @@ -26,7 +26,7 @@ knitr::opts_chunk$set( # Introduction ```{r CRAN_comment, message=FALSE, warning=FALSE, echo=FALSE} -if (build_vignette_on_cran == FALSE){ +if (build_vignette_on_cran == FALSE) { print("!!! IMPORTANT !!!") print("This Vignette has not been built completely on CRAN due to size limitations.") print("Please check the correct version here: ") @@ -104,12 +104,14 @@ First we need to annotate the data with information required for structural data uniprot_ids <- unique(ptsi_pgk$pg_protein_accessions) # Fetch UniProt information -uniprot_information <- fetch_uniprot(uniprot_ids = uniprot_ids, - columns = c("sequence", "xref_pdb")) +uniprot_information <- fetch_uniprot( + uniprot_ids = uniprot_ids, + columns = c("sequence", "xref_pdb") +) # Add UniProt information and find peptide positions -ptsi_pgk_annotated <- ptsi_pgk %>% - left_join(uniprot_information, by = c("pg_protein_accessions" = "accession")) %>% +ptsi_pgk_annotated <- ptsi_pgk %>% + left_join(uniprot_information, by = c("pg_protein_accessions" = "accession")) %>% find_peptide(protein_sequence = sequence, peptide_sequence = pep_stripped_sequence) ``` @@ -121,10 +123,10 @@ In the previous step we already retrieved information about the availability of ```{r extract_pdb_info, eval = test_protti, warning = FALSE} # Extract PDB IDs from UniProt information -ptsi_pgk_pdb_ids <- ptsi_pgk_annotated %>% - distinct(pg_protein_accessions, xref_pdb) %>% - mutate(pdb_id = str_split(xref_pdb, pattern = ";")) %>% - unnest(pdb_id) %>% +ptsi_pgk_pdb_ids <- ptsi_pgk_annotated %>% + distinct(pg_protein_accessions, xref_pdb) %>% + mutate(pdb_id = str_split(xref_pdb, pattern = ";")) %>% + unnest(pdb_id) %>% filter(pdb_id != "") # Fetch pdb information @@ -138,11 +140,13 @@ Generally, the molecular information about the structure is divided into "polyme You could for example only consider structures for your analysis that were determined by X-ray crystallography and that have a resolution of below 3 Å. In addition, you could select only the one structure that contains the longest sequence stretch for each of our proteins (the structure with the maximum `length`). This can be achieved in the following way: ```{r filter_structures, eval = test_protti, warning = FALSE} -filtered_structures <- ptsi_pgk_pdb_information %>% - filter(experimental_method == "X-ray", - resolution_combined <= 3) %>% - group_by(reference_database_accession) %>% - filter(length == max(length)) %>% +filtered_structures <- ptsi_pgk_pdb_information %>% + filter( + experimental_method == "X-ray", + resolution_combined <= 3 + ) %>% + group_by(reference_database_accession) %>% + filter(length == max(length)) %>% ungroup() ``` @@ -159,8 +163,10 @@ Similar to `fetch_pdb()` PDB identifiers are provided as input to `fetch_pdb_str pdb_ids <- unique(filtered_structures$pdb_ids) # "1ZMR", "2HWG" # Fetch atom level structural information -ptsi_pgk_structure_information <- fetch_pdb_structure(pdb_ids = pdb_ids, - return_data_frame = TRUE) +ptsi_pgk_structure_information <- fetch_pdb_structure( + pdb_ids = pdb_ids, + return_data_frame = TRUE +) ``` ## Fetching atomic structure prediction information from AlphaFold @@ -171,8 +177,10 @@ It is of course also possible to retrieve structure prediction information for p ```{r fetch_alphafold_prediction, eval = test_protti, warning = FALSE} # Fetch atom level structural prediction information from AlphaFold -ptsi_pgk_prediction_information <- fetch_alphafold_prediction(uniprot_ids = uniprot_ids, - return_data_frame = TRUE) +ptsi_pgk_prediction_information <- fetch_alphafold_prediction( + uniprot_ids = uniprot_ids, + return_data_frame = TRUE +) # Example for fetching all predictions for Methanocaldococcus jannaschii # mj_predictions <- fetch_alphafold_prediction(organism_name = "Methanocaldococcus jannaschii") @@ -186,75 +194,81 @@ With **protti** we provide the function `fetch_alphafold_aligned_error()` that a ```{r alphafold_domain_prediction, eval = test_protti, warning = FALSE} # Fetch aligned errors -aligned_error <- fetch_alphafold_aligned_error(uniprot_ids = uniprot_ids, - error_cutoff = 4) +aligned_error <- fetch_alphafold_aligned_error( + uniprot_ids = uniprot_ids, + error_cutoff = 4 +) # Predict protein domains with graph_resolution of 1 -af_domains_res_1 <- predict_alphafold_domain(pae_list = aligned_error, - return_data_frame = TRUE, - graph_resolution = 1) # Default +af_domains_res_1 <- predict_alphafold_domain( + pae_list = aligned_error, + return_data_frame = TRUE, + graph_resolution = 1 +) # Default # Predict protein domains with graph_resolution of 0.5 -af_domains_res_05 <- predict_alphafold_domain(pae_list = aligned_error, - return_data_frame = TRUE, - graph_resolution = 0.5) +af_domains_res_05 <- predict_alphafold_domain( + pae_list = aligned_error, + return_data_frame = TRUE, + graph_resolution = 0.5 +) ``` #### ptsI domain prediction with `graph_resolution = 1` ```{r show_model_P08839_res1, eval = test_protti, echo=FALSE, warning=FALSE} # This code won't be seen in the Vignette -if(!is.null(af_domains_res_1)){ -# Fetch structure file for model -protti:::try_query("https://alphafold.ebi.ac.uk/files/AF-P08839-F1-model_v3.pdb", - type = "text/tab-separated-values", - col_names = FALSE, - quote = "", - show_col_types = FALSE, - progress = FALSE -) %>% - readr::write_tsv( - file = paste0(tempdir(), "/AF-P08839-F1-model_v3.pdb"), - quote = "none", - escape = "none", +if (!is.null(af_domains_res_1)) { + # Fetch structure file for model + protti:::try_query("https://alphafold.ebi.ac.uk/files/AF-P08839-F1-model_v3.pdb", + type = "text/tab-separated-values", col_names = FALSE, + quote = "", + show_col_types = FALSE, progress = FALSE - ) - -# Load the r3dmol package -library(r3dmol) - -# Extract domain positions - -domain1 <- af_domains_res_1 %>% - filter(accession == "P08839" & domain == 1) %>% - pull(residue) - -domain2 <- af_domains_res_1 %>% - filter(accession == "P08839" & domain == 2) %>% - pull(residue) - -domain3 <- af_domains_res_1 %>% - filter(accession == "P08839" & domain == 3) %>% - pull(residue) - -# Create model -r3dmol() %>% - m_add_model(data = paste0(tempdir(), "/AF-P08839-F1-model_v3.pdb"), format = "pdb") %>% - m_set_style(style = m_style_cartoon()) %>% - m_add_style( - style = m_style_cartoon(color = "#8047d6"), - sel = m_sel(resi = domain1) - ) %>% - m_add_style( - style = m_style_cartoon(color = "#96d647"), - sel = m_sel(resi = domain2) - ) %>% - m_add_style( - style = m_style_cartoon(color = "#FF7276"), - sel = m_sel(resi = domain3) - ) %>% - m_zoom_to() + ) %>% + readr::write_tsv( + file = paste0(tempdir(), "/AF-P08839-F1-model_v3.pdb"), + quote = "none", + escape = "none", + col_names = FALSE, + progress = FALSE + ) + + # Load the r3dmol package + library(r3dmol) + + # Extract domain positions + + domain1 <- af_domains_res_1 %>% + filter(accession == "P08839" & domain == 1) %>% + pull(residue) + + domain2 <- af_domains_res_1 %>% + filter(accession == "P08839" & domain == 2) %>% + pull(residue) + + domain3 <- af_domains_res_1 %>% + filter(accession == "P08839" & domain == 3) %>% + pull(residue) + + # Create model + r3dmol() %>% + m_add_model(data = paste0(tempdir(), "/AF-P08839-F1-model_v3.pdb"), format = "pdb") %>% + m_set_style(style = m_style_cartoon()) %>% + m_add_style( + style = m_style_cartoon(color = "#8047d6"), + sel = m_sel(resi = domain1) + ) %>% + m_add_style( + style = m_style_cartoon(color = "#96d647"), + sel = m_sel(resi = domain2) + ) %>% + m_add_style( + style = m_style_cartoon(color = "#FF7276"), + sel = m_sel(resi = domain3) + ) %>% + m_zoom_to() } ``` @@ -262,38 +276,38 @@ r3dmol() %>% ```{r show_model_P08839_res05, eval = test_protti, echo=FALSE, warning=FALSE} # This code won't be seen in the Vignette -if(!is.null(af_domains_res_1)){ -# Extract domain positions - -domain1 <- af_domains_res_05 %>% - filter(accession == "P08839" & domain == 1) %>% - pull(residue) - -domain2 <- af_domains_res_05 %>% - filter(accession == "P08839" & domain == 2) %>% - pull(residue) - -domain3 <- af_domains_res_05 %>% - filter(accession == "P08839" & domain == 3) %>% - pull(residue) - -# Create model -r3dmol() %>% - m_add_model(data = paste0(tempdir(), "/AF-P08839-F1-model_v3.pdb"), format = "pdb") %>% - m_set_style(style = m_style_cartoon()) %>% - m_add_style( - style = m_style_cartoon(color = "#8047d6"), - sel = m_sel(resi = domain1) - ) %>% - m_add_style( - style = m_style_cartoon(color = "#96d647"), - sel = m_sel(resi = domain2) - ) %>% - m_add_style( - style = m_style_cartoon(color = "#FF7276"), - sel = m_sel(resi = domain3) - ) %>% - m_zoom_to() +if (!is.null(af_domains_res_1)) { + # Extract domain positions + + domain1 <- af_domains_res_05 %>% + filter(accession == "P08839" & domain == 1) %>% + pull(residue) + + domain2 <- af_domains_res_05 %>% + filter(accession == "P08839" & domain == 2) %>% + pull(residue) + + domain3 <- af_domains_res_05 %>% + filter(accession == "P08839" & domain == 3) %>% + pull(residue) + + # Create model + r3dmol() %>% + m_add_model(data = paste0(tempdir(), "/AF-P08839-F1-model_v3.pdb"), format = "pdb") %>% + m_set_style(style = m_style_cartoon()) %>% + m_add_style( + style = m_style_cartoon(color = "#8047d6"), + sel = m_sel(resi = domain1) + ) %>% + m_add_style( + style = m_style_cartoon(color = "#96d647"), + sel = m_sel(resi = domain2) + ) %>% + m_add_style( + style = m_style_cartoon(color = "#FF7276"), + sel = m_sel(resi = domain3) + ) %>% + m_zoom_to() } ``` @@ -309,13 +323,14 @@ In the following we are going to identify structure specific peptide start and e ```{r find_peptide, eval = test_protti, warning = FALSE} ptsi_pgk_peptide_structure_positions <- find_peptide_in_structure( - peptide_data = ptsi_pgk_annotated, - peptide = pep_stripped_sequence, - start = start, - end = end, - uniprot_id = pg_protein_accessions, - pdb_data = filtered_structures, - retain_columns = c(eg_precursor_id, diff, adj_pval)) + peptide_data = ptsi_pgk_annotated, + peptide = pep_stripped_sequence, + start = start, + end = end, + uniprot_id = pg_protein_accessions, + pdb_data = filtered_structures, + retain_columns = c(eg_precursor_id, diff, adj_pval) +) ``` _Note: The `peptide` argument in this function does not necessarily require the peptide sequence but can also take any other unique peptide identifier as input. This is possible because peptides are not matched to the structure sequence by their sequence but are identified based on their start and end positions in the protein._ @@ -334,10 +349,10 @@ _Note: The input to the `create_structure_contact_map()` function can be a data ```{r create_structure_contact_map, eval = test_protti, warning = FALSE, fig.width = 10, fig.height = 7, fig.align = "center"} # Filter data for significant peptides. -significant_peptides <- ptsi_pgk_peptide_structure_positions %>% +significant_peptides <- ptsi_pgk_peptide_structure_positions %>% filter(abs(diff) > 2, adj_pval <= 0.01) -# Create a structure contact maps +# Create a structure contact maps contact_map <- create_structure_contact_map( data = significant_peptides, id = pdb_ids, @@ -362,7 +377,7 @@ integer_breaks <- function(n = 5, ...) { # Plot structure contact maps # 1ZMR contact_map[["1ZMR"]] %>% # Extract data frame from list - mutate(chain_combinations = paste0("chain_", label_asym_id_var1, "_vs_chain_", label_asym_id_var2)) %>% + mutate(chain_combinations = paste0("chain_", label_asym_id_var1, "_vs_chain_", label_asym_id_var2)) %>% ggplot(aes(x = label_seq_id_var1, y = label_seq_id_var2, fill = min_distance_residue)) + geom_tile() + scale_y_continuous(breaks = integer_breaks()) + @@ -370,10 +385,10 @@ contact_map[["1ZMR"]] %>% # Extract data frame from list facet_wrap(~chain_combinations, scale = "free") + labs(title = "Structure contact map 1ZMR") + theme_bw() - + # 2HWG contact_map[["2HWG"]] %>% # Extract data frame from list - mutate(chain_combinations = paste0("chain_", label_asym_id_var1, "_vs_chain_", label_asym_id_var2)) %>% + mutate(chain_combinations = paste0("chain_", label_asym_id_var1, "_vs_chain_", label_asym_id_var2)) %>% ggplot(aes(x = label_seq_id_var1, y = label_seq_id_var2, fill = min_distance_residue)) + geom_tile() + scale_y_continuous(breaks = integer_breaks()) + @@ -413,19 +428,20 @@ You provide information on colouring through the `map_value` argument. This shou Lastly, we can specify an export location to indicate a place where the structure file should be saved. In this example the location is specified as a temporary file directory using the `tempdir()` function. You should change that location to any more accessible place. If you leave the argument empty, structures will be automatically saved in your working directory. ```{r peptide_mapping, eval = test_protti, warning = FALSE} -ptsi_pgk_peptide_structure_positions %>% - mutate(map_value = ifelse(eg_precursor_id %in% significant_peptides$eg_precursor_id, - 100, - 0)) %>% +ptsi_pgk_peptide_structure_positions %>% + mutate(map_value = ifelse(eg_precursor_id %in% significant_peptides$eg_precursor_id, + 100, + 0 + )) %>% map_peptides_on_structure( - uniprot_id = pg_protein_accessions, - pdb_id = pdb_ids, - chain = auth_asym_id, - auth_seq_id = auth_seq_id, - map_value = map_value, - file_format = ".pdb", - export_location = tempdir() # change to a location of your choice - ) + uniprot_id = pg_protein_accessions, + pdb_id = pdb_ids, + chain = auth_asym_id, + auth_seq_id = auth_seq_id, + map_value = map_value, + file_format = ".pdb", + export_location = tempdir() # change to a location of your choice + ) ``` The function does not return anything in the R environment. Structures are directly saved in the desired format. The files contain the name of the structure and separated by "_" the name of all UniProt IDs present in the structure file. @@ -445,11 +461,11 @@ If you run the code snippets on your own make sure you replace `paste0(tempdir() # install.packages("r3dmol") # Load the r3dmol package -library(r3dmol) +library(r3dmol) # Create structure r3dmol() %>% - m_add_model(data = paste0(tempdir(), "/1ZMR_P0A799.pdb"), format = "pdb") %>% + m_add_model(data = paste0(tempdir(), "/1ZMR_P0A799.pdb"), format = "pdb") %>% m_set_style(style = m_style_cartoon( colorfunc = " function(atom) { @@ -539,25 +555,26 @@ amino_acid_score <- calculate_aa_scores( # Find amino acid positions in the structure ptsi_pgk_amino_acid_structure_positions <- find_peptide_in_structure( - peptide_data = amino_acid_score, - peptide = residue, - start = residue, - end = residue, - uniprot_id = pg_protein_accessions, - pdb_data = filtered_structures, - retain_columns = c(amino_acid_score)) + peptide_data = amino_acid_score, + peptide = residue, + start = residue, + end = residue, + uniprot_id = pg_protein_accessions, + pdb_data = filtered_structures, + retain_columns = c(amino_acid_score) +) # Map the score on structure map_peptides_on_structure( peptide_data = ptsi_pgk_amino_acid_structure_positions, - uniprot_id = pg_protein_accessions, - pdb_id = pdb_ids, - chain = auth_asym_id, - auth_seq_id = auth_seq_id, - map_value = amino_acid_score, - file_format = ".pdb", - export_location = tempdir() - ) + uniprot_id = pg_protein_accessions, + pdb_id = pdb_ids, + chain = auth_asym_id, + auth_seq_id = auth_seq_id, + map_value = amino_acid_score, + file_format = ".pdb", + export_location = tempdir() +) ``` While the previous method of displaying only significantly changing peptides that have a minimum fold change is useful for the identification of regions that change the most, it is not resistant to false positive peptides. The score on the other hand will only be high if an amino acid is consistently part of significant and differentially abundant peptides. The problem of the score is that it is hard and visually even impossible to interpret what a certain score means. This is due to the fact that the same score can be caused by two completely different scenarios. On the one hand an amino acid might be associated with peptides that have a low fold change but are highly significant. On the other hand it can be associated with peptides that have a high fold change but are of low significance. Both scenarios could yield the same amino acid score. Therefore, the score should not be misinterpreted as describing the extent of the change but should rather be seen as a probability that a certain amino acid is really structurally affected. @@ -566,10 +583,13 @@ You can visualise the score as a colour gradient using the `r3dmol` package in R ```{r score_3d_structure_mapping, eval = test_protti, echo=TRUE, warning=FALSE} # create a color gradient with 101 colors -color_gradient <- paste0('"', - paste(colorRampPalette(c("white", "#90EE90", "#FF7276"))(101), - collapse = '", "'), - '"') +color_gradient <- paste0( + '"', + paste(colorRampPalette(c("white", "#90EE90", "#FF7276"))(101), + collapse = '", "' + ), + '"' +) # create structure r3dmol() %>% @@ -577,7 +597,7 @@ r3dmol() %>% m_set_style(style = m_style_cartoon( colorfunc = paste0(" function(atom) { - const color = [", color_gradient,"] + const color = [", color_gradient, "] return color[Math.round(atom.b)] }") )) %>% diff --git a/vignettes/quality_control_workflow.Rmd b/vignettes/quality_control_workflow.Rmd index 83ab1b11..e9ed87e4 100644 --- a/vignettes/quality_control_workflow.Rmd +++ b/vignettes/quality_control_workflow.Rmd @@ -67,7 +67,6 @@ Before we can start analysing our data, we need to load the **protti** package. library(protti) library(magrittr) library(dplyr) - ``` After having loaded the required packages we can create a synthetic dataset, that contains data similar to data obtained from a treatment experiment with e.g. a protein, metabolite or small-molecule. @@ -81,15 +80,16 @@ Please note that generally, quality control should be conducted on raw unfiltere ```{r create_synthetic_data, eval = test_protti} # by setting the seed we are making sure that the random object generation can be reproduced -set.seed(123) - -data <- create_synthetic_data(n_proteins = 100, - frac_change = 0.05, - n_replicates = 3, - n_conditions = 2, - method = "effect_random", - additional_metadata = TRUE) - +set.seed(123) + +data <- create_synthetic_data( + n_proteins = 100, + frac_change = 0.05, + n_replicates = 3, + n_conditions = 2, + method = "effect_random", + additional_metadata = TRUE +) ``` ## Quality control @@ -116,20 +116,24 @@ The "combined" group of CVs contains CVs across all samples and not only across ```{r qc_cvs, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} input <- data %>% # as the data is log2 transformed, we need to transform it back before calculating the CVs - mutate(raw_intensity = 2^peptide_intensity_missing) - -qc_cvs(data = input, - grouping = peptide, - condition = condition, - intensity = raw_intensity, - plot = FALSE) - -qc_cvs(data = input, - grouping = peptide, - condition = condition, - intensity = raw_intensity, - plot = TRUE, - plot_style = "violin") + mutate(raw_intensity = 2^peptide_intensity_missing) + +qc_cvs( + data = input, + grouping = peptide, + condition = condition, + intensity = raw_intensity, + plot = FALSE +) + +qc_cvs( + data = input, + grouping = peptide, + condition = condition, + intensity = raw_intensity, + plot = TRUE, + plot_style = "violin" +) ``` ### Number of identifications (IDs) @@ -139,20 +143,24 @@ The number of protein or peptide identifications should be similar for different For the analysis of the number of identifications of precursors, peptides or proteins we use the function `qc_ids()`. This function can return either a table or a plot. The output of this function - and also of a lot of other **protti** functions - can be plotted in an interactive plot that makes use of the R package `plotly`. You can plot an interactive version of the plot by setting `interactive = TRUE` within the function call. ```{r qc_ids, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_ids(data = input, - sample = sample, - grouping = protein, - intensity = peptide_intensity_missing, - condition = condition, - plot = FALSE) - -qc_ids(data = input, - sample = sample, - grouping = protein, - intensity = peptide_intensity_missing, - condition = condition, - title = "Protein identifications per sample", - plot = TRUE) +qc_ids( + data = input, + sample = sample, + grouping = protein, + intensity = peptide_intensity_missing, + condition = condition, + plot = FALSE +) + +qc_ids( + data = input, + sample = sample, + grouping = protein, + intensity = peptide_intensity_missing, + condition = condition, + title = "Protein identifications per sample", + plot = TRUE +) ``` ### Peptide types @@ -172,22 +180,26 @@ We are going to use the function `qc_peptide_type()` to evaluate the distributio In a typical LiP-MS experiment, there should be few non-tryptic peptides. The peptide type distribution for fully- and semi-, and non-tryptic peptides depends on your digestion conditions (i.e. temperature, duration, concentration). ```{r qc_peptide_type, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_peptide_type(data = input, - sample = sample, - peptide = peptide, - pep_type = pep_type, - method = "intensity", - intensity = raw_intensity, - plot = TRUE, - interactive = FALSE) - -qc_peptide_type(data = input, - sample = sample, - peptide = peptide, - pep_type = pep_type, - method = "count", - plot = TRUE, - interactive = FALSE) +qc_peptide_type( + data = input, + sample = sample, + peptide = peptide, + pep_type = pep_type, + method = "intensity", + intensity = raw_intensity, + plot = TRUE, + interactive = FALSE +) + +qc_peptide_type( + data = input, + sample = sample, + peptide = peptide, + pep_type = pep_type, + method = "count", + plot = TRUE, + interactive = FALSE +) ``` ### Run intensities @@ -197,17 +209,20 @@ The function `qc_intensity_distribution()` plots all precursor, peptide or prote Rrun intensities can also be assessed by plotting the median run intensities as a line plot. This helps you quickly assess if there are any trends in your data. The function to use for this analysis is `qc_median_intensities()`. ```{r qc_intensity_distribution_boxplot, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} +qc_intensity_distribution( + data = input, + sample = sample, + grouping = peptide, + intensity_log2 = peptide_intensity_missing, + plot_style = "boxplot" +) -qc_intensity_distribution(data = input, - sample = sample, - grouping = peptide, - intensity_log2 = peptide_intensity_missing, - plot_style = "boxplot") - -qc_median_intensities(data = input, - sample = sample, - grouping = peptide, - intensity = peptide_intensity_missing) +qc_median_intensities( + data = input, + sample = sample, + grouping = peptide, + intensity = peptide_intensity_missing +) ``` ### Charge states @@ -216,13 +231,15 @@ The charge state distibution of the detected peptides can be assessed with `qc_c For `method = "intensity"` the function requires the raw intensity values created previously as its input. ```{r qc_charge_states, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_charge_states(data = input, - sample = sample, - grouping = peptide, - charge_states = charge, - method = "intensity", - intensity = raw_intensity, - plot = TRUE) +qc_charge_states( + data = input, + sample = sample, + grouping = peptide, + charge_states = charge, + method = "intensity", + intensity = raw_intensity, + plot = TRUE +) ``` ### Missed cleavages @@ -234,13 +251,15 @@ The number of missed cleavages should generally be low in a proteomics or LiP-MS We are going to check the numbers of missed cleavages in our dataset by using the function `qc_missed_cleavages()`. It can assess missed cleavages based on the count of peptides with missed cleavages or the intensities of the corresponding peptides. For `method = "intensity"` the function uses the raw (not log2 transformed) intensity values. You can have the function either return a plot or a table. ```{r qc_missed_cleavages, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_missed_cleavages(data = input, - sample = sample, - grouping = peptide, - missed_cleavages = n_missed_cleavage, - method = "intensity", - intensity = raw_intensity, - plot = TRUE) +qc_missed_cleavages( + data = input, + sample = sample, + grouping = peptide, + missed_cleavages = n_missed_cleavage, + method = "intensity", + intensity = raw_intensity, + plot = TRUE +) ``` ### Sequence coverage @@ -250,9 +269,11 @@ The following function gives you insight into the protein coverage (i.e. what pe To assess the protein coverage distribution, we are going to use the function `qc_protein_coverage()`. If you do not have a column containing the protein coverages in your data, you can use the function `calculate_sequence_coverage()` to obtain this information. ```{r qc_sequence_coverage, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_sequence_coverage(data = input, - protein_identifier = protein, - coverage = coverage) +qc_sequence_coverage( + data = input, + protein_identifier = protein, + coverage = coverage +) ``` ### Peak width @@ -261,11 +282,13 @@ In order to identify potential chromatographic issues that might have occurred d The peak widths should be similar for all the measured samples of the experiment. ```{r qc_peak_width, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_peak_width(data = input, - sample = sample, - intensity = peptide_intensity_missing, - retention_time = retention_time, - peak_width = peak_width) +qc_peak_width( + data = input, + sample = sample, + intensity = peptide_intensity_missing, + retention_time = retention_time, + peak_width = peak_width +) ``` ### Data completeness @@ -273,11 +296,13 @@ qc_peak_width(data = input, The function `qc_data_completeness()` checks how many of all detected precursors, peptides or proteins were identified in each sample. The function can return either a plot or a table. ```{r qc_data_completeness, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_data_completeness(data = input, - sample = sample, - grouping = peptide, - intensity = peptide_intensity_missing, - plot = TRUE) +qc_data_completeness( + data = input, + sample = sample, + grouping = peptide, + intensity = peptide_intensity_missing, + plot = TRUE +) ``` ### Log2 Intensity distribution @@ -285,10 +310,12 @@ qc_data_completeness(data = input, For different kinds of analyses (e.g. t-tests) it is important that your data intensity follows a normal distribution. To ensure that this is the case, we are going to use the function `qc_intensity_distriubution()`. The function returns a histogram plot when `plot_style = "histogram"` showing how the intensities are distributed. ```{r qc_intensity_distribution_histogram, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} -qc_intensity_distribution(data = input, - grouping = peptide, - intensity_log2 = peptide_intensity_missing, - plot_style = "histogram") +qc_intensity_distribution( + data = input, + grouping = peptide, + intensity_log2 = peptide_intensity_missing, + plot_style = "histogram" +) ``` ### Sample correlation @@ -296,12 +323,14 @@ qc_intensity_distribution(data = input, Another approach to quality control is to check the correlation of your samples. Ideally, replicates should cluster together and different treatment conditions should be separated. We are now going to check if this is the case for our data by using the function `qc_sample_correlation()`. The function will return a correlation heatmap with a comparison of all samples. ```{r qc_sample_correlation, eval = test_protti, fig.width = 6, fig.height = 4, fig.align = "center"} - qc_sample_correlation(data = input, - sample = sample, - grouping = peptide, - intensity_log2 = peptide_intensity_missing, - condition = condition, - interactive = FALSE) +qc_sample_correlation( + data = input, + sample = sample, + grouping = peptide, + intensity_log2 = peptide_intensity_missing, + condition = condition, + interactive = FALSE +) ``` ### Principal component analysis (PCA) @@ -326,7 +355,7 @@ qc_pca( grouping = peptide, intensity = peptide_intensity_missing, condition = condition, - components = c("PC1", "PC2"), + components = c("PC1", "PC2"), plot_style = "pca" ) ``` From a89299fc015f63896231525691eb74a0a69dfcd5 Mon Sep 17 00:00:00 2001 From: Aaron Fehr Date: Mon, 21 Aug 2023 15:57:32 +0200 Subject: [PATCH 23/71] initial commit of correct_lip_for_abundance function --- NAMESPACE | 2 + R/correct_lip_for_abundance.R | 215 +++++++++++++++++++++++++++++++ man/correct_lip_for_abundance.Rd | 139 ++++++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 R/correct_lip_for_abundance.R create mode 100644 man/correct_lip_for_abundance.Rd diff --git a/NAMESPACE b/NAMESPACE index d691851e..09754811 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(calculate_kegg_enrichment) export(calculate_protein_abundance) export(calculate_sequence_coverage) export(calculate_treatment_enrichment) +export(correct_lip_for_abundance) export(create_queue) export(create_structure_contact_map) export(create_synthetic_data) @@ -145,6 +146,7 @@ importFrom(rlang,enquo) importFrom(rlang,ensym) importFrom(rlang,expr) importFrom(rlang,new_formula) +importFrom(rlang,sym) importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,p.adjust) diff --git a/R/correct_lip_for_abundance.R b/R/correct_lip_for_abundance.R new file mode 100644 index 00000000..b5509e4d --- /dev/null +++ b/R/correct_lip_for_abundance.R @@ -0,0 +1,215 @@ +#' Protein abundance correction +#' +#' Performs the correction of LiP-peptides for changes in protein abundance and +#' calculates their significance using a t-test +#' +#' @param lip_data a data frame containing at least the input variables. Ideally, +#' the result from the \code{calculate_diff_abundance} function is used. +#' @param trp_data a data frame containing at least the input variables minus the grouping column. Ideally, +#' the result from the \code{calculate_diff_abundance} function is used. +#' @param protein_id a character column in the \code{lip_data} and \code{trp_data} data frames +#' that contains protein identifiers. +#' @param grouping a character column in the \code{lip_data} data frame that contains precursor or +#' peptide identifiers. +#' @param comparison a character column in the \code{lip_data} and \code{trp_data} data frames +#' that contains the comparisons between conditions. +#' @param diff a numeric column in the \code{lip_data} and \code{trp_data} data frames +#' that contains log2-fold changes for peptide or protein quantities. +#' @param n_obs a numeric column in the \code{lip_data} and \code{trp_data} data frames +#' containing the number of observations used to calculate fold changes. +#' @param std_error a numeric column in the \code{lip_data} and \code{trp_data} data frames +#' containing the standard error of fold changes. +#' @param p_adj_method a character value, specifies the p-value correction method. Possible +#' methods are c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"). Default +#' method is \code{"BH"}. +#' @param retain_columns a vector indicating if certain columns should be retained from the input +#' data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific +#' columns can be retained by providing their names (not in quotations marks, just like other +#' column names, but in a vector). Please note that if you retain columns that have multiple +#' rows per grouped variable there will be duplicated rows in the output. +#' @param method a character value, specifies the method used to estimate the degrees of freedom. +#' Possible methods are c("satterthwaite", "no_df_approximation"). \code{satterthwaite} uses the Welch-Satterthwaite +#' equation to estimate the pooled degrees of freedom, as described in https://doi.org/10.1016/j.mcpro.2022.100477 and +#' implemented in the MSstatsLiP package. This approach respects the number of protein measurements for the degrees of freedom. +#' \code{no_df_approximation} just takes the number of peptides into account when calculating the degrees of freedom. +#' +#' @return a data frame containing corrected differential abundances (\code{adj_diff}, adjusted +#' standard errors (\code{adj_std_error}), degrees of freedom (\code{df}), pvalues (\code{pval}) and +#' adjusted p-values (\code{adj_pval}) +#' +#' @author Aaron Fehr +#' @import dplyr +#' @importFrom rlang .data enquo sym as_name expr := !! +#' @export +#' +#' @examples +#' +#' # Load libraries +#' +#' library(dplyr) +#' +#' # Load example data and simulate tryptic data by summing up precursors +#' +#' data = rapamycin_10uM +#' +#' data_trp = data %>% +#' dplyr::group_by(pg_protein_accessions, r_file_name) %>% +#' dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% +#' dplyr::distinct(r_condition, +#' r_file_name, +#' pg_protein_accessions, +#' pg_quantity) +#' +#' +#' # Calculate differential abundances for LiP and Trp data +#' +#' +#' diff_lip = data %>% +#' mutate(fg_intensity_log2 = log2(fg_quantity)) %>% +#' assign_missingness(sample = r_file_name, +#' condition = r_condition, +#' intensity = fg_intensity_log2, +#' grouping = eg_precursor_id, +#' ref_condition = "control", +#' retain_columns = "pg_protein_accessions") %>% +#' calculate_diff_abundance(sample = r_file_name, +#' condition = r_condition, +#' grouping = eg_precursor_id, +#' intensity_log2 = fg_intensity_log2, +#' comparison = comparison, +#' method = "t-test", +#' retain_columns = "pg_protein_accessions") +#' +#' +#' diff_trp = data_trp %>% +#' mutate(pg_intensity_log2 = log2(pg_quantity)) %>% +#' assign_missingness(sample = r_file_name, +#' condition = r_condition, +#' intensity = pg_intensity_log2, +#' grouping = pg_protein_accessions, +#' ref_condition = "control") %>% +#' calculate_diff_abundance(sample = r_file_name, +#' condition = r_condition, +#' grouping = pg_protein_accessions, +#' intensity_log2 = pg_intensity_log2, +#' comparison = comparison, +#' method = "t-test") +#' +#' # Correct for abundance changes +#' +#' corrected = correct_lip_for_abundance( +#' +#' lip_data = diff_lip, +#' trp_data = diff_trp, +#' protein_id = pg_protein_accessions, +#' grouping = eg_precursor_id, +#' retain_columns = c("missingness"), +#' method = "satterthwaite") +#' +#' head(corrected, n = 10) + + +correct_lip_for_abundance = function( + lip_data, + trp_data, + protein_id, + grouping, + comparison = comparison, + diff = diff, + n_obs = n_obs, + std_error = std_error, + p_adj_method = "BH", + retain_columns = NULL, + method = c("satterthwaite","no_df_approximation")){ + + method = match.arg(method) + + + se_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)),"_pep")) + + se_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)),"_prot")) + + diff_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)),"_pep")) + + diff_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)),"_prot")) + + n_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)),"_pep")) + + n_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)),"_prot")) + + + temp_lip_data = lip_data %>% + dplyr::select(!!enquo(retain_columns), {{comparison}}, {{ protein_id }}, {{ grouping}}, {{ diff }}, {{ n_obs }}, {{ std_error }}) %>% + dplyr::distinct() + + temp_trp_data = trp_data %>% + dplyr::distinct({{comparison}}, {{ protein_id }}, {{ diff }}, {{ n_obs }}, {{ std_error }}) + + + test = temp_lip_data %>% + dplyr::distinct({{comparison}}, {{ protein_id }}, {{ grouping}}) + + if (nrow(test) != nrow(temp_lip_data)){ + message("Warning: Your data frame contains dublicated values due to retained columns. This will affect the multiple testing correction.") + } + + combined_data = dplyr::left_join(x = temp_lip_data, + y = temp_trp_data, + by = c(rlang::as_name(rlang::enquo(comparison)), rlang::as_name(rlang::enquo(protein_id))), + suffix = c("_pep", "_prot")) + + n_unmatched = combined_data %>% + dplyr::filter(is.na(!!diff_prot) == T) %>% + nrow() + + percent_unmatched = round(n_unmatched/nrow(combined_data)*100, 2) + + message(paste0("No protein data was available for ",n_unmatched," peptides (",percent_unmatched, "% of dataset).")) + + if (method == "satterthwaite"){ + + corrected_data = combined_data %>% + dplyr::mutate(adj_diff = !!diff_pep - !!diff_prot) %>% + dplyr::mutate(adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2) ) %>% + dplyr::mutate(numer = ((!!se_pep)**2 + (!!se_prot)**2)**2) %>% + dplyr::mutate(denom = ( (!!se_pep)**4 / (!!n_pep-2) + (!!se_prot)**4/(!!n_prot-2))) %>% + dplyr::mutate(df = .data$numer/.data$denom) %>% + dplyr::mutate(tval = .data$adj_diff / .data$adj_std_error) %>% + dplyr::mutate(pval = 2*stats::pt(abs(.data$tval), .data$df, lower.tail = FALSE)) + + + adjusted_data = dplyr::left_join(x = corrected_data, + y = corrected_data %>% + dplyr::filter(is.na(.data$pval) == FALSE) %>% + dplyr::group_by({{ comparison }}) %>% + dplyr::mutate(adj_pval = p.adjust(.data$pval, method = {{ p_adj_method }})), + by = colnames(corrected_data)) %>% + dplyr::select(- .data$numer, - .data$denom, - .data$tval) + + return(adjusted_data) + + } + + if (method == "no_df_approximation"){ + + corrected_data = combined_data %>% + dplyr::mutate(adj_diff = !!diff_pep - !!diff_prot) %>% + dplyr::mutate(adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2) ) %>% + dplyr::mutate(df = !!n_pep - 2) %>% + dplyr::mutate(tval = .data$adj_diff / .data$adj_std_error) %>% + dplyr::mutate(pval = 2*stats::pt(abs(.data$tval), .data$df, lower.tail = FALSE)) + + + adjusted_data = dplyr::left_join(x = corrected_data, + y = corrected_data %>% + dplyr::filter(is.na(.data$pval) == FALSE) %>% + dplyr::mutate(adj_pval = p.adjust(.data$pval, method = {{ p_adj_method }})), + by = colnames(corrected_data)) %>% + dplyr::select(- .data$tval) + + + return(adjusted_data) + + } + +} diff --git a/man/correct_lip_for_abundance.Rd b/man/correct_lip_for_abundance.Rd new file mode 100644 index 00000000..726d6b33 --- /dev/null +++ b/man/correct_lip_for_abundance.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correct_lip_for_abundance.R +\name{correct_lip_for_abundance} +\alias{correct_lip_for_abundance} +\title{Protein abundance correction} +\usage{ +correct_lip_for_abundance( + lip_data, + trp_data, + protein_id, + grouping, + comparison = comparison, + diff = diff, + n_obs = n_obs, + std_error = std_error, + p_adj_method = "BH", + retain_columns = NULL, + method = c("satterthwaite", "no_df_approximation") +) +} +\arguments{ +\item{lip_data}{a data frame containing at least the input variables. Ideally, +the result from the \code{calculate_diff_abundance} function is used.} + +\item{trp_data}{a data frame containing at least the input variables minus the grouping column. Ideally, +the result from the \code{calculate_diff_abundance} function is used.} + +\item{protein_id}{a character column in the \code{lip_data} and \code{trp_data} data frames +that contains protein identifiers.} + +\item{grouping}{a character column in the \code{lip_data} data frame that contains precursor or +peptide identifiers.} + +\item{comparison}{a character column in the \code{lip_data} and \code{trp_data} data frames +that contains the comparisons between conditions.} + +\item{diff}{a numeric column in the \code{lip_data} and \code{trp_data} data frames +that contains log2-fold changes for peptide or protein quantities.} + +\item{n_obs}{a numeric column in the \code{lip_data} and \code{trp_data} data frames +containing the number of observations used to calculate fold changes.} + +\item{std_error}{a numeric column in the \code{lip_data} and \code{trp_data} data frames +containing the standard error of fold changes.} + +\item{p_adj_method}{a character value, specifies the p-value correction method. Possible +methods are c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"). Default +method is \code{"BH"}.} + +\item{retain_columns}{a vector indicating if certain columns should be retained from the input +data frame. Default is not retaining additional columns \code{retain_columns = NULL}. Specific +columns can be retained by providing their names (not in quotations marks, just like other +column names, but in a vector). Please note that if you retain columns that have multiple +rows per grouped variable there will be duplicated rows in the output.} + +\item{method}{a character value, specifies the method used to estimate the degrees of freedom. +Possible methods are c("satterthwaite", "no_df_approximation"). \code{satterthwaite} uses the Welch-Satterthwaite +equation to estimate the pooled degrees of freedom, as described in https://doi.org/10.1016/j.mcpro.2022.100477 and +implemented in the MSstatsLiP package. This approach respects the number of protein measurements for the degrees of freedom. +\code{no_df_approximation} just takes the number of peptides into account when calculating the degrees of freedom.} +} +\value{ +a data frame containing corrected differential abundances (\code{adj_diff}, adjusted +standard errors (\code{adj_std_error}), degrees of freedom (\code{df}), pvalues (\code{pval}) and +adjusted p-values (\code{adj_pval}) +} +\description{ +Performs the correction of LiP-peptides for changes in protein abundance and +calculates their significance using a t-test +} +\examples{ + +# Load libraries + +library(dplyr) + +# Load example data and simulate tryptic data by summing up precursors + +data = rapamycin_10uM + +data_trp = data \%>\% +dplyr::group_by(pg_protein_accessions, r_file_name) \%>\% +dplyr::mutate(pg_quantity = sum(fg_quantity)) \%>\% +dplyr::distinct(r_condition, + r_file_name, + pg_protein_accessions, + pg_quantity) + + +# Calculate differential abundances for LiP and Trp data + + +diff_lip = data \%>\% + mutate(fg_intensity_log2 = log2(fg_quantity)) \%>\% + assign_missingness(sample = r_file_name, + condition = r_condition, + intensity = fg_intensity_log2, + grouping = eg_precursor_id, + ref_condition = "control", + retain_columns = "pg_protein_accessions") \%>\% + calculate_diff_abundance(sample = r_file_name, + condition = r_condition, + grouping = eg_precursor_id, + intensity_log2 = fg_intensity_log2, + comparison = comparison, + method = "t-test", + retain_columns = "pg_protein_accessions") + + +diff_trp = data_trp \%>\% + mutate(pg_intensity_log2 = log2(pg_quantity)) \%>\% + assign_missingness(sample = r_file_name, + condition = r_condition, + intensity = pg_intensity_log2, + grouping = pg_protein_accessions, + ref_condition = "control") \%>\% + calculate_diff_abundance(sample = r_file_name, + condition = r_condition, + grouping = pg_protein_accessions, + intensity_log2 = pg_intensity_log2, + comparison = comparison, + method = "t-test") + +# Correct for abundance changes + +corrected = correct_lip_for_abundance( + + lip_data = diff_lip, + trp_data = diff_trp, + protein_id = pg_protein_accessions, + grouping = eg_precursor_id, + retain_columns = c("missingness"), + method = "satterthwaite") + +head(corrected, n = 10) +} +\author{ +Aaron Fehr +} From 1e52f8dcea3c1d5e306a34558dee773a1f3cf57b Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 29 Aug 2023 16:49:25 +0200 Subject: [PATCH 24/71] Fix bug in calculate_treatment_enrichment --- R/calculate_treatment_enrichment.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/calculate_treatment_enrichment.R b/R/calculate_treatment_enrichment.R index d0fde9cf..7e325a1a 100644 --- a/R/calculate_treatment_enrichment.R +++ b/R/calculate_treatment_enrichment.R @@ -177,6 +177,7 @@ calculate_treatment_enrichment <- function(data, } # Add p-value to group name for plot + # also group to correctly calculate the total number of proteins in the next step if (!missing(group)) { cont_table <- cont_table %>% dplyr::mutate(group_pval = paste0( @@ -188,7 +189,8 @@ calculate_treatment_enrichment <- function(data, round(.data$pval, digits = 2) ), ")" - )) + )) %>% + dplyr::group_by({{ group }}) } enrichment_plot <- cont_table %>% From 7be10b93036023f35ed237eb6d207351e8db57df Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 29 Aug 2023 16:53:29 +0200 Subject: [PATCH 25/71] Fix bug in qc_sample_correlation The plot would always be printed even if it is saved to a variable. This was fixed now. This was raised in issue #207 --- R/qc_sample_correlation.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/qc_sample_correlation.R b/R/qc_sample_correlation.R index bab2be88..b23e0c04 100644 --- a/R/qc_sample_correlation.R +++ b/R/qc_sample_correlation.R @@ -191,7 +191,8 @@ qc_sample_correlation <- function(data, annotation = annotation, annotation_colors = annotation_colours, main = "Correlation based hierachical clustering of samples", - color = viridis_colours + color = viridis_colours, + silent = TRUE ) return(heatmap_static) } From 57160115f4f5a06efc5ae66226e5a66540c8bccf Mon Sep 17 00:00:00 2001 From: vivreb <102022465+vivreb@users.noreply.github.com> Date: Mon, 30 Oct 2023 12:31:35 +0100 Subject: [PATCH 26/71] #214 assign missingness variables correctly --- R/assign_missingness.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/assign_missingness.R b/R/assign_missingness.R index 5885cbff..3192a8c0 100644 --- a/R/assign_missingness.R +++ b/R/assign_missingness.R @@ -197,12 +197,12 @@ from the conditions and assigned their missingness. The created comparisons are: dplyr::mutate(missingness = dplyr::case_when( .data$n_detect_control == .data$n_replicates_control & .data$n_detect_treated == .data$n_replicates_treated ~ "complete", - .data$n_detect_control <= floor(n_replicates_control * 0.2) & + .data$n_detect_control <= floor(n_replicates_control * completeness_MNAR) & .data$n_detect_treated == .data$n_replicates_treated ~ "MNAR", .data$n_detect_control == .data$n_replicates_control & - .data$n_detect_treated <= floor(n_replicates_treated * 0.2) ~ "MNAR", - .data$n_detect_control >= max(floor(.data$n_replicates_control * 0.7), 1) & - .data$n_detect_treated >= max(floor(.data$n_replicates_control * 0.7), 1) ~ "MAR" + .data$n_detect_treated <= floor(n_replicates_treated * completeness_MNAR) ~ "MNAR", + .data$n_detect_control >= max(floor(.data$n_replicates_control * completeness_MAR), 1) & + .data$n_detect_treated >= max(floor(.data$n_replicates_control * completeness_MAR), 1) ~ "MAR" ))) %>% dplyr::select(-c(.data$n_detect_control, .data$n_detect_treated, .data$n_replicates_control, .data$n_replicates_treated)) %>% # Arrange by grouping but in a numeric order of the character vector. From ad3e0c6d7365508903a017dc70ba8462ae671682 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 30 Oct 2023 15:06:18 +0100 Subject: [PATCH 27/71] fix r version in github actions --- .github/workflows/test-coverage.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 0834d63b..5e9f40ba 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -18,9 +18,9 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | From 12150ea916ac528ab41101a195cbad59f925880f Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Fri, 3 Nov 2023 09:21:47 +0100 Subject: [PATCH 28/71] version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 82d72bd3..9ccd07de 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: protti Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools -Version: 0.6.0.9000 +Version: 0.6.0.9001 Authors@R: c(person(given = "Jan-Philipp", family = "Quast", From 86f0ee8e6add36571e0faeb1bd5f2d487f8192ff Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Fri, 3 Nov 2023 09:26:55 +0100 Subject: [PATCH 29/71] fix version in github action --- .github/workflows/test-coverage.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 0834d63b..cbb595b1 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -18,9 +18,9 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | @@ -44,7 +44,7 @@ jobs: shell: Rscript {0} - name: Test coverage - env: + env: TEST_PROTTI: true BUILD_VIGNETTE: true run: covr::codecov() From 806d773fd6ee00c05d1488dec3deed88aa9e9fa4 Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 3 Nov 2023 14:58:54 +0100 Subject: [PATCH 30/71] Fix typos and examples, update NEWS --- NEWS.md | 9 +++++++-- R/calculate_go_enrichment.R | 7 ++++--- R/calculate_treatment_enrichment.R | 4 ++-- R/qc_proteome_coverage.R | 7 ++++++- man/calculate_go_enrichment.Rd | 7 ++++--- man/calculate_treatment_enrichment.Rd | 2 +- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index 43765ec2..7df9feea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,15 +6,20 @@ * `peptide_profile_plot()` received a new argument called `complete_sample`. If set to `TRUE`, each protein gets assigned all sample names that are found in the input data. This ensures that the plot always contains all samples on the x-axis even if there are no measured intensities for a specific sample. The default is `FALSE`, which is the original behaviour of the function. * `volcano_plot()` received the `colour` argument that allows the user to provide custom colours for points. * Increased the speed of `find_peptide()` and `assign_peptide_type()` by only computing on the smallest possible subset of data before joining back to the original data frame. +* `calculate_treatment_enrichment()` can now be applied on data frames with multiple different groups. The enrichment will be calculated for each group separately. If the data is plotted, each group is displayed in a separate facet. The group is provided to the new `group` argument. +* `qc_pca()`: If the condition argument is numeric a colour gradient is used instead. ## Bug fixes * `volcano_plot()` now also works interactively if there are no significant hits. * `fetch_chebi()`: fixed an issue caused by `na_if()` that changed its behaviour after the recent `dplyr` update. -* `qc_proteome_coverage()`: fixed the label order of fractions of proteins detected and not detected in the proteome. -* `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. +* `qc_proteome_coverage()`: fixed the label order of fractions of proteins detected and not detected in the proteome. Fixes issue #194. +* `calculate_protein_abundance()` now correctly retains columns if `for_plot = TRUE`. Previously the columns to retain were not joined considering the precursor column, which lead to duplications of information where it did not belong. Fixes issue #197. * `fetch_kegg()` now returns the pathway name correctly again. * `qc_intensity_distribution()`, `qc_median_intensities()`, `qc_charge_states()`, `qc_contaminants()`, `qc_missed_cleavages()`, `qc_peptide_type()`, `qc_ids()`: If the provided sample column is of type factor, the level order won't be overwritten anymore. +*`fit_drc_4p()`: If there are no correlations an empty data frame is returned to prevent errors in `parallel_fit_drc_4p()`. +* `calculate_sequence_coverage()` does not fail anymore if a protein only contains `NA` peptide sequences. +* `qc_sequence_coverage()` does not return a plot anymore if `plot = FALSE`. This fixes issue #207. # protti 0.6.0 diff --git a/R/calculate_go_enrichment.R b/R/calculate_go_enrichment.R index 9f2deb8b..554ab507 100644 --- a/R/calculate_go_enrichment.R +++ b/R/calculate_go_enrichment.R @@ -38,7 +38,7 @@ go_enrichment <- function(...) { #' @param group optional, character column in the \code{data} data frame that contains information by #' which the analysis should be grouped. The analysis will be performed separately for each of the #' groups. This is most likely a column that labels separate comparisons of different conditions. -#' In protti the `asign_missingness()` function creates such a column automatically. +#' In protti the `assign_missingness()` function creates such a column automatically. #' @param y_axis_free a logical value that specifies if the y-axis of the plot should be "free" #' for each facet if a grouping variable is provided. Default is `TRUE`. If `FALSE` is selected #' it is easier to compare GO categories directly with each other. @@ -66,7 +66,8 @@ go_enrichment <- function(...) { #' to determine the number of GO terms in the plot. This information should be provided with the #' type first followed by the threshold separated by a space. Example are #' \code{plot_cutoff = "adj_pval top10"}, \code{plot_cutoff = "pval 0.05"} or -#' \code{plot_cutoff = "adj_pval 0.01"}. The threshold can be chosen freely. +#' \code{plot_cutoff = "adj_pval 0.01"}. The threshold can be chosen freely. The default value is +#' \code{"adj_pval top10"}. #' #' @return A bar plot displaying negative log10 adjusted p-values for the top 10 enriched or #' depleted gene ontology terms. Alternatively, plot cutoffs can be chosen individually with the @@ -98,7 +99,7 @@ go_enrichment <- function(...) { #' ) #' ) #' -#' if (!is(data, "character")) { +#' if (!is(uniprot_go_data, "character")) { #' data <- uniprot_go_data %>% #' mutate(significant = c( #' rep(TRUE, 1000), diff --git a/R/calculate_treatment_enrichment.R b/R/calculate_treatment_enrichment.R index 7e325a1a..10d546ae 100644 --- a/R/calculate_treatment_enrichment.R +++ b/R/calculate_treatment_enrichment.R @@ -37,7 +37,7 @@ treatment_enrichment <- function(...) { #' @param group optional, character column in the \code{data} data frame that contains information by #' which the analysis should be grouped. The analysis will be performed separately for each of the #' groups. This is most likely a column that labels separate comparisons of different conditions. -#' In protti the `asign_missingness()` function creates such a column automatically. +#' In protti the `assign_missingness()` function creates such a column automatically. #' @param treatment_name a character value that indicates the treatment name. It will be included #' in the plot title. #' @param plot a logical value indicating whether the result should be plotted or returned as a @@ -189,7 +189,7 @@ calculate_treatment_enrichment <- function(data, round(.data$pval, digits = 2) ), ")" - )) %>% + )) %>% dplyr::group_by({{ group }}) } diff --git a/R/qc_proteome_coverage.R b/R/qc_proteome_coverage.R index 3e5011c7..e9dcadd5 100644 --- a/R/qc_proteome_coverage.R +++ b/R/qc_proteome_coverage.R @@ -64,9 +64,14 @@ qc_proteome_coverage <- function(data, dplyr::summarize(proteins_detected = dplyr::n_distinct(!!ensym(protein_id)), .groups = "drop") %>% dplyr::mutate({{ sample }} := "Total") - proteome <- fetch_uniprot_proteome({{ organism_id }}, reviewed = reviewed) %>% + proteome <- fetch_uniprot_proteome(organism_id, reviewed = reviewed) %>% dplyr::summarize(proteins_proteome = dplyr::n_distinct(.data$accession), .groups = "drop") + if(is(proteome, "character")){ + # return NULL if UniProt information could not be fetched. + return(NULL) + } + proteome_coverage <- data %>% dplyr::group_by({{ sample }}) %>% dplyr::summarize(proteins_detected = dplyr::n_distinct(!!ensym(protein_id)), .groups = "drop") %>% diff --git a/man/calculate_go_enrichment.Rd b/man/calculate_go_enrichment.Rd index 5369dd1d..31f2f8f6 100644 --- a/man/calculate_go_enrichment.Rd +++ b/man/calculate_go_enrichment.Rd @@ -33,7 +33,7 @@ protein level information from this.} \item{group}{optional, character column in the \code{data} data frame that contains information by which the analysis should be grouped. The analysis will be performed separately for each of the groups. This is most likely a column that labels separate comparisons of different conditions. -In protti the \code{asign_missingness()} function creates such a column automatically.} +In protti the \code{assign_missingness()} function creates such a column automatically.} \item{y_axis_free}{a logical value that specifies if the y-axis of the plot should be "free" for each facet if a grouping variable is provided. Default is \code{TRUE}. If \code{FALSE} is selected @@ -69,7 +69,8 @@ significant proteins (p-value or adjusted p-value), or if a significance cutoff to determine the number of GO terms in the plot. This information should be provided with the type first followed by the threshold separated by a space. Example are \code{plot_cutoff = "adj_pval top10"}, \code{plot_cutoff = "pval 0.05"} or -\code{plot_cutoff = "adj_pval 0.01"}. The threshold can be chosen freely.} +\code{plot_cutoff = "adj_pval 0.01"}. The threshold can be chosen freely. The default value is +\code{"adj_pval top10"}.} } \value{ A bar plot displaying negative log10 adjusted p-values for the top 10 enriched or @@ -101,7 +102,7 @@ uniprot_go_data <- fetch_uniprot_proteome( ) ) -if (!is(data, "character")) { +if (!is(uniprot_go_data, "character")) { data <- uniprot_go_data \%>\% mutate(significant = c( rep(TRUE, 1000), diff --git a/man/calculate_treatment_enrichment.Rd b/man/calculate_treatment_enrichment.Rd index 6d8a1f10..08b8bf02 100644 --- a/man/calculate_treatment_enrichment.Rd +++ b/man/calculate_treatment_enrichment.Rd @@ -32,7 +32,7 @@ databases, e.g. UniProt.} \item{group}{optional, character column in the \code{data} data frame that contains information by which the analysis should be grouped. The analysis will be performed separately for each of the groups. This is most likely a column that labels separate comparisons of different conditions. -In protti the \code{asign_missingness()} function creates such a column automatically.} +In protti the \code{assign_missingness()} function creates such a column automatically.} \item{treatment_name}{a character value that indicates the treatment name. It will be included in the plot title.} From bc63fc1089461b722935cd18aa96e0a3aa6e172e Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 3 Nov 2023 15:20:54 +0100 Subject: [PATCH 31/71] Fix more tidyeval problems --- R/calculate_aa_scores.R | 2 +- R/calculate_diff_abundance.R | 7 ++++--- R/create_queue.R | 12 ++++++------ R/fetch_chebi.R | 3 ++- R/fetch_eco.R | 2 +- R/fetch_pdb.R | 38 ++++++++++++++++++------------------ R/fit_drc_4p.R | 2 +- 7 files changed, 34 insertions(+), 32 deletions(-) diff --git a/R/calculate_aa_scores.R b/R/calculate_aa_scores.R index 35fb9f46..5bdf151d 100644 --- a/R/calculate_aa_scores.R +++ b/R/calculate_aa_scores.R @@ -59,7 +59,7 @@ calculate_aa_scores <- function(data, dplyr::mutate(score = -log10({{ adj_pval }}) * abs({{ diff }})) %>% dplyr::rowwise() %>% dplyr::mutate(residue = list(seq({{ start_position }}, {{ end_position }}))) %>% - tidyr::unnest(.data$residue) %>% + tidyr::unnest("residue") %>% dplyr::group_by({{ protein }}, .data$residue) %>% dplyr::mutate(amino_acid_score = mean(.data$score)) %>% dplyr::distinct({{ protein }}, .data$residue, .data$amino_acid_score) diff --git a/R/calculate_diff_abundance.R b/R/calculate_diff_abundance.R index 57718363..1f3dd227 100644 --- a/R/calculate_diff_abundance.R +++ b/R/calculate_diff_abundance.R @@ -399,7 +399,7 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", "control", "treated" )) %>% - tidyr::pivot_wider(names_from = {{ condition }}, values_from = c(.data$mean, .data$sd, .data$n)) %>% + tidyr::pivot_wider(names_from = {{ condition }}, values_from = c("mean", "sd", "n")) %>% dplyr::mutate(ttest_protti( mean1 = .data$mean_control, mean2 = .data$mean_treated, @@ -760,8 +760,9 @@ missingness type is assigned.\n The created comparisons are: \n", prefix = "\n", .f = ~ dplyr::mutate(.x, comparison = str_replace_all(.y, pattern = "`", replacement = "")) ) %>% purrr::map_dfr(~ dplyr::mutate(.x, adj_pval = p.adjust(.data$pval, method = p_adj_method))) %>% - dplyr::select(-.data$n_obs, -.data$n_approx) %>% - dplyr::rename({{ grouping }} := .data$name, std_error = .data$se) %>% + dplyr::select(-"n_obs", -"n_approx") %>% + dplyr::rename({{ grouping }} := "name", + std_error = "se") %>% dplyr::left_join(proDA_missingness, by = c(rlang::as_name(rlang::enquo(grouping)), "comparison")) message("DONE", appendLF = TRUE) diff --git a/R/create_queue.R b/R/create_queue.R index 48c005ae..cff6354f 100644 --- a/R/create_queue.R +++ b/R/create_queue.R @@ -239,10 +239,10 @@ create_queue <- ) )) %>% dplyr::select(-c( - .data$t1_null, - .data$t2_null, - .data$d1_null, - .data$d2_null + "t1_null", + "t2_null", + "d1_null", + "d2_null" )) } @@ -255,7 +255,7 @@ create_queue <- user, measurement_type, experiment_name, - .data$number + "number" ) } @@ -314,7 +314,7 @@ create_queue <- blank <- tidyr::unite(blank_names, "File Name", sep = "_") %>% dplyr::mutate(`Sample Type` = "QC") %>% - dplyr::select(.data$`Sample Type`, .data$`File Name`) %>% + dplyr::select("Sample Type", "File Name") %>% dplyr::mutate(`Sample ID` = 1) %>% dplyr::mutate(Path = data_path) %>% dplyr::mutate(`Instrument Method` = blank_method_path) %>% diff --git a/R/fetch_chebi.R b/R/fetch_chebi.R index fca34f3d..1f396140 100644 --- a/R/fetch_chebi.R +++ b/R/fetch_chebi.R @@ -207,7 +207,8 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi_names_clean <- chebi_names %>% dplyr::distinct(.data$COMPOUND_ID, .data$NAME, .data$TYPE) %>% - dplyr::rename(ID = .data$COMPOUND_ID, TYPE_NAME = .data$TYPE) %>% + dplyr::rename(ID = "COMPOUND_ID", + TYPE_NAME = "TYPE") %>% dplyr::bind_rows(chebi_compounds_names_clean) chebi <- chebi_compounds_clean %>% diff --git a/R/fetch_eco.R b/R/fetch_eco.R index eaa3d985..a2e6f994 100644 --- a/R/fetch_eco.R +++ b/R/fetch_eco.R @@ -172,7 +172,7 @@ fetch_eco <- function(return_relation = FALSE, result <- query_result_unnest_3 %>% tidyr::unnest("synonyms") %>% dplyr::distinct(.data$id, .data$name, .data$type) %>% - dplyr::right_join(dplyr::select(query_result_unnest_3, c(-"synonyms")), by = "id") %>% + dplyr::right_join(dplyr::select(query_result_unnest_3, c(-"synonyms")), by = "id", relationship = "many-to-many") %>% dplyr::select( "id", "isObsolete", diff --git a/R/fetch_pdb.R b/R/fetch_pdb.R index 896971d8..2de5a6ac 100644 --- a/R/fetch_pdb.R +++ b/R/fetch_pdb.R @@ -346,7 +346,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { resolution_info <- query_result_clean %>% dplyr::select("pdb_ids", "resolution_combined") %>% - tidyr::unnest(.data$resolution_combined) + tidyr::unnest("resolution_combined") nmr_info <- query_result_clean %>% dplyr::select( @@ -355,9 +355,9 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { "entries.pdbx_nmr_exptl_sample_conditions", "entries.pdbx_nmr_refine" ) %>% - tidyr::unnest(.data$entries.pdbx_nmr_exptl) %>% - tidyr::unnest(.data$entries.pdbx_nmr_exptl_sample_conditions) %>% - tidyr::unnest(.data$entries.pdbx_nmr_refine) + tidyr::unnest("entries.pdbx_nmr_exptl") %>% + tidyr::unnest("entries.pdbx_nmr_exptl_sample_conditions") %>% + tidyr::unnest("entries.pdbx_nmr_refine") # make sure that the data is complete even if there is no NMR structure should_not_be_here <- colnames(nmr_info)[!colnames(nmr_info) %in% c( @@ -403,7 +403,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { rcsb_binding_affinity <- query_result_clean %>% dplyr::select("pdb_ids", "entries.rcsb_binding_affinity") %>% - tidyr::unnest(.data$entries.rcsb_binding_affinity) + tidyr::unnest("entries.rcsb_binding_affinity") # make sure that the data is complete even if there is no affinity information should_not_be_here <- colnames(rcsb_binding_affinity)[!colnames(rcsb_binding_affinity) %in% c( @@ -444,7 +444,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { polymer_entities <- query_result_clean %>% dplyr::select("pdb_ids", "entries.polymer_entities") %>% - tidyr::unnest(.data$entries.polymer_entities) %>% + tidyr::unnest("entries.polymer_entities") %>% dplyr::bind_cols( .$entity_poly, .$rcsb_polymer_entity_container_identifiers @@ -472,11 +472,11 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { if (nrow(polymer_entities_no_uniprots) > 0) { polymer_entities <- polymer_entities %>% - tidyr::unnest(c(.data$uniprots)) %>% + tidyr::unnest(c("uniprots")) %>% dplyr::bind_rows(polymer_entities_no_uniprots) } else { polymer_entities <- polymer_entities %>% - tidyr::unnest(c(.data$uniprots)) + tidyr::unnest(c("uniprots")) } if (show_progress == TRUE) { @@ -495,11 +495,11 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { if (nrow(polymer_entities_no_rcsb_polymer_entity_align) > 0) { polymer_entities <- polymer_entities %>% - tidyr::unnest(c(.data$rcsb_polymer_entity_align)) %>% + tidyr::unnest(c("rcsb_polymer_entity_align")) %>% dplyr::bind_rows(polymer_entities_no_rcsb_polymer_entity_align) } else { polymer_entities <- polymer_entities %>% - tidyr::unnest(c(.data$rcsb_polymer_entity_align)) + tidyr::unnest(c("rcsb_polymer_entity_align")) } # some proteins do not contain UniProt information therefore data needs to be extracted differently @@ -643,15 +643,15 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { dplyr::distinct() %>% dplyr::mutate(dplyr::across( .cols = c( - .data$atom_id, - .data$auth_seq_id, - .data$comp_id, - .data$ligand_asym_id, - .data$ligand_atom_id, - .data$ligand_comp_id, - .data$ligand_entity_id, - .data$ligand_is_bound, - .data$seq_id + "atom_id", + "auth_seq_id", + "comp_id", + "ligand_asym_id", + "ligand_atom_id", + "ligand_comp_id", + "ligand_entity_id", + "ligand_is_bound", + "seq_id" ), .fns = ~ ifelse(str_detect(.data$atom_id, pattern = "NA"), NA, .x) )) %>% diff --git a/R/fit_drc_4p.R b/R/fit_drc_4p.R index fcf75bfd..f05bd3e0 100644 --- a/R/fit_drc_4p.R +++ b/R/fit_drc_4p.R @@ -185,7 +185,7 @@ fit_drc_4p <- function(data, tidyr::drop_na("mean_ratio", "sd") %>% anova_protti({{ grouping }}, {{ dose }}, .data$mean_ratio, .data$sd, .data$n) %>% dplyr::distinct({{ grouping }}, .data$pval) %>% - tidyr::drop_na(.data$pval) %>% # remove NA pvalues before adjustment! + tidyr::drop_na("pval") %>% # remove NA pvalues before adjustment! dplyr::mutate(anova_adj_pval = stats::p.adjust(.data$pval, method = "BH")) %>% dplyr::rename(anova_pval = "pval") From 3bf98b2e4360eddf059fee55dbb1d7f945f5c97b Mon Sep 17 00:00:00 2001 From: jpquast Date: Fri, 3 Nov 2023 18:18:12 +0100 Subject: [PATCH 32/71] Fixed tests and tidyeval warnings --- R/create_structure_contact_map.R | 8 ++++---- R/extract_metal_binders.R | 14 +++++++------- R/fetch_chebi.R | 2 +- R/fetch_pdb.R | 2 +- R/find_peptide_in_structure.R | 2 +- R/map_peptides_on_structure.R | 10 +++++----- R/woods_plot.R | 2 +- tests/testthat/test-structure_functions.R | 8 ++++---- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R/create_structure_contact_map.R b/R/create_structure_contact_map.R index 0f76afac..ca42a242 100644 --- a/R/create_structure_contact_map.R +++ b/R/create_structure_contact_map.R @@ -320,7 +320,7 @@ Please always provide a chain ID for your start and end positions."), stringr::str_extract(.data$X1, pattern = "\\d+"), NA ))) %>% - tidyr::fill(.data$pdb_model_number, .direction = "down") %>% + tidyr::fill("pdb_model_number", .direction = "down") %>% dplyr::mutate(pdb_model_number = ifelse(is.na(.data$pdb_model_number), 0, .data$pdb_model_number)) %>% dplyr::filter(!stringr::str_detect(.data$X1, pattern = "^MODEL")) %>% dplyr::mutate( @@ -362,7 +362,7 @@ Please always provide a chain ID for your start and end positions."), id = "my_structure" ) %>% dplyr::filter(.data$pdb_model_number %in% pdb_model_number_selection) %>% - dplyr::select(-c(.data$X1, .data$pdb_model_number)) %>% + dplyr::select(-c("X1", "pdb_model_number")) %>% dplyr::mutate(retain_pattern = stringr::str_replace_all( paste(.data$id, .data$auth_asym_id, .data$auth_seq_id, sep = "_"), pattern = "_NA", @@ -504,7 +504,7 @@ Please always provide a chain ID for your start and end positions."), .data$retain_pattern, pattern = paste(paste0(data_retain_pattern1, "(?=$|_)"), collapse = "|") )) %>% - dplyr::rename(id = .data$uniprot_id) + dplyr::rename(id = "uniprot_id") if (data2_missing) { predictions %>% @@ -599,7 +599,7 @@ Please always provide a chain ID for your start and end positions."), dplyr::mutate(distance = sqrt((.data$x.x - .data$x.y)^2 + (.data$y.x - .data$y.y)^2 + (.data$z.x - .data$z.y)^2)) %>% dplyr::select("var1", "var2", "distance") %>% dplyr::filter(.data$distance <= distance_cutoff) %>% - dplyr::left_join(current_structure1 %>% dplyr::select(-.data$id), by = c("var1" = "label_id")) %>% + dplyr::left_join(current_structure1 %>% dplyr::select(-"id"), by = c("var1" = "label_id")) %>% dplyr::left_join(current_structure2, by = c("var2" = "label_id"), suffix = c("_var1", "_var2")) } ) diff --git a/R/extract_metal_binders.R b/R/extract_metal_binders.R index bc78a5e5..dd8c1b7c 100644 --- a/R/extract_metal_binders.R +++ b/R/extract_metal_binders.R @@ -261,12 +261,12 @@ extract_metal_binders <- function(data_uniprot, dplyr::filter(stringr::str_detect(.data$formula, pattern = paste0(paste0(metal_list$symbol, collapse = "(?![:lower:])|"), "(?![:lower:])")) | .data$chebi_id %in% metal_chebi_uniprot$id) %>% # This recreates a version of the data frame provided by protti that contains all metal containing entries from ChEBI dplyr::mutate(extract_formula = stringr::str_extract_all(.data$formula, pattern = paste0(paste0(metal_list$symbol, collapse = "(?![:lower:])|"), "(?![:lower:])"))) %>% - tidyr::unnest(.data$extract_formula) %>% + tidyr::unnest("extract_formula") %>% dplyr::mutate(metal_atom_id = ifelse(is.na(.data$extract_formula), stats::setNames(metal_chebi_uniprot$metal_atom_id, as.character(metal_chebi_uniprot$id))[.data$chebi_id], stats::setNames(metal_list$chebi_id, metal_list$symbol)[.data$extract_formula] )) %>% - dplyr::select(-.data$extract_formula) %>% + dplyr::select(-"extract_formula") %>% dplyr::group_by(.data$chebi_id) %>% dplyr::mutate(metal_atom_id = paste0(.data$metal_atom_id, collapse = ",")) %>% dplyr::distinct() @@ -281,7 +281,7 @@ extract_metal_binders <- function(data_uniprot, find_all_subs( ids = c("ECO:0000352"), main_id = .data$main_id, - type = .data$relation, + type = "relation", accepted_types = "all" ) %>% unlist() @@ -290,7 +290,7 @@ extract_metal_binders <- function(data_uniprot, find_all_subs( ids = c("ECO:0000501"), main_id = .data$main_id, - type = .data$relation, + type = "relation", accepted_types = "all" ) %>% unlist() @@ -560,7 +560,7 @@ extract_metal_binders <- function(data_uniprot, catalytic_activity_uniprot <- data_uniprot %>% dplyr::distinct(.data$accession, .data$cc_catalytic_activity) %>% - tidyr::drop_na(.data$cc_catalytic_activity) %>% + tidyr::drop_na("cc_catalytic_activity") %>% dplyr::mutate(catalytic_activity_split = stringr::str_extract_all( .data$cc_catalytic_activity, pattern = "(?<=CATALYTIC ACTIVITY:).+?(?=CATALYTIC ACTIVITY|$)" @@ -709,13 +709,13 @@ extract_metal_binders <- function(data_uniprot, by = c("go_term" = "slims_from_id"), relationship = "many-to-many" ) %>% - dplyr::rename(metal_id_part = .data$metal_atom_id) %>% + dplyr::rename(metal_id_part = "metal_atom_id") %>% dplyr::mutate(eco_type = dplyr::case_when( .data$eco %in% manual_eco ~ "manual_assertion", .data$eco %in% automatic_eco ~ "automatic_assertion" )) %>% dplyr::mutate(eco_type = ifelse(is.na(.data$eco_type), "automatic_assertion", .data$eco_type)) %>% - tidyr::unite(.data$reference, .data$with_from, col = "evidence_source", na.rm = TRUE) %>% + tidyr::unite("reference", "with_from", col = "evidence_source", na.rm = TRUE) %>% dplyr::mutate(evidence_source = stringr::str_replace_all(.data$evidence_source, pattern = "\\|", replacement = ",")) %>% # Now combine data to have one row per accession and chebi_id # First combine evidence_source etc. diff --git a/R/fetch_chebi.R b/R/fetch_chebi.R index 1f396140..5b0bf205 100644 --- a/R/fetch_chebi.R +++ b/R/fetch_chebi.R @@ -200,7 +200,7 @@ fetch_chebi <- function(relation = FALSE, stars = c(3), timeout = 60) { chebi_compounds_names_clean <- chebi_compounds %>% dplyr::filter(.data$STAR %in% stars) %>% dplyr::distinct(.data$ID, .data$NAME) %>% - dplyr::mutate(dplyr::across(c(.data$NAME), ~ dplyr::na_if(.x, "null"))) %>% + dplyr::mutate(dplyr::across(c("NAME"), ~ dplyr::na_if(.x, "null"))) %>% dplyr::filter(!is.na(.data$NAME)) %>% dplyr::mutate(TYPE_NAME = "STANDARD") %>% dplyr::select("ID", "TYPE_NAME", "NAME") diff --git a/R/fetch_pdb.R b/R/fetch_pdb.R index 2de5a6ac..36662f35 100644 --- a/R/fetch_pdb.R +++ b/R/fetch_pdb.R @@ -897,7 +897,7 @@ fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) { combined <- polymer_entities %>% dplyr::full_join(nonpolymer_entities, by = c("pdb_ids", "auth_asym_ids"), relationship = "many-to-many") %>% - dplyr::left_join(rcsb_binding_affinity, by = "pdb_ids") %>% + dplyr::left_join(rcsb_binding_affinity, by = "pdb_ids", relationship = "many-to-many" ) %>% dplyr::left_join(additional_info, by = "pdb_ids") %>% dplyr::left_join(crystal_growth_info, by = "pdb_ids") %>% dplyr::left_join(nmr_info, by = "pdb_ids") %>% diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R index d450450c..18a74b48 100644 --- a/R/find_peptide_in_structure.R +++ b/R/find_peptide_in_structure.R @@ -113,7 +113,7 @@ find_peptide_in_structure <- function(peptide_data, pdb_id_mapping <- uniprot_info %>% tidyr::drop_na() %>% dplyr::mutate(pdb_ids = strsplit(.data$xref_pdb, split = ";")) %>% - tidyr::unnest(.data$pdb_ids) + tidyr::unnest("pdb_ids") pdb_data <- fetch_pdb(pdb_ids = unique(pdb_id_mapping$pdb_ids)) } diff --git a/R/map_peptides_on_structure.R b/R/map_peptides_on_structure.R index 36e78112..ecb74610 100644 --- a/R/map_peptides_on_structure.R +++ b/R/map_peptides_on_structure.R @@ -495,7 +495,7 @@ map_peptides_on_structure <- function(peptide_data, replacement = {{ map_value }} )) %>% dplyr::mutate(X1 = ifelse(!is.na(.data$atoms_mod), .data$atoms_mod, .data$X1)) %>% - dplyr::select(.data$X1) %>% + dplyr::select("X1") %>% readr::write_tsv( file = paste0(export_location, .y, file_format), quote = "none", @@ -542,7 +542,7 @@ map_peptides_on_structure <- function(peptide_data, dplyr::mutate({{ map_value }} := ifelse(str_detect({{ map_value }}, pattern = "NA"), NA, {{ map_value }})) %>% dplyr::mutate(atoms_mod = `str_sub<-`(.data$atoms, 61, 66, value = {{ map_value }})) %>% dplyr::mutate(X1 = ifelse(!is.na(.data$atoms_mod), .data$atoms_mod, .data$X1)) %>% - dplyr::select(.data$X1) %>% + dplyr::select("X1") %>% readr::write_tsv( file = paste0(export_location, .y, file_format), quote = "none", @@ -592,7 +592,7 @@ map_peptides_on_structure <- function(peptide_data, list(NA) )) %>% dplyr::ungroup() %>% - tidyr::unnest(.data$residue_internal) %>% + tidyr::unnest("residue_internal") %>% dplyr::group_by({{ chain }}, .data$residue_internal) %>% dplyr::mutate({{ map_value }} := max({{ map_value }})) %>% dplyr::ungroup() %>% @@ -636,7 +636,7 @@ for the mapping. Make sure to provide a chain identifier if a mapping should be replacement = {{ map_value }} )) %>% dplyr::mutate(X1 = ifelse(!is.na(.data$atoms_mod), .data$atoms_mod, .data$X1)) %>% - dplyr::select(.data$X1) %>% + dplyr::select("X1") %>% readr::write_tsv( file = paste0(export_location, "modified_", file_name), quote = "none", @@ -664,7 +664,7 @@ for the mapping. Make sure to provide a chain identifier if a mapping should be dplyr::mutate({{ map_value }} := ifelse(str_detect({{ map_value }}, pattern = "NA"), NA, {{ map_value }})) %>% dplyr::mutate(atoms_mod = `str_sub<-`(.data$atoms, 61, 66, value = {{ map_value }})) %>% dplyr::mutate(X1 = ifelse(!is.na(.data$atoms_mod), .data$atoms_mod, .data$X1)) %>% - dplyr::select(.data$X1) %>% + dplyr::select("X1") %>% readr::write_tsv( file = paste0(export_location, "modified_", file_name), quote = "none", diff --git a/R/woods_plot.R b/R/woods_plot.R index b491fe8f..eae3fc5c 100644 --- a/R/woods_plot.R +++ b/R/woods_plot.R @@ -158,7 +158,7 @@ woods_plot <- function(data, fill = {{ colouring }} ), col = "black", - size = 0.7, + linewidth = 0.7, alpha = 0.8 ) + { diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index abdc9ccc..c5fbcaf3 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -19,7 +19,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("find_peptide_in_structure works", { expect_is(positions_structure, "data.frame") - expect_equal(nrow(positions_structure), 457) + expect_equal(nrow(positions_structure), 569) expect_equal(ncol(positions_structure), 17) }) @@ -69,7 +69,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { )) expect_false(file.exists(paste0(tempdir(), "/6UU2_P0A8T7.pdb"))) - expect_gt(file.info(paste0(tempdir(), "/2EL9_P60906.pdb"))$size, 1000000) + expect_gt(file.info(paste0(tempdir(), "/2EL9_P60906.pdb"))$size, 900000) expect_gt(file.info(paste0(tempdir(), "/P37648_AlphaFold.pdb"))$size, 300000) file_pdb_6UU2_P0A8T7 <- readr::read_tsv(paste0(tempdir(), "/2EL9_P60906.pdb"), col_names = FALSE, show_col_types = FALSE, progress = FALSE) %>% @@ -99,7 +99,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { export_location = tempdir() ) - expect_gt(file.info(paste0(tempdir(), "/modified_2EL9_P60906.cif"))$size, 1000000) + expect_gt(file.info(paste0(tempdir(), "/modified_2EL9_P60906.cif"))$size, 900000) # .pdb structure file provided map_peptides_on_structure( @@ -113,7 +113,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { export_location = tempdir() ) - expect_gt(file.info(paste0(tempdir(), "/modified_2EL9_P60906.pdb"))$size, 1000000) + expect_gt(file.info(paste0(tempdir(), "/modified_2EL9_P60906.pdb"))$size, 900000) }) test_that("create_structure_contact_map works", { From eef45b6d023c5285bf03795f084d25a7a7f89e7b Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 6 Nov 2023 16:30:26 +0100 Subject: [PATCH 33/71] version pinning for lme4 (fix for ubuntu) --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 82d72bd3..3b714d7d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,8 @@ Imports: httr, methods, R.utils, - stats + stats, + lme4 (>= 1.1-35) RoxygenNote: 7.2.3 Suggests: testthat, From d417067bcd5a7477753ea7abadd315cebff8f6b3 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 7 Nov 2023 11:29:24 +0100 Subject: [PATCH 34/71] dont check donttest examples --- .github/workflows/R-CMD-check.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f31f68d6..2ada71d9 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: all name: R-CMD-check @@ -29,6 +29,7 @@ jobs: R_KEEP_PKG_SOURCE: yes TEST_PROTTI: true BUILD_VIGNETTE: true + R_CHECK_DONTTEST_EXAMPLES: false steps: - uses: actions/checkout@v3 From 7adf2feb7f9617b7ec3d16a618d7e548d925b2b6 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 7 Nov 2023 11:36:27 +0100 Subject: [PATCH 35/71] trigger workflow - new empty line --- .github/workflows/R-CMD-check.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2ada71d9..dbd7ee1f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -50,3 +50,4 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true + From 72e0e7db03ead93b0d288004be17993ad0b3fa76 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 7 Nov 2023 11:45:35 +0100 Subject: [PATCH 36/71] fix syntax of all branches --- .github/workflows/R-CMD-check.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index dbd7ee1f..43bb1a3c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,8 @@ on: push: branches: [main, master] pull_request: - branches: all + branches: + - '*' name: R-CMD-check From 30bee1fb3e0eaa0f45cf13dc7126cc10e8565566 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 7 Nov 2023 15:05:57 +0100 Subject: [PATCH 37/71] more verbose error message --- R/qc_proteome_coverage.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/qc_proteome_coverage.R b/R/qc_proteome_coverage.R index e9dcadd5..cde30477 100644 --- a/R/qc_proteome_coverage.R +++ b/R/qc_proteome_coverage.R @@ -64,14 +64,17 @@ qc_proteome_coverage <- function(data, dplyr::summarize(proteins_detected = dplyr::n_distinct(!!ensym(protein_id)), .groups = "drop") %>% dplyr::mutate({{ sample }} := "Total") - proteome <- fetch_uniprot_proteome(organism_id, reviewed = reviewed) %>% - dplyr::summarize(proteins_proteome = dplyr::n_distinct(.data$accession), .groups = "drop") + proteome <- fetch_uniprot_proteome(organism_id, reviewed = reviewed) if(is(proteome, "character")){ - # return NULL if UniProt information could not be fetched. + # UniProt information could not be fetched. + message("UniProt information could not be fetched") return(NULL) } + proteome <- proteome %>% + dplyr::summarize(proteins_proteome = dplyr::n_distinct(.data$accession), .groups = "drop") + proteome_coverage <- data %>% dplyr::group_by({{ sample }}) %>% dplyr::summarize(proteins_detected = dplyr::n_distinct(!!ensym(protein_id)), .groups = "drop") %>% From b224879000e3b419f1b3085a67337ca43bbebf5e Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 7 Nov 2023 15:06:08 +0100 Subject: [PATCH 38/71] increase timeout --- R/try_query.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/try_query.R b/R/try_query.R index 993d5b26..a56e0e30 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -25,7 +25,7 @@ #' #' @return A data frame that contains the table from the url. try_query <- - function(url, max_tries = 5, silent = TRUE, type = "text/tab-separated-values", timeout = 30, accept = NULL, ...) { + function(url, max_tries = 5, silent = TRUE, type = "text/tab-separated-values", timeout = 60, accept = NULL, ...) { # Check if there is an internet connection first if (!curl::has_internet()) { if (!silent) message("No internet connection.") From 1f88bcf7c31b842081b20e51916b48471d373a93 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 7 Nov 2023 15:57:01 +0100 Subject: [PATCH 39/71] fix documentation mismatch --- R/try_query.R | 2 +- man/try_query.Rd | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/try_query.R b/R/try_query.R index a56e0e30..27e1619e 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -14,7 +14,7 @@ #' all options that can be supplied to httr::content, these include e.g. #' "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values". #' Default is "tab-separated-values". -#' @param timeout a numeric value that specifies the maximum request time. Default is 30 seconds. +#' @param timeout a numeric value that specifies the maximum request time. Default is 60 seconds. #' @param accept a character value that specifies the type of data that should be sent by the API if #' it uses content negotiation. The default is NULL and it should only be set for APIs that use #' content negotiation. diff --git a/man/try_query.Rd b/man/try_query.Rd index 21b6852e..3f320514 100644 --- a/man/try_query.Rd +++ b/man/try_query.Rd @@ -9,7 +9,7 @@ try_query( max_tries = 5, silent = TRUE, type = "text/tab-separated-values", - timeout = 30, + timeout = 60, accept = NULL, ... ) @@ -29,7 +29,7 @@ all options that can be supplied to httr::content, these include e.g. "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values". Default is "tab-separated-values".} -\item{timeout}{a numeric value that specifies the maximum request time. Default is 30 seconds.} +\item{timeout}{a numeric value that specifies the maximum request time. Default is 60 seconds.} \item{accept}{a character value that specifies the type of data that should be sent by the API if it uses content negotiation. The default is NULL and it should only be set for APIs that use From 43a241c39cec33adb8b5af9245ec140ebcad7b14 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:39:44 +0100 Subject: [PATCH 40/71] test if annotations are in x range --- tests/testthat/test-fetch_extract_and_enrichment_functions.R | 3 ++- tests/testthat/test-structure_functions.R | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-fetch_extract_and_enrichment_functions.R b/tests/testthat/test-fetch_extract_and_enrichment_functions.R index 7c45410a..646f2b84 100644 --- a/tests/testthat/test-fetch_extract_and_enrichment_functions.R +++ b/tests/testthat/test-fetch_extract_and_enrichment_functions.R @@ -453,7 +453,8 @@ if (Sys.getenv("TEST_PROTTI") == "true") { annotations <- fetch_quickgo(type = "annotations", id = uniprot_ids, ontology = "molecular_function") test_that("fetch_quickgo works", { expect_is(annotations, "data.frame") - expect_equal(nrow(annotations), 24) + expect_gte(nrow(annotations), 24) + expect_lte(nrow(annotations), 30) expect_equal(ncol(annotations), 15) terms <- fetch_quickgo(type = "terms") diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index c5fbcaf3..149ac32d 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -19,7 +19,9 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("find_peptide_in_structure works", { expect_is(positions_structure, "data.frame") - expect_equal(nrow(positions_structure), 569) + # test if position structure is in certain range as db can be updated + expect_gte(nrow(positions_structure), 569) + expect_lte(nrow(positions_structure), 600) expect_equal(ncol(positions_structure), 17) }) From 3b32999a591a5a63fab8e71d45e10f6e7bfb1fe7 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 9 Nov 2023 11:03:53 +0100 Subject: [PATCH 41/71] increase testing range --- tests/testthat/test-fetch_extract_and_enrichment_functions.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-fetch_extract_and_enrichment_functions.R b/tests/testthat/test-fetch_extract_and_enrichment_functions.R index 646f2b84..e6a0805a 100644 --- a/tests/testthat/test-fetch_extract_and_enrichment_functions.R +++ b/tests/testthat/test-fetch_extract_and_enrichment_functions.R @@ -464,7 +464,8 @@ if (Sys.getenv("TEST_PROTTI") == "true") { slims <- fetch_quickgo(type = "slims", go_id_slims = c("GO:0046872", "GO:0051540")) expect_is(slims, "data.frame") - expect_equal(nrow(slims), 43) + expect_gte(nrow(slims), 38) + expect_lte(nrow(slims), 44) expect_equal(ncol(slims), 2) expect_warning(fetch_quickgo( From 60f43013e0cb3f9cd77225b55db77ba9b69e9613 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Thu, 9 Nov 2023 11:16:30 +0100 Subject: [PATCH 42/71] increase testing range --- .../testthat/test-fetch_extract_and_enrichment_functions.R | 6 ++++-- tests/testthat/test-structure_functions.R | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-fetch_extract_and_enrichment_functions.R b/tests/testthat/test-fetch_extract_and_enrichment_functions.R index 7c45410a..e6a0805a 100644 --- a/tests/testthat/test-fetch_extract_and_enrichment_functions.R +++ b/tests/testthat/test-fetch_extract_and_enrichment_functions.R @@ -453,7 +453,8 @@ if (Sys.getenv("TEST_PROTTI") == "true") { annotations <- fetch_quickgo(type = "annotations", id = uniprot_ids, ontology = "molecular_function") test_that("fetch_quickgo works", { expect_is(annotations, "data.frame") - expect_equal(nrow(annotations), 24) + expect_gte(nrow(annotations), 24) + expect_lte(nrow(annotations), 30) expect_equal(ncol(annotations), 15) terms <- fetch_quickgo(type = "terms") @@ -463,7 +464,8 @@ if (Sys.getenv("TEST_PROTTI") == "true") { slims <- fetch_quickgo(type = "slims", go_id_slims = c("GO:0046872", "GO:0051540")) expect_is(slims, "data.frame") - expect_equal(nrow(slims), 43) + expect_gte(nrow(slims), 38) + expect_lte(nrow(slims), 44) expect_equal(ncol(slims), 2) expect_warning(fetch_quickgo( diff --git a/tests/testthat/test-structure_functions.R b/tests/testthat/test-structure_functions.R index c5fbcaf3..dfe1498b 100644 --- a/tests/testthat/test-structure_functions.R +++ b/tests/testthat/test-structure_functions.R @@ -19,7 +19,8 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("find_peptide_in_structure works", { expect_is(positions_structure, "data.frame") - expect_equal(nrow(positions_structure), 569) + expect_gte(nrow(positions_structure), 569) + expect_lte(nrow(positions_structure), 600) expect_equal(ncol(positions_structure), 17) }) From 02d7479977a981ef46ec135706401fc5c0628a67 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:56:17 +0100 Subject: [PATCH 43/71] remove lme4 from dependencies and add it to github actions --- .github/workflows/R-CMD-check.yaml | 6 ++++-- DESCRIPTION | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 43bb1a3c..5a8737fd 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -30,11 +30,14 @@ jobs: R_KEEP_PKG_SOURCE: yes TEST_PROTTI: true BUILD_VIGNETTE: true - R_CHECK_DONTTEST_EXAMPLES: false steps: - uses: actions/checkout@v3 + - name: Install lme4 package + run: | + R -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always") + - uses: r-lib/actions/setup-pandoc@v2 - uses: r-lib/actions/setup-r@v2 @@ -51,4 +54,3 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true - diff --git a/DESCRIPTION b/DESCRIPTION index 39cd274c..9ccd07de 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,8 +42,7 @@ Imports: httr, methods, R.utils, - stats, - lme4 (>= 1.1-35) + stats RoxygenNote: 7.2.3 Suggests: testthat, From b295af731ac91da687ce9fe749aef559a0173d87 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:12:14 +0100 Subject: [PATCH 44/71] =?UTF-8?q?fix=20g=C3=A4nsef=C3=BCschen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 5a8737fd..cdb25b05 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: - name: Install lme4 package run: | - R -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always") + R -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' - uses: r-lib/actions/setup-pandoc@v2 From 9684ac2dd07e72a55852a53bb645a3054e4837b4 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:19:52 +0100 Subject: [PATCH 45/71] use rscript intead of r --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index cdb25b05..07889af5 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: - name: Install lme4 package run: | - R -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' + Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' - uses: r-lib/actions/setup-pandoc@v2 From 1ed95dbda03dd34e5ae58658ec27801a20b4dab4 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:24:38 +0100 Subject: [PATCH 46/71] install remotes package --- .github/workflows/R-CMD-check.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 07889af5..6e7afc36 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -34,6 +34,10 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Install remotes package + run: | + Rscript -e 'install.packages("remotes", repos="https://ftp.belnet.be/mirror/CRAN")' + - name: Install lme4 package run: | Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' From 5bc55c20ecb9535bce18722091a9b9420f7324a4 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:27:14 +0100 Subject: [PATCH 47/71] instal remotes fix --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 6e7afc36..92b40606 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: - name: Install remotes package run: | - Rscript -e 'install.packages("remotes", repos="https://ftp.belnet.be/mirror/CRAN")' + Rscript -e 'install.packages("remotes")' - name: Install lme4 package run: | From 578565412239ad3bd896b1bb9ddec2bff8de8b3a Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:29:27 +0100 Subject: [PATCH 48/71] path to lib --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 92b40606..097c0e0d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: - name: Install remotes package run: | - Rscript -e 'install.packages("remotes")' + Rscript -e 'install.packages("remotes", lib="$HOME/R/library")' - name: Install lme4 package run: | From e88213f14f1e16e7d3966996e6786f42c02e1058 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 15:05:29 +0100 Subject: [PATCH 49/71] change order of workflow --- .github/workflows/R-CMD-check.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 097c0e0d..825b07a0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -34,14 +34,6 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install remotes package - run: | - Rscript -e 'install.packages("remotes", lib="$HOME/R/library")' - - - name: Install lme4 package - run: | - Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' - - uses: r-lib/actions/setup-pandoc@v2 - uses: r-lib/actions/setup-r@v2 @@ -50,6 +42,14 @@ jobs: http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true + #- name: Install remotes package + # Rscript -e 'install.packages("remotes", lib="$HOME/R/library")' + # run: | + + - name: Install lme4 package + run: | + Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck From 1e260e8e36fe94cb0c43a87dd78fdf19e3a6a755 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 15:26:26 +0100 Subject: [PATCH 50/71] install in new order --- .github/workflows/R-CMD-check.yaml | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 825b07a0..4c1011bd 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -42,19 +42,22 @@ jobs: http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true - #- name: Install remotes package - # Rscript -e 'install.packages("remotes", lib="$HOME/R/library")' - # run: | - - - name: Install lme4 package - run: | - Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' + #run: | + ##- name: Install remotes package + # R_LIBS_USER="$HOME/R/library" + #Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIBS_USER"))' + #mkdir -p "$R_LIBS_USER" +' - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck needs: check + - name: Install lme4 package + run: | + Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always") + - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true From 6c5f9f4058046249befecc19041a9b349067aad7 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 15:29:58 +0100 Subject: [PATCH 51/71] syntax fix --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 4c1011bd..597600d0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -47,7 +47,7 @@ jobs: # R_LIBS_USER="$HOME/R/library" #Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIBS_USER"))' #mkdir -p "$R_LIBS_USER" -' + - uses: r-lib/actions/setup-r-dependencies@v2 with: From f614c374395b953bab5218b917356f8941e08246 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 15:33:20 +0100 Subject: [PATCH 52/71] fix 100 --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 597600d0..0d930129 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -56,7 +56,7 @@ jobs: - name: Install lme4 package run: | - Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always") + Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' - uses: r-lib/actions/check-r-package@v2 with: From 54d2a4c7111c1cff274ba0faf3ce0f22cbb84e1f Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 16:07:44 +0100 Subject: [PATCH 53/71] install with in dependencies --- .github/workflows/R-CMD-check.yaml | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0d930129..a139003f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -42,21 +42,16 @@ jobs: http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true - #run: | - ##- name: Install remotes package - # R_LIBS_USER="$HOME/R/library" - #Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIBS_USER"))' - #mkdir -p "$R_LIBS_USER" - - - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck + extra-packages: any::lme4 needs: check - - name: Install lme4 package - run: | - Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' + # run: | + # - name: Install remotes and lme4 package + # Rscript -e 'install.packages("remotes", lib=Sys.getenv("R_LIB_FOR_PAK"))' + # Rscript -e 'remotes::install_cran("lme4", dependencies = TRUE, upgrade = "always")' - uses: r-lib/actions/check-r-package@v2 with: From 894f7ff83b5acf1283e64ea0d7c3ac04f2346c33 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 13 Nov 2023 16:08:56 +0100 Subject: [PATCH 54/71] fix extra packages --- .github/workflows/R-CMD-check.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a139003f..1a74931e 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -44,8 +44,9 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::rcmdcheck - extra-packages: any::lme4 + extra-packages: + any::rcmdcheck + any::lme4 needs: check # run: | From 04cdb160e28fa075c694f37fe5c213b23086b83b Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 18 Dec 2023 15:50:51 +0100 Subject: [PATCH 55/71] fix deprecated code --- R/correct_lip_for_abundance.R | 134 +++++++++++++++++----------------- 1 file changed, 66 insertions(+), 68 deletions(-) diff --git a/R/correct_lip_for_abundance.R b/R/correct_lip_for_abundance.R index b5509e4d..413ff41f 100644 --- a/R/correct_lip_for_abundance.R +++ b/R/correct_lip_for_abundance.R @@ -1,17 +1,17 @@ #' Protein abundance correction #' -#' Performs the correction of LiP-peptides for changes in protein abundance and +#' Performs the correction of LiP-peptides for changes in protein abundance and #' calculates their significance using a t-test #' -#' @param lip_data a data frame containing at least the input variables. Ideally, +#' @param lip_data a data frame containing at least the input variables. Ideally, #' the result from the \code{calculate_diff_abundance} function is used. -#' @param trp_data a data frame containing at least the input variables minus the grouping column. Ideally, +#' @param trp_data a data frame containing at least the input variables minus the grouping column. Ideally, #' the result from the \code{calculate_diff_abundance} function is used. -#' @param protein_id a character column in the \code{lip_data} and \code{trp_data} data frames +#' @param protein_id a character column in the \code{lip_data} and \code{trp_data} data frames #' that contains protein identifiers. #' @param grouping a character column in the \code{lip_data} data frame that contains precursor or #' peptide identifiers. -#' @param comparison a character column in the \code{lip_data} and \code{trp_data} data frames +#' @param comparison a character column in the \code{lip_data} and \code{trp_data} data frames #' that contains the comparisons between conditions. #' @param diff a numeric column in the \code{lip_data} and \code{trp_data} data frames #' that contains log2-fold changes for peptide or protein quantities. @@ -28,45 +28,46 @@ #' column names, but in a vector). Please note that if you retain columns that have multiple #' rows per grouped variable there will be duplicated rows in the output. #' @param method a character value, specifies the method used to estimate the degrees of freedom. -#' Possible methods are c("satterthwaite", "no_df_approximation"). \code{satterthwaite} uses the Welch-Satterthwaite -#' equation to estimate the pooled degrees of freedom, as described in https://doi.org/10.1016/j.mcpro.2022.100477 and -#' implemented in the MSstatsLiP package. This approach respects the number of protein measurements for the degrees of freedom. +#' Possible methods are c("satterthwaite", "no_df_approximation"). \code{satterthwaite} uses the Welch-Satterthwaite +#' equation to estimate the pooled degrees of freedom, as described in https://doi.org/10.1016/j.mcpro.2022.100477 and +#' implemented in the MSstatsLiP package. This approach respects the number of protein measurements for the degrees of freedom. #' \code{no_df_approximation} just takes the number of peptides into account when calculating the degrees of freedom. -#' -#' @return a data frame containing corrected differential abundances (\code{adj_diff}, adjusted +#' +#' @return a data frame containing corrected differential abundances (\code{adj_diff}, adjusted #' standard errors (\code{adj_std_error}), degrees of freedom (\code{df}), pvalues (\code{pval}) and #' adjusted p-values (\code{adj_pval}) -#' +#' #' @author Aaron Fehr #' @import dplyr #' @importFrom rlang .data enquo sym as_name expr := !! #' @export #' #' @examples -#' +#' #' # Load libraries -#' +#' #' library(dplyr) -#' +#' #' # Load example data and simulate tryptic data by summing up precursors -#' +#' #' data = rapamycin_10uM #' -#' data_trp = data %>% +#' data_trp = data %>% #' dplyr::group_by(pg_protein_accessions, r_file_name) %>% #' dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% #' dplyr::distinct(r_condition, #' r_file_name, #' pg_protein_accessions, -#' pg_quantity) -#' +#' pg_quantity) +#' #' #' # Calculate differential abundances for LiP and Trp data -#' -#' +#' +#' #' diff_lip = data %>% #' mutate(fg_intensity_log2 = log2(fg_quantity)) %>% -#' assign_missingness(sample = r_file_name, +#' assign_missingness( +#' sample = r_file_name, #' condition = r_condition, #' intensity = fg_intensity_log2, #' grouping = eg_precursor_id, @@ -94,18 +95,18 @@ #' intensity_log2 = pg_intensity_log2, #' comparison = comparison, #' method = "t-test") -#' +#' #' # Correct for abundance changes -#' +#' #' corrected = correct_lip_for_abundance( -#' +#' #' lip_data = diff_lip, #' trp_data = diff_trp, #' protein_id = pg_protein_accessions, #' grouping = eg_precursor_id, #' retain_columns = c("missingness"), #' method = "satterthwaite") -#' +#' #' head(corrected, n = 10) @@ -121,51 +122,46 @@ correct_lip_for_abundance = function( p_adj_method = "BH", retain_columns = NULL, method = c("satterthwaite","no_df_approximation")){ - + method = match.arg(method) - - + se_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)),"_pep")) - se_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)),"_prot")) - diff_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)),"_pep")) - diff_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)),"_prot")) - n_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)),"_pep")) - n_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)),"_prot")) - - + temp_lip_data = lip_data %>% dplyr::select(!!enquo(retain_columns), {{comparison}}, {{ protein_id }}, {{ grouping}}, {{ diff }}, {{ n_obs }}, {{ std_error }}) %>% dplyr::distinct() - + temp_trp_data = trp_data %>% dplyr::distinct({{comparison}}, {{ protein_id }}, {{ diff }}, {{ n_obs }}, {{ std_error }}) - - + + test = temp_lip_data %>% dplyr::distinct({{comparison}}, {{ protein_id }}, {{ grouping}}) - + if (nrow(test) != nrow(temp_lip_data)){ message("Warning: Your data frame contains dublicated values due to retained columns. This will affect the multiple testing correction.") } - - combined_data = dplyr::left_join(x = temp_lip_data, - y = temp_trp_data, - by = c(rlang::as_name(rlang::enquo(comparison)), rlang::as_name(rlang::enquo(protein_id))), - suffix = c("_pep", "_prot")) - + + combined_data = dplyr::left_join( + x = temp_lip_data, + y = temp_trp_data, + by = c(rlang::as_name(rlang::enquo(comparison)), rlang::as_name(rlang::enquo(protein_id))), + suffix = c("_pep", "_prot") + ) + n_unmatched = combined_data %>% dplyr::filter(is.na(!!diff_prot) == T) %>% nrow() - + percent_unmatched = round(n_unmatched/nrow(combined_data)*100, 2) - + message(paste0("No protein data was available for ",n_unmatched," peptides (",percent_unmatched, "% of dataset).")) - + if (method == "satterthwaite"){ corrected_data = combined_data %>% @@ -173,43 +169,45 @@ correct_lip_for_abundance = function( dplyr::mutate(adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2) ) %>% dplyr::mutate(numer = ((!!se_pep)**2 + (!!se_prot)**2)**2) %>% dplyr::mutate(denom = ( (!!se_pep)**4 / (!!n_pep-2) + (!!se_prot)**4/(!!n_prot-2))) %>% - dplyr::mutate(df = .data$numer/.data$denom) %>% - dplyr::mutate(tval = .data$adj_diff / .data$adj_std_error) %>% - dplyr::mutate(pval = 2*stats::pt(abs(.data$tval), .data$df, lower.tail = FALSE)) - + dplyr::mutate(df = numer/denom) %>% + dplyr::mutate(tval = adj_diff / adj_std_error) %>% + dplyr::mutate(pval = 2*stats::pt(abs(tval), df, lower.tail = FALSE)) + adjusted_data = dplyr::left_join(x = corrected_data, y = corrected_data %>% - dplyr::filter(is.na(.data$pval) == FALSE) %>% + dplyr::filter(is.na(pval) == FALSE) %>% dplyr::group_by({{ comparison }}) %>% - dplyr::mutate(adj_pval = p.adjust(.data$pval, method = {{ p_adj_method }})), + dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), by = colnames(corrected_data)) %>% - dplyr::select(- .data$numer, - .data$denom, - .data$tval) + dplyr::select(- numer, - denom, - tval) return(adjusted_data) - + } - + if (method == "no_df_approximation"){ corrected_data = combined_data %>% - dplyr::mutate(adj_diff = !!diff_pep - !!diff_prot) %>% - dplyr::mutate(adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2) ) %>% - dplyr::mutate(df = !!n_pep - 2) %>% - dplyr::mutate(tval = .data$adj_diff / .data$adj_std_error) %>% - dplyr::mutate(pval = 2*stats::pt(abs(.data$tval), .data$df, lower.tail = FALSE)) + dplyr::mutate( + adj_diff = !!diff_pep - !!diff_prot, + adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2), + df = !!n_pep - 2, + tval = adj_diff / adj_std_error, + pval = 2*stats::pt(abs(tval), df, lower.tail = FALSE) + ) adjusted_data = dplyr::left_join(x = corrected_data, y = corrected_data %>% - dplyr::filter(is.na(.data$pval) == FALSE) %>% - dplyr::mutate(adj_pval = p.adjust(.data$pval, method = {{ p_adj_method }})), + dplyr::filter(is.na(pval) == FALSE) %>% + dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), by = colnames(corrected_data)) %>% - dplyr::select(- .data$tval) - - + dplyr::select(- tval) + + return(adjusted_data) } - + } From 51ec9b0f35d618848afec9481b744c7dd8ab3622 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 18 Dec 2023 18:36:22 +0100 Subject: [PATCH 56/71] style file --- R/correct_lip_for_abundance.R | 207 ++++++++++++++++++---------------- 1 file changed, 107 insertions(+), 100 deletions(-) diff --git a/R/correct_lip_for_abundance.R b/R/correct_lip_for_abundance.R index 413ff41f..a78edb1c 100644 --- a/R/correct_lip_for_abundance.R +++ b/R/correct_lip_for_abundance.R @@ -50,67 +50,73 @@ #' #' # Load example data and simulate tryptic data by summing up precursors #' -#' data = rapamycin_10uM +#' data <- rapamycin_10uM #' -#' data_trp = data %>% -#' dplyr::group_by(pg_protein_accessions, r_file_name) %>% -#' dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% -#' dplyr::distinct(r_condition, -#' r_file_name, -#' pg_protein_accessions, -#' pg_quantity) +#' data_trp <- data %>% +#' dplyr::group_by(pg_protein_accessions, r_file_name) %>% +#' dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% +#' dplyr::distinct( +#' r_condition, +#' r_file_name, +#' pg_protein_accessions, +#' pg_quantity +#' ) #' #' #' # Calculate differential abundances for LiP and Trp data #' -#' -#' diff_lip = data %>% -#' mutate(fg_intensity_log2 = log2(fg_quantity)) %>% -#' assign_missingness( -#' sample = r_file_name, -#' condition = r_condition, -#' intensity = fg_intensity_log2, -#' grouping = eg_precursor_id, -#' ref_condition = "control", -#' retain_columns = "pg_protein_accessions") %>% -#' calculate_diff_abundance(sample = r_file_name, -#' condition = r_condition, -#' grouping = eg_precursor_id, -#' intensity_log2 = fg_intensity_log2, -#' comparison = comparison, -#' method = "t-test", -#' retain_columns = "pg_protein_accessions") -#' -#' -#' diff_trp = data_trp %>% -#' mutate(pg_intensity_log2 = log2(pg_quantity)) %>% -#' assign_missingness(sample = r_file_name, -#' condition = r_condition, -#' intensity = pg_intensity_log2, -#' grouping = pg_protein_accessions, -#' ref_condition = "control") %>% -#' calculate_diff_abundance(sample = r_file_name, -#' condition = r_condition, -#' grouping = pg_protein_accessions, -#' intensity_log2 = pg_intensity_log2, -#' comparison = comparison, -#' method = "t-test") +#' diff_lip <- data %>% +#' dplyr::mutate(fg_intensity_log2 = log2(fg_quantity)) %>% +#' assign_missingness( +#' sample = r_file_name, +#' condition = r_condition, +#' intensity = fg_intensity_log2, +#' grouping = eg_precursor_id, +#' ref_condition = "control", +#' retain_columns = "pg_protein_accessions" +#' ) %>% +#' calculate_diff_abundance( +#' sample = r_file_name, +#' condition = r_condition, +#' grouping = eg_precursor_id, +#' intensity_log2 = fg_intensity_log2, +#' comparison = comparison, +#' method = "t-test", +#' retain_columns = "pg_protein_accessions" +#' ) +#' +#' +#' diff_trp <- data_trp %>% +#' dplyr::mutate(pg_intensity_log2 = log2(pg_quantity)) %>% +#' assign_missingness( +#' sample = r_file_name, +#' condition = r_condition, +#' intensity = pg_intensity_log2, +#' grouping = pg_protein_accessions, +#' ref_condition = "control" +#' ) %>% +#' calculate_diff_abundance( +#' sample = r_file_name, +#' condition = r_condition, +#' grouping = pg_protein_accessions, +#' intensity_log2 = pg_intensity_log2, +#' comparison = comparison, +#' method = "t-test" +#' ) #' #' # Correct for abundance changes #' -#' corrected = correct_lip_for_abundance( -#' +#' corrected <- correct_lip_for_abundance( #' lip_data = diff_lip, #' trp_data = diff_trp, #' protein_id = pg_protein_accessions, #' grouping = eg_precursor_id, #' retain_columns = c("missingness"), -#' method = "satterthwaite") +#' method = "satterthwaite" +#' ) #' #' head(corrected, n = 10) - - -correct_lip_for_abundance = function( +correct_lip_for_abundance <- function( lip_data, trp_data, protein_id, @@ -121,93 +127,94 @@ correct_lip_for_abundance = function( std_error = std_error, p_adj_method = "BH", retain_columns = NULL, - method = c("satterthwaite","no_df_approximation")){ - - method = match.arg(method) - - se_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)),"_pep")) - se_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)),"_prot")) - diff_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)),"_pep")) - diff_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)),"_prot")) - n_pep = rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)),"_pep")) - n_prot = rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)),"_prot")) - - temp_lip_data = lip_data %>% - dplyr::select(!!enquo(retain_columns), {{comparison}}, {{ protein_id }}, {{ grouping}}, {{ diff }}, {{ n_obs }}, {{ std_error }}) %>% + method = c("satterthwaite", "no_df_approximation")) { + method <- match.arg(method) + + se_pep <- rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)), "_pep")) + se_prot <- rlang::sym(paste0(rlang::as_name(rlang::enquo(std_error)), "_prot")) + diff_pep <- rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)), "_pep")) + diff_prot <- rlang::sym(paste0(rlang::as_name(rlang::enquo(diff)), "_prot")) + n_pep <- rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)), "_pep")) + n_prot <- rlang::sym(paste0(rlang::as_name(rlang::enquo(n_obs)), "_prot")) + + temp_lip_data <- lip_data %>% + dplyr::select(!!enquo(retain_columns), {{ comparison }}, {{ protein_id }}, {{ grouping }}, {{ diff }}, {{ n_obs }}, {{ std_error }}) %>% dplyr::distinct() - temp_trp_data = trp_data %>% - dplyr::distinct({{comparison}}, {{ protein_id }}, {{ diff }}, {{ n_obs }}, {{ std_error }}) + temp_trp_data <- trp_data %>% + dplyr::distinct({{ comparison }}, {{ protein_id }}, {{ diff }}, {{ n_obs }}, {{ std_error }}) - test = temp_lip_data %>% - dplyr::distinct({{comparison}}, {{ protein_id }}, {{ grouping}}) + test <- temp_lip_data %>% + dplyr::distinct({{ comparison }}, {{ protein_id }}, {{ grouping }}) - if (nrow(test) != nrow(temp_lip_data)){ + if (nrow(test) != nrow(temp_lip_data)) { message("Warning: Your data frame contains dublicated values due to retained columns. This will affect the multiple testing correction.") - } + } - combined_data = dplyr::left_join( + combined_data <- dplyr::left_join( x = temp_lip_data, y = temp_trp_data, by = c(rlang::as_name(rlang::enquo(comparison)), rlang::as_name(rlang::enquo(protein_id))), suffix = c("_pep", "_prot") ) - n_unmatched = combined_data %>% + n_unmatched <- combined_data %>% dplyr::filter(is.na(!!diff_prot) == T) %>% nrow() - percent_unmatched = round(n_unmatched/nrow(combined_data)*100, 2) - - message(paste0("No protein data was available for ",n_unmatched," peptides (",percent_unmatched, "% of dataset).")) + percent_unmatched <- round(n_unmatched / nrow(combined_data) * 100, 2) - if (method == "satterthwaite"){ + if (n_unmatched != 0) { + message(paste0("No protein data was available for ", n_unmatched, " peptides (", percent_unmatched, "% of dataset).")) + } - corrected_data = combined_data %>% - dplyr::mutate(adj_diff = !!diff_pep - !!diff_prot) %>% - dplyr::mutate(adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2) ) %>% - dplyr::mutate(numer = ((!!se_pep)**2 + (!!se_prot)**2)**2) %>% - dplyr::mutate(denom = ( (!!se_pep)**4 / (!!n_pep-2) + (!!se_prot)**4/(!!n_prot-2))) %>% - dplyr::mutate(df = numer/denom) %>% - dplyr::mutate(tval = adj_diff / adj_std_error) %>% - dplyr::mutate(pval = 2*stats::pt(abs(tval), df, lower.tail = FALSE)) + if (method == "satterthwaite") { + corrected_data <- combined_data %>% + dplyr::mutate(adj_diff = !!diff_pep - !!diff_prot) %>% + dplyr::mutate(adj_std_error = sqrt((!!se_pep)**2 + (!!se_prot)**2)) %>% + dplyr::mutate(numer = ((!!se_pep)**2 + (!!se_prot)**2)**2) %>% + dplyr::mutate(denom = ((!!se_pep)**4 / (!!n_pep - 2) + (!!se_prot)**4 / (!!n_prot - 2))) %>% + dplyr::mutate(df = numer / denom) %>% + dplyr::mutate(tval = adj_diff / adj_std_error) %>% + dplyr::mutate(pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE)) - adjusted_data = dplyr::left_join(x = corrected_data, - y = corrected_data %>% - dplyr::filter(is.na(pval) == FALSE) %>% - dplyr::group_by({{ comparison }}) %>% - dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), - by = colnames(corrected_data)) %>% - dplyr::select(- numer, - denom, - tval) - return(adjusted_data) + adjusted_data <- dplyr::left_join( + x = corrected_data, + y = corrected_data %>% + dplyr::filter(is.na(pval) == FALSE) %>% + dplyr::group_by({{ comparison }}) %>% + dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), + by = colnames(corrected_data) + ) %>% + dplyr::select(-numer, -denom, -tval) + return(adjusted_data) } - if (method == "no_df_approximation"){ - - corrected_data = combined_data %>% + if (method == "no_df_approximation") { + corrected_data <- combined_data %>% dplyr::mutate( adj_diff = !!diff_pep - !!diff_prot, - adj_std_error = sqrt( (!!se_pep)**2 + (!!se_prot)**2), + adj_std_error = sqrt((!!se_pep)**2 + (!!se_prot)**2), df = !!n_pep - 2, tval = adj_diff / adj_std_error, - pval = 2*stats::pt(abs(tval), df, lower.tail = FALSE) + pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE) ) - adjusted_data = dplyr::left_join(x = corrected_data, - y = corrected_data %>% - dplyr::filter(is.na(pval) == FALSE) %>% - dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), - by = colnames(corrected_data)) %>% - dplyr::select(- tval) + adjusted_data <- dplyr::left_join( + x = corrected_data, + y = corrected_data %>% + dplyr::filter(is.na(pval) == FALSE) %>% + dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), + by = colnames(corrected_data) + ) %>% + dplyr::select(-tval) return(adjusted_data) - } - } From 3fc1b2aa33d6db7cbbb60ec8904fafadb00dc4e5 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Mon, 18 Dec 2023 18:36:30 +0100 Subject: [PATCH 57/71] setup testing --- tests/testthat/test-workflow.R | 57 +++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-workflow.R b/tests/testthat/test-workflow.R index 76647263..d5e799e9 100644 --- a/tests/testthat/test-workflow.R +++ b/tests/testthat/test-workflow.R @@ -7,7 +7,7 @@ data <- create_synthetic_data( n_replicates = 3, n_conditions = 2, method = "effect_random", - additional_metadata = FALSE + additional_metadata = TRUE ) data_drc <- create_synthetic_data( @@ -330,6 +330,61 @@ test_that("calculate_diff_abundance works", { expect_equal(round(min(diff_proDA$adj_pval, na.rm = TRUE), digits = 5), 0.00125) } }) +# +# test_that("correct_lip_for_abundance works", { +# diff_lip = normalised_data %>% +# dplyr::mutate(fg_intensity_log2 = log2(fg_quantity)) %>% +# assign_missingness( +# sample = r_file_name, +# condition = r_condition, +# intensity = fg_intensity_log2, +# grouping = eg_precursor_id, +# ref_condition = "control", +# retain_columns = "pg_protein_accessions") %>% +# calculate_diff_abundance( +# sample = r_file_name, +# condition = r_condition, +# grouping = eg_precursor_id, +# intensity_log2 = fg_intensity_log2, +# comparison = comparison, +# method = "t-test", +# retain_columns = "pg_protein_accessions" +# ) +# +# diff_trp = normalised_data %>% +# dplyr::group_by(pg_protein_accessions, r_file_name) %>% +# dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% +# dplyr::distinct( +# r_condition, +# r_file_name, +# pg_protein_accessions, +# pg_quantity +# ) %>% +# dplyr::mutate(pg_intensity_log2 = log2(pg_quantity)) %>% +# assign_missingness( +# sample = r_file_name, +# condition = r_condition, +# intensity = pg_intensity_log2, +# grouping = pg_protein_accessions, +# ref_condition = "control") %>% +# calculate_diff_abundance(sample = r_file_name, +# condition = r_condition, +# grouping = pg_protein_accessions, +# intensity_log2 = pg_intensity_log2, +# comparison = comparison, +# method = "t-test") +# +# corrected = correct_lip_for_abundance( +# lip_data = diff_lip, +# trp_data = diff_trp, +# protein_id = pg_protein_accessions, +# grouping = eg_precursor_id, +# retain_columns = c("missingness"), +# method = "satterthwaite" +# ) +# +# +# }) if (Sys.getenv("TEST_PROTTI") == "true") { test_that("deprecated diff_abundance works", { From 7f7c567e0e1a6e9520b5548f6f6fd146cb989467 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 19 Dec 2023 11:31:28 +0100 Subject: [PATCH 58/71] refactor --- R/correct_lip_for_abundance.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/correct_lip_for_abundance.R b/R/correct_lip_for_abundance.R index a78edb1c..dca28bbb 100644 --- a/R/correct_lip_for_abundance.R +++ b/R/correct_lip_for_abundance.R @@ -172,14 +172,15 @@ correct_lip_for_abundance <- function( if (method == "satterthwaite") { corrected_data <- combined_data %>% - dplyr::mutate(adj_diff = !!diff_pep - !!diff_prot) %>% - dplyr::mutate(adj_std_error = sqrt((!!se_pep)**2 + (!!se_prot)**2)) %>% - dplyr::mutate(numer = ((!!se_pep)**2 + (!!se_prot)**2)**2) %>% - dplyr::mutate(denom = ((!!se_pep)**4 / (!!n_pep - 2) + (!!se_prot)**4 / (!!n_prot - 2))) %>% - dplyr::mutate(df = numer / denom) %>% - dplyr::mutate(tval = adj_diff / adj_std_error) %>% - dplyr::mutate(pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE)) - + dplyr::mutate( + adj_diff = !!diff_pep - !!diff_prot, + adj_std_error = sqrt((!!se_pep)**2 + (!!se_prot)**2), + numer = ((!!se_pep)**2 + (!!se_prot)**2)**2, + denom = ((!!se_pep)**4 / (!!n_pep - 2) + (!!se_prot)**4 / (!!n_prot - 2)), + df = numer / denom, + tval = adj_diff / adj_std_error, + pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE) + ) adjusted_data <- dplyr::left_join( x = corrected_data, @@ -204,7 +205,6 @@ correct_lip_for_abundance <- function( pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE) ) - adjusted_data <- dplyr::left_join( x = corrected_data, y = corrected_data %>% From 4d849d13e9bf7df791a6a49a560d6ddacff42b83 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 19 Dec 2023 11:31:48 +0100 Subject: [PATCH 59/71] add testing for correct lip for abudance --- tests/testthat/test-workflow.R | 129 +++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 55 deletions(-) diff --git a/tests/testthat/test-workflow.R b/tests/testthat/test-workflow.R index d5e799e9..28b34667 100644 --- a/tests/testthat/test-workflow.R +++ b/tests/testthat/test-workflow.R @@ -330,61 +330,80 @@ test_that("calculate_diff_abundance works", { expect_equal(round(min(diff_proDA$adj_pval, na.rm = TRUE), digits = 5), 0.00125) } }) -# -# test_that("correct_lip_for_abundance works", { -# diff_lip = normalised_data %>% -# dplyr::mutate(fg_intensity_log2 = log2(fg_quantity)) %>% -# assign_missingness( -# sample = r_file_name, -# condition = r_condition, -# intensity = fg_intensity_log2, -# grouping = eg_precursor_id, -# ref_condition = "control", -# retain_columns = "pg_protein_accessions") %>% -# calculate_diff_abundance( -# sample = r_file_name, -# condition = r_condition, -# grouping = eg_precursor_id, -# intensity_log2 = fg_intensity_log2, -# comparison = comparison, -# method = "t-test", -# retain_columns = "pg_protein_accessions" -# ) -# -# diff_trp = normalised_data %>% -# dplyr::group_by(pg_protein_accessions, r_file_name) %>% -# dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% -# dplyr::distinct( -# r_condition, -# r_file_name, -# pg_protein_accessions, -# pg_quantity -# ) %>% -# dplyr::mutate(pg_intensity_log2 = log2(pg_quantity)) %>% -# assign_missingness( -# sample = r_file_name, -# condition = r_condition, -# intensity = pg_intensity_log2, -# grouping = pg_protein_accessions, -# ref_condition = "control") %>% -# calculate_diff_abundance(sample = r_file_name, -# condition = r_condition, -# grouping = pg_protein_accessions, -# intensity_log2 = pg_intensity_log2, -# comparison = comparison, -# method = "t-test") -# -# corrected = correct_lip_for_abundance( -# lip_data = diff_lip, -# trp_data = diff_trp, -# protein_id = pg_protein_accessions, -# grouping = eg_precursor_id, -# retain_columns = c("missingness"), -# method = "satterthwaite" -# ) -# -# -# }) + +test_that("correct_lip_for_abundance works", { + data <- rapamycin_10uM + diff_lip = data %>% + dplyr::mutate(fg_intensity_log2 = log2(fg_quantity)) %>% + assign_missingness( + sample = r_file_name, + condition = r_condition, + intensity = fg_intensity_log2, + grouping = eg_precursor_id, + ref_condition = "control", + retain_columns = "pg_protein_accessions") %>% + calculate_diff_abundance( + sample = r_file_name, + condition = r_condition, + grouping = eg_precursor_id, + intensity_log2 = fg_intensity_log2, + comparison = comparison, + method = "t-test", + retain_columns = "pg_protein_accessions" + ) + + diff_trp = data %>% + dplyr::group_by(pg_protein_accessions, r_file_name) %>% + dplyr::mutate(pg_quantity = sum(fg_quantity)) %>% + dplyr::distinct( + r_condition, + r_file_name, + pg_protein_accessions, + pg_quantity + ) %>% + dplyr::mutate(pg_intensity_log2 = log2(pg_quantity)) %>% + assign_missingness( + sample = r_file_name, + condition = r_condition, + intensity = pg_intensity_log2, + grouping = pg_protein_accessions, + ref_condition = "control") %>% + calculate_diff_abundance(sample = r_file_name, + condition = r_condition, + grouping = pg_protein_accessions, + intensity_log2 = pg_intensity_log2, + comparison = comparison, + method = "t-test") + + corrected_satterthwaite = correct_lip_for_abundance( + lip_data = diff_lip, + trp_data = diff_trp, + protein_id = pg_protein_accessions, + grouping = eg_precursor_id, + retain_columns = c("missingness"), + method = "satterthwaite" + ) + + corrected_no_df_approximation = correct_lip_for_abundance( + lip_data = diff_lip, + trp_data = diff_trp, + protein_id = pg_protein_accessions, + grouping = eg_precursor_id, + retain_columns = c("missingness"), + method = "no_df_approximation" + ) + + expect_is(corrected_satterthwaite, "data.frame") + expect_equal(corrected_satterthwaite$adj_diff[1], 2.474938, tolerance=1e-3) + expect_equal(corrected_satterthwaite$adj_std_error[1], 0.1531189, tolerance=1e-3) + expect_equal(corrected_satterthwaite$adj_pval[1], 1.561211e-05, tolerance=1e-3) + expect_equal(corrected_satterthwaite$df[1], 10.13124, tolerance=1e-3) + + expect_is(corrected_no_df_approximation, "data.frame") + expect_equal(corrected_no_df_approximation$adj_diff[1], 2.474938, tolerance=1e-3) + expect_equal(corrected_no_df_approximation$adj_std_error[1], 0.1531189, tolerance=1e-3) + expect_equal(corrected_no_df_approximation$df[1], 6) +}) if (Sys.getenv("TEST_PROTTI") == "true") { test_that("deprecated diff_abundance works", { From 5f690f97c699e6ca58a659b67f49eb7e82c7b255 Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 19 Dec 2023 11:37:43 +0100 Subject: [PATCH 60/71] add correct lip for abundance to news --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7df9feea..7a760fc9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +## New features +* `correct_lip_for_abundance()` was added. If corrects LiP-peptides for changes in protein abundance and calculates their significance using a t-test. Big thanks to @FehrAaron! + # protti 0.6.0.9000 ## New features From e346f3c6970d09b4ef49010f8ceebf7b06b6075f Mon Sep 17 00:00:00 2001 From: Elena Krismer <70535771+elena-krismer@users.noreply.github.com> Date: Tue, 19 Dec 2023 14:14:54 +0100 Subject: [PATCH 61/71] no addtional metadata --- tests/testthat/test-workflow.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-workflow.R b/tests/testthat/test-workflow.R index 28b34667..cab24ad4 100644 --- a/tests/testthat/test-workflow.R +++ b/tests/testthat/test-workflow.R @@ -7,7 +7,7 @@ data <- create_synthetic_data( n_replicates = 3, n_conditions = 2, method = "effect_random", - additional_metadata = TRUE + additional_metadata = FALSE ) data_drc <- create_synthetic_data( From b8b1da42f06b31c1bdaa5321a952b15045203346 Mon Sep 17 00:00:00 2001 From: jpquast Date: Mon, 29 Jan 2024 14:51:29 +0100 Subject: [PATCH 62/71] Add MSstatsLiP to function description and NEWS --- DESCRIPTION | 3 +- NEWS.md | 4 +- R/correct_lip_for_abundance.R | 6 +- man/correct_lip_for_abundance.Rd | 101 +++++++++++++++++-------------- 4 files changed, 62 insertions(+), 52 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3b714d7d..82d72bd3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,8 +42,7 @@ Imports: httr, methods, R.utils, - stats, - lme4 (>= 1.1-35) + stats RoxygenNote: 7.2.3 Suggests: testthat, diff --git a/NEWS.md b/NEWS.md index 7a760fc9..ba0cfe3c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,8 @@ -## New features -* `correct_lip_for_abundance()` was added. If corrects LiP-peptides for changes in protein abundance and calculates their significance using a t-test. Big thanks to @FehrAaron! - # protti 0.6.0.9000 ## New features +* `correct_lip_for_abundance()` was added. It corrects LiP-peptides for changes in protein abundance and calculates their significance using a t-test. The function is based on the [MSstatsLiP](https://www.bioconductor.org/packages/release/bioc/html/MSstatsLiP.html) package developed by the Vitek Lab. Big thanks to @FehrAaron for implementing it! * `qc_cvs()` received a new argument called `max_cv` that specifies the maximum CV that should be included in the plot. * `peptide_profile_plot()` received a new argument called `complete_sample`. If set to `TRUE`, each protein gets assigned all sample names that are found in the input data. This ensures that the plot always contains all samples on the x-axis even if there are no measured intensities for a specific sample. The default is `FALSE`, which is the original behaviour of the function. * `volcano_plot()` received the `colour` argument that allows the user to provide custom colours for points. diff --git a/R/correct_lip_for_abundance.R b/R/correct_lip_for_abundance.R index dca28bbb..338084bc 100644 --- a/R/correct_lip_for_abundance.R +++ b/R/correct_lip_for_abundance.R @@ -1,7 +1,9 @@ -#' Protein abundance correction +#' Protein abundance correction for LiP-data #' #' Performs the correction of LiP-peptides for changes in protein abundance and -#' calculates their significance using a t-test +#' calculates their significance using a t-test. This function was implemented based +#' on the [MSstatsLiP](https://www.bioconductor.org/packages/release/bioc/html/MSstatsLiP.html) +#' package developed by the Vitek lab. #' #' @param lip_data a data frame containing at least the input variables. Ideally, #' the result from the \code{calculate_diff_abundance} function is used. diff --git a/man/correct_lip_for_abundance.Rd b/man/correct_lip_for_abundance.Rd index 726d6b33..d619e3cf 100644 --- a/man/correct_lip_for_abundance.Rd +++ b/man/correct_lip_for_abundance.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/correct_lip_for_abundance.R \name{correct_lip_for_abundance} \alias{correct_lip_for_abundance} -\title{Protein abundance correction} +\title{Protein abundance correction for LiP-data} \usage{ correct_lip_for_abundance( lip_data, @@ -66,7 +66,9 @@ adjusted p-values (\code{adj_pval}) } \description{ Performs the correction of LiP-peptides for changes in protein abundance and -calculates their significance using a t-test +calculates their significance using a t-test. This function was implemented based +on the \href{https://www.bioconductor.org/packages/release/bioc/html/MSstatsLiP.html}{MSstatsLiP} +package developed by the Vitek lab. } \examples{ @@ -76,61 +78,70 @@ library(dplyr) # Load example data and simulate tryptic data by summing up precursors -data = rapamycin_10uM +data <- rapamycin_10uM + +data_trp <- data \%>\% + dplyr::group_by(pg_protein_accessions, r_file_name) \%>\% + dplyr::mutate(pg_quantity = sum(fg_quantity)) \%>\% + dplyr::distinct( + r_condition, + r_file_name, + pg_protein_accessions, + pg_quantity + ) -data_trp = data \%>\% -dplyr::group_by(pg_protein_accessions, r_file_name) \%>\% -dplyr::mutate(pg_quantity = sum(fg_quantity)) \%>\% -dplyr::distinct(r_condition, - r_file_name, - pg_protein_accessions, - pg_quantity) - # Calculate differential abundances for LiP and Trp data +diff_lip <- data \%>\% + dplyr::mutate(fg_intensity_log2 = log2(fg_quantity)) \%>\% + assign_missingness( + sample = r_file_name, + condition = r_condition, + intensity = fg_intensity_log2, + grouping = eg_precursor_id, + ref_condition = "control", + retain_columns = "pg_protein_accessions" + ) \%>\% + calculate_diff_abundance( + sample = r_file_name, + condition = r_condition, + grouping = eg_precursor_id, + intensity_log2 = fg_intensity_log2, + comparison = comparison, + method = "t-test", + retain_columns = "pg_protein_accessions" + ) + + +diff_trp <- data_trp \%>\% + dplyr::mutate(pg_intensity_log2 = log2(pg_quantity)) \%>\% + assign_missingness( + sample = r_file_name, + condition = r_condition, + intensity = pg_intensity_log2, + grouping = pg_protein_accessions, + ref_condition = "control" + ) \%>\% + calculate_diff_abundance( + sample = r_file_name, + condition = r_condition, + grouping = pg_protein_accessions, + intensity_log2 = pg_intensity_log2, + comparison = comparison, + method = "t-test" + ) -diff_lip = data \%>\% - mutate(fg_intensity_log2 = log2(fg_quantity)) \%>\% - assign_missingness(sample = r_file_name, - condition = r_condition, - intensity = fg_intensity_log2, - grouping = eg_precursor_id, - ref_condition = "control", - retain_columns = "pg_protein_accessions") \%>\% - calculate_diff_abundance(sample = r_file_name, - condition = r_condition, - grouping = eg_precursor_id, - intensity_log2 = fg_intensity_log2, - comparison = comparison, - method = "t-test", - retain_columns = "pg_protein_accessions") - - -diff_trp = data_trp \%>\% - mutate(pg_intensity_log2 = log2(pg_quantity)) \%>\% - assign_missingness(sample = r_file_name, - condition = r_condition, - intensity = pg_intensity_log2, - grouping = pg_protein_accessions, - ref_condition = "control") \%>\% - calculate_diff_abundance(sample = r_file_name, - condition = r_condition, - grouping = pg_protein_accessions, - intensity_log2 = pg_intensity_log2, - comparison = comparison, - method = "t-test") - # Correct for abundance changes -corrected = correct_lip_for_abundance( - +corrected <- correct_lip_for_abundance( lip_data = diff_lip, trp_data = diff_trp, protein_id = pg_protein_accessions, grouping = eg_precursor_id, retain_columns = c("missingness"), - method = "satterthwaite") + method = "satterthwaite" +) head(corrected, n = 10) } From ff07d62d49511c5b0b7c333b51f7195ae0a68178 Mon Sep 17 00:00:00 2001 From: jpquast Date: Mon, 29 Jan 2024 15:44:03 +0100 Subject: [PATCH 63/71] remove lme4 from R-CMD-check --- .github/workflows/R-CMD-check.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1a74931e..61037db3 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -46,7 +46,6 @@ jobs: with: extra-packages: any::rcmdcheck - any::lme4 needs: check # run: | From aa9780d1bbd6729d7a0e0fb8b25720822c2e0d83 Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 30 Jan 2024 10:49:56 +0100 Subject: [PATCH 64/71] Bump version and fix formatting and link --- DESCRIPTION | 2 +- NEWS.md | 2 +- R/correct_lip_for_abundance.R | 22 +++++++++---------- .../data_analysis_dose_response_workflow.Rmd | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9ccd07de..ab6d5659 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: protti Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools -Version: 0.6.0.9001 +Version: 0.7.0 Authors@R: c(person(given = "Jan-Philipp", family = "Quast", diff --git a/NEWS.md b/NEWS.md index ba0cfe3c..cb6a2871 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# protti 0.6.0.9000 +# protti 0.7.0 ## New features diff --git a/R/correct_lip_for_abundance.R b/R/correct_lip_for_abundance.R index 338084bc..2b1130c2 100644 --- a/R/correct_lip_for_abundance.R +++ b/R/correct_lip_for_abundance.R @@ -179,20 +179,20 @@ correct_lip_for_abundance <- function( adj_std_error = sqrt((!!se_pep)**2 + (!!se_prot)**2), numer = ((!!se_pep)**2 + (!!se_prot)**2)**2, denom = ((!!se_pep)**4 / (!!n_pep - 2) + (!!se_prot)**4 / (!!n_prot - 2)), - df = numer / denom, - tval = adj_diff / adj_std_error, - pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE) + df = .data$numer / .data$denom, + tval = .data$adj_diff / .data$adj_std_error, + pval = 2 * stats::pt(abs(.data$tval), .data$df, lower.tail = FALSE) ) adjusted_data <- dplyr::left_join( x = corrected_data, y = corrected_data %>% - dplyr::filter(is.na(pval) == FALSE) %>% + dplyr::filter(is.na(.data$pval) == FALSE) %>% dplyr::group_by({{ comparison }}) %>% - dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), + dplyr::mutate(adj_pval = p.adjust(.data$pval, method = {{ p_adj_method }})), by = colnames(corrected_data) ) %>% - dplyr::select(-numer, -denom, -tval) + dplyr::select(-c("numer", "denom", "tval")) return(adjusted_data) } @@ -203,18 +203,18 @@ correct_lip_for_abundance <- function( adj_diff = !!diff_pep - !!diff_prot, adj_std_error = sqrt((!!se_pep)**2 + (!!se_prot)**2), df = !!n_pep - 2, - tval = adj_diff / adj_std_error, - pval = 2 * stats::pt(abs(tval), df, lower.tail = FALSE) + tval = .data$adj_diff / .data$adj_std_error, + pval = 2 * stats::pt(abs(.data$tval), .data$df, lower.tail = FALSE) ) adjusted_data <- dplyr::left_join( x = corrected_data, y = corrected_data %>% - dplyr::filter(is.na(pval) == FALSE) %>% - dplyr::mutate(adj_pval = p.adjust(pval, method = {{ p_adj_method }})), + dplyr::filter(is.na(.data$pval) == FALSE) %>% + dplyr::mutate(adj_pval = p.adjust(.data$pval, method = {{ p_adj_method }})), by = colnames(corrected_data) ) %>% - dplyr::select(-tval) + dplyr::select(-"tval") return(adjusted_data) diff --git a/vignettes/data_analysis_dose_response_workflow.Rmd b/vignettes/data_analysis_dose_response_workflow.Rmd index be538156..977ffb01 100644 --- a/vignettes/data_analysis_dose_response_workflow.Rmd +++ b/vignettes/data_analysis_dose_response_workflow.Rmd @@ -252,7 +252,7 @@ You can test your data set for gene ontology (GO) term enrichment (`calculate_go If you know which proteins bind or interact with your specific treatment, you can provide your own list of true positive hits and check if these are enriched in your significant hits by using **protti**'s `calculate_treatment_enrichment()` function. For our LiP-MS experiment using rapamycin we are probing direct interaction with proteins in contrast to functional effects. The only protein that rapamycin is known to bind to is FKBP12 and testing the significance of enrichment for a single or even a few proteins is not appropriate. However, testing for enrichment is especially useful if your treatment affects many proteins, since it can help you to reduce the complexity of your result. -The [STRING](https://academic.oup.com/nar/article/47/D1/D607/5198476) database provides a good resource for the analysis of protein interaction networks. It is often very useful to check for interactions within your significant hits. For LiP-MS data this sometimes explains why proteins that do not directly interact with your treatment are still significantly affected. With `analyse_functional_network()`, **protti** provides a useful wrapper around some [`STRINGdb`](http://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) package functions. +The [STRING](https://academic.oup.com/nar/article/47/D1/D607/5198476) database provides a good resource for the analysis of protein interaction networks. It is often very useful to check for interactions within your significant hits. For LiP-MS data this sometimes explains why proteins that do not directly interact with your treatment are still significantly affected. With `analyse_functional_network()`, **protti** provides a useful wrapper around some [`STRINGdb`](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) package functions. ## Annotation of data From a26a520f802460aa60183dc94b4ab08bea31f64e Mon Sep 17 00:00:00 2001 From: jpquast Date: Tue, 30 Jan 2024 11:06:34 +0100 Subject: [PATCH 65/71] Add revdep checks --- cran-comments.md | 11 ++++------ revdep/.gitignore | 9 ++++++++ revdep/README.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++ revdep/cran.md | 7 +++++++ revdep/failures.md | 1 + revdep/problems.md | 1 + 6 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 revdep/.gitignore create mode 100644 revdep/README.md create mode 100644 revdep/cran.md create mode 100644 revdep/failures.md create mode 100644 revdep/problems.md diff --git a/cran-comments.md b/cran-comments.md index d28fb8cc..ae237163 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,14 +1,11 @@ ## Submission -* We fixed a bug that caused an example to fail if there -was a connectivity issue to a database. This should now be resolved. -* The bug caused an error to appear when \donttest examples were tested. -* This release therefore fixes the problem reported by Prof. Brian Ripley. +* We have fixed a few bugs and added a new functions. ## Test environments -* macOS-latest (on GitHub actions), R 4.2.2 -* windows-latest (on GitHub actions), R 4.2.2 -* ubuntu-20.04 (on GitHub actions), R 4.2.2 +* macOS-latest (on GitHub actions), R 4.3.2 +* windows-latest (on GitHub actions), R 4.3.2 +* ubuntu-20.04 (on GitHub actions), R 4.3.2 * ubuntu-20.04 (on GitHub actions), r-devel * windows-ix86+x86_64 (win-builder), r-devel * fedora-clang-devel (R-hub), r-devel diff --git a/revdep/.gitignore b/revdep/.gitignore new file mode 100644 index 00000000..347547e3 --- /dev/null +++ b/revdep/.gitignore @@ -0,0 +1,9 @@ +checks +library +checks.noindex +library.noindex +data.sqlite +*.html +download +lib +cloud.noindex \ No newline at end of file diff --git a/revdep/README.md b/revdep/README.md new file mode 100644 index 00000000..4b4ea506 --- /dev/null +++ b/revdep/README.md @@ -0,0 +1,52 @@ +# Platform + +|field |value | +|:--------|:------------------------------------------------------------------------------------------| +|version |R version 4.3.1 (2023-06-16) | +|os |macOS Sonoma 14.2.1 | +|system |aarch64, darwin20 | +|ui |RStudio | +|language |(EN) | +|collate |en_US.UTF-8 | +|ctype |en_US.UTF-8 | +|tz |Europe/Zurich | +|date |2024-01-30 | +|rstudio |2023.06.1+524 Mountain Hydrangea (desktop) | +|pandoc |3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) | + +# Dependencies + +|package |old |new |Δ | +|:-----------|:-----|:-------|:--| +|protti |0.6.0 |0.7.0 |* | +|bslib |NA |0.6.1 |* | +|crosstalk |NA |1.2.1 |* | +|data.table |NA |1.14.10 |* | +|dplyr |NA |1.1.4 |* | +|fontawesome |NA |0.5.2 |* | +|ggplot2 |NA |3.4.4 |* | +|ggrepel |NA |0.9.5 |* | +|gtable |NA |0.3.4 |* | +|htmltools |NA |0.5.7 |* | +|htmlwidgets |NA |1.6.4 |* | +|labeling |NA |0.4.3 |* | +|later |NA |1.3.2 |* | +|lubridate |NA |1.9.3 |* | +|plotly |NA |4.10.4 |* | +|R.oo |NA |1.26.0 |* | +|R.utils |NA |2.12.3 |* | +|Rcpp |NA |1.0.12 |* | +|readr |NA |2.1.5 |* | +|rmarkdown |NA |2.25 |* | +|sass |NA |0.4.8 |* | +|scales |NA |1.3.0 |* | +|snakecase |NA |0.11.1 |* | +|stringi |NA |1.8.3 |* | +|stringr |NA |1.5.1 |* | +|tidyr |NA |1.3.1 |* | +|timechange |NA |0.3.0 |* | +|tinytex |NA |0.49 |* | +|vroom |NA |1.6.5 |* | + +# Revdeps + diff --git a/revdep/cran.md b/revdep/cran.md new file mode 100644 index 00000000..782ef684 --- /dev/null +++ b/revdep/cran.md @@ -0,0 +1,7 @@ +## revdepcheck results + +We checked 1 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. + + * We saw 0 new problems + * We failed to check 0 packages + diff --git a/revdep/failures.md b/revdep/failures.md new file mode 100644 index 00000000..9a207363 --- /dev/null +++ b/revdep/failures.md @@ -0,0 +1 @@ +*Wow, no problems at all. :)* \ No newline at end of file diff --git a/revdep/problems.md b/revdep/problems.md new file mode 100644 index 00000000..9a207363 --- /dev/null +++ b/revdep/problems.md @@ -0,0 +1 @@ +*Wow, no problems at all. :)* \ No newline at end of file From b351e6e3c0a9099079ecfe4439f71c185f3cd90b Mon Sep 17 00:00:00 2001 From: jpquast Date: Sun, 11 Feb 2024 13:07:43 +0100 Subject: [PATCH 66/71] Fix bug in qc_data_completeness Fixed facetting bug when sample was of type factor. --- NEWS.md | 1 + R/qc_data_completeness.R | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index cb6a2871..ee851afb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,6 +21,7 @@ *`fit_drc_4p()`: If there are no correlations an empty data frame is returned to prevent errors in `parallel_fit_drc_4p()`. * `calculate_sequence_coverage()` does not fail anymore if a protein only contains `NA` peptide sequences. * `qc_sequence_coverage()` does not return a plot anymore if `plot = FALSE`. This fixes issue #207. +* `qc_data_completeness()` if sample was of type `factor` the function did not properly facet the data when the `digestion` argument was provided. Now we filter out all 0% completeness values that come from factor levels that are not present in subsetted data. # protti 0.6.0 diff --git a/R/qc_data_completeness.R b/R/qc_data_completeness.R index f9e66e75..9a197cde 100644 --- a/R/qc_data_completeness.R +++ b/R/qc_data_completeness.R @@ -78,7 +78,8 @@ qc_data_completeness <- function(data, dplyr::group_by({{ sample }}) %>% dplyr::summarise(completeness = sum(!is.na({{ intensity }})) / dplyr::n() * 100, .groups = "drop") %>% dplyr::mutate({{ digestion }} := .y) - ) + ) %>% + filter(.data$completeness > 0) } else { result <- data %>% dplyr::distinct({{ sample }}, {{ grouping }}, {{ intensity }}) %>% From 04a9fea94a8be7196a3c9c8c0567465c5398dcc8 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 15 Feb 2024 00:29:18 +0100 Subject: [PATCH 67/71] Fix CITATION Note --- inst/CITATION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/CITATION b/inst/CITATION index 87e48efb..af11cd24 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,6 +1,6 @@ citHeader("To cite protti in publications, please use:") -citEntry(entry = "article", +bibentry(entry = "article", textVersion = "Quast, J.P., Schuster, D., Picotti, P. (2022). protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data. Bioinformatics Advances, 2(1).", author = "Jan-Philipp Quast, Dina Schuster, Paola Picotti", title = "protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data", From 99bb828b84d5b41c360bd8cad44e936987956534 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 15 Feb 2024 00:34:32 +0100 Subject: [PATCH 68/71] Add revdep to .Rbuildignore --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 8469443c..5e586dcf 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,3 +14,4 @@ ^cran-comments\.md$ ^CRAN-RELEASE$ ^CRAN-SUBMISSION$ +^revdep$ \ No newline at end of file From d8a0ddfc49df3cda2011b6260b9cee6ce28dd361 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 15 Feb 2024 01:00:55 +0100 Subject: [PATCH 69/71] Fix Itemize Note --- R/assign_missingness.R | 19 ++++----- R/calculate_diff_abundance.R | 40 ++++++++--------- R/extract_metal_binders.R | 60 +++++++++++++------------- R/fetch_alphafold_aligned_error.R | 16 +++---- R/fetch_alphafold_prediction.R | 45 ++++++++++--------- R/fetch_metal_pdb.R | 34 +++++++-------- R/fetch_pdb.R | 38 ++++++++--------- R/fetch_pdb_structure.R | 64 ++++++++++++++-------------- R/find_peptide_in_structure.R | 46 ++++++++++---------- R/fit_drc_4p.R | 14 +++--- R/predict_alphafold_domain.R | 10 ++--- man/assign_missingness.Rd | 17 ++++---- man/calculate_diff_abundance.Rd | 17 ++++---- man/diff_abundance.Rd | 16 +++---- man/extract_metal_binders.Rd | 57 +++++++++++++------------ man/fetch_alphafold_aligned_error.Rd | 12 +++--- man/fetch_alphafold_prediction.Rd | 42 +++++++++--------- man/fetch_metal_pdb.Rd | 30 ++++++------- man/fetch_pdb.Rd | 34 +++++++-------- man/fetch_pdb_structure.Rd | 60 +++++++++++++------------- man/find_peptide_in_structure.Rd | 42 +++++++++--------- man/fit_drc_4p.Rd | 6 +-- man/predict_alphafold_domain.Rd | 6 +-- 23 files changed, 363 insertions(+), 362 deletions(-) diff --git a/R/assign_missingness.R b/R/assign_missingness.R index 327eec19..2945eb0d 100644 --- a/R/assign_missingness.R +++ b/R/assign_missingness.R @@ -30,18 +30,17 @@ #' @return A data frame that contains the reference condition paired with each treatment condition. #' The \code{comparison} column contains the comparison name for the specific treatment/reference #' pair. The \code{missingness} column reports the type of missingness. -#' \itemize{ -#' \item{"complete": }{No missing values for every replicate of this reference/treatment pair for -#' the specific grouping variable.} -#' \item{"MNAR": }{Missing not at random. All replicates of either the reference or treatment -#' condition have missing values for the specific grouping variable.} -#' \item{"MAR": }{Missing at random. At least n-1 replicates have missing values for the -#' reference/treatment pair for the specific grouping varible.} -#' \item{NA: }{The comparison is not complete enough to fall into any other category. It will not +#' * "complete": No missing values for every replicate of this reference/treatment pair for +#' the specific grouping variable. +#' * "MNAR": Missing not at random. All replicates of either the reference or treatment +#' condition have missing values for the specific grouping variable. +#' * "MAR": Missing at random. At least n-1 replicates have missing values for the +#' reference/treatment pair for the specific grouping varible. +#' * NA: The comparison is not complete enough to fall into any other category. It will not #' be imputed if imputation is performed. For statistical significance testing these comparisons #' are filtered out after the test and prior to p-value adjustment. This can be prevented by setting -#' `filter_NA_missingness = FALSE` in the `calculate_diff_abundance()` function.} -#' } +#' `filter_NA_missingness = FALSE` in the `calculate_diff_abundance()` function. +#' #' The type of missingness has an influence on the way values are imputeted if imputation is #' performed subsequently using the `impute()` function. How each type of missingness is #' specifically imputed can be found in the function description. The type of missingness diff --git a/R/calculate_diff_abundance.R b/R/calculate_diff_abundance.R index 1f3dd227..a329f222 100644 --- a/R/calculate_diff_abundance.R +++ b/R/calculate_diff_abundance.R @@ -7,19 +7,19 @@ #' and adjusted p-values (\code{adj_pval}) for each protein, peptide or precursor (depending on #' the \code{grouping} variable) and the associated treatment/reference pair. Depending on the #' method the data frame contains additional columns: -#' \itemize{ -#' \item{"t-test": }{The \code{std_error} column contains the standard error of the differential +#' +#' * "t-test": The \code{std_error} column contains the standard error of the differential #' abundances. \code{n_obs} contains the number of observations for the specific protein, peptide -#' or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -#' \item{"t-test_mean_sd": }{Columns labeled as control refer to the second condition of the +#' or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +#' * "t-test_mean_sd": Columns labeled as control refer to the second condition of the #' comparison pairs. Treated refers to the first condition. \code{mean_control} and \code{mean_treated} #' columns contain the means for the reference and treatment condition, respectively. \code{sd_control} #' and \code{sd_treated} columns contain the standard deviations for the reference and treatment #' condition, respectively. \code{n_control} and \code{n_treated} columns contain the numbers of #' samples for the reference and treatment condition, respectively. The \code{std_error} column #' contains the standard error of the differential abundances. \code{t_statistic} contains the -#' t_statistic for the t-test.} -#' \item{"moderated_t-test": }{\code{CI_2.5} and \code{CI_97.5} contain the 2.5% and 97.5% +#' t_statistic for the t-test. +#' * "moderated_t-test": \code{CI_2.5} and \code{CI_97.5} contain the 2.5% and 97.5% #' confidence interval borders for differential abundances. \code{avg_abundance} contains average #' abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} #' contains the t_statistic for the t-test. \code{B} The B-statistic is the log-odds that the @@ -29,13 +29,13 @@ #' 4.48/(1+4.48)=0.82, i.e., the probability is about 82% that this group is differentially #' abundant. A B-statistic of zero corresponds to a 50-50 chance that the group is differentially #' abundant.\code{n_obs} contains the number of observations for the specific protein, peptide or -#' precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -#' \item{"proDA": }{The \code{std_error} column contains the standard error of the differential +#' precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +#' * "proDA": The \code{std_error} column contains the standard error of the differential #' abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs #' (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. #' \code{n_obs} contains the number of observations for the specific protein, peptide or precursor -#' (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -#' } +#' (depending on the \code{grouping} variable) and the associated treatment/reference pair. +#' #' @keywords internal #' @export diff_abundance <- @@ -117,19 +117,19 @@ diff_abundance <- #' and adjusted p-values (\code{adj_pval}) for each protein, peptide or precursor (depending on #' the \code{grouping} variable) and the associated treatment/reference pair. Depending on the #' method the data frame contains additional columns: -#' \itemize{ -#' \item{"t-test": }{The \code{std_error} column contains the standard error of the differential +#' +#' * "t-test": The \code{std_error} column contains the standard error of the differential #' abundances. \code{n_obs} contains the number of observations for the specific protein, peptide -#' or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -#' \item{"t-test_mean_sd": }{Columns labeled as control refer to the second condition of the +#' or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +#' * "t-test_mean_sd": Columns labeled as control refer to the second condition of the #' comparison pairs. Treated refers to the first condition. \code{mean_control} and \code{mean_treated} #' columns contain the means for the reference and treatment condition, respectively. \code{sd_control} #' and \code{sd_treated} columns contain the standard deviations for the reference and treatment #' condition, respectively. \code{n_control} and \code{n_treated} columns contain the numbers of #' samples for the reference and treatment condition, respectively. The \code{std_error} column #' contains the standard error of the differential abundances. \code{t_statistic} contains the -#' t_statistic for the t-test.} -#' \item{"moderated_t-test": }{\code{CI_2.5} and \code{CI_97.5} contain the 2.5% and 97.5% +#' t_statistic for the t-test. +#' * "moderated_t-test": \code{CI_2.5} and \code{CI_97.5} contain the 2.5% and 97.5% #' confidence interval borders for differential abundances. \code{avg_abundance} contains average #' abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} #' contains the t_statistic for the t-test. \code{B} The B-statistic is the log-odds that the @@ -139,13 +139,13 @@ diff_abundance <- #' 4.48/(1+4.48)=0.82, i.e., the probability is about 82% that this group is differentially #' abundant. A B-statistic of zero corresponds to a 50-50 chance that the group is differentially #' abundant.\code{n_obs} contains the number of observations for the specific protein, peptide or -#' precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -#' \item{"proDA": }{The \code{std_error} column contains the standard error of the differential +#' precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +#' * "proDA": The \code{std_error} column contains the standard error of the differential #' abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs #' (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. #' \code{n_obs} contains the number of observations for the specific protein, peptide or precursor -#' (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -#' } +#' (depending on the \code{grouping} variable) and the associated treatment/reference pair. +#' #' For all methods execept \code{"proDA"}, the p-value adjustment is performed only on the #' proportion of data that contains a p-value that is not \code{NA}. For \code{"proDA"} the #' p-value adjustment is either performed on the complete dataset (\code{filter_NA_missingness = TRUE}) diff --git a/R/extract_metal_binders.R b/R/extract_metal_binders.R index dd8c1b7c..2665bab1 100644 --- a/R/extract_metal_binders.R +++ b/R/extract_metal_binders.R @@ -24,37 +24,37 @@ #' #' @return A data frame containing information on protein metal binding state. It contains the #' following columns: -#' \itemize{ -#' \item{\code{accession}: }{UniProt protein identifier.} -#' \item{\code{most_specific_id}: }{ChEBI ID that is most specific for the position after combining information from all sources. -#' Can be multiple IDs separated by "," if a position appears multiple times due to multiple fitting IDs.} -#' \item{\code{most_specific_id_name}: }{The name of the ID in the \code{most_specific_id} column. This information is based on -#' ChEBI.} -#' \item{\code{ligand_identifier}: }{A ligand identifier that is unique per ligand per protein. It consists of the ligand ID and -#' ligand name. The ligand ID counts the number of ligands of the same type per protein.} -#' \item{\code{ligand_position}: }{The amino acid position of the residue interacting with the ligand.} -#' \item{\code{binding_mode}: }{Contains information about the way the amino acid residue interacts with the ligand. If it is -#' "covalent" then the residue is not in contact with the metal directly but only the cofactor that binds the metal.} -#' \item{\code{metal_function}: }{Contains information about the function of the metal. E.g. "catalytic".} -#' \item{\code{metal_id_part}: }{Contains a ChEBI ID that identifiers the metal part of the ligand. This is always the metal atom.} -#' \item{\code{metal_id_part_name}: }{The name of the ID in the \code{metal_id_part} column. This information is based on -#' ChEBI.} -#' \item{\code{note}: }{Contains notes associated with information based on cofactors.} -#' \item{\code{chebi_id}: }{Contains the original ChEBI IDs the information is based on.} -#' \item{\code{source}: }{Contains the sources of the information. This can consist of "binding", "cofactor", "catalytic_activity" -#' and "go_term".} -#' \item{\code{eco}: }{If there is evidence the annotation is based on it is annotated with an ECO ID, which is split by source.} -#' \item{\code{eco_type}: }{The ECO identifier can fall into the "manual_assertion" group for manually curated annotations or the +#' +#' * \code{accession}: UniProt protein identifier. +#' * \code{most_specific_id}: ChEBI ID that is most specific for the position after combining information from all sources. +#' Can be multiple IDs separated by "," if a position appears multiple times due to multiple fitting IDs. +#' * \code{most_specific_id_name}: The name of the ID in the \code{most_specific_id} column. This information is based on +#' ChEBI. +#' * \code{ligand_identifier}: A ligand identifier that is unique per ligand per protein. It consists of the ligand ID and +#' ligand name. The ligand ID counts the number of ligands of the same type per protein. +#' * \code{ligand_position}: The amino acid position of the residue interacting with the ligand. +#' * \code{binding_mode}: Contains information about the way the amino acid residue interacts with the ligand. If it is +#' "covalent" then the residue is not in contact with the metal directly but only the cofactor that binds the metal. +#' * \code{metal_function}: Contains information about the function of the metal. E.g. "catalytic". +#' * \code{metal_id_part}: Contains a ChEBI ID that identifiers the metal part of the ligand. This is always the metal atom. +#' * \code{metal_id_part_name}: The name of the ID in the \code{metal_id_part} column. This information is based on +#' ChEBI. +#' * \code{note}: Contains notes associated with information based on cofactors. +#' * \code{chebi_id}: Contains the original ChEBI IDs the information is based on. +#' * \code{source}: Contains the sources of the information. This can consist of "binding", "cofactor", "catalytic_activity" +#' and "go_term". +#' * \code{eco}: If there is evidence the annotation is based on it is annotated with an ECO ID, which is split by source. +#' * \code{eco_type}: The ECO identifier can fall into the "manual_assertion" group for manually curated annotations or the #' "automatic_assertion" group for automatically generated annotations. If there is no evidence it is annotated as -#' "automatic_assertion". The information is split by source.} -#' \item{\code{evidence_source}: }{The original sources (e.g. literature, PDB) of evidence annotations split by source.} -#' \item{\code{reaction}: }{Contains information about the chemical reaction catalysed by the protein that involves the metal. -#' Can contain the EC ID, Rhea ID, direction specific Rhea ID, direction of the reaction and evidence for the direction.} -#' \item{\code{go_term}: }{Contains gene ontology terms if there are any metal related ones associated with the annotation.} -#' \item{\code{go_name}: }{Contains gene ontology names if there are any metal related ones associated with the annotation.} -#' \item{\code{assigned_by}: }{Contains information about the source of the gene ontology term assignment.} -#' \item{\code{database}: }{Contains information about the source of the ChEBI annotation associated with gene ontology terms.} -#' } +#' "automatic_assertion". The information is split by source. +#' * \code{evidence_source}: The original sources (e.g. literature, PDB) of evidence annotations split by source. +#' * \code{reaction}: Contains information about the chemical reaction catalysed by the protein that involves the metal. +#' Can contain the EC ID, Rhea ID, direction specific Rhea ID, direction of the reaction and evidence for the direction. +#' * \code{go_term}: Contains gene ontology terms if there are any metal related ones associated with the annotation. +#' * \code{go_name}: Contains gene ontology names if there are any metal related ones associated with the annotation. +#' * \code{assigned_by}: Contains information about the source of the gene ontology term assignment. +#' * \code{database}: Contains information about the source of the ChEBI annotation associated with gene ontology terms. +#' #' For each protein identifier the data frame contains information on the bound ligand as well as on its position if it is known. #' Since information about metal ligands can come from multiple sources, additional information (e.g. evidence) is nested in the returned #' data frame. In order to unnest the relevant information the following steps have to be taken: It is diff --git a/R/fetch_alphafold_aligned_error.R b/R/fetch_alphafold_aligned_error.R index fa425d67..d370b77a 100644 --- a/R/fetch_alphafold_aligned_error.R +++ b/R/fetch_alphafold_aligned_error.R @@ -20,14 +20,14 @@ #' @return A list that contains aligned errors for AlphaFold predictions. If return_data_frame is #' TRUE, a data frame with this information is returned instead. The data frame contains the #' following columns: -#' \itemize{ -#' \item{scored_residue: }{The error for this position is calculated based on the alignment to the -#' aligned residue.} -#' \item{aligned_residue: }{The residue that is aligned for the calculation of the error of the scored -#' residue} -#' \item{error: }{The predicted aligned error computed by alpha fold.} -#' \item{accession: }{The UniProt protein identifier.} -#' } +#' +#' * scored_residue: The error for this position is calculated based on the alignment to the +#' aligned residue. +#' * aligned_residue: The residue that is aligned for the calculation of the error of the scored +#' residue +#' * error: The predicted aligned error computed by alpha fold. +#' * accession: The UniProt protein identifier. +#' #' #' @import dplyr #' @import progress diff --git a/R/fetch_alphafold_prediction.R b/R/fetch_alphafold_prediction.R index 6d0a5e49..7b435258 100644 --- a/R/fetch_alphafold_prediction.R +++ b/R/fetch_alphafold_prediction.R @@ -25,29 +25,28 @@ #' @return A list that contains atom level data for AlphaFold predictions. If return_data_frame is #' TRUE, a data frame with this information is returned instead. The data frame contains the #' following columns: -#' \itemize{ -#' \item{label_id: }{Uniquely identifies every atom in the prediction following the standardised -#' convention for mmCIF files.} -#' \item{type_symbol: }{The code used to identify the atom species representing this atom type. -#' This code is the element symbol.} -#' \item{label_atom_id: }{Uniquely identifies every atom for the given residue following the -#' standardised convention for mmCIF files.} -#' \item{label_comp_id: }{A chemical identifier for the residue. This is the three- letter code -#' for the amino acid.} -#' \item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files. -#' Since every prediction only contains one protein this is always "A".} -#' \item{label_seq_id: }{Uniquely and sequentially identifies residues for each protein. The -#' numbering corresponds to the UniProt amino acid positions.} -#' \item{x: }{The x coordinate of the atom.} -#' \item{y: }{The y coordinate of the atom.} -#' \item{z: }{The z coordinate of the atom.} -#' \item{prediction_score: }{Contains the prediction score for each residue.} -#' \item{auth_seq_id: }{Same as \code{label_seq_id}. But of type character.} -#' \item{auth_comp_id: }{Same as \code{label_comp_id}.} -#' \item{auth_asym_id: }{Same as \code{label_asym_id}.} -#' \item{uniprot_id: }{The UniProt identifier of the predicted protein.} -#' \item{score_quality: }{Score annotations.} -#' } +#' +#' * label_id: Uniquely identifies every atom in the prediction following the standardised +#' convention for mmCIF files. +#' * type_symbol: The code used to identify the atom species representing this atom type. +#' This code is the element symbol. +#' * label_atom_id: Uniquely identifies every atom for the given residue following the +#' standardised convention for mmCIF files. +#' * label_comp_id: A chemical identifier for the residue. This is the three- letter code +#' for the amino acid. +#' * label_asym_id: Chain identifier following the standardised convention for mmCIF files. +#' Since every prediction only contains one protein this is always "A". +#' * label_seq_id: Uniquely and sequentially identifies residues for each protein. The +#' numbering corresponds to the UniProt amino acid positions. +#' * x: The x coordinate of the atom. +#' * y: The y coordinate of the atom. +#' * z: The z coordinate of the atom. +#' * prediction_score: Contains the prediction score for each residue. +#' * auth_seq_id: Same as \code{label_seq_id}. But of type character. +#' * auth_comp_id: Same as \code{label_comp_id}. +#' * auth_asym_id: Same as \code{label_asym_id}. +#' * uniprot_id: The UniProt identifier of the predicted protein. +#' * score_quality: Score annotations. #' #' @import dplyr #' @import progress diff --git a/R/fetch_metal_pdb.R b/R/fetch_metal_pdb.R index 9a6edebc..bd4be211 100644 --- a/R/fetch_metal_pdb.R +++ b/R/fetch_metal_pdb.R @@ -42,32 +42,32 @@ #' #' @return A data frame that contains information about protein-metal binding sites. The data #' frame contains some columns that might not be self explanatory. -#' \itemize{ -#' \item{auth_id_metal: }{Unique structure atom identifier of the metal, which is provided by +#' +#' * auth_id_metal: Unique structure atom identifier of the metal, which is provided by #' the author of the structure in order to match the identification used in the publication -#' that describes the structure.} -#' \item{auth_seq_id_metal: }{Residue identifier of the metal, which is provided by the author of +#' that describes the structure. +#' * auth_seq_id_metal: Residue identifier of the metal, which is provided by the author of #' the structure in order to match the identification used in the publication that describes the -#' structure.} -#' \item{pattern: }{Metal pattern for each metal bound by the structure.} -#' \item{is_representative: }{A representative site is a site selected to represent a cluster of +#' structure. +#' * pattern: Metal pattern for each metal bound by the structure. +#' * is_representative: A representative site is a site selected to represent a cluster of #' equivalent sites. The selection is done by choosing the PDB structure with the best X-ray #' resolution among those containing the sites in the cluster. NMR structures are generally #' discarded in favor of X-ray structures, unless all the sites in the cluster are found in NMR -#' structures.} -#' \item{auth_asym_id_ligand: }{Chain identifier of the metal-coordinating ligand residues, which +#' structures. +#' * auth_asym_id_ligand: Chain identifier of the metal-coordinating ligand residues, which #' is provided by the author of the structure in order to match the identification used in the -#' publication that describes the structure.} -#' \item{auth_seq_id_ligand: }{Residue identifier of the metal-coordinating ligand residues, which +#' publication that describes the structure. +#' * auth_seq_id_ligand: Residue identifier of the metal-coordinating ligand residues, which #' is provided by the author of the structure in order to match the identification used in the -#' publication that describes the structure.} -#' \item{auth_id_ligand: }{Unique structure atom identifier of the metal-coordinating ligand r +#' publication that describes the structure. +#' * auth_id_ligand: Unique structure atom identifier of the metal-coordinating ligand r #' esidues, which is provided by the author of the structure in order to match the identification -#' used in the publication that describes the structure.} -#' \item{auth_atom_id_ligand: }{Unique residue specific atom identifier of the metal-coordinating +#' used in the publication that describes the structure. +#' * auth_atom_id_ligand: Unique residue specific atom identifier of the metal-coordinating #' ligand residues, which is provided by the author of the structure in order to match the -#' identification used in the publication that describes the structure.} -#' } +#' identification used in the publication that describes the structure. +#' #' @import dplyr #' @import progress #' @import purrr diff --git a/R/fetch_pdb.R b/R/fetch_pdb.R index 36662f35..005593c1 100644 --- a/R/fetch_pdb.R +++ b/R/fetch_pdb.R @@ -11,36 +11,36 @@ #' #' @return A data frame that contains structure metadata for the PDB IDs provided. The data frame #' contains some columns that might not be self explanatory. -#' \itemize{ -#' \item{auth_asym_id: }{Chain identifier provided by the author of the structure in order to -#' match the identification used in the publication that describes the structure.} -#' \item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files.} -#' \item{entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: }{\code{entity_beg_seq_id} is a +#' +#' * auth_asym_id: Chain identifier provided by the author of the structure in order to +#' match the identification used in the publication that describes the structure. +#' * label_asym_id: Chain identifier following the standardised convention for mmCIF files. +#' * entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: \code{entity_beg_seq_id} is a #' position in the structure sequence (\code{pdb_sequence}) that matches the position given in #' \code{ref_beg_seq_id}, which is a position within the protein sequence (not included in the #' data frame). \code{length} identifies the stretch of sequence for which positions match #' accordingly between structure and protein sequence. \code{entity_beg_seq_id} is a residue ID -#' based on the standardised convention for mmCIF files.} -#' \item{auth_seq_id: }{Residue identifier provided by the author of the structure in order to +#' based on the standardised convention for mmCIF files. +#' * auth_seq_id: Residue identifier provided by the author of the structure in order to #' match the identification used in the publication that describes the structure. This character #' vector has the same length as the \code{pdb_sequence} and each position is the identifier for #' the matching amino acid position in \code{pdb_sequence}. The contained values are not -#' necessarily numbers and the values do not have to be positive.} -#' \item{modified_monomer: }{Is composed of first the composition ID of the modification, followed +#' necessarily numbers and the values do not have to be positive. +#' * modified_monomer: Is composed of first the composition ID of the modification, followed #' by the \code{label_seq_id} position. In parenthesis are the parent monomer identifiers as -#' they appear in the sequence.} -#' \item{ligand_*: }{Any column starting with the \code{ligand_*} prefix contains information about +#' they appear in the sequence. +#' * ligand_*: Any column starting with the \code{ligand_*} prefix contains information about #' the position, identity and donors for ligand binding sites. If there are multiple entities of -#' ligands they are separated by "|". Specific donor level information is separated by ";".} -#' \item{secondar_structure: }{Contains information about helix and sheet secondary structure elements. -#' Individual regions are separated by ";".} -#' \item{unmodeled_structure: }{Contains information about unmodeled or partially modeled regions in -#' the model. Individual regions are separated by ";".} -#' \item{auth_seq_id_original: }{In some cases the sequence positions do not match the number of residues +#' ligands they are separated by "|". Specific donor level information is separated by ";". +#' * secondar_structure: Contains information about helix and sheet secondary structure elements. +#' Individual regions are separated by ";". +#' * unmodeled_structure: Contains information about unmodeled or partially modeled regions in +#' the model. Individual regions are separated by ";". +#' * auth_seq_id_original: In some cases the sequence positions do not match the number of residues #' in the sequence either because positions are missing or duplicated. This always coincides with modified #' residues, however does not always occur when there is a modified residue in the sequence. This column -#' contains the original \code{auth_seq_id} information that does not have these positions corrected.} -#' } +#' contains the original \code{auth_seq_id} information that does not have these positions corrected. +#' #' @import dplyr #' @import progress #' @import purrr diff --git a/R/fetch_pdb_structure.R b/R/fetch_pdb_structure.R index 5d527eae..caca4a3a 100644 --- a/R/fetch_pdb_structure.R +++ b/R/fetch_pdb_structure.R @@ -14,49 +14,49 @@ #' @return A list that contains atom data for each PDB structures provided. If return_data_frame is #' TRUE, a data frame with this information is returned instead. The data frame contains the #' following columns: -#' \itemize{ -#' \item{label_id: }{Uniquely identifies every atom in the structure following the standardised +#' +#' * label_id: Uniquely identifies every atom in the structure following the standardised #' convention for mmCIF files. Example value: "5", "C12", "Ca3g28", "Fe3+17", "H*251", "boron2a", -#' "C a phe 83 a 0", "Zn Zn 301 A 0"} -#' \item{type_symbol: }{The code used to identify the atom species representing this atom type. +#' "C a phe 83 a 0", "Zn Zn 301 A 0" +#' * type_symbol: The code used to identify the atom species representing this atom type. #' Normally this code is the element symbol. The code may be composed of any character except an #' underscore with the additional proviso that digits designate an oxidation state and must be -#' followed by a + or - character. Example values: "C", "Cu2+", "H(SDS)", "dummy", "FeNi".} -#' \item{label_atom_id: }{Uniquely identifies every atom for the given residue following the -#' standardised convention for mmCIF files. Example values: "CA", "HB1", "CB", "N"} -#' \item{label_comp_id: }{A chemical identifier for the residue. For protein polymer entities, +#' followed by a + or - character. Example values: "C", "Cu2+", "H(SDS)", "dummy", "FeNi". +#' * label_atom_id: Uniquely identifies every atom for the given residue following the +#' standardised convention for mmCIF files. Example values: "CA", "HB1", "CB", "N" +#' * label_comp_id: A chemical identifier for the residue. For protein polymer entities, #' this is the three- letter code for the amino acid. For nucleic acid polymer entities, this is -#' the one-letter code for the base. Example values: "ala", "val", "A", "C".} -#' \item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files. -#' Example values: "1", "A", "2B3".} -#' \item{entity_id: }{Records details about the molecular entities that are present in the +#' the one-letter code for the base. Example values: "ala", "val", "A", "C". +#' * label_asym_id: Chain identifier following the standardised convention for mmCIF files. +#' Example values: "1", "A", "2B3". +#' * entity_id: Records details about the molecular entities that are present in the #' crystallographic structure. Usually all different types of molecular entities such as polymer #' entities, non-polymer entities or water molecules are numbered once for each structure. Each #' type of non-polymer entity has its own number. Thus, the highest number in this column -#' represents the number of different molecule types in the structure.} -#' \item{label_seq_id: }{Uniquely and sequentially identifies residues for each \code{label_asym_id}. -#' This is always a number and the sequence of numbers always progresses in increasing numerical order.} -#' \item{x: }{The x coordinate of the atom.} -#' \item{y: }{The y coordinate of the atom.} -#' \item{z: }{The z coordinate of the atom.} -#' \item{site_occupancy: }{The fraction of the atom type present at this site.} -#' \item{b_iso_or_equivalent: }{Contains the B-factor or isotopic atomic displacement factor for -#' each atom.} -#' \item{formal_charge: }{The net integer charge assigned to this atom. This is the formal charge +#' represents the number of different molecule types in the structure. +#' * label_seq_id: Uniquely and sequentially identifies residues for each \code{label_asym_id}. +#' This is always a number and the sequence of numbers always progresses in increasing numerical order. +#' * x: The x coordinate of the atom. +#' * y: The y coordinate of the atom. +#' * z: The z coordinate of the atom. +#' * site_occupancy: The fraction of the atom type present at this site. +#' * b_iso_or_equivalent: Contains the B-factor or isotopic atomic displacement factor for +#' each atom. +#' * formal_charge: The net integer charge assigned to this atom. This is the formal charge #' assignment normally found in chemical diagrams. It is currently only assigned in a small subset -#' of structures.} -#' \item{auth_seq_id: }{An alternative residue identifier (\code{label_seq_id}) provided by the +#' of structures. +#' * auth_seq_id: An alternative residue identifier (\code{label_seq_id}) provided by the #' author of the structure in order to match the identification used in the publication that -#' describes the structure. This does not need to be numeric and is therefore of type character.} -#' \item{auth_comp_id: }{An alternative chemical identifier (\code{label_comp_id}) provided by the +#' describes the structure. This does not need to be numeric and is therefore of type character. +#' * auth_comp_id: An alternative chemical identifier (\code{label_comp_id}) provided by the #' author of the structure in order to match the identification used in the publication that -#' describes the structure.} -#' \item{auth_asym_id: }{An alternative chain identifier (\code{label_asym_id}) provided by the +#' describes the structure. +#' * auth_asym_id: An alternative chain identifier (\code{label_asym_id}) provided by the #' author of the structure in order to match the identification used in the publication that -#' describes the structure.} -#' \item{pdb_model_number: }{The PDB model number.} -#' \item{pdb_id: }{The protein database identifier for the structure.} -#' } +#' describes the structure. +#' * pdb_model_number: The PDB model number. +#' * pdb_id: The protein database identifier for the structure. +#' #' #' @import dplyr #' @import progress diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R index 18a74b48..7b3af2c4 100644 --- a/R/find_peptide_in_structure.R +++ b/R/find_peptide_in_structure.R @@ -29,36 +29,36 @@ #' peptide is not found in any structure or no structure is associated with the protein, the data #' frame contains NAs values for the output columns. The data frame contains the following and #' additional columns: -#' \itemize{ -#' \item{auth_asym_id: }{Chain identifier provided by the author of the structure in order to -#' match the identification used in the publication that describes the structure.} -#' \item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files.} -#' \item{peptide_seq_in_pdb: }{The sequence of the peptide mapped to the structure. If the +#' +#' * auth_asym_id: Chain identifier provided by the author of the structure in order to +#' match the identification used in the publication that describes the structure. +#' * label_asym_id: Chain identifier following the standardised convention for mmCIF files. +#' * peptide_seq_in_pdb: The sequence of the peptide mapped to the structure. If the #' peptide only maps partially, then only the part of the sequence that maps on the structure is -#' returned.} -#' \item{fit_type: }{The fit type is either "partial" or "fully" and it indicates if the complete -#' peptide or only part of it was found in the structure.} -#' \item{label_seq_id_start: }{Contains the first residue position of the peptide in the structure -#' following the standardised convention for mmCIF files.} -#' \item{label_seq_id_end: }{Contains the last residue position of the peptide in the structure -#' following the standardised convention for mmCIF files.} -#' \item{auth_seq_id_start: }{Contains the first residue position of the peptide in the structure +#' returned. +#' * fit_type: The fit type is either "partial" or "fully" and it indicates if the complete +#' peptide or only part of it was found in the structure. +#' * label_seq_id_start: Contains the first residue position of the peptide in the structure +#' following the standardised convention for mmCIF files. +#' * label_seq_id_end: Contains the last residue position of the peptide in the structure +#' following the standardised convention for mmCIF files. +#' * auth_seq_id_start: Contains the first residue position of the peptide in the structure #' based on the alternative residue identifier provided by the author of the structure in order #' to match the identification used in the publication that describes the structure. This does -#' not need to be numeric and is therefore of type character.} -#' \item{auth_seq_id_end: }{Contains the last residue position of the peptide in the structure +#' not need to be numeric and is therefore of type character. +#' * auth_seq_id_end: Contains the last residue position of the peptide in the structure #' based on the alternative residue identifier provided by the author of the structure in order #' to match the identification used in the publication that describes the structure. This does -#' not need to be numeric and is therefore of type character.} -#' \item{auth_seq_id: }{Contains all positions (separated by ";") of the peptide in the structure +#' not need to be numeric and is therefore of type character. +#' * auth_seq_id: Contains all positions (separated by ";") of the peptide in the structure #' based on the alternative residue identifier provided by the author of the structure in order #' to match the identification used in the publication that describes the structure. This does -#' not need to be numeric and is therefore of type character.} -#' \item{n_peptides: }{The number of peptides from one protein that were searched for within the -#' current structure.} -#' \item{n_peptides_in_structure: }{The number of peptides from one protein that were found within -#' the current structure.} -#' } +#' not need to be numeric and is therefore of type character. +#' * n_peptides: The number of peptides from one protein that were searched for within the +#' current structure. +#' * n_peptides_in_structure: The number of peptides from one protein that were found within +#' the current structure. +#' #' @import dplyr #' @import tidyr #' @importFrom stringr str_sub str_split diff --git a/R/fit_drc_4p.R b/R/fit_drc_4p.R index f05bd3e0..88796a60 100644 --- a/R/fit_drc_4p.R +++ b/R/fit_drc_4p.R @@ -84,15 +84,15 @@ #' columns can be retained by providing their names (not in quotations marks, just like other #' column names, but in a vector). #' -#' @return If \code{include_models = FALSE} a data frame is returned that contains correlations +#' @return If `include_models = FALSE` a data frame is returned that contains correlations #' of predicted to measured values as a measure of the goodness of the curve fit, an associated #' p-value and the four parameters of the model for each group. Furthermore, input data for plots -#' is returned in the columns \code{plot_curve} (curve and confidence interval) and \code{plot_points} -#' (measured points). If \ code{include_models = TURE}, a list is returned that contains: -#' \itemize{ -#' \item{\code{fit_objects}: }{The fit objects of type \code{drc} for each group.} -#' \item{\code{correlations}: }{The correlation data frame described above} -#' } +#' is returned in the columns `plot_curve` (curve and confidence interval) and `plot_points` +#' (measured points). If `include_models = TURE`, a list is returned that contains: +#' +#' * `fit_objects`: The fit objects of type `drc` for each group. +#' * `correlations`: The correlation data frame described above +#' #' @import dplyr #' @import tidyr #' @import progress diff --git a/R/predict_alphafold_domain.R b/R/predict_alphafold_domain.R index c7ae0154..94b58d40 100644 --- a/R/predict_alphafold_domain.R +++ b/R/predict_alphafold_domain.R @@ -29,11 +29,11 @@ #' @return A list of the provided proteins that contains domain assignments for each residue. If `return_data_frame` is #' `TRUE`, a data frame with this information is returned instead. The data frame contains the #' following columns: -#' \itemize{ -#' \item{residue: }{The protein residue number.} -#' \item{domain: }{A numeric value representing a distinct predicted domain in the protein.} -#' \item{accession: }{The UniProt protein identifier.} -#' } +#' +#' * residue: The protein residue number. +#' * domain: A numeric value representing a distinct predicted domain in the protein. +#' * accession: The UniProt protein identifier. +#' #' #' @import dplyr #' @import progress diff --git a/man/assign_missingness.Rd b/man/assign_missingness.Rd index a5f87d5b..5f4b8bc8 100644 --- a/man/assign_missingness.Rd +++ b/man/assign_missingness.Rd @@ -54,17 +54,18 @@ A data frame that contains the reference condition paired with each treatment co The \code{comparison} column contains the comparison name for the specific treatment/reference pair. The \code{missingness} column reports the type of missingness. \itemize{ -\item{"complete": }{No missing values for every replicate of this reference/treatment pair for -the specific grouping variable.} -\item{"MNAR": }{Missing not at random. All replicates of either the reference or treatment -condition have missing values for the specific grouping variable.} -\item{"MAR": }{Missing at random. At least n-1 replicates have missing values for the -reference/treatment pair for the specific grouping varible.} -\item{NA: }{The comparison is not complete enough to fall into any other category. It will not +\item "complete": No missing values for every replicate of this reference/treatment pair for +the specific grouping variable. +\item "MNAR": Missing not at random. All replicates of either the reference or treatment +condition have missing values for the specific grouping variable. +\item "MAR": Missing at random. At least n-1 replicates have missing values for the +reference/treatment pair for the specific grouping varible. +\item NA: The comparison is not complete enough to fall into any other category. It will not be imputed if imputation is performed. For statistical significance testing these comparisons are filtered out after the test and prior to p-value adjustment. This can be prevented by setting -\code{filter_NA_missingness = FALSE} in the \code{calculate_diff_abundance()} function.} +\code{filter_NA_missingness = FALSE} in the \code{calculate_diff_abundance()} function. } + The type of missingness has an influence on the way values are imputeted if imputation is performed subsequently using the \code{impute()} function. How each type of missingness is specifically imputed can be found in the function description. The type of missingness diff --git a/man/calculate_diff_abundance.Rd b/man/calculate_diff_abundance.Rd index a0948026..019d34e1 100644 --- a/man/calculate_diff_abundance.Rd +++ b/man/calculate_diff_abundance.Rd @@ -102,18 +102,18 @@ and adjusted p-values (\code{adj_pval}) for each protein, peptide or precursor ( the \code{grouping} variable) and the associated treatment/reference pair. Depending on the method the data frame contains additional columns: \itemize{ -\item{"t-test": }{The \code{std_error} column contains the standard error of the differential +\item "t-test": The \code{std_error} column contains the standard error of the differential abundances. \code{n_obs} contains the number of observations for the specific protein, peptide -or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -\item{"t-test_mean_sd": }{Columns labeled as control refer to the second condition of the +or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +\item "t-test_mean_sd": Columns labeled as control refer to the second condition of the comparison pairs. Treated refers to the first condition. \code{mean_control} and \code{mean_treated} columns contain the means for the reference and treatment condition, respectively. \code{sd_control} and \code{sd_treated} columns contain the standard deviations for the reference and treatment condition, respectively. \code{n_control} and \code{n_treated} columns contain the numbers of samples for the reference and treatment condition, respectively. The \code{std_error} column contains the standard error of the differential abundances. \code{t_statistic} contains the -t_statistic for the t-test.} -\item{"moderated_t-test": }{\code{CI_2.5} and \code{CI_97.5} contain the 2.5\% and 97.5\% +t_statistic for the t-test. +\item "moderated_t-test": \code{CI_2.5} and \code{CI_97.5} contain the 2.5\% and 97.5\% confidence interval borders for differential abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. \code{B} The B-statistic is the log-odds that the @@ -123,13 +123,14 @@ about four and a half to one. The probability that there is a differential abund 4.48/(1+4.48)=0.82, i.e., the probability is about 82\% that this group is differentially abundant. A B-statistic of zero corresponds to a 50-50 chance that the group is differentially abundant.\code{n_obs} contains the number of observations for the specific protein, peptide or -precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -\item{"proDA": }{The \code{std_error} column contains the standard error of the differential +precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +\item "proDA": The \code{std_error} column contains the standard error of the differential abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. \code{n_obs} contains the number of observations for the specific protein, peptide or precursor -(depending on the \code{grouping} variable) and the associated treatment/reference pair.} +(depending on the \code{grouping} variable) and the associated treatment/reference pair. } + For all methods execept \code{"proDA"}, the p-value adjustment is performed only on the proportion of data that contains a p-value that is not \code{NA}. For \code{"proDA"} the p-value adjustment is either performed on the complete dataset (\code{filter_NA_missingness = TRUE}) diff --git a/man/diff_abundance.Rd b/man/diff_abundance.Rd index 7438a8a6..f4aa9dbe 100644 --- a/man/diff_abundance.Rd +++ b/man/diff_abundance.Rd @@ -12,18 +12,18 @@ and adjusted p-values (\code{adj_pval}) for each protein, peptide or precursor ( the \code{grouping} variable) and the associated treatment/reference pair. Depending on the method the data frame contains additional columns: \itemize{ -\item{"t-test": }{The \code{std_error} column contains the standard error of the differential +\item "t-test": The \code{std_error} column contains the standard error of the differential abundances. \code{n_obs} contains the number of observations for the specific protein, peptide -or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -\item{"t-test_mean_sd": }{Columns labeled as control refer to the second condition of the +or precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +\item "t-test_mean_sd": Columns labeled as control refer to the second condition of the comparison pairs. Treated refers to the first condition. \code{mean_control} and \code{mean_treated} columns contain the means for the reference and treatment condition, respectively. \code{sd_control} and \code{sd_treated} columns contain the standard deviations for the reference and treatment condition, respectively. \code{n_control} and \code{n_treated} columns contain the numbers of samples for the reference and treatment condition, respectively. The \code{std_error} column contains the standard error of the differential abundances. \code{t_statistic} contains the -t_statistic for the t-test.} -\item{"moderated_t-test": }{\code{CI_2.5} and \code{CI_97.5} contain the 2.5\% and 97.5\% +t_statistic for the t-test. +\item "moderated_t-test": \code{CI_2.5} and \code{CI_97.5} contain the 2.5\% and 97.5\% confidence interval borders for differential abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. \code{B} The B-statistic is the log-odds that the @@ -33,12 +33,12 @@ about four and a half to one. The probability that there is a differential abund 4.48/(1+4.48)=0.82, i.e., the probability is about 82\% that this group is differentially abundant. A B-statistic of zero corresponds to a 50-50 chance that the group is differentially abundant.\code{n_obs} contains the number of observations for the specific protein, peptide or -precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair.} -\item{"proDA": }{The \code{std_error} column contains the standard error of the differential +precursor (depending on the \code{grouping} variable) and the associated treatment/reference pair. +\item "proDA": The \code{std_error} column contains the standard error of the differential abundances. \code{avg_abundance} contains average abundances for treatment/reference pairs (mean of the two group means). \code{t_statistic} contains the t_statistic for the t-test. \code{n_obs} contains the number of observations for the specific protein, peptide or precursor -(depending on the \code{grouping} variable) and the associated treatment/reference pair.} +(depending on the \code{grouping} variable) and the associated treatment/reference pair. } } \description{ diff --git a/man/extract_metal_binders.Rd b/man/extract_metal_binders.Rd index 23e79995..3ff4f341 100644 --- a/man/extract_metal_binders.Rd +++ b/man/extract_metal_binders.Rd @@ -44,36 +44,37 @@ the data frame to save time.} A data frame containing information on protein metal binding state. It contains the following columns: \itemize{ -\item{\code{accession}: }{UniProt protein identifier.} -\item{\code{most_specific_id}: }{ChEBI ID that is most specific for the position after combining information from all sources. -Can be multiple IDs separated by "," if a position appears multiple times due to multiple fitting IDs.} -\item{\code{most_specific_id_name}: }{The name of the ID in the \code{most_specific_id} column. This information is based on -ChEBI.} -\item{\code{ligand_identifier}: }{A ligand identifier that is unique per ligand per protein. It consists of the ligand ID and -ligand name. The ligand ID counts the number of ligands of the same type per protein.} -\item{\code{ligand_position}: }{The amino acid position of the residue interacting with the ligand.} -\item{\code{binding_mode}: }{Contains information about the way the amino acid residue interacts with the ligand. If it is -"covalent" then the residue is not in contact with the metal directly but only the cofactor that binds the metal.} -\item{\code{metal_function}: }{Contains information about the function of the metal. E.g. "catalytic".} -\item{\code{metal_id_part}: }{Contains a ChEBI ID that identifiers the metal part of the ligand. This is always the metal atom.} -\item{\code{metal_id_part_name}: }{The name of the ID in the \code{metal_id_part} column. This information is based on -ChEBI.} -\item{\code{note}: }{Contains notes associated with information based on cofactors.} -\item{\code{chebi_id}: }{Contains the original ChEBI IDs the information is based on.} -\item{\code{source}: }{Contains the sources of the information. This can consist of "binding", "cofactor", "catalytic_activity" -and "go_term".} -\item{\code{eco}: }{If there is evidence the annotation is based on it is annotated with an ECO ID, which is split by source.} -\item{\code{eco_type}: }{The ECO identifier can fall into the "manual_assertion" group for manually curated annotations or the +\item \code{accession}: UniProt protein identifier. +\item \code{most_specific_id}: ChEBI ID that is most specific for the position after combining information from all sources. +Can be multiple IDs separated by "," if a position appears multiple times due to multiple fitting IDs. +\item \code{most_specific_id_name}: The name of the ID in the \code{most_specific_id} column. This information is based on +ChEBI. +\item \code{ligand_identifier}: A ligand identifier that is unique per ligand per protein. It consists of the ligand ID and +ligand name. The ligand ID counts the number of ligands of the same type per protein. +\item \code{ligand_position}: The amino acid position of the residue interacting with the ligand. +\item \code{binding_mode}: Contains information about the way the amino acid residue interacts with the ligand. If it is +"covalent" then the residue is not in contact with the metal directly but only the cofactor that binds the metal. +\item \code{metal_function}: Contains information about the function of the metal. E.g. "catalytic". +\item \code{metal_id_part}: Contains a ChEBI ID that identifiers the metal part of the ligand. This is always the metal atom. +\item \code{metal_id_part_name}: The name of the ID in the \code{metal_id_part} column. This information is based on +ChEBI. +\item \code{note}: Contains notes associated with information based on cofactors. +\item \code{chebi_id}: Contains the original ChEBI IDs the information is based on. +\item \code{source}: Contains the sources of the information. This can consist of "binding", "cofactor", "catalytic_activity" +and "go_term". +\item \code{eco}: If there is evidence the annotation is based on it is annotated with an ECO ID, which is split by source. +\item \code{eco_type}: The ECO identifier can fall into the "manual_assertion" group for manually curated annotations or the "automatic_assertion" group for automatically generated annotations. If there is no evidence it is annotated as -"automatic_assertion". The information is split by source.} -\item{\code{evidence_source}: }{The original sources (e.g. literature, PDB) of evidence annotations split by source.} -\item{\code{reaction}: }{Contains information about the chemical reaction catalysed by the protein that involves the metal. -Can contain the EC ID, Rhea ID, direction specific Rhea ID, direction of the reaction and evidence for the direction.} -\item{\code{go_term}: }{Contains gene ontology terms if there are any metal related ones associated with the annotation.} -\item{\code{go_name}: }{Contains gene ontology names if there are any metal related ones associated with the annotation.} -\item{\code{assigned_by}: }{Contains information about the source of the gene ontology term assignment.} -\item{\code{database}: }{Contains information about the source of the ChEBI annotation associated with gene ontology terms.} +"automatic_assertion". The information is split by source. +\item \code{evidence_source}: The original sources (e.g. literature, PDB) of evidence annotations split by source. +\item \code{reaction}: Contains information about the chemical reaction catalysed by the protein that involves the metal. +Can contain the EC ID, Rhea ID, direction specific Rhea ID, direction of the reaction and evidence for the direction. +\item \code{go_term}: Contains gene ontology terms if there are any metal related ones associated with the annotation. +\item \code{go_name}: Contains gene ontology names if there are any metal related ones associated with the annotation. +\item \code{assigned_by}: Contains information about the source of the gene ontology term assignment. +\item \code{database}: Contains information about the source of the ChEBI annotation associated with gene ontology terms. } + For each protein identifier the data frame contains information on the bound ligand as well as on its position if it is known. Since information about metal ligands can come from multiple sources, additional information (e.g. evidence) is nested in the returned data frame. In order to unnest the relevant information the following steps have to be taken: It is diff --git a/man/fetch_alphafold_aligned_error.Rd b/man/fetch_alphafold_aligned_error.Rd index ed18971e..31791692 100644 --- a/man/fetch_alphafold_aligned_error.Rd +++ b/man/fetch_alphafold_aligned_error.Rd @@ -34,12 +34,12 @@ A list that contains aligned errors for AlphaFold predictions. If return_data_fr TRUE, a data frame with this information is returned instead. The data frame contains the following columns: \itemize{ -\item{scored_residue: }{The error for this position is calculated based on the alignment to the -aligned residue.} -\item{aligned_residue: }{The residue that is aligned for the calculation of the error of the scored -residue} -\item{error: }{The predicted aligned error computed by alpha fold.} -\item{accession: }{The UniProt protein identifier.} +\item scored_residue: The error for this position is calculated based on the alignment to the +aligned residue. +\item aligned_residue: The residue that is aligned for the calculation of the error of the scored +residue +\item error: The predicted aligned error computed by alpha fold. +\item accession: The UniProt protein identifier. } } \description{ diff --git a/man/fetch_alphafold_prediction.Rd b/man/fetch_alphafold_prediction.Rd index 3e693054..99b2141a 100644 --- a/man/fetch_alphafold_prediction.Rd +++ b/man/fetch_alphafold_prediction.Rd @@ -43,27 +43,27 @@ A list that contains atom level data for AlphaFold predictions. If return_data_f TRUE, a data frame with this information is returned instead. The data frame contains the following columns: \itemize{ -\item{label_id: }{Uniquely identifies every atom in the prediction following the standardised -convention for mmCIF files.} -\item{type_symbol: }{The code used to identify the atom species representing this atom type. -This code is the element symbol.} -\item{label_atom_id: }{Uniquely identifies every atom for the given residue following the -standardised convention for mmCIF files.} -\item{label_comp_id: }{A chemical identifier for the residue. This is the three- letter code -for the amino acid.} -\item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files. -Since every prediction only contains one protein this is always "A".} -\item{label_seq_id: }{Uniquely and sequentially identifies residues for each protein. The -numbering corresponds to the UniProt amino acid positions.} -\item{x: }{The x coordinate of the atom.} -\item{y: }{The y coordinate of the atom.} -\item{z: }{The z coordinate of the atom.} -\item{prediction_score: }{Contains the prediction score for each residue.} -\item{auth_seq_id: }{Same as \code{label_seq_id}. But of type character.} -\item{auth_comp_id: }{Same as \code{label_comp_id}.} -\item{auth_asym_id: }{Same as \code{label_asym_id}.} -\item{uniprot_id: }{The UniProt identifier of the predicted protein.} -\item{score_quality: }{Score annotations.} +\item label_id: Uniquely identifies every atom in the prediction following the standardised +convention for mmCIF files. +\item type_symbol: The code used to identify the atom species representing this atom type. +This code is the element symbol. +\item label_atom_id: Uniquely identifies every atom for the given residue following the +standardised convention for mmCIF files. +\item label_comp_id: A chemical identifier for the residue. This is the three- letter code +for the amino acid. +\item label_asym_id: Chain identifier following the standardised convention for mmCIF files. +Since every prediction only contains one protein this is always "A". +\item label_seq_id: Uniquely and sequentially identifies residues for each protein. The +numbering corresponds to the UniProt amino acid positions. +\item x: The x coordinate of the atom. +\item y: The y coordinate of the atom. +\item z: The z coordinate of the atom. +\item prediction_score: Contains the prediction score for each residue. +\item auth_seq_id: Same as \code{label_seq_id}. But of type character. +\item auth_comp_id: Same as \code{label_comp_id}. +\item auth_asym_id: Same as \code{label_asym_id}. +\item uniprot_id: The UniProt identifier of the predicted protein. +\item score_quality: Score annotations. } } \description{ diff --git a/man/fetch_metal_pdb.Rd b/man/fetch_metal_pdb.Rd index 7b111805..4d090bd8 100644 --- a/man/fetch_metal_pdb.Rd +++ b/man/fetch_metal_pdb.Rd @@ -75,30 +75,30 @@ nothing is supplied here, all possible columns will be retrieved.} A data frame that contains information about protein-metal binding sites. The data frame contains some columns that might not be self explanatory. \itemize{ -\item{auth_id_metal: }{Unique structure atom identifier of the metal, which is provided by +\item auth_id_metal: Unique structure atom identifier of the metal, which is provided by the author of the structure in order to match the identification used in the publication -that describes the structure.} -\item{auth_seq_id_metal: }{Residue identifier of the metal, which is provided by the author of +that describes the structure. +\item auth_seq_id_metal: Residue identifier of the metal, which is provided by the author of the structure in order to match the identification used in the publication that describes the -structure.} -\item{pattern: }{Metal pattern for each metal bound by the structure.} -\item{is_representative: }{A representative site is a site selected to represent a cluster of +structure. +\item pattern: Metal pattern for each metal bound by the structure. +\item is_representative: A representative site is a site selected to represent a cluster of equivalent sites. The selection is done by choosing the PDB structure with the best X-ray resolution among those containing the sites in the cluster. NMR structures are generally discarded in favor of X-ray structures, unless all the sites in the cluster are found in NMR -structures.} -\item{auth_asym_id_ligand: }{Chain identifier of the metal-coordinating ligand residues, which +structures. +\item auth_asym_id_ligand: Chain identifier of the metal-coordinating ligand residues, which is provided by the author of the structure in order to match the identification used in the -publication that describes the structure.} -\item{auth_seq_id_ligand: }{Residue identifier of the metal-coordinating ligand residues, which +publication that describes the structure. +\item auth_seq_id_ligand: Residue identifier of the metal-coordinating ligand residues, which is provided by the author of the structure in order to match the identification used in the -publication that describes the structure.} -\item{auth_id_ligand: }{Unique structure atom identifier of the metal-coordinating ligand r +publication that describes the structure. +\item auth_id_ligand: Unique structure atom identifier of the metal-coordinating ligand r esidues, which is provided by the author of the structure in order to match the identification -used in the publication that describes the structure.} -\item{auth_atom_id_ligand: }{Unique residue specific atom identifier of the metal-coordinating +used in the publication that describes the structure. +\item auth_atom_id_ligand: Unique residue specific atom identifier of the metal-coordinating ligand residues, which is provided by the author of the structure in order to match the -identification used in the publication that describes the structure.} +identification used in the publication that describes the structure. } } \description{ diff --git a/man/fetch_pdb.Rd b/man/fetch_pdb.Rd index a2ffddf7..08b890e8 100644 --- a/man/fetch_pdb.Rd +++ b/man/fetch_pdb.Rd @@ -19,34 +19,34 @@ TRUE.} A data frame that contains structure metadata for the PDB IDs provided. The data frame contains some columns that might not be self explanatory. \itemize{ -\item{auth_asym_id: }{Chain identifier provided by the author of the structure in order to -match the identification used in the publication that describes the structure.} -\item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files.} -\item{entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: }{\code{entity_beg_seq_id} is a +\item auth_asym_id: Chain identifier provided by the author of the structure in order to +match the identification used in the publication that describes the structure. +\item label_asym_id: Chain identifier following the standardised convention for mmCIF files. +\item entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: \code{entity_beg_seq_id} is a position in the structure sequence (\code{pdb_sequence}) that matches the position given in \code{ref_beg_seq_id}, which is a position within the protein sequence (not included in the data frame). \code{length} identifies the stretch of sequence for which positions match accordingly between structure and protein sequence. \code{entity_beg_seq_id} is a residue ID -based on the standardised convention for mmCIF files.} -\item{auth_seq_id: }{Residue identifier provided by the author of the structure in order to +based on the standardised convention for mmCIF files. +\item auth_seq_id: Residue identifier provided by the author of the structure in order to match the identification used in the publication that describes the structure. This character vector has the same length as the \code{pdb_sequence} and each position is the identifier for the matching amino acid position in \code{pdb_sequence}. The contained values are not -necessarily numbers and the values do not have to be positive.} -\item{modified_monomer: }{Is composed of first the composition ID of the modification, followed +necessarily numbers and the values do not have to be positive. +\item modified_monomer: Is composed of first the composition ID of the modification, followed by the \code{label_seq_id} position. In parenthesis are the parent monomer identifiers as -they appear in the sequence.} -\item{ligand_*: }{Any column starting with the \code{ligand_*} prefix contains information about +they appear in the sequence. +\item ligand_*: Any column starting with the \code{ligand_*} prefix contains information about the position, identity and donors for ligand binding sites. If there are multiple entities of -ligands they are separated by "|". Specific donor level information is separated by ";".} -\item{secondar_structure: }{Contains information about helix and sheet secondary structure elements. -Individual regions are separated by ";".} -\item{unmodeled_structure: }{Contains information about unmodeled or partially modeled regions in -the model. Individual regions are separated by ";".} -\item{auth_seq_id_original: }{In some cases the sequence positions do not match the number of residues +ligands they are separated by "|". Specific donor level information is separated by ";". +\item secondar_structure: Contains information about helix and sheet secondary structure elements. +Individual regions are separated by ";". +\item unmodeled_structure: Contains information about unmodeled or partially modeled regions in +the model. Individual regions are separated by ";". +\item auth_seq_id_original: In some cases the sequence positions do not match the number of residues in the sequence either because positions are missing or duplicated. This always coincides with modified residues, however does not always occur when there is a modified residue in the sequence. This column -contains the original \code{auth_seq_id} information that does not have these positions corrected.} +contains the original \code{auth_seq_id} information that does not have these positions corrected. } } \description{ diff --git a/man/fetch_pdb_structure.Rd b/man/fetch_pdb_structure.Rd index 67be618e..fb29c3dc 100644 --- a/man/fetch_pdb_structure.Rd +++ b/man/fetch_pdb_structure.Rd @@ -21,47 +21,47 @@ A list that contains atom data for each PDB structures provided. If return_data_ TRUE, a data frame with this information is returned instead. The data frame contains the following columns: \itemize{ -\item{label_id: }{Uniquely identifies every atom in the structure following the standardised +\item label_id: Uniquely identifies every atom in the structure following the standardised convention for mmCIF files. Example value: "5", "C12", "Ca3g28", "Fe3+17", "H*251", "boron2a", -"C a phe 83 a 0", "Zn Zn 301 A 0"} -\item{type_symbol: }{The code used to identify the atom species representing this atom type. +"C a phe 83 a 0", "Zn Zn 301 A 0" +\item type_symbol: The code used to identify the atom species representing this atom type. Normally this code is the element symbol. The code may be composed of any character except an underscore with the additional proviso that digits designate an oxidation state and must be -followed by a + or - character. Example values: "C", "Cu2+", "H(SDS)", "dummy", "FeNi".} -\item{label_atom_id: }{Uniquely identifies every atom for the given residue following the -standardised convention for mmCIF files. Example values: "CA", "HB1", "CB", "N"} -\item{label_comp_id: }{A chemical identifier for the residue. For protein polymer entities, +followed by a + or - character. Example values: "C", "Cu2+", "H(SDS)", "dummy", "FeNi". +\item label_atom_id: Uniquely identifies every atom for the given residue following the +standardised convention for mmCIF files. Example values: "CA", "HB1", "CB", "N" +\item label_comp_id: A chemical identifier for the residue. For protein polymer entities, this is the three- letter code for the amino acid. For nucleic acid polymer entities, this is -the one-letter code for the base. Example values: "ala", "val", "A", "C".} -\item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files. -Example values: "1", "A", "2B3".} -\item{entity_id: }{Records details about the molecular entities that are present in the +the one-letter code for the base. Example values: "ala", "val", "A", "C". +\item label_asym_id: Chain identifier following the standardised convention for mmCIF files. +Example values: "1", "A", "2B3". +\item entity_id: Records details about the molecular entities that are present in the crystallographic structure. Usually all different types of molecular entities such as polymer entities, non-polymer entities or water molecules are numbered once for each structure. Each type of non-polymer entity has its own number. Thus, the highest number in this column -represents the number of different molecule types in the structure.} -\item{label_seq_id: }{Uniquely and sequentially identifies residues for each \code{label_asym_id}. -This is always a number and the sequence of numbers always progresses in increasing numerical order.} -\item{x: }{The x coordinate of the atom.} -\item{y: }{The y coordinate of the atom.} -\item{z: }{The z coordinate of the atom.} -\item{site_occupancy: }{The fraction of the atom type present at this site.} -\item{b_iso_or_equivalent: }{Contains the B-factor or isotopic atomic displacement factor for -each atom.} -\item{formal_charge: }{The net integer charge assigned to this atom. This is the formal charge +represents the number of different molecule types in the structure. +\item label_seq_id: Uniquely and sequentially identifies residues for each \code{label_asym_id}. +This is always a number and the sequence of numbers always progresses in increasing numerical order. +\item x: The x coordinate of the atom. +\item y: The y coordinate of the atom. +\item z: The z coordinate of the atom. +\item site_occupancy: The fraction of the atom type present at this site. +\item b_iso_or_equivalent: Contains the B-factor or isotopic atomic displacement factor for +each atom. +\item formal_charge: The net integer charge assigned to this atom. This is the formal charge assignment normally found in chemical diagrams. It is currently only assigned in a small subset -of structures.} -\item{auth_seq_id: }{An alternative residue identifier (\code{label_seq_id}) provided by the +of structures. +\item auth_seq_id: An alternative residue identifier (\code{label_seq_id}) provided by the author of the structure in order to match the identification used in the publication that -describes the structure. This does not need to be numeric and is therefore of type character.} -\item{auth_comp_id: }{An alternative chemical identifier (\code{label_comp_id}) provided by the +describes the structure. This does not need to be numeric and is therefore of type character. +\item auth_comp_id: An alternative chemical identifier (\code{label_comp_id}) provided by the author of the structure in order to match the identification used in the publication that -describes the structure.} -\item{auth_asym_id: }{An alternative chain identifier (\code{label_asym_id}) provided by the +describes the structure. +\item auth_asym_id: An alternative chain identifier (\code{label_asym_id}) provided by the author of the structure in order to match the identification used in the publication that -describes the structure.} -\item{pdb_model_number: }{The PDB model number.} -\item{pdb_id: }{The protein database identifier for the structure.} +describes the structure. +\item pdb_model_number: The PDB model number. +\item pdb_id: The protein database identifier for the structure. } } \description{ diff --git a/man/find_peptide_in_structure.Rd b/man/find_peptide_in_structure.Rd index b96ba805..556e5f93 100644 --- a/man/find_peptide_in_structure.Rd +++ b/man/find_peptide_in_structure.Rd @@ -45,34 +45,34 @@ peptide is not found in any structure or no structure is associated with the pro frame contains NAs values for the output columns. The data frame contains the following and additional columns: \itemize{ -\item{auth_asym_id: }{Chain identifier provided by the author of the structure in order to -match the identification used in the publication that describes the structure.} -\item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files.} -\item{peptide_seq_in_pdb: }{The sequence of the peptide mapped to the structure. If the +\item auth_asym_id: Chain identifier provided by the author of the structure in order to +match the identification used in the publication that describes the structure. +\item label_asym_id: Chain identifier following the standardised convention for mmCIF files. +\item peptide_seq_in_pdb: The sequence of the peptide mapped to the structure. If the peptide only maps partially, then only the part of the sequence that maps on the structure is -returned.} -\item{fit_type: }{The fit type is either "partial" or "fully" and it indicates if the complete -peptide or only part of it was found in the structure.} -\item{label_seq_id_start: }{Contains the first residue position of the peptide in the structure -following the standardised convention for mmCIF files.} -\item{label_seq_id_end: }{Contains the last residue position of the peptide in the structure -following the standardised convention for mmCIF files.} -\item{auth_seq_id_start: }{Contains the first residue position of the peptide in the structure +returned. +\item fit_type: The fit type is either "partial" or "fully" and it indicates if the complete +peptide or only part of it was found in the structure. +\item label_seq_id_start: Contains the first residue position of the peptide in the structure +following the standardised convention for mmCIF files. +\item label_seq_id_end: Contains the last residue position of the peptide in the structure +following the standardised convention for mmCIF files. +\item auth_seq_id_start: Contains the first residue position of the peptide in the structure based on the alternative residue identifier provided by the author of the structure in order to match the identification used in the publication that describes the structure. This does -not need to be numeric and is therefore of type character.} -\item{auth_seq_id_end: }{Contains the last residue position of the peptide in the structure +not need to be numeric and is therefore of type character. +\item auth_seq_id_end: Contains the last residue position of the peptide in the structure based on the alternative residue identifier provided by the author of the structure in order to match the identification used in the publication that describes the structure. This does -not need to be numeric and is therefore of type character.} -\item{auth_seq_id: }{Contains all positions (separated by ";") of the peptide in the structure +not need to be numeric and is therefore of type character. +\item auth_seq_id: Contains all positions (separated by ";") of the peptide in the structure based on the alternative residue identifier provided by the author of the structure in order to match the identification used in the publication that describes the structure. This does -not need to be numeric and is therefore of type character.} -\item{n_peptides: }{The number of peptides from one protein that were searched for within the -current structure.} -\item{n_peptides_in_structure: }{The number of peptides from one protein that were found within -the current structure.} +not need to be numeric and is therefore of type character. +\item n_peptides: The number of peptides from one protein that were searched for within the +current structure. +\item n_peptides_in_structure: The number of peptides from one protein that were found within +the current structure. } } \description{ diff --git a/man/fit_drc_4p.Rd b/man/fit_drc_4p.Rd index 3bffd464..0397d7a0 100644 --- a/man/fit_drc_4p.Rd +++ b/man/fit_drc_4p.Rd @@ -83,10 +83,10 @@ If \code{include_models = FALSE} a data frame is returned that contains correlat of predicted to measured values as a measure of the goodness of the curve fit, an associated p-value and the four parameters of the model for each group. Furthermore, input data for plots is returned in the columns \code{plot_curve} (curve and confidence interval) and \code{plot_points} -(measured points). If \ code{include_models = TURE}, a list is returned that contains: +(measured points). If \code{include_models = TURE}, a list is returned that contains: \itemize{ -\item{\code{fit_objects}: }{The fit objects of type \code{drc} for each group.} -\item{\code{correlations}: }{The correlation data frame described above} +\item \code{fit_objects}: The fit objects of type \code{drc} for each group. +\item \code{correlations}: The correlation data frame described above } } \description{ diff --git a/man/predict_alphafold_domain.Rd b/man/predict_alphafold_domain.Rd index 0526bcfc..41634ca2 100644 --- a/man/predict_alphafold_domain.Rd +++ b/man/predict_alphafold_domain.Rd @@ -42,9 +42,9 @@ A list of the provided proteins that contains domain assignments for each residu \code{TRUE}, a data frame with this information is returned instead. The data frame contains the following columns: \itemize{ -\item{residue: }{The protein residue number.} -\item{domain: }{A numeric value representing a distinct predicted domain in the protein.} -\item{accession: }{The UniProt protein identifier.} +\item residue: The protein residue number. +\item domain: A numeric value representing a distinct predicted domain in the protein. +\item accession: The UniProt protein identifier. } } \description{ From 686f8559fb94fdd70316a5bf5153808713d00b34 Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 15 Feb 2024 01:14:51 +0100 Subject: [PATCH 70/71] Fix CITATION --- inst/CITATION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/CITATION b/inst/CITATION index af11cd24..45ea13ae 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,6 +1,6 @@ citHeader("To cite protti in publications, please use:") -bibentry(entry = "article", +bibentry(bibtype = "article", textVersion = "Quast, J.P., Schuster, D., Picotti, P. (2022). protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data. Bioinformatics Advances, 2(1).", author = "Jan-Philipp Quast, Dina Schuster, Paola Picotti", title = "protti: an R package for comprehensive data analysis of peptide- and protein-centric bottom-up proteomics data", From aa882489b24d6c1af3b1412f614c2dc5646d031f Mon Sep 17 00:00:00 2001 From: jpquast Date: Thu, 15 Feb 2024 09:00:28 +0100 Subject: [PATCH 71/71] Fix QuickGO test --- tests/testthat/test-fetch_extract_and_enrichment_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-fetch_extract_and_enrichment_functions.R b/tests/testthat/test-fetch_extract_and_enrichment_functions.R index e6a0805a..6f420b21 100644 --- a/tests/testthat/test-fetch_extract_and_enrichment_functions.R +++ b/tests/testthat/test-fetch_extract_and_enrichment_functions.R @@ -454,7 +454,7 @@ if (Sys.getenv("TEST_PROTTI") == "true") { test_that("fetch_quickgo works", { expect_is(annotations, "data.frame") expect_gte(nrow(annotations), 24) - expect_lte(nrow(annotations), 30) + expect_lte(nrow(annotations), 40) expect_equal(ncol(annotations), 15) terms <- fetch_quickgo(type = "terms")