Skip to content

Commit

Permalink
Add fetch_interpro
Browse files Browse the repository at this point in the history
  • Loading branch information
jpquast committed Oct 1, 2024
1 parent d25c3f3 commit 17f2afe
Show file tree
Hide file tree
Showing 7 changed files with 374 additions and 2 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export(fetch_alphafold_prediction)
export(fetch_chebi)
export(fetch_eco)
export(fetch_go)
export(fetch_interpro)
export(fetch_kegg)
export(fetch_metal_pdb)
export(fetch_mobidb)
Expand Down Expand Up @@ -171,6 +172,7 @@ importFrom(stringr,str_sort)
importFrom(stringr,str_split)
importFrom(stringr,str_squish)
importFrom(stringr,str_sub)
importFrom(stringr,str_to_upper)
importFrom(tibble,as_tibble)
importFrom(tibble,column_to_rownames)
importFrom(tibble,rowid_to_column)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# protti 0.9.0.9000

## New features
* Added `fetch_interpro()`. The function allows you to fetch information from the InterPro database. There are two options, either domain level information about the proteins of interest can be retrieved. This includes also e.g. gene ontology terms of the domains as well as their positions within the protein. Second you can retrieve residue level information. These are any annotations of proteins that focus on residues or small stretches, such as active sites, binding sites etc.

## Additional Changes

* `assign_peptide_type` now takes the `start` argument, containing the start position of a peptide. If a protein does not have any peptide starting at position `1` and there is a peptide starting at position `2`, this peptide will be considered "tryptic" at the N-terminus. This is because the initial Methionine is likely missing due to processing for every copy of the protein and therefore position `2` is the true N-terminus.
Expand Down
292 changes: 292 additions & 0 deletions R/fetch_interpro.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
#' Fetch domain and residue information from InterPro
#'
#' Fetches either domain level information with e.g. gene ontology annotations or
#' residue level information from the InterPro database.
#'
#' @param uniprot_ids a character vector of UniProt accession numbers.
#' @param return_residue_info a logical value that specifies if either domain or residue information
#' should be returned by the function. The default is `FALSE`.
#' @param manual_query optional, a character value that is a custom query to the InterPro database.
#' This query is pastes after "https://www.ebi.ac.uk/interpro/api/" and before "&page_size=200".
#' The raw data of the query is returned as a list.
#' @param page_size a numeric value that specifies the number of entries that should be retrieved
#' per page of a request. The function anyway iterates through all pages, but this parameters allows you
#' to finetune the number of iterations and thus number of requests to the database. Default is 200.
#' @param max_tries a numeric value that specifies the number of times the function tries to download
#' the data in case an error occurs. The default is 3.
#' @param timeout a numeric value that specifies the maximum request time per try. Default is 20 seconds.
#' @param show_progress a logical value that determines if a progress bar will be shown. Default
#' is TRUE.
#'
#' @return A data frame that contains either domain or residue level information for the provided
#' UniProt IDs.
#' @import dplyr
#' @import progress
#' @import purrr
#' @importFrom tidyr crossing
#' @importFrom stringr str_to_upper
#' @importFrom curl has_internet
#' @export
#'
#' @examples
#' \donttest{
#' uniprot_ids <- c("P36578", "O43324", "Q00796", "O32583")
#'
#' domain_info <- fetch_interpro(uniprot_ids = uniprot_ids)
#'
#' head(domain_info)
#'
#' residue_info <- fetch_interpro(uniprot_ids = uniprot_ids,
#' return_residue_info = TRUE)
#'
#' head(residue_info)
#' }
fetch_interpro <- function(uniprot_ids = NULL,
return_residue_info = FALSE,
manual_query = NULL,
page_size = 200,
max_tries = 3,
timeout = 20,
show_progress = TRUE) {

# Define the progress bar settings
progress_option_residue <- if (show_progress) {
list(
type = "iterator",
format = "Fetching Residue Positions {cli::pb_bar} {cli::pb_percent} ({cli::pb_current}/{cli::pb_total}) ETA: {cli::pb_eta}",
clear = TRUE
)
} else {
NULL
}
progress_option_domain <- if (show_progress) {
list(
type = "iterator",
format = "Fetching InterPro Domains {cli::pb_bar} {cli::pb_percent} ({cli::pb_current}/{cli::pb_total}) ETA: {cli::pb_eta}",
clear = TRUE
)
} else {
NULL
}

# check for internet connection
if (!curl::has_internet()) {
message("No internet connection.")
return(invisible(NULL))
}

# check if any information for retrieval is provided
if(missing(manual_query) & missing(uniprot_ids)){
stop('Please provide information about what to retrieve to either "uniprot_ids" or "manual_query".')
}

# check if uniprot_ids were provided
if(!missing(uniprot_ids)){
uniprot_ids <- uniprot_ids[!is.na(uniprot_ids)]
if (length(uniprot_ids) == 0) {
stop("Please provide valid UniProt IDs.")
}
query_url_part <- paste0("entry/interpro/protein/uniprot/", uniprot_ids, "?")
}

# check if manual_query was provided
# if it was provided it will always be used over protein IDs
if(!missing(manual_query)){

# remove NA values
query_url_part <- manual_query[!is.na(manual_query)]
if (length(manual_query) == 0) {
stop("Please provide a valid manual query based on the interpro documentation.\n
The query is pasted after the general API URL: https://www.ebi.ac.uk/interpro/api/ \n
A valid query is eg. entry/interpro/protein/uniprot/O32583?")
}
}

# query data
if(!return_residue_info | !missing(manual_query)){
query_result <- purrr::map(
.x = query_url_part,
.f = ~ {

# Initialize the next page URL with the base query URL
next_url <- paste0("https://www.ebi.ac.uk/interpro/api/", .x, "&page_size=", page_size)
attempts <- 0
all_results <- list()

# progress bar variable
total_pages <- 1
current_page <- 1

while (!is.null(next_url)) {
result <- tryCatch({
# Call the protti function to query the API
query <- protti:::try_query(next_url,
type = "application/json",
max_tries = max_tries,
timeout = timeout)

# Store the results from the current page
all_results[[current_page]] <- query

# initialize progress bar
if (show_progress == TRUE & current_page == 1 & length(query_url_part) == 1 & !is.null(query$count)) {
pb <- progress::progress_bar$new(total = ceiling(query$count/page_size),
format = " Fetching Pages [:bar] :current/:total (:percent) :eta")
}

# tick progress bar
if (show_progress == TRUE & length(query_url_part) == 1 & !is.null(query$count) && !pb$finished) {
pb$tick()
}

# Get the next page URL from the "next" field
next_url <- query$`next`

# add to page counter
current_page <- current_page + 1

attempts <- 0 # Reset attempts counter on successful request

NULL
}, error = function(e) {
if (grepl("408", e$message)) {
Sys.sleep(2) # Sleep and retry for timeout errors (408)
NULL
} else {
attempts <<- attempts + 1
if (attempts < max_tries) {
Sys.sleep(2) # Sleep and retry up to "max_tries" times
NULL
} else {
stop(paste("Error retrieving data for URL:", next_url, " - ", e$message))
}
}
})
}
all_results
},
.progress = progress_option_domain
)
}

if(!missing(manual_query)){
# return raw data for manual queries
return(query_result)
}

# if residue level information should be returned, fetch it here
if(return_residue_info){

query_residue <- purrr::map_dfr(
.x = uniprot_ids,
.f = ~ {

# Initialize the next page URL with the base query URL
next_url <- paste0("https://www.ebi.ac.uk/interpro/api/protein/uniprot/", .x, "?residues")

query <- protti:::try_query(next_url,
type = "application/json",
max_tries = max_tries,
timeout = timeout)

if(length(query) == 0){
result <- data.frame(accession = .x)
} else{

# map for each database accession
result <- purrr::map_dfr(.x = query,
.f = ~{
# map for each location
location <- purrr::map_dfr(.x = .x[["locations"]],
.f = ~{
# map for each fragment
fragment <- purrr::map_dfr(.x = .x[["fragments"]],
.f = ~{
# lowest level
data.frame(start = .x[["start"]],
end = .x[["end"]],
residues = .x[["residues"]])
})

fragment %>%
dplyr::mutate(fragment_description = .x[["description"]])
})
location %>%
dplyr::mutate(source_database = .x[["source_database"]],
source_accession = .x[["accession"]],
source_name = .x[["name"]])
}) %>%
mutate(accession = .x)
}

result

},
.progress = progress_option_residue
)

return(query_residue)
}

# start with unnesting and flattening the data
flattened_query_result <- purrr::map(.x = query_result,
.f = ~{
# page level
purrr::map(.x = .x,
.f = ~{
.x[["results"]]
})
}) %>%
purrr::flatten() %>%
purrr::flatten()

# unnest domain level data
domain_result <- purrr::map_dfr(.x = flattened_query_result,
.f = ~{

go_terms <- purrr::map_dfr(.x = .x[["metadata"]][["go_terms"]],
.f = ~{
data.frame(go_id = .x[["identifier"]],
go_name = .x[["name"]],
go_code = .x[["category"]][["code"]],
go_type = .x[["category"]][["name"]])
})

metadata <- data.frame(identifier = .x[["metadata"]][["accession"]],
identifier_name = .x[["metadata"]][["name"]],
identifier_source_database = .x[["metadata"]][["source_database"]],
identifier_type = .x[["metadata"]][["type"]])

if(nrow(go_terms) != 0){
metadata <- metadata %>%
bind_cols(go_terms)
}

# Protein level
protein_level <- purrr::map_dfr(.x = .x[["proteins"]],
.f = ~{
# Entry protein location level
entry_protein_locations_level <- purrr::map_dfr(
.x = .x[["entry_protein_locations"]],
.f = ~{
# Fragment Level
protein_info <- purrr::map_dfr(.x = .x[["fragments"]],
.f = ~{
data.frame(start = .x[["start"]],
end = .x[["end"]],
dc_status = .x[["dc-status"]],
representative = .x[["representative"]])
}) %>%
dplyr::mutate(model = .x[["model"]],
score = .x[["score"]])
}
) %>%
dplyr::mutate(accession = stringr::str_to_upper(.x[["accession"]]))
})

tidyr::crossing(metadata, protein_level)

})

domain_result
}
2 changes: 1 addition & 1 deletion R/fetch_uniprot.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @param batchsize a numeric value that specifies the number of proteins processed in a single
#' single query. Default and max value is 200.
#' @param max_tries a numeric value that specifies the number of times the function tries to download
#' the data in case an error occurs.
#' the data in case an error occurs. The default is 10.
#' @param timeout a numeric value that specifies the maximum request time per try. Default is 20 seconds.
#' @param show_progress a logical value that determines if a progress bar will be shown. Default
#' is TRUE.
Expand Down
60 changes: 60 additions & 0 deletions man/fetch_interpro.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/fetch_uniprot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 17f2afe

Please sign in to comment.