Skip to content

Commit

Permalink
multiple GENCODE versions
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Jul 19, 2023
1 parent 41b3360 commit e8b9b4a
Show file tree
Hide file tree
Showing 8 changed files with 226 additions and 108 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: geneOncoX
Type: Package
Title: Human gene annotations for the oncology domain
Version: 0.7.5
Date: 2023-07-11
Date: 2023-07-19
Authors@R: person(given = "Sigve", family = "Nakken", role = c("aut", "cre"),
email = "sigven@ifi.uio.no",
comment = c(ORCID = "0000-0001-8468-2050"))
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Version 0.7.6

* Allowing the possibility to query and retrieve different versions of GENCODE transcripts (using Ensembl version 110 to 105, effectively querying GENCODE v44 to GENCODE v39). This will only
effect grch38, for grch37, GENCODE v19 will always be used.

# Version 0.7.5

* Updated metadata table
Expand Down
35 changes: 33 additions & 2 deletions R/get_gencode.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
#' @param cache_dir Local directory for data download
#' @param force_download Logical indicating if local cache should be overwritten
#' (set to TRUE to re-download if file exists in cache)
#' @param ensembl_release version of Ensembl to use - this will
#' dictate the version of GENCODE used for grch38 (e.g. 110 implies
#' GENCODE v44, 109 implies v43, 108 implies v42 and so on)
#'
#' @returns
#' \strong{metadata} - A data frame with 3 rows and 6 columns:
Expand Down Expand Up @@ -93,11 +96,39 @@
#'

get_gencode <- function(cache_dir = NA,
force_download = FALSE) {
force_download = FALSE,
ensembl_release = 110) {

if(ensembl_release > 110 | ensembl_release < 105){
lgr::lgr$fatal(
paste0("ERROR: Ensembl release must be between 110 and",
"105 - exiting"))
return(0)
}

gencode_release <- 44
if(ensembl_release == 109){
gencode_release <- 43
}
if(ensembl_release == 108){
gencode_release <- 42
}
if(ensembl_release == 107){
gencode_release <- 41
}
if(ensembl_release == 106){
gencode_release <- 40
}
if(ensembl_release == 105){
gencode_release <- 39
}

dat <- get_gox_data(
cache_dir = cache_dir,
force_download = force_download,
db = "gencode"
db = paste0(
"gencode_", gencode_release,
"_", ensembl_release)
)
return(dat)
}
Binary file modified R/sysdata.rda
Binary file not shown.
282 changes: 180 additions & 102 deletions data-raw/data-raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,105 +48,7 @@ gene_alias$records <- get_gene_aliases_ncbi(
path_data_raw = file.path(here::here(), "data-raw")
)

gene_gencode <- list()
gene_gencode$records <- list()
gene_gencode$metadata <- metadata[["gencode"]]

gene_gencode$records[["grch38"]] <- gencode_get_transcripts(
build = "grch38",
gene_info = gene_info,
gencode_version = as.integer(metadata$gencode[1, ]$source_version),
ensembl_version = as.integer(metadata$gencode[2, ]$source_version),
uniprot_version = as.character(metadata$gencode[3, ]$source_version),
gene_alias = gene_alias
)
gene_gencode$records[["grch37"]] <- gencode_get_transcripts(
build = "grch37",
gene_info = gene_info,
gencode_version = as.integer(19),
ensembl_version = as.integer(metadata$gencode[2, ]$source_version),
uniprot_version = as.character(metadata$gencode[3, ]$source_version),
gene_alias = gene_alias
)

## "Rescue" some UniProt identifiers from
## grch38 - missing/not found for grch37

up_xref_grch38 <- gene_gencode[["records"]][["grch38"]] |>
dplyr::select(
entrezgene, uniprot_acc,
uniprot_id
) |>
dplyr::filter(!is.na(entrezgene) &
!is.na(uniprot_acc) &
!is.na(uniprot_id)) |>
# dplyr::select(-uniprot_acc) |>
dplyr::distinct()

up_xref <- list()
up_xref[["found"]] <-
gene_gencode[["records"]][["grch37"]] |>
dplyr::select(
entrezgene, uniprot_acc,
ensembl_transcript_id, uniprot_id
) |>
dplyr::filter(!is.na(entrezgene) &
!is.na(uniprot_acc) &
!is.na(uniprot_id)) |>
dplyr::distinct()

up_xref[["rescued_from_grch38"]] <-
gene_gencode[["records"]][["grch37"]] |>
dplyr::select(
entrezgene, uniprot_acc,
ensembl_transcript_id, uniprot_id
) |>
dplyr::filter(!is.na(entrezgene) &
!is.na(uniprot_acc) &
is.na(uniprot_id)) |>
dplyr::select(-c(uniprot_id)) |>
dplyr::left_join(
up_xref_grch38,
by = c("entrezgene", "uniprot_acc")
) |>
dplyr::filter(!is.na(uniprot_id)) |>
dplyr::distinct()

up_xref[["found"]] <- as.data.frame(
up_xref[["found"]] |>
dplyr::bind_rows(up_xref[["rescued_from_grch38"]]) |>
dplyr::distinct() |>
dplyr::group_by(
ensembl_transcript_id,
uniprot_acc, uniprot_id
) |>
dplyr::summarise(
entrezgene = paste(entrezgene, collapse = "&"),
.groups = "drop"
) |>
dplyr::filter(!stringr::str_detect(
entrezgene, "&"
)) |>
dplyr::group_by(
entrezgene
) |>
dplyr::mutate(
entrezgene = as.integer(entrezgene)
)
)


gene_gencode$records[["grch37"]] <-
gene_gencode$records[["grch37"]] |>
dplyr::select(-c(uniprot_id, uniprot_acc)) |>
dplyr::left_join(
up_xref$found,
by = c(
"entrezgene",
"ensembl_transcript_id"
)
) |>
dplyr::distinct()

gene_summary <- get_function_summary_ncbi(gene_df = gene_info)

Expand Down Expand Up @@ -325,9 +227,9 @@ rm(signaling_genes)
rm(cancermine_genes)
rm(tso500)
rm(gene_summary)
rm(gene_info)
#rm(gene_info)
rm(dbnsfp_annotations)
rm(metadata)
#rm(metadata)
rm(ncg)
rm(cgc_all)
rm(tcga_drivers)
Expand Down Expand Up @@ -360,11 +262,187 @@ db_id_ref <- data.frame()

db <- list()
db[["basic"]] <- gene_basic
db[["gencode"]] <- gene_gencode
#db[["gencode"]] <- gene_gencode
db[["alias"]] <- gene_alias
db[["predisposition"]] <- gene_predisposition
db[["panels"]] <- gene_panels



gencode_release_current <-
as.integer(metadata[['gencode']][
metadata$gencode$source_abbreviation == "gencode",]$source_version)
ensembl_release_current <-
as.integer(metadata[['gencode']][
metadata$gencode$source_abbreviation == "ensembl",]$source_version)
ensembl_iter <- 0
while(ensembl_iter <= 5){
gencode_release <- gencode_release_current - ensembl_iter
ensembl_release <- ensembl_release_current - ensembl_iter
ensembl_iter <- ensembl_iter + 1
cat(ensembl_release, " - ", gencode_release)
cat("\n")

gene_gencode <- list()
gene_gencode$records <- list()
gene_gencode$metadata <- metadata[["gencode"]]

gene_gencode$metadata[
gene_gencode$metadata$source_abbreviation == "gencode",
"source_version"] <- gencode_release
gene_gencode$metadata[
gene_gencode$metadata$source_abbreviation == "ensembl",
"source_version"] <- ensembl_release

gene_gencode$records[["grch38"]] <- gencode_get_transcripts(
build = "grch38",
gene_info = gene_info,
gencode_version = gencode_release,
ensembl_version = ensembl_release,
uniprot_version = as.character(
metadata$gencode[3, ]$source_version),
gene_alias = gene_alias
)
gene_gencode$records[["grch37"]] <- gencode_get_transcripts(
build = "grch37",
gene_info = gene_info,
gencode_version = as.integer(19),
ensembl_version = ensembl_release,
uniprot_version = as.character(
metadata$gencode[3, ]$source_version),
gene_alias = gene_alias
)

## "Rescue" some UniProt identifiers from
## grch38 - missing/not found for grch37

up_xref_grch38 <- gene_gencode[["records"]][["grch38"]] |>
dplyr::select(
entrezgene, uniprot_acc,
uniprot_id
) |>
dplyr::filter(!is.na(entrezgene) &
!is.na(uniprot_acc) &
!is.na(uniprot_id)) |>
# dplyr::select(-uniprot_acc) |>
dplyr::distinct()

up_xref <- list()
up_xref[["found"]] <-
gene_gencode[["records"]][["grch37"]] |>
dplyr::select(
entrezgene, uniprot_acc,
ensembl_transcript_id, uniprot_id
) |>
dplyr::filter(!is.na(entrezgene) &
!is.na(uniprot_acc) &
!is.na(uniprot_id)) |>
dplyr::distinct()

up_xref[["rescued_from_grch38"]] <-
gene_gencode[["records"]][["grch37"]] |>
dplyr::select(
entrezgene, uniprot_acc,
ensembl_transcript_id, uniprot_id
) |>
dplyr::filter(!is.na(entrezgene) &
!is.na(uniprot_acc) &
is.na(uniprot_id)) |>
dplyr::select(-c(uniprot_id)) |>
dplyr::left_join(
up_xref_grch38,
by = c("entrezgene", "uniprot_acc")
) |>
dplyr::filter(!is.na(uniprot_id)) |>
dplyr::distinct()

up_xref[["found"]] <- as.data.frame(
up_xref[["found"]] |>
dplyr::bind_rows(up_xref[["rescued_from_grch38"]]) |>
dplyr::distinct() |>
dplyr::group_by(
ensembl_transcript_id,
uniprot_acc, uniprot_id
) |>
dplyr::summarise(
entrezgene = paste(entrezgene, collapse = "&"),
.groups = "drop"
) |>
dplyr::filter(!stringr::str_detect(
entrezgene, "&"
)) |>
dplyr::group_by(
entrezgene
) |>
dplyr::mutate(
entrezgene = as.integer(entrezgene)
)
)


gene_gencode$records[["grch37"]] <-
gene_gencode$records[["grch37"]] |>
dplyr::select(-c(uniprot_id, uniprot_acc)) |>
dplyr::left_join(
up_xref$found,
by = c(
"entrezgene",
"ensembl_transcript_id"
)
) |>
dplyr::distinct()


elem <- paste0(
"gencode_", gencode_release, "_", ensembl_release)

saveRDS(
gene_gencode,
file = paste0(
"data-raw/gd_local/gene_", elem, "_v",
version_bump, ".rds"
)
)

(gd_records[[elem]] <- googledrive::drive_upload(
paste0(
"data-raw/gd_local/gene_",
elem, "_v", version_bump, ".rds"
),
paste0("geneOncoX/gene_", elem, "_v", version_bump, ".rds")
))

google_rec_df <-
dplyr::select(
as.data.frame(gd_records[[elem]]), name, id
) |>
dplyr::rename(
gid = id,
filename = name
) |>
dplyr::mutate(
name = stringr::str_replace(
stringr::str_replace(filename, "_v\\S+$", ""),
"gene_", ""
),
date = as.character(Sys.Date()),
pVersion = version_bump
) |>
dplyr::mutate(
md5Checksum =
gd_records[[elem]]$drive_resource[[1]]$md5Checksum
)

db_id_ref <- db_id_ref |>
dplyr::bind_rows(google_rec_df)

}






# rm(gene_alias)
# rm(gene_basic)
# rm(gene_panels)
Expand All @@ -375,7 +453,7 @@ db[["panels"]] <- gene_panels

for (elem in c(
"basic", "predisposition",
"panels", "alias", "gencode")) {
"panels", "alias")) {
saveRDS(db[[elem]],
file = paste0(
"data-raw/gd_local/gene_", elem, "_v",
Expand Down
2 changes: 1 addition & 1 deletion data-raw/utils_driver_annotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ get_intogen_driver_genes <- function(gene_info = NULL) {
read.table(
file = file.path(
"data-raw", "intogen",
"Compendium_Cancer_Genes.tsv"
"Compendium_Cancer_Genes_IntOGen.tsv"
), sep = "\t",
header = TRUE, quote = NULL, stringsAsFactors = FALSE
) |>
Expand Down
Loading

0 comments on commit e8b9b4a

Please sign in to comment.