From 70ccf966bc817e6342f3d069463880bc31cdb997 Mon Sep 17 00:00:00 2001 From: alexvpickering Date: Mon, 16 Sep 2019 10:02:20 -0400 Subject: [PATCH] migrate salmon index functions to drugseqr.data --- NAMESPACE | 3 + R/build_kallisto_index.R | 29 +++++++- R/build_salmon_index.R | 138 ++++++++++++++++++++++++++++++++++++ man/build_gencode_index.Rd | 25 +++++++ man/build_kallisto_index.Rd | 2 +- man/build_salmon_index.Rd | 22 ++++++ man/get_pkg_version.Rd | 18 +++++ 7 files changed, 235 insertions(+), 2 deletions(-) create mode 100644 R/build_salmon_index.R create mode 100644 man/build_gencode_index.Rd create mode 100644 man/build_salmon_index.Rd create mode 100644 man/get_pkg_version.Rd diff --git a/NAMESPACE b/NAMESPACE index c7d2c4e..1dfcbc7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,7 @@ # Generated by roxygen2: do not edit by hand +export(build_gencode_index) export(build_kallisto_index) +export(build_salmon_index) export(dl_drug_es) +export(get_pkg_version) diff --git a/R/build_kallisto_index.R b/R/build_kallisto_index.R index e55072a..c61ed73 100644 --- a/R/build_kallisto_index.R +++ b/R/build_kallisto_index.R @@ -17,8 +17,9 @@ #' build_kallisto_index <- function(species = 'homo_sapiens', release = '94', kallisto_path = '/home/ubuntu/miniconda/bin/kallisto') { + kallisto_version <- get_pkg_version(kallisto_path) indices_dir <- system.file(package = 'drugseqr.data') - indices_dir <- file.path(indices_dir, 'indices/kallisto') + indices_dir <- file.path(indices_dir, paste0('indices/kallisto_', kallisto_version)) if (!dir.exists(indices_dir)) dir.create(indices_dir, recursive = TRUE) @@ -53,3 +54,29 @@ build_kallisto_index <- function(species = 'homo_sapiens', release = '94', kalli unlink(ensembl_fasta) setwd(work_dir) } + +#' Get version of salmon/kallisto from system command. +#' +#' @param type Either \code{'salmon'} or \code{'kallisto'}. +#' +#' @return Version of salmon/kallisto. +#' @export +#' @keywords internal +#' +#' @examples +get_pkg_version <- function(type) { + # possibly use older salmon with version appended to executable name + if (type == 'salmon') { + version <- system(paste(type, '--version'), intern = TRUE) + version <- gsub('^salmon ', '', version) + + } else if (type == 'kallisto') { + version <- system(paste(type, 'version'), intern = TRUE) + version <- gsub('^kallisto, version ', '', version) + + } else { + stop('type must be either salmon or kallisto') + } + + return(version) +} diff --git a/R/build_salmon_index.R b/R/build_salmon_index.R new file mode 100644 index 0000000..b6bbeca --- /dev/null +++ b/R/build_salmon_index.R @@ -0,0 +1,138 @@ +#' Download ensembl transcriptome and build index for salmon quantification +#' +#' This index is used for bulk RNA seq quantification. See \code{\link{build_gencode_index}} for single cell RNA-seq equivalent. +#' +#' @param species The species. Default is \code{homo_sapiens.} +#' @param release ensembl release. Default is \code{94} (latest in release for AnnotationHub - +#' needs to match with \code{\link{build_ensdb}}) and corresponds to Gencode release 29 for \code{\link{build_gencode_index}}. +#' +#' @return NULL +#' @export +#' +#' @examples +#' # build salmon index for humans +#' build_salmon_index() +#' +build_salmon_index <- function(species = 'homo_sapiens', release = '94') { + + salmon_version <- get_pkg_version('salmon') + if (salmon_version == '0.14.0') + stop('EnsemblDb indices not yet implemented for salmon 0.14.0') + + indices_dir <- system.file(package = 'drugseqr.data') + indices_dir <- file.path(indices_dir, paste0('indices/salmon_', salmon_version), 'ensdb') + + if (!dir.exists(indices_dir)) + dir.create(indices_dir, recursive = TRUE) + + # construct ensembl url for transcriptome + ensembl_species <- gsub(' ', '_', tolower(species)) + ensembl_release <- paste0('release-', release) + ensembl_url <- paste0('ftp://ftp.ensembl.org/pub/', ensembl_release, '/fasta/', ensembl_species, '/cdna/') + + # get list of all files + handle <- curl::new_handle(dirlistonly=TRUE) + con <- curl::curl(ensembl_url, "r", handle) + tbl <- utils::read.table(con, stringsAsFactors=FALSE) + close(con) + + # get transcripts cdna.all file + ensembl_all <- grep('cdna.all.fa.gz$', tbl[[1]], value = TRUE) + ensembl_url <- paste0(ensembl_url, ensembl_all) + + work_dir <- getwd() + setwd(indices_dir) + curl::curl_download(ensembl_url, ensembl_all) + + # build index + tryCatch(system2(command, args=c('index', + '-t', ensembl_all, + '-i', ensembl_species)), + error = function(err) {err$message <- 'Is salmon installed and on the PATH?'; stop(err)}) + + unlink(ensembl_all) + setwd(work_dir) +} + +#' Download ensembl transcriptome and build index for salmon quantification +#' +#' This index is used for single cell RNA-seq quantification. See \code{\link{build_ensdb_index}} for bulk RNA-seq equivalent. +#' +#' @param indices_dir directory to place indices in. +#' @param species The species. Default is \code{homo_sapiens.} +#' @param release gencode release. Default is \code{29} (matches ensembl release 94 for \code{\link{build_ensdb_index}})). +#' +#' @return NULL +#' @export +#' +#' @examples +#' # build salmon alevin index for humans +#' indices_dir <- 'data-raw/indices' +#' build_gencode_index(indices_dir) +#' +build_gencode_index <- function(indices_dir, species = 'human', release = '29', command = 'salmon') { + + salmon_version <- get_pkg_version(command) + salmon_old <- salmon_lt_0.14.0(salmon_version) + + # newer salmon uses decoys in index + if (salmon_old) + build_gencode_index_no_decoys(indices_dir, species, release) + else + build_gencode_index_decoys(indices_dir, species, release) +} + +build_gencode_index_decoys <- function(indices_dir, species, release) { + + indices_dir <- file.path(indices_dir, '0.14.0', 'gencode') + if (!dir.exists(indices_dir)) dir.create(indices_dir, recursive = TRUE) + + if (species != 'human' | release != '29') + stop('Only implemented for human gencode v29.') + + gencode_file <- 'human_GENCODEv29.tar.gz' + gencode_url <- paste0('https://s3.us-east-2.amazonaws.com/drugseqr/', gencode_file) + + work_dir <- getwd() + setwd(indices_dir) + + download.file(gencode_url, gencode_file) + utils::untar(gencode_file) + + # build index + tryCatch(system2('salmon', args=c('index', + '-t', 'human_GENCODEv29/gentrome.fa', + '--gencode', + '-d', 'human_GENCODEv29/decoys.txt', + '-i', species)), + error = function(err) {err$message <- 'Is salmon installed and on the PATH?'; stop(err)}) + + + unlink(c(gencode_file, 'human_GENCODEv29'), recursive = TRUE) + setwd(work_dir) +} + + +build_gencode_index_no_decoys <- function(indices_dir, species, release) { + + indices_dir <- file.path(indices_dir, '0.13.1', 'gencode') + if (!dir.exists(indices_dir)) dir.create(indices_dir, recursive = TRUE) + + # construct ensembl url for protein coding transcriptome + gencode_file <- paste0('gencode.v', release, '.pc_transcripts.fa.gz') + gencode_url <- paste0('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_', species, '/release_', release, '/', gencode_file) + + work_dir <- getwd() + setwd(indices_dir) + curl::curl_download(gencode_url, gencode_file) + + # build index + tryCatch(system2('salmon', args=c('index', + '-t', gencode_file, + '--gencode', + '-i', species)), + error = function(err) {err$message <- 'Is salmon installed and on the PATH?'; stop(err)}) + + unlink(gencode_file) + setwd(work_dir) +} diff --git a/man/build_gencode_index.Rd b/man/build_gencode_index.Rd new file mode 100644 index 0000000..c072a58 --- /dev/null +++ b/man/build_gencode_index.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/build_salmon_index.R +\name{build_gencode_index} +\alias{build_gencode_index} +\title{Download ensembl transcriptome and build index for salmon quantification} +\usage{ +build_gencode_index(indices_dir, species = "human", release = "29", + command = "salmon") +} +\arguments{ +\item{indices_dir}{directory to place indices in.} + +\item{species}{The species. Default is \code{homo_sapiens.}} + +\item{release}{gencode release. Default is \code{29} (matches ensembl release 94 for \code{\link{build_ensdb_index}})).} +} +\description{ +This index is used for single cell RNA-seq quantification. See \code{\link{build_ensdb_index}} for bulk RNA-seq equivalent. +} +\examples{ +# build salmon alevin index for humans +indices_dir <- 'data-raw/indices' +build_gencode_index(indices_dir) + +} diff --git a/man/build_kallisto_index.Rd b/man/build_kallisto_index.Rd index fd20a9c..983a295 100644 --- a/man/build_kallisto_index.Rd +++ b/man/build_kallisto_index.Rd @@ -5,7 +5,7 @@ \title{Download ensembl transcriptome and build index for kalliso quantification} \usage{ build_kallisto_index(species = "homo_sapiens", release = "94", - kallisto_path = "/home/ubuntu/miniconda2/bin/kallisto") + kallisto_path = "/home/ubuntu/miniconda/bin/kallisto") } \arguments{ \item{species}{The species. Default is \code{homo_sapiens}.} diff --git a/man/build_salmon_index.Rd b/man/build_salmon_index.Rd new file mode 100644 index 0000000..9634f32 --- /dev/null +++ b/man/build_salmon_index.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/build_salmon_index.R +\name{build_salmon_index} +\alias{build_salmon_index} +\title{Download ensembl transcriptome and build index for salmon quantification} +\usage{ +build_salmon_index(species = "homo_sapiens", release = "94") +} +\arguments{ +\item{species}{The species. Default is \code{homo_sapiens.}} + +\item{release}{ensembl release. Default is \code{94} (latest in release for AnnotationHub - +needs to match with \code{\link{build_ensdb}}) and corresponds to Gencode release 29 for \code{\link{build_gencode_index}}.} +} +\description{ +This index is used for bulk RNA seq quantification. See \code{\link{build_gencode_index}} for single cell RNA-seq equivalent. +} +\examples{ +# build salmon index for humans +build_salmon_index() + +} diff --git a/man/get_pkg_version.Rd b/man/get_pkg_version.Rd new file mode 100644 index 0000000..d44044d --- /dev/null +++ b/man/get_pkg_version.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/build_kallisto_index.R +\name{get_pkg_version} +\alias{get_pkg_version} +\title{Get version of salmon/kallisto from system command.} +\usage{ +get_pkg_version(type) +} +\arguments{ +\item{type}{Either \code{'salmon'} or \code{'kallisto'}.} +} +\value{ +Version of salmon/kallisto. +} +\description{ +Get version of salmon/kallisto from system command. +} +\keyword{internal}