Skip to content

Commit

Permalink
migrate salmon index functions to drugseqr.data
Browse files Browse the repository at this point in the history
  • Loading branch information
alexvpickering committed Sep 16, 2019
1 parent 579dff4 commit 70ccf96
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 2 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(build_gencode_index)
export(build_kallisto_index)
export(build_salmon_index)
export(dl_drug_es)
export(get_pkg_version)
29 changes: 28 additions & 1 deletion R/build_kallisto_index.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
#'
build_kallisto_index <- function(species = 'homo_sapiens', release = '94', kallisto_path = '/home/ubuntu/miniconda/bin/kallisto') {

kallisto_version <- get_pkg_version(kallisto_path)
indices_dir <- system.file(package = 'drugseqr.data')
indices_dir <- file.path(indices_dir, 'indices/kallisto')
indices_dir <- file.path(indices_dir, paste0('indices/kallisto_', kallisto_version))

if (!dir.exists(indices_dir))
dir.create(indices_dir, recursive = TRUE)
Expand Down Expand Up @@ -53,3 +54,29 @@ build_kallisto_index <- function(species = 'homo_sapiens', release = '94', kalli
unlink(ensembl_fasta)
setwd(work_dir)
}

#' Get version of salmon/kallisto from system command.
#'
#' @param type Either \code{'salmon'} or \code{'kallisto'}.
#'
#' @return Version of salmon/kallisto.
#' @export
#' @keywords internal
#'
#' @examples
get_pkg_version <- function(type) {
# possibly use older salmon with version appended to executable name
if (type == 'salmon') {
version <- system(paste(type, '--version'), intern = TRUE)
version <- gsub('^salmon ', '', version)

} else if (type == 'kallisto') {
version <- system(paste(type, 'version'), intern = TRUE)
version <- gsub('^kallisto, version ', '', version)

} else {
stop('type must be either salmon or kallisto')
}

return(version)
}
138 changes: 138 additions & 0 deletions R/build_salmon_index.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#' Download ensembl transcriptome and build index for salmon quantification
#'
#' This index is used for bulk RNA seq quantification. See \code{\link{build_gencode_index}} for single cell RNA-seq equivalent.
#'
#' @param species The species. Default is \code{homo_sapiens.}
#' @param release ensembl release. Default is \code{94} (latest in release for AnnotationHub -
#' needs to match with \code{\link{build_ensdb}}) and corresponds to Gencode release 29 for \code{\link{build_gencode_index}}.
#'
#' @return NULL
#' @export
#'
#' @examples
#' # build salmon index for humans
#' build_salmon_index()
#'
build_salmon_index <- function(species = 'homo_sapiens', release = '94') {

salmon_version <- get_pkg_version('salmon')
if (salmon_version == '0.14.0')
stop('EnsemblDb indices not yet implemented for salmon 0.14.0')

indices_dir <- system.file(package = 'drugseqr.data')
indices_dir <- file.path(indices_dir, paste0('indices/salmon_', salmon_version), 'ensdb')

if (!dir.exists(indices_dir))
dir.create(indices_dir, recursive = TRUE)

# construct ensembl url for transcriptome
ensembl_species <- gsub(' ', '_', tolower(species))
ensembl_release <- paste0('release-', release)
ensembl_url <- paste0('ftp://ftp.ensembl.org/pub/', ensembl_release, '/fasta/', ensembl_species, '/cdna/')

# get list of all files
handle <- curl::new_handle(dirlistonly=TRUE)
con <- curl::curl(ensembl_url, "r", handle)
tbl <- utils::read.table(con, stringsAsFactors=FALSE)
close(con)

# get transcripts cdna.all file
ensembl_all <- grep('cdna.all.fa.gz$', tbl[[1]], value = TRUE)
ensembl_url <- paste0(ensembl_url, ensembl_all)

work_dir <- getwd()
setwd(indices_dir)
curl::curl_download(ensembl_url, ensembl_all)

# build index
tryCatch(system2(command, args=c('index',
'-t', ensembl_all,
'-i', ensembl_species)),
error = function(err) {err$message <- 'Is salmon installed and on the PATH?'; stop(err)})

unlink(ensembl_all)
setwd(work_dir)
}

#' Download ensembl transcriptome and build index for salmon quantification
#'
#' This index is used for single cell RNA-seq quantification. See \code{\link{build_ensdb_index}} for bulk RNA-seq equivalent.
#'
#' @param indices_dir directory to place indices in.
#' @param species The species. Default is \code{homo_sapiens.}
#' @param release gencode release. Default is \code{29} (matches ensembl release 94 for \code{\link{build_ensdb_index}})).
#'
#' @return NULL
#' @export
#'
#' @examples
#' # build salmon alevin index for humans
#' indices_dir <- 'data-raw/indices'
#' build_gencode_index(indices_dir)
#'
build_gencode_index <- function(indices_dir, species = 'human', release = '29', command = 'salmon') {

salmon_version <- get_pkg_version(command)
salmon_old <- salmon_lt_0.14.0(salmon_version)

# newer salmon uses decoys in index
if (salmon_old)
build_gencode_index_no_decoys(indices_dir, species, release)
else
build_gencode_index_decoys(indices_dir, species, release)
}

build_gencode_index_decoys <- function(indices_dir, species, release) {

indices_dir <- file.path(indices_dir, '0.14.0', 'gencode')
if (!dir.exists(indices_dir)) dir.create(indices_dir, recursive = TRUE)

if (species != 'human' | release != '29')
stop('Only implemented for human gencode v29.')

gencode_file <- 'human_GENCODEv29.tar.gz'
gencode_url <- paste0('https://s3.us-east-2.amazonaws.com/drugseqr/', gencode_file)

work_dir <- getwd()
setwd(indices_dir)

download.file(gencode_url, gencode_file)
utils::untar(gencode_file)

# build index
tryCatch(system2('salmon', args=c('index',
'-t', 'human_GENCODEv29/gentrome.fa',
'--gencode',
'-d', 'human_GENCODEv29/decoys.txt',
'-i', species)),
error = function(err) {err$message <- 'Is salmon installed and on the PATH?'; stop(err)})


unlink(c(gencode_file, 'human_GENCODEv29'), recursive = TRUE)
setwd(work_dir)
}


build_gencode_index_no_decoys <- function(indices_dir, species, release) {

indices_dir <- file.path(indices_dir, '0.13.1', 'gencode')
if (!dir.exists(indices_dir)) dir.create(indices_dir, recursive = TRUE)

# construct ensembl url for protein coding transcriptome
gencode_file <- paste0('gencode.v', release, '.pc_transcripts.fa.gz')
gencode_url <- paste0('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_', species, '/release_', release, '/', gencode_file)

work_dir <- getwd()
setwd(indices_dir)
curl::curl_download(gencode_url, gencode_file)

# build index
tryCatch(system2('salmon', args=c('index',
'-t', gencode_file,
'--gencode',
'-i', species)),
error = function(err) {err$message <- 'Is salmon installed and on the PATH?'; stop(err)})

unlink(gencode_file)
setwd(work_dir)
}
25 changes: 25 additions & 0 deletions man/build_gencode_index.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/build_kallisto_index.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/build_salmon_index.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/get_pkg_version.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 70ccf96

Please sign in to comment.