diff --git a/DESCRIPTION b/DESCRIPTION index 815b2d0..d6d2d0c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: gcap Title: Gene-level Circular Amplicon Prediction -Version: 1.1.4 +Version: 1.2.0 Authors@R: person("Shixiang", "Wang", , "w_shixiang@163.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9855-7357")) @@ -45,7 +45,7 @@ LinkingTo: Rcpp Remotes: git::https://bitbucket.org/sequenzatools/sequenza.git@master, - github::ShixiangWang/ascat/ASCAT@v3-for-gcap-v1, + github::VanLoo-lab/ascat/ASCAT, github::ShixiangWang/copynumber, github::ShixiangWang/facets, github::ShixiangWang/IDConverter @@ -55,4 +55,4 @@ Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE, roclets = c("collate", "namespace", "rd", "roxytest::testthat_roclet")) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 diff --git a/Dockerfile b/Dockerfile index 5970295..13f7477 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ RUN mamba install -y -c conda-forge -c bioconda r-base=4.3 python=3.10 r-remotes R -e 'BiocManager::install("jokergoo/GetoptLong", update = FALSE, force = TRUE)' &&\ R -e 'BiocManager::install("ShixiangWang/copynumber", update = FALSE, force = TRUE)' &&\ R -e 'BiocManager::install("ShixiangWang/facets", update = FALSE, force = TRUE)' &&\ - R -e 'BiocManager::install("ShixiangWang/ascat@v3-for-gcap-v1", subdir = "ASCAT", dependencies = TRUE, update = FALSE)' &&\ + R -e 'BiocManager::install("VanLoo-lab/ascat", subdir = "ASCAT", dependencies = TRUE, update = FALSE)' &&\ R -e 'BiocManager::install("ShixiangWang/gcap", dependencies = TRUE, update = FALSE)' &&\ cd /opt/conda/lib/R/library/facets/extcode/ &&\ g++ -std=c++11 -I/opt/conda/include snp-pileup.cpp -L/opt/conda/lib -lhts -Wl,-rpath=/opt/conda/lib -o snp-pileup &&\ diff --git a/NEWS.md b/NEWS.md index 12b02a4..7e1acbe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# gcap 1.2.0 + +- Utilized the latest ASCAT version from [VanLoo-lab/ascat](https://github.com/VanLoo-lab/ascat) for default copy number calling. This help address issue mentioned at #43. +- Updated documentation. +- Added a whole workflow for debugging purpose. +- Enhanced path parsing like `~`. +- Fixed bug in `gcap-bam.R`. + # gcap 1.1.4 - Removed the XGBOOST version limits, instead, a warning is posted. diff --git a/R/ascat.R b/R/ascat.R index 601cd67..2bbee5c 100644 --- a/R/ascat.R +++ b/R/ascat.R @@ -11,9 +11,12 @@ #' Note, for multiple tumor-normal pairs, the first 5 arguments should #' be a vector with same length. #' +#' @param g1000allelesprefix Prefix path to the allele data (e.g. "G1000_alleles_chr"). +#' @param g1000lociprefix Prefix path to the loci data (e.g. "G1000_loci_chr"). #' @inheritParams ASCAT::ascat.prepareHTS #' @inheritParams ASCAT::ascat.aspcf -#' @inheritParams ASCAT::ascat.GCcorrect +#' @param GCcontentfile File containing the GC content around every SNP for increasing window sizes. +#' @param replictimingfile File containing replication timing at every SNP for various cell lines. #' @param tumourseqfile Full path to the tumour BAM file. #' @param normalseqfile Full path to the normal BAM file. #' @param jobname job name, typically an unique name for a tumor-normal pair. @@ -25,6 +28,7 @@ #' considered (optional, default=1:22). #' @param skip_finished_ASCAT if `TRUE`, skipped finished ASCAT calls #' to save time. +#' @param genome_build "hg38" or "hg19". #' #' @importFrom ASCAT ascat.prepareHTS ascat.loadData ascat.GCcorrect #' ascat.plotRawData ascat.aspcf ascat.plotSegmentedData ascat.predictGermlineGenotypes @@ -57,6 +61,7 @@ gcap.runASCAT <- function(tumourseqfile, min_base_qual = 20, min_map_qual = 35, penalty = 70, + genome_build = "hg38", skip_finished_ASCAT = FALSE) { stopifnot(all(gender %in% c("XX", "XY"))) if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) @@ -128,34 +133,70 @@ gcap.runASCAT <- function(tumourseqfile, if (!all(file.exists(tfile, nfile))) { lg$fatal("Not all bam files exist") } - ascat.prepareHTS( - tumourseqfile = tfile, - normalseqfile = nfile, - tumourname = tn, - normalname = nn, - allelecounter_exe = allelecounter_exe, - g1000allelesprefix = g1000allelesprefix, - g1000lociprefix = g1000lociprefix, - nthreads = nthreads, - minCounts = minCounts, - BED_file = BED_file, - probloci_file = probloci_file, - gender = gd, - chrom_names = chrom_names, - min_base_qual = min_base_qual, - min_map_qual = min_map_qual, - skip_allele_counting_tumour = FALSE, - skip_allele_counting_normal = skip_norm - ) - - ascat.bc <- ascat.loadData( - paste0(tn, "_tumourLogR.txt"), paste0(tn, "_tumourBAF.txt"), - if (skip_norm) NULL else paste0(tn, "_normalLogR.txt"), - if (skip_norm) NULL else paste0(tn, "_normalBAF.txt"), - chrs = chrom_names, - gender = gender - ) # New parameter genomeVersion in the latest version of ASCAT - ascat.bc <- ascat.GCcorrect(ascat.bc, GCcontentfile, replictimingfile) + + if (packageVersion("ASCAT") > "3.1.0") { + ascat.prepareHTS( + tumourseqfile = tfile, + normalseqfile = nfile, + tumourname = tn, + normalname = nn, + allelecounter_exe = allelecounter_exe, + alleles.prefix = g1000allelesprefix, + loci.prefix = g1000lociprefix, + nthreads = nthreads, + minCounts = minCounts, + BED_file = BED_file, + probloci_file = probloci_file, + gender = gd, + chrom_names = chrom_names, + min_base_qual = min_base_qual, + min_map_qual = min_map_qual, + skip_allele_counting_tumour = FALSE, + skip_allele_counting_normal = skip_norm + ) + + ascat.bc <- ascat.loadData( + paste0(tn, "_tumourLogR.txt"), paste0(tn, "_tumourBAF.txt"), + if (skip_norm) NULL else paste0(tn, "_normalLogR.txt"), + if (skip_norm) NULL else paste0(tn, "_normalBAF.txt"), + chrs = chrom_names, + gender = gender, + genomeVersion = genome_build + ) + + ascat.bc <- ASCAT::ascat.correctLogR(ascat.bc, GCcontentfile, replictimingfile) + + } else { + ascat.prepareHTS( + tumourseqfile = tfile, + normalseqfile = nfile, + tumourname = tn, + normalname = nn, + allelecounter_exe = allelecounter_exe, + g1000allelesprefix = g1000allelesprefix, + g1000lociprefix = g1000lociprefix, + nthreads = nthreads, + minCounts = minCounts, + BED_file = BED_file, + probloci_file = probloci_file, + gender = gd, + chrom_names = chrom_names, + min_base_qual = min_base_qual, + min_map_qual = min_map_qual, + skip_allele_counting_tumour = FALSE, + skip_allele_counting_normal = skip_norm + ) + + ascat.bc <- ascat.loadData( + paste0(tn, "_tumourLogR.txt"), paste0(tn, "_tumourBAF.txt"), + if (skip_norm) NULL else paste0(tn, "_normalLogR.txt"), + if (skip_norm) NULL else paste0(tn, "_normalBAF.txt"), + chrs = chrom_names, + gender = gender + ) + ascat.bc <- ascat.GCcorrect(ascat.bc, GCcontentfile, replictimingfile) + } + ascat.plotRawData(ascat.bc) if (skip_norm) { # https://github.com/VanLoo-lab/ascat/issues/73 @@ -177,6 +218,11 @@ gcap.runASCAT <- function(tumourseqfile, ascat.bc <- ascat.aspcf(ascat.bc, ascat.gg = gg, penalty = penalty) ascat.plotSegmentedData(ascat.bc) ascat.output <- ascat.runAscat(ascat.bc, gamma = 1L, pdfPlot = TRUE) + if (packageVersion("ASCAT") > "3.1.0") { + QC = ASCAT::ascat.metrics(ascat.bc, ascat.output) + ascat.output = c(ascat.output, list(QC = QC)) + } + saveRDS(ascat.output, file = paste0(id, ".ASCAT.rds")) lg$info("job {id} done") diff --git a/R/workflow.R b/R/workflow.R index 13d034e..c985dcd 100644 --- a/R/workflow.R +++ b/R/workflow.R @@ -139,6 +139,7 @@ gcap.workflow <- function(tumourseqfile, normalseqfile, }, min_base_qual = min_base_qual, min_map_qual = min_map_qual, + genome_build = genome_build, penalty = penalty, skip_finished_ASCAT ) diff --git a/README.md b/README.md index ef91b5a..847d501 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,20 @@ if you use **conda** or other approaches, please set the path when you use corre ### Install ASCAT (required) -Install **ASCAT** v3.0 (modified and adapted for GCAP workflow in HPC) in R console from GitHub with: +#### Latest ASCAT + +From v1.2, GCAP uses the latest version of ASCAT. Install **ASCAT** v3 in R console from GitHub with: + +```r +# install.packages("remotes") +remotes::install_github('VanLoo-lab/ascat/ASCAT') +``` + +We have provided generated reference files above, but sometimes you may want to generate the reference data for yourself, in such case, please refer to for generating the required allele-specific copy number data. + +#### A fixed version of ASCAT + +In our manuscript, we used a fixed version of ASCAT for the GCAP data pre-processing (modified and adapted for GCAP workflow in HPC). It does not fit the R version `>=4.3`. ```r # This is a forked version ASCAT @@ -51,10 +64,6 @@ remotes::install_github("ShixiangWang/ascat@v3-for-gcap-v1", subdir = "ASCAT") # See https://github.com/ShixiangWang/gcap/issues/27 ``` -> Here we used a fixed version of ASCAT for the GCAP data pre-processing, if you want to adopted the latest -> updates in processing your data, please refer to for generating the required -> allele-specific copy number data, and refer to [`?gcap.ASCNworkflow()`](https://shixiangwang.github.io/gcap/reference/gcap.ASCNworkflow.html) for downstream analysis. - ### Alternatives to ASCAT For the latest version of GCAP, [sequenza](https://shixiangwang.github.io/gcap/reference/gcap.workflow.seqz.html) or [facets](https://shixiangwang.github.io/gcap/reference/gcap.workflow.facets.html) are supported for preprocessing the bam data, please refer to the provided links for usage. diff --git a/man/gcap-package.Rd b/man/gcap-package.Rd index ebdc180..f0d6a2f 100644 --- a/man/gcap-package.Rd +++ b/man/gcap-package.Rd @@ -6,12 +6,13 @@ \alias{gcap-package} \title{gcap: Gene-level Circular Amplicon Prediction} \description{ -Provides data processing pipeline feeding paired bam files and XGBOOST model for predicting tumor circular amplicons (also known as ecDNA) in gene level. +Provides data processing pipeline feeding paired bam files (or allele-specific copy number data) and XGBOOST model for predicting tumor circular amplicons (also known as ecDNA) in gene level. } \seealso{ Useful links: \itemize{ \item \url{https://github.com/ShixiangWang/gcap} + \item \url{https://shixiangwang.github.io/gcap/} \item Report bugs at \url{https://github.com/ShixiangWang/gcap/issues} } diff --git a/man/gcap.ASCNworkflow.Rd b/man/gcap.ASCNworkflow.Rd index e77404f..82a9082 100644 --- a/man/gcap.ASCNworkflow.Rd +++ b/man/gcap.ASCNworkflow.Rd @@ -37,8 +37,7 @@ This info is only used in 'XGB56' model. If you don't use this model, you don't need to set it. }} -\item{genome_build}{genome build version, should be -one of 'hg38', 'hg19'.} +\item{genome_build}{"hg38" or "hg19".} \item{model}{model name ("XGB11", "XGB32", "XGB56") or a custom model from input. 'toy' can be used for test.} diff --git a/man/gcap.runASCAT.Rd b/man/gcap.runASCAT.Rd index 674a77f..c5ff219 100644 --- a/man/gcap.runASCAT.Rd +++ b/man/gcap.runASCAT.Rd @@ -27,6 +27,7 @@ gcap.runASCAT( min_base_qual = 20, min_map_qual = 35, penalty = 70, + genome_build = "hg38", skip_finished_ASCAT = FALSE ) } @@ -45,13 +46,13 @@ gcap.runASCAT( \item{allelecounter_exe}{Path to the allele counter executable.} -\item{g1000allelesprefix}{Prefix path to the 1000 Genomes alleles reference files.} +\item{g1000allelesprefix}{Prefix path to the allele data (e.g. "G1000_alleles_chr").} -\item{g1000lociprefix}{Prefix path to the 1000 Genomes SNP reference files.} +\item{g1000lociprefix}{Prefix path to the loci data (e.g. "G1000_loci_chr").} -\item{GCcontentfile}{File containing the GC content around every SNP for increasing window sizes} +\item{GCcontentfile}{File containing the GC content around every SNP for increasing window sizes.} -\item{replictimingfile}{File containing replication timing at every SNP for various cell lines (optional)} +\item{replictimingfile}{File containing replication timing at every SNP for various cell lines.} \item{nthreads}{The number of parallel processes for getting allele counts (optional, default=1).} @@ -74,6 +75,8 @@ chromosomes.} \item{penalty}{penalty of introducing an additional ASPCF breakpoint (expert parameter, don't adapt unless you know what you're doing)} +\item{genome_build}{"hg38" or "hg19".} + \item{skip_finished_ASCAT}{if \code{TRUE}, skipped finished ASCAT calls to save time.} } diff --git a/man/gcap.runASCATBuildflow.Rd b/man/gcap.runASCNBuildflow.Rd similarity index 96% rename from man/gcap.runASCATBuildflow.Rd rename to man/gcap.runASCNBuildflow.Rd index 5619c0a..4c31356 100644 --- a/man/gcap.runASCATBuildflow.Rd +++ b/man/gcap.runASCNBuildflow.Rd @@ -27,8 +27,7 @@ This info is only used in 'XGB56' model. If you don't use this model, you don't need to set it. }} -\item{genome_build}{genome build version, should be -one of 'hg38', 'hg19'.} +\item{genome_build}{"hg38" or "hg19".} \item{overlap}{the overlap percentage on gene.} } diff --git a/man/gcap.workflow.Rd b/man/gcap.workflow.Rd index dc459a4..21c3373 100644 --- a/man/gcap.workflow.Rd +++ b/man/gcap.workflow.Rd @@ -59,8 +59,7 @@ also could be \code{0} for 'XX' and \code{1} for 'XY'.} should be included in \code{extra_info}, the supported cancer type should be described with \href{https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations}{TCGA cancer type abbr.}.} -\item{genome_build}{genome build version, should be -one of 'hg38', 'hg19'.} +\item{genome_build}{"hg38" or "hg19".} \item{model}{model name ("XGB11", "XGB32", "XGB56") or a custom model from input. 'toy' can be used for test.} @@ -86,13 +85,13 @@ Default a unique file name is generated by UUID approach.} \item{allelecounter_exe}{Path to the allele counter executable.} -\item{g1000allelesprefix}{Prefix path to the 1000 Genomes alleles reference files.} +\item{g1000allelesprefix}{Prefix path to the allele data (e.g. "G1000_alleles_chr").} -\item{g1000lociprefix}{Prefix path to the 1000 Genomes SNP reference files.} +\item{g1000lociprefix}{Prefix path to the loci data (e.g. "G1000_loci_chr").} -\item{GCcontentfile}{File containing the GC content around every SNP for increasing window sizes} +\item{GCcontentfile}{File containing the GC content around every SNP for increasing window sizes.} -\item{replictimingfile}{File containing replication timing at every SNP for various cell lines (optional)} +\item{replictimingfile}{File containing replication timing at every SNP for various cell lines.} \item{nthreads}{The number of parallel processes for getting allele counts (optional, default=1).} diff --git a/tests/testthat/test-roxytest-testexamples-ASCNworkflow.R b/tests/testthat/test-roxytest-testexamples-ASCNworkflow.R index 52a8125..5f07e13 100644 --- a/tests/testthat/test-roxytest-testexamples-ASCNworkflow.R +++ b/tests/testthat/test-roxytest-testexamples-ASCNworkflow.R @@ -1,6 +1,6 @@ -# Generated by roxytest: Do not edit by hand! +# Generated by roxytest: do not edit by hand! -context("File R/ASCNworkflow.R: @testexamples") +# File R/ASCNworkflow.R: @testexamples test_that("Function gcap.ASCNworkflow() @ L85", { diff --git a/tests/testthat/test-roxytest-testexamples-auc.R b/tests/testthat/test-roxytest-testexamples-auc.R index 36ee087..8d35b6a 100644 --- a/tests/testthat/test-roxytest-testexamples-auc.R +++ b/tests/testthat/test-roxytest-testexamples-auc.R @@ -1,6 +1,6 @@ -# Generated by roxytest: Do not edit by hand! +# Generated by roxytest: do not edit by hand! -context("File R/auc.R: @testexamples") +# File R/auc.R: @testexamples test_that("Function get_auc() @ L23", { diff --git a/tests/testthat/test-roxytest-testexamples-prediction.R b/tests/testthat/test-roxytest-testexamples-prediction.R index 87b0cfe..6c939ca 100644 --- a/tests/testthat/test-roxytest-testexamples-prediction.R +++ b/tests/testthat/test-roxytest-testexamples-prediction.R @@ -1,6 +1,6 @@ -# Generated by roxytest: Do not edit by hand! +# Generated by roxytest: do not edit by hand! -context("File R/prediction.R: @testexamples") +# File R/prediction.R: @testexamples test_that("Function gcap.runPrediction() @ L18", { diff --git a/tests/testthat/test-roxytest-testexamples-scoring.R b/tests/testthat/test-roxytest-testexamples-scoring.R index 531d23f..635e3bc 100644 --- a/tests/testthat/test-roxytest-testexamples-scoring.R +++ b/tests/testthat/test-roxytest-testexamples-scoring.R @@ -1,6 +1,6 @@ -# Generated by roxytest: Do not edit by hand! +# Generated by roxytest: do not edit by hand! -context("File R/scoring.R: @testexamples") +# File R/scoring.R: @testexamples test_that("Function gcap.runScoring() @ L28", {