From fa24f71ded0c99432f429caeb5199ab868236e36 Mon Sep 17 00:00:00 2001 From: michaelgruenstaeudl Date: Sun, 7 Jan 2024 23:38:55 -0600 Subject: [PATCH] Update to version 1.0.6 --- CHANGELOG.md | 7 ++ DESCRIPTION | 4 +- R/IRoperations.R | 23 ++---- R/PACVr.R | 24 +++--- R/customizedRCircos.R | 72 +++++++--------- R/generatePlotData.R | 25 ++---- R/helpers.R | 154 +++++++++++------------------------ R/parseData.R | 4 +- R/quadripartiteOperations.R | 24 ++---- R/visualizeWithRCircos.R | 11 +-- README.md | 37 ++++++--- inst/extdata/PACVr_Rscript.R | 4 +- man/PACVr-package.Rd | 2 +- man/PACVr.complete.Rd | 7 +- vignettes/PACVr_Vignette.Rnw | 2 +- 15 files changed, 156 insertions(+), 244 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c52c6f9..8a00d10c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ CHANGELOG --------- +#### Version 1.0.6 (2024.01.07) +* Moved all functions that pertain to inverted repeats or quadripartite genome structure into a separate file and made their application optional + +#### Version 1.0.5 (2023.12.07) +* Minor bugfixes +* New version submitted to CRAN + #### Version 1.0.4 (2023.12.06) * Implemented a logger * Several bug fixes (e.g., error caused by function masking) diff --git a/DESCRIPTION b/DESCRIPTION index 8ab543a2..132a5ca0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: PACVr -Version: 1.0.5 -Date: 2023-12-07 +Version: 1.0.6 +Date: 2024-01-07 Title: Plastome Assembly Coverage Visualization Authors@R: c(person("Gregory", "Smith", role=c("ctb")), person("Nils", "Jenke", role=c("ctb")), diff --git a/R/IRoperations.R b/R/IRoperations.R index ab443ec9..2a67922c 100644 --- a/R/IRoperations.R +++ b/R/IRoperations.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.12.18.2100" +#version="2024.01.07.2200" checkIREquality <- function(gbkData, regions, dir, sample_name) { gbkSeq <- read.gbSeq(gbkData) @@ -13,18 +13,8 @@ checkIREquality <- function(gbkData, regions, dir, sample_name) { IR_diff_gaps <- c() if (repeatB[2] - repeatB[1] != repeatA[2] - repeatA[1]) { message("WARNING: Inverted repeats differ in sequence length") - message(paste( - "The IRb has a total lengths of: ", - repeatB[2] - repeatB[1], - " bp", - sep = "" - )) - message(paste( - "The IRa has a total lengths of: ", - repeatA[2] - repeatA[1], - " bp", - sep = "" - )) + message(paste("The IRb has a total lengths of: ", repeatB[2] - repeatB[1], " bp", sep = "")) + message(paste("The IRa has a total lengths of: ", repeatA[2] - repeatA[1], " bp", sep = "")) } if (gbkSeq[[1]][repeatB[1]:repeatB[2]] != Biostrings::reverseComplement(gbkSeq[[1]][repeatA[1]:repeatA[2]])) { IRa_seq <- Biostrings::DNAString(gbkSeq[[1]][repeatB[1]:repeatB[2]]) @@ -33,10 +23,7 @@ checkIREquality <- function(gbkData, regions, dir, sample_name) { IRb_seq <- split(IRb_seq, ceiling(seq_along(IRb_seq) / 10000)) for (i in 1:min(length(IRa_seq), length(IRb_seq))) { - subst_mat <- - Biostrings::nucleotideSubstitutionMatrix(match = 1, - mismatch = -3, - baseOnly = TRUE) + subst_mat <- Biostrings::nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE) globalAlign <- tryCatch({ Biostrings::pairwiseAlignment( IRa_seq[[i]], diff --git a/R/PACVr.R b/R/PACVr.R index 83e6cb1b..65533650 100644 --- a/R/PACVr.R +++ b/R/PACVr.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.11.23.1530" +#version="2024.01.07.2200" PACVr.parseName <- function (gbkData) { return(read.gbSampleName(gbkData)) @@ -123,7 +123,7 @@ PACVr.visualizeWithRCircos <- function(gbkData, #' package="PACVr") #' outFile <- paste(tempdir(), "/NC_045072__all_reads.pdf", sep="") #' PACVr.complete(gbkFile=gbkFile, bamFile=bamFile, windowSize=250, logScale=FALSE, -#' threshold=0.5, syntenyLineType=3, relative=TRUE, textSize=0.5, +#' threshold=0.5, syntenyLineType=1, relative=TRUE, textSize=0.5, #' regionsCheck=FALSE, verbose=FALSE, output=outFile #' } PACVr.complete <- function(gbkFile, @@ -131,7 +131,7 @@ PACVr.complete <- function(gbkFile, windowSize=250, logScale=FALSE, threshold=0.5, - syntenyLineType=3, + syntenyLineType=1, relative=TRUE, textSize=0.5, regionsCheck=FALSE, @@ -144,7 +144,7 @@ PACVr.complete <- function(gbkFile, ################################### if (regionsCheck) { - logger::log_info('Parsing different genome regions') + logger::log_info('Parsing the different genome regions') regions <- PACVr.parseRegions(gbkData, gbkDataDF) } else { @@ -152,11 +152,11 @@ PACVr.complete <- function(gbkFile, } ################################### - logger::log_info('Parsing different genes') + logger::log_info('Parsing the different genes') genes <- PACVr.parseGenes(gbkDataDF) ################################### - logger::log_info('Calculating sequencing coverage') + logger::log_info('Calculating the sequencing coverage') coverage <- PACVr.calcCoverage(bamFile, windowSize) @@ -164,7 +164,7 @@ PACVr.complete <- function(gbkFile, linkData <- NULL IRCheck <- regionsCheck && isSyntenyLineType(syntenyLineType) if (IRCheck) { - logger::log_info('Inferring IR regions and genes within IRs') + logger::log_info('Inferring the IR regions and the genes within the IRs') linkData <- PACVr.generateIRGeneData(genes, regions, syntenyLineType) @@ -172,7 +172,7 @@ PACVr.complete <- function(gbkFile, ################################### if (regionsCheck && verbose) { - logger::log_info('Generating statistical information on sequencing coverage') + logger::log_info('Generating statistical information on the sequencing coverage') PACVr.verboseInformation(gbkData, bamFile, genes, @@ -182,7 +182,7 @@ PACVr.complete <- function(gbkFile, ################################### if (!is.na(output)) { - logger::log_info('Generating visualization of sequencing coverage') + logger::log_info('Generating a visualization of the sequencing coverage') pdf(output, width=10, height=10) PACVr.visualizeWithRCircos( gbkData, @@ -198,7 +198,7 @@ PACVr.complete <- function(gbkFile, textSize ) dev.off() - logger::log_info('Saved visualization including coverage as `{output}`') + logger::log_info('Visualization (including coverage) saved as `{output}`') } else { logger::log_info('No coverage data inferred; generating empty visualization') PACVr.visualizeWithRCircos( @@ -215,7 +215,7 @@ PACVr.complete <- function(gbkFile, textSize ) dev.off() - logger::log_info('Saved visualization excluding coverage as `{output}`') + logger::log_info('Visualization (excluding coverage) saved as `{output}`') } ###################################################################### logger::log_success('Done.') diff --git a/R/customizedRCircos.R b/R/customizedRCircos.R index 91f6d0d8..5d9b237d 100644 --- a/R/customizedRCircos.R +++ b/R/customizedRCircos.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.12.07.1830" +#version="2024.01.07.2200" # The following R functions were taken from the R package RCircos and then modified. @@ -18,7 +18,7 @@ PACVr.Get.Start.End.Locations <- function(plot.data, plot.width) { for (aChr in seq_len(length(chromosomes))) { cyto.rows <- which(cyto.chroms == chromosomes[aChr]) chr.start <- min(RCircos.Cyto$StartPoint[cyto.rows]) - chr.end <- max(RCircos.Cyto$EndPoint[cyto.rows]) + chr.end <- max(RCircos.Cyto$EndPoint[cyto.rows]) data.rows <- which(dataChroms == chromosomes[aChr]) start.outliers <- which(locations[data.rows, 1] < chr.start) which(locations[data.rows, 1] < chr.start) @@ -45,19 +45,15 @@ PACVr.Histogram.Plot <- function(hist.data = NULL, warning("Genomic data missing in input.") stop() } - boundary <- - RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, - outside.pos, FALSE) + boundary <- RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, outside.pos, FALSE) outerPos <- boundary[1] - innerPos <- boundary[2] - if (is.null(genomic.columns) || - genomic.columns < 2 || genomic.columns > 3) { + innerPos <- boundary[2] + if (is.null(genomic.columns) || genomic.columns < 2 || genomic.columns > 3) { warning("Number of columns for genomic position incorrect.") stop() } if (is.null(data.col) || data.col <= genomic.columns) { - warning(paste("Number of input columns must be > ", genomic.columns, ".", sep = - "")) + warning(paste("Number of input columns must be > ", genomic.columns, ".", sep="")) stop() } RCircos.Pos <- RCircos::RCircos.Get.Plot.Positions() @@ -113,9 +109,7 @@ PACVr.Chromosome.Ideogram.Plot <- function(tick.interval = 0) { ##### tick length, text size, text orientation ##### PACVr.Ideogram.Tick.Plot <- - function(tick.num = 10, - track.for.ticks = 3, - add.text.size = 0) + function(tick.num=10, track.for.ticks=3, add.text.size=0) { RCircos.Pos <- RCircos::RCircos.Get.Plot.Positions() RCircos.Par <- RCircos::RCircos.Get.Plot.Parameters() @@ -124,7 +118,7 @@ PACVr.Ideogram.Tick.Plot <- (RCircos.Pos[1:(nrow(RCircos.Pos) / 2), 3] + 270) %% 360 RCircos.Pos[(nrow(RCircos.Pos) / 2 + 1):nrow(RCircos.Pos), 3] <- (RCircos.Pos[(nrow(RCircos.Pos) / 2 + 1):nrow(RCircos.Pos), 3] + 90) %% 360 - + endchr <- RCircos.Cyto$ChromEnd[length(RCircos.Cyto$ChromEnd)] tick.interval <- endchr / tick.num / 1000 @@ -134,8 +128,8 @@ PACVr.Ideogram.Tick.Plot <- # names use two tracks. There will be total of 6 tracks needed. # =================================================================== track.height <- RCircos.Par$track.height - tick.height <- track.height * track.for.ticks - ticks.span <- RCircos.Par$chr.ideo.pos + tick.height * 2 + tick.height <- track.height * track.for.ticks + ticks.span <- RCircos.Par$chr.ideo.pos + tick.height * 2 if (RCircos.Par$plot.radius < ticks.span) { @@ -156,17 +150,17 @@ PACVr.Ideogram.Tick.Plot <- mid.pos <- RCircos.Pos[, 1:2] * (start.pos + track.height / 4) the.interval <- tick.interval * 1000 short.tick <- round(the.interval / RCircos.Par$base.per.unit, digits = 0) - long.tick <- round(the.interval / RCircos.Par$base.per.unit, digits = 0) + long.tick <- round(the.interval / RCircos.Par$base.per.unit, digits = 0) #short.tick <- round(the.interval/RCircos.Par$base.per.unit, digits=0); - #long.tick <- short.tick*2; + #long.tick <- short.tick*2; - lab.pos <- RCircos.Pos[, 1:2] * (start.pos + tick.height / 2) + lab.pos <- RCircos.Pos[, 1:2] * (start.pos + tick.height / 2) chroms <- unique(RCircos.Cyto$Chromosome) for (aChr in seq_len(length(chroms))) { - the.chr <- RCircos.Cyto[RCircos.Cyto[, 1] == chroms[aChr],] + the.chr <- RCircos.Cyto[RCircos.Cyto[, 1] == chroms[aChr],] chr.start <- the.chr$StartPoint[1] - chr.end <- the.chr$EndPoint[nrow(the.chr)] + chr.end <- the.chr$EndPoint[nrow(the.chr)] total.ticks <- tick.num for (a.tick in seq_len(total.ticks)) @@ -178,8 +172,7 @@ PACVr.Ideogram.Tick.Plot <- c(innerPos[tick.pos, 2], outerPos[tick.pos, 2]), col = the.chr$ChrColor[1]) - lab.text <- - paste0(round((a.tick - 1) * tick.interval, 1), "kb") + lab.text <- paste0(round((a.tick - 1) * tick.interval, 1), "kb") graphics::text( lab.pos[tick.pos, 1] , @@ -205,7 +198,7 @@ PACVr.Ideogram.Tick.Plot <- # parameter, direct work with RCircosEnvironment is needed. # ======================================================= old.name.pos <- RCircos.Par$chr.name.pos - old.out.pos <- RCircos.Par$track.out.start + old.out.pos <- RCircos.Par$track.out.start old.distance <- old.out.pos - old.name.pos RCircos.Par$chr.name.pos <- ticks.span @@ -245,12 +238,9 @@ PACVr.Gene.Name.Plot <- function(gene.data = NULL, # Convert raw data to plot data. The raw data will be validated # first during the conversion # ============================================================= - boundary <- RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, - outside.pos, FALSE) - gene.data <- RCircos::RCircos.Get.Single.Point.Positions(gene.data, - genomic.columns) - gene.data <- RCircos::RCircos.Get.Gene.Label.Locations(gene.data, genomic.columns, - is.sorted) + boundary <- RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, outside.pos, FALSE) + gene.data <- RCircos::RCircos.Get.Single.Point.Positions(gene.data, genomic.columns) + gene.data <- RCircos::RCircos.Get.Gene.Label.Locations(gene.data, genomic.columns, is.sorted) # Label positions # ============================================================= @@ -261,7 +251,7 @@ PACVr.Gene.Name.Plot <- function(gene.data = NULL, textSide <- rep(4, nrow(gene.data)) textSide[thePoints <= rightSide] <- 2 } else { - labelPos <- boundary[2] - correction + labelPos <- boundary[2] - correction textSide <- rep(2, nrow(gene.data)) textSide[thePoints <= rightSide] <- 4 } @@ -296,8 +286,7 @@ PACVr.Gene.Connector.Plot <- function(genomic.data = NULL, if (is.null(genomic.data)) stop("Genomic data missing for RCircos.Gene.Connector.Plot().\n") - boundary <- RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, - outside.pos, erase.area = FALSE) + boundary <- RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, outside.pos, erase.area=FALSE) outerPos <- boundary[1] innerPos <- boundary[2] RCircos.Pos <- RCircos::RCircos.Get.Plot.Positions() @@ -388,20 +377,16 @@ PACVr.Line.Plot <- { if (is.null(line.data)) stop("Genomic data missing in RCircos.Line.Plot().\n") - boundary <- - RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, - outside.pos, FALSE) + boundary <- RCircos::RCircos.Get.Plot.Boundary(track.num, side, inside.pos, outside.pos, FALSE) outerPos <- boundary[1] innerPos <- boundary[2] if (is.null(genomic.columns)) stop("Missing number of columns for genomic position.\n") if (is.null(data.col) || data.col <= genomic.columns) - stop("Line data column must be ", genomic.columns + - 1, " or bigger.\n") + stop("Line data column must be ", genomic.columns + 1, " or bigger.\n") RCircos.Pos <- RCircos::RCircos.Get.Plot.Positions() RCircos.Par <- RCircos::RCircos.Get.Plot.Parameters() - line.data <- RCircos::RCircos.Get.Single.Point.Positions(line.data, - genomic.columns) + line.data <- RCircos::RCircos.Get.Single.Point.Positions(line.data, genomic.columns) pointValues <- as.numeric(line.data[, data.col]) if (is.null(min.value) || is.null(max.value)) { min.value <- min(pointValues) @@ -415,8 +400,7 @@ PACVr.Line.Plot <- min.value, max.value, plot.type = "points", - outerPos - - innerPos) + outerPos - innerPos) pointHeight <- pointHeight + innerPos line.colors <- RCircos::RCircos.Get.Plot.Colors(line.data, RCircos.Par$line.color) #RCircos.Track.Outline(outerPos, innerPos, RCircos.Par$sub.tracks) @@ -490,7 +474,7 @@ PACVr.Reset.Plot.Parameters <- function (new.params = NULL) new.params$highlight.pos <- old.params$highlight.pos + differ new.name.pos <- old.params$chr.name.pos + differ if (new.params$chr.name.pos < new.name.pos) - new.params$chr.name.pos <- new.name.pos + new.params$chr.name.pos <- new.name.pos } diff --git a/R/generatePlotData.R b/R/generatePlotData.R index 28625b16..d21f3dab 100644 --- a/R/generatePlotData.R +++ b/R/generatePlotData.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.11.23.1530" +#version="2024.01.07.2200" CovCalc <- function(bamFile, windowSize = 250) { # Calculates coverage of a given bam file and stores data in data.frame format @@ -17,23 +17,19 @@ CovCalc <- function(bamFile, windowSize = 250) { stop() } cov <- GenomicAlignments::coverage(bamFile) - bins <- - GenomicRanges::tileGenome( - sum(IRanges::runLength(cov)), + bins <- GenomicRanges::tileGenome( + sum(IRanges::runLength(cov)), tilewidth = windowSize, cut.last.tile.in.chrom = TRUE ) cov <- GenomicRanges::binnedAverage(bins, cov, "coverage") - cov <- - as.data.frame(cov)[c("seqnames", "start", "end", "coverage")] - colnames(cov) <- - c("Chromosome", "chromStart", "chromEnd", "coverage") + cov <- as.data.frame(cov)[c("seqnames", "start", "end", "coverage")] + colnames(cov) <- c("Chromosome", "chromStart", "chromEnd", "coverage") cov$coverage <- ceiling(as.numeric(cov$coverage)) return(cov) } -GenerateHistogramData <- - function(region, coverage, windowSize, lastOne) { +GenerateHistogramData <- function(region, coverage, windowSize, lastOne) { # Function to generate line data for RCircos.Line.Plot # ARGS: # coverage: data.frame of coverage @@ -42,12 +38,9 @@ GenerateHistogramData <- # Error handling logger::log_info(' Generating histogram data for region `{region[4]}`') if (lastOne) { - coverage <- - coverage[(floor(region[1, 2] / windowSize) + 1):ceiling(region[1, 3] / windowSize),] + coverage <- coverage[(floor(region[1, 2] / windowSize) + 1):ceiling(region[1, 3] / windowSize),] } else { - coverage <- - coverage[(floor(region[1, 2] / windowSize) + 1):floor(region[1, 3] / windowSize) + - 1,] + coverage <- coverage[(floor(region[1, 2] / windowSize) + 1):floor(region[1, 3] / windowSize) + 1,] } coverage[, 4] <- mean(coverage[, 4]) return(coverage) diff --git a/R/helpers.R b/R/helpers.R index c629d220..ac7e7326 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.12.07.1830" +#version="2024.01.07.2200" read.gb2DF <- function(gbkData) { fileDF <- data.frame() @@ -94,8 +94,7 @@ read.gbOther <- function(gbkDataDF) { read.gbLengths <- function(gbkData) { sampleLengths <- c() for (sample in gbkData) { - sampleLengths <- c(sampleLengths, - nchar(sample$ORIGIN)) + sampleLengths <- c(sampleLengths, nchar(sample$ORIGIN)) } return(sampleLengths) } @@ -104,8 +103,8 @@ read.gbSampleName <- function(gbkData) { sampleNames <- c() for (sample in gbkData) { sampleNames <- c(sampleNames, - c(sample_name = sample$ACCESSION, - genome_name = sample$VERSION)) + c(sample_name = sample$ACCESSION, + genome_name = sample$VERSION)) } return(sampleNames) } @@ -113,9 +112,7 @@ read.gbSampleName <- function(gbkData) { read.gbPlotTitle <- function(gbkData) { plotTitles <- c() for (sample in gbkData) { - plotTitles <- c(plotTitles, - paste(sample$DEFINITION, - sample$ACCESSION)) + plotTitles <- c(plotTitles, paste(sample$DEFINITION, sample$ACCESSION)) } return(plotTitles) } @@ -138,7 +135,7 @@ HistCol <- function(cov, threshold, relative, logScale) { threshold <- mean(cov[, 4]) * threshold } color <- rep("black", nrow(cov)) - ind <- as.numeric(cov[, 4]) <= threshold + ind <- as.numeric(cov[, 4]) <= threshold color <- replace(color, ind, "red") return(color) } @@ -152,20 +149,13 @@ boolToDeci <- function(boolList) { return(out) } -writeTables <- - function(regions, - bamFile, - genes, - dir, - sample_name) { - ir_regions <- - IRanges::IRanges( +writeTables <- function(regions, bamFile, genes, dir, sample_name) { + ir_regions <- IRanges::IRanges( start = regions$chromStart, end = regions$chromEnd, names = regions$Band ) - ir_genes <- - unlist(IRanges::slidingWindows( + ir_genes <- unlist(IRanges::slidingWindows( IRanges::reduce( IRanges::IRanges( start = genes$chromStart, @@ -176,8 +166,7 @@ writeTables <- width = 250L, step = 250L )) - ir_noncoding <- - unlist(IRanges::slidingWindows( + ir_noncoding <- unlist(IRanges::slidingWindows( IRanges::reduce(IRanges::gaps( ir_genes, start = min(regions$chromStart), @@ -186,75 +175,45 @@ writeTables <- width = 250L, step = 250L )) - overlaps_genes <- - IRanges::findOverlaps( + overlaps_genes <- IRanges::findOverlaps( query = ir_genes, subject = ir_regions, type = "any", select = "all" ) - - ir_genes <- - GenomicRanges::GRanges(seqnames = sample_name["genome_name"], ir_genes) - ir_genes <- - GenomicRanges::binnedAverage(ir_genes, - GenomicAlignments::coverage(bamFile), - "coverage") - ir_genes <- - as.data.frame(ir_genes)[c("seqnames", "start", "end", "coverage")] - colnames(ir_genes) <- - c("Chromosome", "chromStart", "chromEnd", "coverage") - ir_genes$coverage <- ceiling(as.numeric(ir_genes$coverage)) - ir_genes$Chromosome <- - sapply(overlaps_genes, function(x) - regions$Band[x]) - ir_genes$Chromosome <- - gsub("^c\\(\"|\"|\"|\"\\)$", - "", - as.character(ir_genes$Chromosome)) - - overlaps_noncoding <- - IRanges::findOverlaps( + overlaps_noncoding <- IRanges::findOverlaps( query = ir_noncoding, subject = ir_regions, type = "any", select = "all" ) - - ir_noncoding <- - GenomicRanges::GRanges(seqnames = sample_name["genome_name"], ir_noncoding) - ir_noncoding <- - GenomicRanges::binnedAverage(ir_noncoding, - GenomicAlignments::coverage(bamFile), - "coverage") - ir_noncoding <- - as.data.frame(ir_noncoding)[c("seqnames", "start", "end", "coverage")] - colnames(ir_noncoding) <- - c("Chromosome", "chromStart", "chromEnd", "coverage") - ir_noncoding$coverage <- - ceiling(as.numeric(ir_noncoding$coverage)) - ir_noncoding$Chromosome <- - sapply(overlaps_noncoding, function(x) - regions$Band[x]) - ir_noncoding$Chromosome <- - gsub("^c\\(\"|\"|\"|\"\\)$", - "", - as.character(ir_noncoding$Chromosome)) - - ir_regions <- - unlist(IRanges::slidingWindows(ir_regions, width = 250L, step = 250L)) - ir_regions <- - GenomicRanges::GRanges(seqnames = sample_name["genome_name"], ir_regions) - ir_regions <- - GenomicRanges::binnedAverage(ir_regions, - GenomicAlignments::coverage(bamFile), - "coverage") + + # Operations on genes + ir_genes <- GenomicRanges::GRanges(seqnames = sample_name["genome_name"], ir_genes) + ir_genes <- GenomicRanges::binnedAverage(ir_genes, GenomicAlignments::coverage(bamFile), "coverage") + ir_genes <- as.data.frame(ir_genes)[c("seqnames", "start", "end", "coverage")] + colnames(ir_genes) <- c("Chromosome", "chromStart", "chromEnd", "coverage") + ir_genes$coverage <- ceiling(as.numeric(ir_genes$coverage)) + ir_genes$Chromosome <- sapply(overlaps_genes, function(x) regions$Band[x]) + ir_genes$Chromosome <- gsub("^c\\(\"|\"|\"|\"\\)$", "", as.character(ir_genes$Chromosome)) + + # Operations on non-coding regions + ir_noncoding <- GenomicRanges::GRanges(seqnames = sample_name["genome_name"], ir_noncoding) + ir_noncoding <- GenomicRanges::binnedAverage(ir_noncoding, GenomicAlignments::coverage(bamFile), "coverage") + ir_noncoding <- as.data.frame(ir_noncoding)[c("seqnames", "start", "end", "coverage")] + colnames(ir_noncoding) <- c("Chromosome", "chromStart", "chromEnd", "coverage") + ir_noncoding$coverage <- ceiling(as.numeric(ir_noncoding$coverage)) + ir_noncoding$Chromosome <- sapply(overlaps_noncoding, function(x) regions$Band[x]) + ir_noncoding$Chromosome <- gsub("^c\\(\"|\"|\"|\"\\)$", "", as.character(ir_noncoding$Chromosome)) + + # Operations on IRs + ir_regions <- unlist(IRanges::slidingWindows(ir_regions, width = 250L, step = 250L)) + ir_regions <- GenomicRanges::GRanges(seqnames = sample_name["genome_name"], ir_regions) + ir_regions <- GenomicRanges::binnedAverage(ir_regions, GenomicAlignments::coverage(bamFile), "coverage") chr <- ir_regions@ranges@NAMES - ir_regions <- - as.data.frame(ir_regions, row.names = NULL)[c("seqnames", "start", "end", "coverage")] + ir_regions <- as.data.frame(ir_regions, row.names = NULL)[c("seqnames", "start", "end", "coverage")] ir_regions["seqnames"] <- chr - colnames(ir_regions) <- - c("Chromosome", "chromStart", "chromEnd", "coverage") + colnames(ir_regions) <- c("Chromosome", "chromStart", "chromEnd", "coverage") ir_regions$coverage <- ceiling(as.numeric(ir_regions$coverage)) cov_regions <- @@ -264,13 +223,9 @@ writeTables <- FUN = function(x) ceiling(mean(x) - sd(x)) ) - ir_regions$lowCoverage <- - with(ir_regions, ir_regions$coverage < cov_regions$coverage[match(Chromosome, cov_regions$Chromosome)]) - - ir_genes$lowCoverage <- - ir_genes$coverage < mean(ir_genes$coverage) - sd(ir_genes$coverage) - ir_noncoding$lowCoverage <- - ir_noncoding$coverage < mean(ir_noncoding$coverage) - sd(ir_noncoding$coverage) + ir_regions$lowCoverage <- with(ir_regions, ir_regions$coverage < cov_regions$coverage[match(Chromosome, cov_regions$Chromosome)]) + ir_genes$lowCoverage <- ir_genes$coverage < mean(ir_genes$coverage) - sd(ir_genes$coverage) + ir_noncoding$lowCoverage <- ir_noncoding$coverage < mean(ir_noncoding$coverage) - sd(ir_noncoding$coverage) ir_genes$lowCoverage[ir_genes$lowCoverage == TRUE] <- "*" ir_genes$lowCoverage[ir_genes$lowCoverage == FALSE] <- "" @@ -279,46 +234,29 @@ writeTables <- ir_noncoding$lowCoverage[ir_noncoding$lowCoverage == TRUE] <- "*" ir_noncoding$lowCoverage[ir_noncoding$lowCoverage == FALSE] <- "" + # Writing values to output table write.table( ir_genes, - paste( - dir, - .Platform$file.sep, - sample_name["sample_name"], - "_coverage.genes.bed", - sep = "" - ), + paste(dir, .Platform$file.sep, sample_name["sample_name"], "_coverage.genes.bed", sep = ""), row.names = FALSE, quote = FALSE, sep = "\t" ) write.table( ir_regions, - paste( - dir, - .Platform$file.sep, - sample_name["sample_name"], - "_coverage.regions.bed", - sep = "" - ), + paste(dir, .Platform$file.sep, sample_name["sample_name"], "_coverage.regions.bed", sep = ""), row.names = FALSE, quote = FALSE, sep = "\t" ) write.table( ir_noncoding, - paste( - dir, - .Platform$file.sep, - sample_name["sample_name"], - "_coverage.noncoding.bed", - sep = "" - ), + paste(dir, .Platform$file.sep, sample_name["sample_name"], "_coverage.noncoding.bed", sep = ""), row.names = FALSE, quote = FALSE, sep = "\t" ) - } + } validateColors <- function(colorsToValidate) { colorNames <- colors() diff --git a/R/parseData.R b/R/parseData.R index 6554e8c7..4b20be38 100644 --- a/R/parseData.R +++ b/R/parseData.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.11.23.1530" +#version="2024.01.07.2200" ExtractAllGenes <- function(gbkDataDF) { # Function to extract gene information from Genbank flatfile data diff --git a/R/quadripartiteOperations.R b/R/quadripartiteOperations.R index 0e4f1817..b3ab425f 100644 --- a/R/quadripartiteOperations.R +++ b/R/quadripartiteOperations.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.12.18.2100" +#version="2024.01.07.2200" FilterByKeywords <- function(allRegions, where) { # Function to filter list based on genomic keywords @@ -74,8 +74,7 @@ fillDataFrame <- function(gbkData, regions) { logger::log_info(' Annotating plastid genome with quadripartite regions') seqLength <- read.gbLengths(gbkData) if ((nrow(regions) == 0) || (regions[1, 2] == -1)) { - regions[1,] <- - c("", as.numeric(1), as.numeric(seqLength), "NA", "gpos100") + regions[1,] <- c("", as.numeric(1), as.numeric(seqLength), "NA", "gpos100") regions[, 2] <- as.numeric(regions[, 2]) regions[, 3] <- as.numeric(regions[, 3]) return(regions) @@ -90,8 +89,7 @@ fillDataFrame <- function(gbkData, regions) { if (start - 1 < seqLength) { regions[nrow(regions) + 1,] <- c("", start, seqLength, "NA", "gpos100") } - regions <- - regions[order(as.numeric(regions[, 2]), decreasing=FALSE),] + regions <- regions[order(as.numeric(regions[, 2]), decreasing=FALSE),] row.names(regions) <- 1:nrow(regions) regions[, 2] <- as.numeric(regions[, 2]) regions[, 3] <- as.numeric(regions[, 3]) @@ -122,8 +120,7 @@ fillDataFrame <- function(gbkData, regions) { } } else if (regionAvail == 11) { # only LSC, SSC and IRa - regions[which(regions[, 4] == "NA" & - regions[, 6] == regions[which(regions[, 4] == "IRa"), 6]), 4] <- "IRb" + regions[which(regions[, 4] == "NA" & regions[, 6] == regions[which(regions[, 4] == "IRa"), 6]), 4] <- "IRb" message("Annotation for IRb was automatically added") } else if (regionAvail == 13) { # only LSC, IRb and IRa @@ -132,8 +129,7 @@ fillDataFrame <- function(gbkData, regions) { message("Annotation for SSC was automatically added") } else if (regionAvail == 14) { # only LSC, IRb and SSC - regions[which(regions[, 4] == "NA" & - regions[, 6] == regions[which(regions[, 4] == "IRb"), 6]), 4] <- "IRa" + regions[which(regions[, 4] == "NA" & regions[, 6] == regions[which(regions[, 4] == "IRb"), 6]), 4] <- "IRa" message("Annotation for IRa was automatically added") } regions <- regions[-6] @@ -148,12 +144,8 @@ fillDataFrame <- function(gbkData, regions) { plotAverageLines <- function(regions, coverage, windowSize, positions) { averageLines <- c() for (i in 1:nrow(regions)) { - lineData <- - GenerateHistogramData(regions[i,], coverage, windowSize, - (i == nrow(regions))) - averageLines <- - c(averageLines, - paste(regions[i, 4], ": ", trunc(lineData[1, 4]), "X", sep = "")) + lineData <- GenerateHistogramData(regions[i,], coverage, windowSize, (i == nrow(regions))) + averageLines <- c(averageLines, paste(regions[i, 4], ": ", trunc(lineData[1, 4]), "X", sep = "")) PACVr.Line.Plot( line.data = lineData, data.col = 4, diff --git a/R/visualizeWithRCircos.R b/R/visualizeWithRCircos.R index 727e1829..734c66f0 100644 --- a/R/visualizeWithRCircos.R +++ b/R/visualizeWithRCircos.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.12.18.2100" +#version="2024.01.07.2200" #' Title @@ -214,12 +214,7 @@ getLegendParams <- function(relative, coverage, threshold, averageLines) { ), as.expression(bquote( "Coverage" <= .( - paste( - legendParams$vals[[3]], - "X (=", - legendParams$vals[[4]], - "% of avg. cov.)", - sep = "") + paste(legendParams$vals[[3]], "X (=", legendParams$vals[[4]], "% of avg. cov.)", sep = "") ))) ) diff --git a/README.md b/README.md index 613113bf..30453d53 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,6 @@ install_github("michaelgruenstaeudl/PACVr") ``` Note: Detailed installation instructions can be found in the package vignette. -## PRE-FORMATTING INPUT -Due to the internal usage of R package [genbankr](https://bioconductor.org/packages/release/bioc/html/genbankr.html), any GenBank flatfile must conform to the following specifications: -- Flatfile must include a _source_ feature at start of feature table -- All _exon_ features (plus their qualifier lines) must be removed: `sed -i -e '/ exon/,+2d' input.gb` -- All redundant _complement_ specifications must be removed: `sed -i -z 's/),\s*complement(/,/g' input.gb` - - ## USAGE ``` # In R: @@ -26,11 +17,27 @@ library(PACVr) gbkFile <- system.file("extdata", "NC_045072/NC_045072.gb", package="PACVr") bamFile <- system.file("extdata", "NC_045072/NC_045072_PlastomeReadsOnly.sorted.bam", package="PACVr") + outFile <- paste(tempdir(), "/NC_045072_AssemblyCoverage_viz.pdf", sep="") #outFile <- "../Desktop/test.pdf" # on R-Studio for Windows +#outFile <- "~/test.pdf" # on R-Studio for Linux + +## ONLY COVERAGE VALUES, NO REGION INDICATORS ## PACVr.complete(gbkFile, bamFile, windowSize=250, - logScale=FALSE, threshold=0.5, syntenyLineType=3, - relative=TRUE, textSize=0.5, regionsCheck=FALSE, + logScale=FALSE, threshold=0.5, syntenyLineType=NA, + relative=TRUE, textSize=0.5, regionsCheck=FALSE, + verbose=FALSE, output=outFile) + +## COVERAGE VALUES PLUS REGION INDICATORS ## +PACVr.complete(gbkFile, bamFile, windowSize=250, + logScale=FALSE, threshold=0.5, syntenyLineType=NA, + relative=TRUE, textSize=0.5, regionsCheck=TRUE, + verbose=FALSE, output=outFile) + +## COVERAGE VALUES PLUS REGION INDICATORS PLUS IR SYNTENY LINES ## +PACVr.complete(gbkFile, bamFile, windowSize=250, + logScale=FALSE, threshold=0.5, syntenyLineType=1, + relative=TRUE, textSize=0.5, regionsCheck=TRUE, verbose=FALSE, output=outFile) ``` @@ -75,6 +82,14 @@ Using PACVr in your research? Please cite it! * Foo bar baz --> + ## CHANGELOG See [`CHANGELOG.md`](CHANGELOG.md) for a list of recent changes to the software. diff --git a/inst/extdata/PACVr_Rscript.R b/inst/extdata/PACVr_Rscript.R index 206a5699..ca424f14 100644 --- a/inst/extdata/PACVr_Rscript.R +++ b/inst/extdata/PACVr_Rscript.R @@ -1,7 +1,7 @@ -#!/usr/bin/R +#!/usr/bin/env RScript #contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl") #email="m_gruenstaeudl@fhsu.edu" -#version="2023.11.23.1530" +#version="2024.01.07.2200" library("optparse") diff --git a/man/PACVr-package.Rd b/man/PACVr-package.Rd index 3c20ef1b..791964f2 100644 --- a/man/PACVr-package.Rd +++ b/man/PACVr-package.Rd @@ -25,5 +25,5 @@ } \references{ - Gruenstaeudl, M. and Jenke, N. (2019). PACVr: Plastome Assembly Coverage Visualization in R. bioRxiv 697821; doi: https://doi.org/10.1101/697821 + Gruenstaeudl, M. and Jenke, N. (2020). PACVr: plastome assembly coverage visualization in R. BMC Bioinformatics 21: 207. doi: https://doi.org/10.1186/s12859-020-3475-0 } diff --git a/man/PACVr.complete.Rd b/man/PACVr.complete.Rd index c58c505b..19100adb 100644 --- a/man/PACVr.complete.Rd +++ b/man/PACVr.complete.Rd @@ -9,7 +9,7 @@ PACVr.complete(gbkFile, windowSize = 250, logScale = FALSE, threshold = 0.5, - syntenyLineType = 3, + syntenyLineType = 1, relative = TRUE, textSize = 0.5, regionsCheck = FALSE, @@ -52,7 +52,8 @@ bamFile <- system.file("extdata", package="PACVr") outFile <- paste(tempdir(), "/NC_045072__all_reads.pdf", sep="") PACVr.complete(gbkFile, bamFile, windowSize=250, - threshold=0.5, syntenyLineType=1, relative=TRUE, - textSize=0.5, verbose=FALSE, output=outFile) + logScale=FALSE, threshold=0.5, syntenyLineType=1, + relative=TRUE, textSize=0.5, regionsCheck=TRUE, + verbose=FALSE, output=outFile) } } diff --git a/vignettes/PACVr_Vignette.Rnw b/vignettes/PACVr_Vignette.Rnw index 9be595f9..25699775 100644 --- a/vignettes/PACVr_Vignette.Rnw +++ b/vignettes/PACVr_Vignette.Rnw @@ -70,7 +70,7 @@ outFile <- paste(tempdir(), "/NC_045072_AssemblyCoverage_viz.pdf", sep="") # Run PACVr PACVr.complete(gbkFile, bamFile, windowSize=250, - logScale=FALSE, threshold=0.5, syntenyLineType=3, + logScale=FALSE, threshold=0.5, syntenyLineType=1, relative=TRUE, textSize=0.5, regionsCheck=FALSE, verbose=FALSE, output=outFile) \end{BGVerbatim}