Skip to content

Commit

Permalink
Merge pull request #48 from alephnull7/master
Browse files Browse the repository at this point in the history
Refactoring of `coverage` to an R6 class & updates for outlier handling in `scripts_for_figures_tables`
  • Loading branch information
michaelgruenstaeudl authored May 31, 2024
2 parents 6dc35b4 + 6eda66d commit f18e404
Show file tree
Hide file tree
Showing 26 changed files with 347 additions and 171 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
CHANGELOG
---------

#### Version 1.1.3 (2024.05.31)
* Refactoring of `coverage` to an R6 class
* Updates to how outliers are handled for artifacts created by `scripts_for_figures_tables`

#### Version 1.1.2 (2024.05.25)
* Updates to summary tabular statistics files
* Inclusion of unpartitioned statistics for coding and noncoding regions summaries
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: PACVr
Version: 1.1.2
Date: 2024-05-25
Version: 1.1.3
Date: 2024-05-31
Title: Plastome Assembly Coverage Visualization
Authors@R: c(person("Gregory", "Smith", role=c("ctb")),
person("Nils", "Jenke", role=c("ctb")),
Expand Down
2 changes: 1 addition & 1 deletion R/AnalysisSpecs.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

AnalysisSpecs <- R6Class("AnalysisSpecs",
public = list(
Expand Down
165 changes: 165 additions & 0 deletions R/Coverage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.31.0123"

Coverage <- R6Class("Coverage",
public = list(
# fields
analysisSpecs = NULL,
outputSpecs = NULL,
coverageRaw = NULL,
coveragePlot = NULL,
seqNames = NULL,
covData = NULL,
sumData = NULL,

# constructor
initialize = function(bamFile,
analysisSpecs = NULL,
outputSpecs = NULL) {
private$setAnalysisSpecs(analysisSpecs)
private$setOutputSpecs(outputSpecs)
private$setBaseCoverages(bamFile)
},

# public methods

# precondition: `outputSpecs` is set
makeStatsFolder = function() {
self$outputSpecs$makeStatsFolder()
},

printCovStats = function(gbkData) {
self$setSeqNames(gbkData)
if (length(self$seqNames) == 0) {
logger::log_error("Neither `ACCESSION` nor `VERSION` matches BAM sample name")
return()
}

logger::log_info('Generating statistical information on the sequencing coverage')
self$setCovData(gbkData)
self$writeCovTables(gbkData)
},

# precondition: `coverageRaw` is set
setSeqNames = function(gbkData) {
sampleName <- gbkData$sampleName
self$seqNames <- unname(sampleName[sampleName %in% names(self$coverageRaw)])
},

# precondition: `analysisSpecs`, `coverageRaw`, and `seqNames` are set
setCovData = function(gbkData) {
quadripRegions <- gbkData$quadripRegions
genes <- gbkData$genes
seqnames <- self$seqNames
analysisSpecs <- self$analysisSpecs
coverageRaw <- self$coverageRaw

covData <- getCovData(quadripRegions, genes)
covData <- filter_IR_genes(quadripRegions, coverageRaw, seqnames, covData, analysisSpecs)
covData <- filter_IR_noncoding(quadripRegions, coverageRaw, seqnames, covData, analysisSpecs)
covData <- filter_IR_regions(coverageRaw, seqnames, covData, analysisSpecs)
covData <- setLowCoverages(covData, analysisSpecs)
self$covData <- covData
},

# precondition: `covData` and `outputSpecs` are set
writeCovTables = function(gbkData) {
writeCovTables(self$covData,
gbkData$sampleName,
self$outputSpecs$statsFilePath)
},

printCovSumStats = function(gbkData) {
self$setSumData(gbkData)
self$writeSumTables(gbkData)
},

setSumData = function(gbkData) {
# Count of N nucleotides for source
self$setAmbigCounts(gbkData)

# Add count of mismatches between IRs
self$setIRMismatches(gbkData)

# Summarized coverage data, grouped by `quadripRegions`;
# add previous results to this
self$setCovSummary()
},

# precondition: `analysisSpecs` is set
setAmbigCounts = function(gbkData) {
self$sumData <- getAmbigCounts(gbkData,
self$analysisSpecs)
},

# precondition: `analysisSpecs` and `sumData` are set
setIRMismatches = function(gbkData) {
analysisSpecs <- self$analysisSpecs
if (!analysisSpecs$isSyntenyLine) {
return()
}

IR_mismatches <- checkIREquality(gbkData,
analysisSpecs)
self$sumData <- dplyr::full_join(self$sumData,
IR_mismatches,
analysisSpecs$regions_name)
},

# precondition: `analysisSpecs`, `covData`, and `sumData` are set
setCovSummary = function() {
covData <- self$covData
analysisSpecs <- self$analysisSpecs

if (!is.null(covData)) {
summary <- getCovSummaries(covData,
analysisSpecs)
summary$regions_summary <- dplyr::full_join(summary$regions_summary,
self$sumData,
analysisSpecs$regions_name)
} else {
summary <- list(
regions_summary = self$sumData
)
}
self$sumData <- summary
},

# precondition: `outputSpecs` and `sumData` are set
writeSumTables = function(gbkData) {
writeSumTables(self$sumData,
gbkData$sampleName,
self$outputSpecs$statsFilePath)
}
),

# private setters for constructor
private = list(
setAnalysisSpecs = function(analysisSpecs) {
if(is.null(analysisSpecs)) {
self$analysisSpecs <- AnalysisSpecs$new()
} else {
self$analysisSpecs <- analysisSpecs
}
},

setOutputSpecs = function(outputSpecs) {
if(is.null(outputSpecs)) {
self$outputSpecs <- OutputSpecs$new()
} else {
self$outputSpecs <- outputSpecs
}
},

# precondition: `analysisSpecs` is set
setBaseCoverages = function(bamFile) {
coverage <- PACVr.calcCoverage(bamFile,
self$analysisSpecs$windowSize,
self$outputSpecs$logScale)
self$coverageRaw <- coverage$raw
self$coveragePlot <- coverage$plot
}
)
)
2 changes: 1 addition & 1 deletion R/GBKData.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

GBKData <- R6Class("GBKData",
public = list(
Expand Down
2 changes: 1 addition & 1 deletion R/IROps.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

checkIREquality <- function(gbkData,
analysisSpecs) {
Expand Down
2 changes: 1 addition & 1 deletion R/OutputSpecs.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

OutputSpecs <- R6Class("OutputSpecs",
public = list(
Expand Down
14 changes: 6 additions & 8 deletions R/PACVr.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

PACVr.read.gb <- function(gbkFile) {
gbkRaw <- getGbkRaw(gbkFile)
Expand Down Expand Up @@ -114,21 +114,19 @@ PACVr.complete <- function(gbkFile,
gbkData$sampleName)

###################################
coverage <- PACVr.calcCoverage(bamFile,
analysisSpecs$windowSize,
outputSpecs$logScale)
coverage <- Coverage$new(bamFile,
analysisSpecs,
outputSpecs)

###################################
if (tabularCovStats) {
PACVr.compileCovStats(gbkData,
coverage$raw,
analysisSpecs,
outputSpecs)
coverage)
}

###################################
PACVr.vizWithRCircos(gbkData,
coverage$plot,
coverage$coveragePlot,
analysisSpecs,
outputSpecs)

Expand Down
2 changes: 1 addition & 1 deletion R/RCircosOps.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"


# The following R functions were taken from the R package RCircos and then modified.
Expand Down
50 changes: 6 additions & 44 deletions R/compileStats.R
Original file line number Diff line number Diff line change
@@ -1,52 +1,14 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

PACVr.compileCovStats <- function(gbkData,
coverageRaw,
analysisSpecs,
outputSpecs) {
outputSpecs$makeStatsFolder()

covData <- printCovStats(gbkData,
coverageRaw,
analysisSpecs,
outputSpecs)

# Count of N nucleotides for source
regionsSummary <- getAmbigCounts(gbkData,
analysisSpecs)

# Add count of mismatches between IRs
regions_name <- analysisSpecs$regions_name
if (analysisSpecs$isSyntenyLine) {
IR_mismatches <- checkIREquality(gbkData,
analysisSpecs)
regionsSummary <- dplyr::full_join(regionsSummary,
IR_mismatches,
regions_name)
}

# Summarized coverage data, grouped by `quadripRegions`;
# add previous results to this
if (!is.null(covData)) {
summary <- getCovSummaries(covData,
analysisSpecs)
summary$regions_summary <- dplyr::full_join(summary$regions_summary,
regionsSummary,
regions_name)
} else {
summary <- list(
regions_summary = regionsSummary
)
}

writeSumTables(summary,
gbkData$sampleName,
outputSpecs$statsFilePath)

logger::log_info('Coverage statistics saved to `{outputSpecs$statsFilePath}`')
coverage) {
coverage$makeStatsFolder()
coverage$printCovStats(gbkData)
coverage$printCovSumStats(gbkData)
logger::log_info('Coverage statistics saved to `{coverage$outputSpecs$statsFilePath}`')
}

getAmbigCounts <- function(gbkData, analysisSpecs) {
Expand Down
11 changes: 8 additions & 3 deletions R/coverageCalcOps.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

CovCalc <- function(coverageRaw,
windowSize = 250,
Expand Down Expand Up @@ -74,6 +74,12 @@ getCovData <- function(regions, genes) {
select = "all"
)

ir_regions <- unlist(IRanges::slidingWindows(
ir_regions,
width = 250L,
step = 250L
))

covData <- list(
ir_regions = ir_regions,
ir_genes = ir_genes,
Expand Down Expand Up @@ -113,8 +119,7 @@ filter_IR_noncoding <- function(regions, coverageRaw, seqnames, covData, analysi
}

filter_IR_regions <- function(coverageRaw, seqnames, covData, analysisSpecs) {
ir_regions <- unlist(IRanges::slidingWindows(covData$ir_regions, width = 250L, step = 250L))
ir_regions <- GenomicRanges::GRanges(seqnames = seqnames, ir_regions)
ir_regions <- GenomicRanges::GRanges(seqnames = seqnames, covData$ir_regions)
ir_regions <- GenomicRanges::binnedAverage(ir_regions, coverageRaw, "coverage")
chr <- ir_regions@ranges@NAMES
ir_regions <- as.data.frame(ir_regions, row.names = NULL)[c("seqnames", "start", "end", "coverage")]
Expand Down
2 changes: 1 addition & 1 deletion R/customRead.gb.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

read.gbWithHandling <- function(gbkRaw, count=0) {
gbkData <- tryCatch({
Expand Down
2 changes: 1 addition & 1 deletion R/helpers.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

HistCol <- function(cov, threshold, relative, logScale) {
# Function to generate color vector for histogram data
Expand Down
2 changes: 1 addition & 1 deletion R/parsingOps.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

PACVr.parseGenes <- function (gbkSeqFeatures) {
# Function to extract gene information from Genbank flatfile data
Expand Down
2 changes: 1 addition & 1 deletion R/quadripOps.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env RScript
#contributors=c("Gregory Smith", "Nils Jenke", "Michael Gruenstaeudl")
#email="m_gruenstaeudl@fhsu.edu"
#version="2024.05.25.0158"
#version="2024.05.31.0123"

FilterByKeywords <- function(allRegions, where) {
# Function to filter list based on genomic keywords
Expand Down
Loading

0 comments on commit f18e404

Please sign in to comment.