From 8a24706ddc9b6dd8376e0b54e58c6f0a215da554 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Tue, 1 Oct 2024 15:11:03 +1000 Subject: [PATCH 1/4] add AD, DP to bcf mpileup, calculate AF for indels --- Rtools/malDrugR/filterBcfVcf.R | 242 ++++++++++++++++++--------------- modules/stvariant.nf | 3 +- 2 files changed, 136 insertions(+), 109 deletions(-) diff --git a/Rtools/malDrugR/filterBcfVcf.R b/Rtools/malDrugR/filterBcfVcf.R index 651636f..614cb2f 100755 --- a/Rtools/malDrugR/filterBcfVcf.R +++ b/Rtools/malDrugR/filterBcfVcf.R @@ -60,16 +60,20 @@ pfCurrRefs <- data.frame( version = c("52", "57", "52") ) ref <- filter(pfCurrRefs, strain == argv$refstrain) -if (ref$strain == "Supp"){ +if (ref$strain == "Supp") { ref$strain <- "3D7" ref$supp <- "_supplemented" - } else ref$supp <- "" +} else { + ref$supp <- "" +} # ---------- Read genomic features -------------------------------------------- file.gff <- file.path( refDir, - paste0("PlasmoDB-", ref$version, "_Pfalciparum", ref$strain, ref$supp, - ".gff") + paste0( + "PlasmoDB-", ref$version, "_Pfalciparum", ref$strain, ref$supp, + ".gff" + ) ) pf_features <- tryCatch( @@ -97,7 +101,7 @@ CDSnoVar <- pf_featuresNovar[annotation(pf_featuresNovar)$type == "CDS", ] file.chrinfo <- file.path( refDir, paste0( "PlasmoDB-", ref$version, "_Pfalciparum", ref$strain, - "_Genome", ref$supp, ".fasta.fai" + "_Genome", ref$supp, ".fasta.fai" ) ) chrinfo <- @@ -288,11 +292,11 @@ countalleles <- function(bamfile, vcf) { subjectHits() ] PosInSeq <- mapToAlignments(vcfgr, eventSeq) - refwidthDNA <- + refwidthDNA <- subseq(mcols(eventSeq)$seq, start = start(PosInSeq), width = width(vcfgr$REF) - ) + ) altwidthDNA <- tryCatch( subseq(mcols(eventSeq)$seq, start = start(PosInSeq), @@ -321,100 +325,100 @@ countalleles <- function(bamfile, vcf) { }) |> list_rbind() } -if (nrow(snpCDS) > 0){ - SNPalleleCounts <- map( - c( - paste0(samplesOI, "_nodup.bam"), - paste0(parentlist, "_nodup.bam") - ), - countalleles, - vcf = snpCDS - ) |> - list_rbind() - - if (nrow(SNPalleleCounts) > 1) { - SNPalleleCounts <- arrange(SNPalleleCounts, variant) +if (nrow(snpCDS) > 0) { + SNPalleleCounts <- map( + c( + paste0(samplesOI, "_nodup.bam"), + paste0(parentlist, "_nodup.bam") + ), + countalleles, + vcf = snpCDS + ) |> + list_rbind() + + if (nrow(SNPalleleCounts) > 1) { + SNPalleleCounts <- arrange(SNPalleleCounts, variant) + } + #### Annotate SNPs #### + #### Load genome and transcript db + transcriptdb <- file.path( + refDir, + paste0("PlasmoDB-", ref$version, "_Pfalciparum", ref$strain, "_txdb.sql") + ) + txdb <- tryCatch( + txdb <- loadDb(transcriptdb), + error = function(e) { + stop(paste(e, "Filepath:", transcriptdb)) } - #### Annotate SNPs #### - #### Load genome and transcript db - transcriptdb <- file.path( - refDir, - paste0("PlasmoDB-", ref$version, "_Pfalciparum", ref$strain, "_txdb.sql") + ) + if (argv$refstrain == "Supp") { + library("BSgenome.PfalciparumNF54iGP", + character.only = TRUE ) - txdb <- tryCatch( - txdb <- loadDb(transcriptdb), - error = function(e) { - stop(paste(e, "Filepath:", transcriptdb)) - } + pfg <- get("BSgenome.PfalciparumNF54iGP") + } else { + library(paste0("BSgenome.Pfalciparum", ref$strain, ".PlasmoDB.", ref$version), + character.only = TRUE ) - if (argv$refstrain == "Supp") { - library("BSgenome.PfalciparumNF54iGP", - character.only = TRUE - ) - pfg <- get("BSgenome.PfalciparumNF54iGP") - } else { - library(paste0("BSgenome.Pfalciparum", ref$strain, ".PlasmoDB.", ref$version), - character.only = TRUE - ) - pfg <- get(paste0("BSgenome.Pfalciparum", ref$strain, ".PlasmoDB.", ref$version)) - } - - #### AA prediction for SNPs in CDS - AApred <- predictCoding(query = snpCDS, subject = txdb, seqSource = pfg) - - #### SNPs in same codon - mcols(AApred)$warning <- NA - if (paste(AApred$CDSID, AApred$PROTEINLOC) |> unlist() |> n_distinct() < - length(AApred) - ) { - multihit <- which(mcols(AApred) |> - as.data.frame() |> - add_count(CDSID, PROTEINLOC) |> - dplyr::select(n) > 1) - mcols(AApred[multihit])$warning <- "multihit on codon" - } - - #### Remove synonymous #### - AApred <- AApred[which(mcols(AApred)$CONSEQUENCE != "synonymous" | - !is.na(mcols(AApred)$warning))] - nonsynSNP <- - cbind( - as.data.frame(AApred) |> dplyr::select(-ALT), - data.frame( - SNP = names(AApred), - ALT = as.character(unlist(AApred$ALT)), - AAchanges = with( - AApred, - paste0(REFAA, PROTEINLOC, VARAA) - ) - ) - ) |> dplyr::select(SNP, seqnames, - pos = start, strand, REF, ALT, QUAL, - AAchanges, REFCODON, VARCODON, warning + pfg <- get(paste0("BSgenome.Pfalciparum", ref$strain, ".PlasmoDB.", ref$version)) + } + + #### AA prediction for SNPs in CDS + AApred <- predictCoding(query = snpCDS, subject = txdb, seqSource = pfg) + + #### SNPs in same codon + mcols(AApred)$warning <- NA + if (paste(AApred$CDSID, AApred$PROTEINLOC) |> unlist() |> n_distinct() < + length(AApred) + ) { + multihit <- which(mcols(AApred) |> + as.data.frame() |> + add_count(CDSID, PROTEINLOC) |> + dplyr::select(n) > 1) + mcols(AApred[multihit])$warning <- "multihit on codon" + } + + #### Remove synonymous #### + AApred <- AApred[which(mcols(AApred)$CONSEQUENCE != "synonymous" | + !is.na(mcols(AApred)$warning))] + nonsynSNP <- + cbind( + as.data.frame(AApred) |> dplyr::select(-ALT), + data.frame( + SNP = names(AApred), + ALT = as.character(unlist(AApred$ALT)), + AAchanges = with( + AApred, + paste0(REFAA, PROTEINLOC, VARAA) ) - - if (all(is.na(nonsynSNP$warning))) { - nonsynSNP <- dplyr::select(nonsynSNP, !c(REFCODON, VARCODON, warning)) - } - SNPdetails <- left_join( - nonsynSNP, - SNPalleleCounts |> mutate( - AltFrac = paste0(AltCount, "/", TotCount), - TotCount = NULL, RefCount = NULL, AltCount = NULL - ) |> - pivot_wider( - names_from = c(SampleName), - values_from = c(AltFrac), - names_prefix = "AF_" - ), - by = join_by("SNP" == "variant") + ) + ) |> dplyr::select(SNP, seqnames, + pos = start, strand, REF, ALT, QUAL, + AAchanges, REFCODON, VARCODON, warning + ) + + if (all(is.na(nonsynSNP$warning))) { + nonsynSNP <- dplyr::select(nonsynSNP, !c(REFCODON, VARCODON, warning)) + } + SNPdetails <- left_join( + nonsynSNP, + SNPalleleCounts |> mutate( + AltFrac = paste0(AltCount, "/", TotCount), + TotCount = NULL, RefCount = NULL, AltCount = NULL ) |> - dplyr::select(!SNP) + pivot_wider( + names_from = c(SampleName), + values_from = c(AltFrac), + names_prefix = "AF_" + ), + by = join_by("SNP" == "variant") + ) |> + dplyr::select(!SNP) } else { - SNPdetails <- data.frame( - seqnames = factor(), Gene = character(), pos = integer() - ) - AApred <- GRanges() + SNPdetails <- data.frame( + seqnames = factor(), Gene = character(), pos = integer() + ) + AApred <- GRanges() } #### Get gene details for indels and non-synonymous SNPs #### @@ -428,6 +432,19 @@ if (nrow(indelGene) > 0) { select = "last" ) ] + #### Get Alt allele fractions for indels from added geno fields #### + indels.AF <- left_join( + as.data.frame(geno(indelGene)$AD) |> + rownames_to_column(var = "variant") |> + pivot_longer(cols = !"variant", names_to = "SampleName", values_to = "AD"), + as.data.frame(geno(indelGene)$AF) |> + rownames_to_column(var = "variant") |> + pivot_longer(cols = !"variant", names_to = "SampleName", values_to = "DP"), + ) |> mutate( + AltFrac = paste0(AD, "/", DP), + AD = NULL, DP = NULL + ) + #### Get gene details for indels #### indels.Feat.df <- data.frame( rowRanges(indelGene)[, c("REF", "ALT", "QUAL")], @@ -461,7 +478,7 @@ if (nrow(indelGene) > 0) { indels.Feat.df <- data.frame(Gene = character(), ALT = character()) } -## Report indels in gene regions +## Report indels in gene regions as 2 tables: gene details and allele fractions write_tsv( indels.Feat.df, file.path( @@ -473,6 +490,17 @@ write_tsv( ) ) +write_tsv( + indels.AF, + file.path( + varDir, + paste0( + argv$samplegroup, + "AFfiltIndels.tsv" + ) + ) +) + #### Get gene details for SNPs #### snpCDSAttr <- pf_featuresNovar[ findOverlaps(GRanges(AApred), GRanges(pf_featuresNovar), @@ -498,16 +526,16 @@ geneDetail <- map(baseIDEvents, function(ID) { ) }) |> list_rbind() -if (nrow(geneDetail) > 0){ - snps.Feat.df <- cbind(snps.Feat.df, geneDetail) |> - mutate( - Gene = case_when( - is.na(GeneName) ~ Description |> - str_replace_all(pattern = "\\+", replacement = " "), - TRUE ~ GeneName - ), - GeneName = NULL, Description = NULL, ID = NULL - ) +if (nrow(geneDetail) > 0) { + snps.Feat.df <- cbind(snps.Feat.df, geneDetail) |> + mutate( + Gene = case_when( + is.na(GeneName) ~ Description |> + str_replace_all(pattern = "\\+", replacement = " "), + TRUE ~ GeneName + ), + GeneName = NULL, Description = NULL, ID = NULL + ) } SNPdetails <- left_join( @@ -533,7 +561,7 @@ events.stats.df <- data.frame( write_tsv( events.stats.df, file.path( - varDir, paste0( + varDir, paste0( argv$samplegroup, critSamplesSom, "plusstats.Qcrit", argv$QUALcrit, ".tsv" ) diff --git a/modules/stvariant.nf b/modules/stvariant.nf index 4b2ef1d..2948ad9 100644 --- a/modules/stvariant.nf +++ b/modules/stvariant.nf @@ -1,5 +1,4 @@ - process Bcf{ label 'Bcftools' tag "${groupId}" @@ -17,7 +16,7 @@ process Bcf{ script: """ bcftools mpileup -Ou --max-depth 800 --threads ${task.cpus} \ - -f ${ref}.fasta \ + -a AD,DP -f ${ref}.fasta \ --bam-list ${bamlist} | \ bcftools call --ploidy 1 --threads ${task.cpus} -mv -Ov \ --output "${groupId}.vcf" - From 4aedbb6f43c0797ea724e19f451b220f7d41926b Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Thu, 10 Oct 2024 09:25:04 +1100 Subject: [PATCH 2/4] filterBcfVcf.R: make bcf AD/DP the default AF to report --- Rtools/malDrugR/filterBcfVcf.R | 102 +++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/Rtools/malDrugR/filterBcfVcf.R b/Rtools/malDrugR/filterBcfVcf.R index 614cb2f..c7daa4b 100755 --- a/Rtools/malDrugR/filterBcfVcf.R +++ b/Rtools/malDrugR/filterBcfVcf.R @@ -285,6 +285,8 @@ countalleles <- function(bamfile, vcf) { use.names = TRUE ) readCounts <- map(names(vcf), function(Name) { + ## this did not give good enough results for indels - too many events not + ## contained in reads. vcfgr <- rowRanges(vcf)[Name] width1stAlt <- unlist(vcfgr$ALT)[1] |> width() eventSeq <- Reads[ @@ -292,7 +294,7 @@ countalleles <- function(bamfile, vcf) { subjectHits() ] PosInSeq <- mapToAlignments(vcfgr, eventSeq) - refwidthDNA <- + refDNA <- subseq(mcols(eventSeq)$seq, start = start(PosInSeq), width = width(vcfgr$REF) @@ -303,7 +305,7 @@ countalleles <- function(bamfile, vcf) { width = width1stAlt ), error = function(e) { - print(paste(vcfgr, "ALT does not fit within reads. Skipping")) + print(paste(vcfgr, "ALT not within reads. Skipping")) } ) countresult <- data.frame( @@ -325,20 +327,75 @@ countalleles <- function(bamfile, vcf) { }) |> list_rbind() } +#### Get Alt allele fractions for vcf records from added geno fields #### +#### Report AF for 1st ALT not in reference sample +altAF <- function(vcf) { + GTparent <- geno(vcf)$GT[, parentlist] |> + as.numeric() |> + pmax() + names(GTparent) <- names(geno(vcf)$GT[, parentlist]) + alldepths <- as.data.frame(geno(vcf)$AD) |> + cbind(GTparent) |> + rownames_to_column(var = "variant") |> + pivot_longer( + cols = !c("variant", "GTparent"), + names_to = "SampleName", + values_to = "AD" + ) + alldepths$NewAltDepth <- + # Plus 2 because GT is 0-based; GTparent+1 is index of highest parent GT + apply(alldepths, 1, function(row) unlist(row$AD)[row$GTparent + 2]) + + newAF <- left_join( + alldepths, + as.data.frame(geno(vcf)$DP) |> + rownames_to_column(var = "variant") |> + pivot_longer(cols = !"variant", names_to = "SampleName", values_to = "DP"), + ) |> mutate( + NewAltFrac = paste0(NewAltDepth, "/", DP) + ) + ## Add ALT DNA, clean up and pivot wider for report + alts <- as.data.frame(alt(vcf)) |> + group_by(group) |> + summarize(altDNA = list(value)) + alts$variant <- names(vcf) + alts$GTparent <- GTparent + alts$AF_DNA <- + apply(alts, 1, function(row) unlist(row$altDNA)[row$GTparent + 1]) + + return( + newAF |> + dplyr::select(-AD, -NewAltDepth, -DP) |> + pivot_wider(names_from = SampleName, values_from = NewAltFrac) |> + left_join(alts) |> + dplyr::select( + variant, GTparent, AF_DNA, all_of(c(parentlist, samplesOI)) + ) + ) +} + + if (nrow(snpCDS) > 0) { - SNPalleleCounts <- map( - c( - paste0(samplesOI, "_nodup.bam"), - paste0(parentlist, "_nodup.bam") - ), - countalleles, - vcf = snpCDS - ) |> - list_rbind() + if (("AD" %in% rownames(geno(header(snpCDS))) & + "DP" %in% rownames(geno(header(snpCDS))) + )) { #### SNP allele counts from vcf geno fields + SNPalleleCounts <- altAF(snpCDS) + } else { #### SNP allele counts from bam files + SNPalleleCounts <- map( + c( + paste0(samplesOI, "_nodup.bam"), + paste0(parentlist, "_nodup.bam") + ), + countalleles, + vcf = snpCDS + ) |> + list_rbind() + } if (nrow(SNPalleleCounts) > 1) { SNPalleleCounts <- arrange(SNPalleleCounts, variant) } + #### Annotate SNPs #### #### Load genome and transcript db transcriptdb <- file.path( @@ -432,18 +489,16 @@ if (nrow(indelGene) > 0) { select = "last" ) ] - #### Get Alt allele fractions for indels from added geno fields #### - indels.AF <- left_join( - as.data.frame(geno(indelGene)$AD) |> - rownames_to_column(var = "variant") |> - pivot_longer(cols = !"variant", names_to = "SampleName", values_to = "AD"), - as.data.frame(geno(indelGene)$AF) |> - rownames_to_column(var = "variant") |> - pivot_longer(cols = !"variant", names_to = "SampleName", values_to = "DP"), - ) |> mutate( - AltFrac = paste0(AD, "/", DP), - AD = NULL, DP = NULL - ) + #### Get Alt allele fractions for indels from geno fields if available #### + #### Report AF for 1st ALT not in reference sample. + #### Function defined above in SNP allele counts + if (("AD" %in% rownames(geno(header(indelGene))) & + "DP" %in% rownames(geno(header(snpCindelGeneDS))) + )) { #### SNP allele counts from vcf geno fields + indels.AF <- altAF(indelGene) + } else { #### no allele frequencies available + indels.AF <- data.frame() + } #### Get gene details for indels #### indels.Feat.df <- data.frame( @@ -476,6 +531,7 @@ if (nrow(indelGene) > 0) { ) } else { indels.Feat.df <- data.frame(Gene = character(), ALT = character()) + indels.AF <- data.frame() } ## Report indels in gene regions as 2 tables: gene details and allele fractions From 8882a50ca17178ad41394ce030c81866f13ae68a Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Thu, 10 Oct 2024 15:35:32 +1100 Subject: [PATCH 3/4] config/modules.config: increase initial GRIDSS mem allocation --- config/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/modules.config b/config/modules.config index 0c47014..2760781 100644 --- a/config/modules.config +++ b/config/modules.config @@ -57,7 +57,7 @@ process { withLabel: Gridss { cpus = 8 time = '12h' - memory = { 20.GB * task.attempt } + memory = { 40.GB * task.attempt } } withLabel: Rfilter { cpus = 8 From a068bf000cd1ab31e0e2c2dfaf1aa9f0083654dc Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Thu, 10 Oct 2024 15:39:37 +1100 Subject: [PATCH 4/4] filterBcfVcf.R: add AF_ to start of column names --- Rtools/malDrugR/filterBcfVcf.R | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/Rtools/malDrugR/filterBcfVcf.R b/Rtools/malDrugR/filterBcfVcf.R index c7daa4b..e2b5dce 100755 --- a/Rtools/malDrugR/filterBcfVcf.R +++ b/Rtools/malDrugR/filterBcfVcf.R @@ -294,7 +294,7 @@ countalleles <- function(bamfile, vcf) { subjectHits() ] PosInSeq <- mapToAlignments(vcfgr, eventSeq) - refDNA <- + refwidthDNA <- subseq(mcols(eventSeq)$seq, start = start(PosInSeq), width = width(vcfgr$REF) @@ -366,10 +366,12 @@ altAF <- function(vcf) { return( newAF |> dplyr::select(-AD, -NewAltDepth, -DP) |> - pivot_wider(names_from = SampleName, values_from = NewAltFrac) |> + pivot_wider(names_from = SampleName, values_from = NewAltFrac, + names_prefix = "AF_") |> left_join(alts) |> dplyr::select( - variant, GTparent, AF_DNA, all_of(c(parentlist, samplesOI)) + variant, GTparent, AF_DNA, + all_of(paste0("AF_", c(parentlist, samplesOI))) ) ) } @@ -390,6 +392,16 @@ if (nrow(snpCDS) > 0) { vcf = snpCDS ) |> list_rbind() + SNPalleleCounts |> mutate( + SNPalleleCounts, + AltFrac = paste0(AltCount, "/", TotCount), + TotCount = NULL, RefCount = NULL, AltCount = NULL + ) |> + pivot_wider( + names_from = c(SampleName), + values_from = c(AltFrac), + names_prefix = "AF_" + ) } if (nrow(SNPalleleCounts) > 1) { @@ -459,15 +471,7 @@ if (nrow(snpCDS) > 0) { } SNPdetails <- left_join( nonsynSNP, - SNPalleleCounts |> mutate( - AltFrac = paste0(AltCount, "/", TotCount), - TotCount = NULL, RefCount = NULL, AltCount = NULL - ) |> - pivot_wider( - names_from = c(SampleName), - values_from = c(AltFrac), - names_prefix = "AF_" - ), + SNPalleleCounts, by = join_by("SNP" == "variant") ) |> dplyr::select(!SNP) @@ -493,7 +497,7 @@ if (nrow(indelGene) > 0) { #### Report AF for 1st ALT not in reference sample. #### Function defined above in SNP allele counts if (("AD" %in% rownames(geno(header(indelGene))) & - "DP" %in% rownames(geno(header(snpCindelGeneDS))) + "DP" %in% rownames(geno(header(indelGene))) )) { #### SNP allele counts from vcf geno fields indels.AF <- altAF(indelGene) } else { #### no allele frequencies available