From 3ae970890bba04326ed5a1d02aff2f8bcca902ef Mon Sep 17 00:00:00 2001 From: Phil9S Date: Thu, 11 Apr 2024 17:26:51 +0100 Subject: [PATCH 01/18] fixed pipeline failure on being only provided single sample --- scripts/gridsearch_results_filtering.R | 100 +++++++++++++++++++------ scripts/qdnaseq_rel_to_abs.R | 4 +- 2 files changed, 80 insertions(+), 24 deletions(-) diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index 56851f9..91dba31 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -32,7 +32,9 @@ collapse_rds <- function(rds.list){ comb <- combine(comb,add) } rds.obj <- comb - } + } else { + rds.obj <- comb + } return(rds.obj) } # Combine and load rds objects @@ -103,57 +105,109 @@ if(!dir.exists(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_d dir.create(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots")) } -#relative_smoothed -#Plot absolute CN fits for assessment -foreach(i=unique(pruned_results$SAMPLE_ID)) %dopar% { +if(length(unique(pruned_results$SAMPLE_ID)) == 1){ + i <- unique(pruned_results$SAMPLE_ID) dat <- pruned_results %>% filter(SAMPLE_ID == i) %>% arrange(ploidy) #arrange(rank_clonality) - x <- relative_smoothed[, i] + x <- relative_smoothed cn <- assayDataElement(x,"copynumber") seg <- assayDataElement(x,"segmented") rel_ploidy <- mean(cn,na.rm=T) ll <- nrow(dat) png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350) - par(mfrow = c(1,ll)) + par(mfrow = c(1,ll)) for(n in 1:nrow(dat)){ - ploidy <- dat[n,]$ploidy purity <- dat[n,]$purity cellploidy <- ploidy*purity+2*(1-purity) seqdepth <- rel_ploidy/cellploidy - expTP53 <- round(dat[n,]$expected_TP53_AF, 2) TP53 <- dat[n,]$TP53freq - + #convert to abs - + pData(x)$ploidy <- ploidy pData(x)$purity <- purity - + temp <- x abs_cn <- depthtocn(cn,purity,seqdepth) abs_seg <- depthtocn(seg,purity,seqdepth) assayDataElement(temp,"copynumber") <- abs_cn assayDataElement(temp,"segmented") <- abs_seg - + #tmp_abs <- convert_rd_to_cn(x) - # plot + # plot if(ploidy>5){ yrange=15 - }else - { + } else { yrange=10 } - plot(temp,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN", - main=paste(i, " eTP53=",round(expTP53,2), - " AF=", round(TP53,2), - " p=",round(purity,2), - " pl=",round(ploidy,2), - sep=""),cex.main=0.8) - abline(h=1:9,col = "blue") - + plot(temp,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN", + main=paste(i, " eTP53=",round(expTP53,2), + " AF=", round(TP53,2), + " p=",round(purity,2), + " pl=",round(ploidy,2), + sep=""),cex.main=0.8) + abline(h=1:9,col = "blue") + } dev.off() +} else { + #relative_smoothed + #Plot absolute CN fits for assessment + foreach(i=unique(pruned_results$SAMPLE_ID)) %dopar% { + dat <- pruned_results %>% + filter(SAMPLE_ID == i) %>% + arrange(ploidy) + #arrange(rank_clonality) + x <- relative_smoothed[, i] + cn <- assayDataElement(x,"copynumber") + seg <- assayDataElement(x,"segmented") + rel_ploidy <- mean(cn,na.rm=T) + ll <- nrow(dat) + png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350) + par(mfrow = c(1,ll)) + for(n in 1:nrow(dat)){ + + ploidy <- dat[n,]$ploidy + purity <- dat[n,]$purity + cellploidy <- ploidy*purity+2*(1-purity) + seqdepth <- rel_ploidy/cellploidy + + expTP53 <- round(dat[n,]$expected_TP53_AF, 2) + TP53 <- dat[n,]$TP53freq + + #convert to abs + + pData(x)$ploidy <- ploidy + pData(x)$purity <- purity + + temp <- x + abs_cn <- depthtocn(cn,purity,seqdepth) + abs_seg <- depthtocn(seg,purity,seqdepth) + assayDataElement(temp,"copynumber") <- abs_cn + assayDataElement(temp,"segmented") <- abs_seg + + #tmp_abs <- convert_rd_to_cn(x) + # plot + if(ploidy>5){ + yrange=15 + } else { + yrange=10 + } + plot(temp,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN", + main=paste(i, " eTP53=",round(expTP53,2), + " AF=", round(TP53,2), + " p=",round(purity,2), + " pl=",round(ploidy,2), + sep=""),cex.main=0.8) + abline(h=1:9,col = "blue") + + } + dev.off() + } } + +#END diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R index cf36cc3..af596f5 100644 --- a/scripts/qdnaseq_rel_to_abs.R +++ b/scripts/qdnaseq_rel_to_abs.R @@ -17,8 +17,8 @@ cores <- as.numeric(snakemake@threads) registerDoMC(cores) qc.data <- qc.data[qc.data$use == "TRUE",] - rds.filename <- snakemake@input[["rds"]] + rds.list <- lapply(rds.filename,FUN=function(x){readRDS(x)}) collapse_rds <- function(rds.list){ @@ -29,6 +29,8 @@ collapse_rds <- function(rds.list){ comb <- combine(comb,add) } rds.obj <- comb + } else { + rds.obj <- comb } return(rds.obj) } From c9dfb4591c561644a50a0b9a140fd730aabf9966 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Thu, 11 Apr 2024 18:44:53 +0100 Subject: [PATCH 02/18] added dev utility to report segment counts --- dev_tools/report_seg_counts.R | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 dev_tools/report_seg_counts.R diff --git a/dev_tools/report_seg_counts.R b/dev_tools/report_seg_counts.R new file mode 100644 index 0000000..386e3bf --- /dev/null +++ b/dev_tools/report_seg_counts.R @@ -0,0 +1,49 @@ +args <- commandArgs(trailingOnly=T) +library(yaml) + +cat("report segments - use 'report_seg_counts.R all' for individal seg counts\n") + +config <- read_yaml(file="config/config.yaml") + +projectBin <- paste0(config$project_name,"_",config$bin,"kb") +outputLoc <- paste0(config$out_dir,"sWGS_fitting/",projectBin,"/") + +pre <- "absolute_PRE_down_sampling/" +post <- "absolute_POST_down_sampling/abs_cn_rds/" + +preFile <- paste0(outputLoc,pre,projectBin,"_relSmoothedCN.rds") +postFile <- paste0(outputLoc,post,projectBin,"_ds_absCopyNumber.rds") + +verbose <- FALSE +if(length(args) > 0){ + if(args[1] == "all"){ + verbose <- TRUE + } +} + +if(file.exists(preFile)){ + suppressMessages(library(QDNAseqmod)) + suppressMessages(library(Biobase)) + preS <- readRDS(preFile) + preS <- preS[featureData(preS)$use] + preSegs <- apply(assayDataElement(preS,"segmented"),MARGIN=2,function(x) length(rle(x)$lengths)) + cat("\nPre-downsampled segments\n") + if(verbose){ + print(preSegs) + } else { + print(summary(preSegs)) + } + if(file.exists(postFile)){ + postS <- readRDS(postFile) + postS <- postS[featureData(postS)$use] + postSegs <- apply(assayDataElement(postS,"segmented"),MARGIN=2,function(x) length(rle(x)$lengths)) + cat("\nPost-downsampled segments\n") + if(verbose){ + print(postSegs) + } else { + print(summary(postSegs)) + } + } +} else { + cat("no pre or post downsampled files found\n") +} From 08891089c8d99dde1462ae98050a844350f07a5c Mon Sep 17 00:00:00 2001 From: Phil9S Date: Wed, 17 Apr 2024 16:50:49 +0100 Subject: [PATCH 03/18] implemented setting seed in config and updated scripts to use new seed variable to fix segmentation - set to TRUE by default --- config/config.yaml | 9 +++++++-- dev_tools/report_seg_counts.R | 2 +- rules/downsampled_rel_rds.smk | 4 +++- rules/rel_rds.smk | 4 +++- schemas/config.schema.yaml | 9 ++++++++- scripts/qdnaseq_mod.R | 13 +++++++++++-- scripts/qdnaseq_mod_ds.R | 17 ++++++++++++++--- update_configs.py | 2 ++ 8 files changed, 49 insertions(+), 11 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 91f0217..9caa5ca 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,17 +3,22 @@ samplesheet: "sample_sheet.tsv" # Output location -out_dir: "/mnt/scratcha/fmlab/smith10/britroc/" +out_dir: "/mnt/scratcha/fmlab/smith10/swgs_test/" # Bin sizes # By default any in [1,5,15,30,50,100,500,1000] bins: - 30 -project_name: "britroc" +project_name: "nm_test" # Pipeline parameters af_cutoff: 0.15 +# Set seed for CBS - TRUE or FALSE +# default TRUE +use_seed: "TRUE" +seed_val: "9999" + # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/dev_tools/report_seg_counts.R b/dev_tools/report_seg_counts.R index 386e3bf..74051e4 100644 --- a/dev_tools/report_seg_counts.R +++ b/dev_tools/report_seg_counts.R @@ -1,7 +1,7 @@ args <- commandArgs(trailingOnly=T) library(yaml) -cat("report segments - use 'report_seg_counts.R all' for individal seg counts\n") +cat("report segments - use 'report_seg_counts.R all' for individual seg counts\n") config <- read_yaml(file="config/config.yaml") diff --git a/rules/downsampled_rel_rds.smk b/rules/downsampled_rel_rds.smk index e6d4254..8218978 100644 --- a/rules/downsampled_rel_rds.smk +++ b/rules/downsampled_rel_rds.smk @@ -7,6 +7,8 @@ rule ds_relRDS: params: outdir=OUT_DIR, project="{project}", - bin="{bin}" + bin="{bin}", + use_seed=config["use_seed"], + seed_val=config["seed_val"] script: "../scripts/qdnaseq_mod_ds.R" diff --git a/rules/rel_rds.smk b/rules/rel_rds.smk index 812f01f..3e5d406 100644 --- a/rules/rel_rds.smk +++ b/rules/rel_rds.smk @@ -7,7 +7,9 @@ rule relRDS: bin="{bin}", outdir=OUT_DIR, project="{project}", - meta=config["samplesheet"] + meta=config["samplesheet"], + use_seed=config["use_seed"], + seed_val=config["seed_val"] script: "../scripts/qdnaseq_mod.R" diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml index 7179698..8ce5948 100644 --- a/schemas/config.schema.yaml +++ b/schemas/config.schema.yaml @@ -18,6 +18,12 @@ properties: type: number min: 0 max: 1.0 + use_seed: + type: string + enum: ["TRUE","FALSE"] + seed_val: + type: string + # entries that have to be in the config file for successful validation required: @@ -25,4 +31,5 @@ required: - out_dir - bins - project_name - + - use_seed + - seed_val diff --git a/scripts/qdnaseq_mod.R b/scripts/qdnaseq_mod.R index 15e0da7..f472d01 100644 --- a/scripts/qdnaseq_mod.R +++ b/scripts/qdnaseq_mod.R @@ -7,6 +7,8 @@ ncores <- 1 output_dir <- snakemake@params[["outdir"]] project <- snakemake@params[["project"]] metafile <- snakemake@params[["meta"]] +use_seed <- snakemake@params[["use_seed"]] +seed_val <- snakemake@params[["seed_val"]] metadata <- read.table(file = metafile,header=T,sep="\t") bam_list <- snakemake@input[["bams"]] outname <- snakemake@output[[1]] @@ -42,8 +44,15 @@ assayDataElement(copyNumbers[[1]],"copynumber") <- assayDataElement(copyNumbers[ # smooth outliers (Data is now ready to be analyzed with a downstream package of choice (exportBins)) copyNumbersSmooth <- mclapply(X=copyNumbers, FUN=smoothOutlierBins, mc.cores=1) +# Implement seeding to prevent variable segments on repeated runs +if(use_seed){ + seed <- as.character(seed_val) +} else { + seed <- NULL +} + # perform segmentation on bins and save it -copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores) +copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores,seeds=seed) # For each smooth_samples <- function(obj){ @@ -64,7 +73,7 @@ smooth_samples <- function(obj){ segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)})) while(sum(segnum>maxseg)&sdadjust<5) { - currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust) + currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust,seeds=seed) segments <- assayDataElement(currsamp, "segmented")[condition, , drop=FALSE] segments<-apply(segments,2,rle) segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)})) diff --git a/scripts/qdnaseq_mod_ds.R b/scripts/qdnaseq_mod_ds.R index d67517a..eb56c86 100644 --- a/scripts/qdnaseq_mod_ds.R +++ b/scripts/qdnaseq_mod_ds.R @@ -10,6 +10,9 @@ metafile <- snakemake@input[["meta"]] metadata <- read.table(file = metafile,header=T,sep="\t") bam_list <- snakemake@input[["bam"]] outname <- snakemake@output[[1]] +use_seed <- snakemake@params[["use_seed"]] +seed_val <- snakemake@params[["seed_val"]] + suppressMessages(library(parallel)) suppressMessages(library(tidyverse)) @@ -42,8 +45,16 @@ assayDataElement(copyNumbers[[1]],"copynumber") <- assayDataElement(copyNumbers[ # smooth outliers (Data is now ready to be analyzed with a downstream package of choice (exportBins)) copyNumbersSmooth <- mclapply(X=copyNumbers, FUN=smoothOutlierBins, mc.cores=1) - # perform segmentation on bins and save it -copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores) + +# Implement seeding to prevent variable segments on repeated runs +if(use_seed){ + seed <- as.character(seed_val) +} else { + seed <- NULL +} + +# perform segmentation on bins and save it +copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores,seeds=seed) changeSampleName <- function(CNsObj) { @@ -75,7 +86,7 @@ smooth_samples <- function(obj){ segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)})) while(sum(segnum>maxseg)&sdadjust<5) { - currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust) + currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust,seeds=seed) segments <- assayDataElement(currsamp, "segmented")[condition, , drop=FALSE] segments<-apply(segments,2,rle) segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)})) diff --git a/update_configs.py b/update_configs.py index 2aab6b0..a77b78e 100755 --- a/update_configs.py +++ b/update_configs.py @@ -43,6 +43,8 @@ def get_input(x,y): config['out_dir'] = DoubleQuotedScalarString(get_input(config['out_dir'],'out_dir')) config['project_name'] = DoubleQuotedScalarString(get_input(config['project_name'],'project_name')) config['af_cutoff'] = float(get_input(config['af_cutoff'],'af_cutoff')) + config['use_seed'] = DoubleQuotedScalarString(get_input(config['use_seed'],'use_seed')) + config['seed_val'] = DoubleQuotedScalarString(get_input(config['seed_val'],'seed_val')) # Save updated yaml with open('config/config.yaml', 'w') as fp: yaml.dump(config, fp) From 55f0cdd85ccb9ae31cdafb6880e409b2d7841d93 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Wed, 17 Apr 2024 18:29:16 +0100 Subject: [PATCH 04/18] removed *.csv file ext on tab-seperated files and replaced with *.tsv --- rules/filter_gridsearch.smk | 2 +- rules/gridsearch.smk | 2 +- scripts/gridsearch_results_filtering.R | 2 +- scripts/ploidy_purity_search_standard_error.R | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk index 78df00d..4b0784e 100644 --- a/rules/filter_gridsearch.smk +++ b/rules/filter_gridsearch.smk @@ -1,6 +1,6 @@ rule gridsearch_filter: input: - cl=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/clonality_results/{{project}}_{sample}_clonality.csv",sample=SAMPLES), + cl=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/clonality_results/{{project}}_{sample}_clonality.tsv",sample=SAMPLES), rds=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{sample}_{{bin}}kb_relSmoothedCN.rds",sample=SAMPLES) output: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv" diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk index 086d852..7f345b0 100644 --- a/rules/gridsearch.smk +++ b/rules/gridsearch.smk @@ -2,7 +2,7 @@ rule gridsearch_fitting: input: expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{{sample}}_{{bin}}kb_relSmoothedCN.rds") output: - csv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.csv", + tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.tsv", pdf=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.pdf" params: bin="{bin}", diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index 91dba31..4518750 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -44,7 +44,7 @@ saveRDS(relative_smoothed,paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/abs filelist <- snakemake@input[["cl"]] clonality <- do.call(rbind, lapply(filelist,FUN = function(x){ - n <- gsub(pattern="_clonality.csv",rep="",x=x) + n <- gsub(pattern="_clonality.tsv",rep="",x=x) prefix <- paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/clonality_results/",project,"_") n <- gsub(pattern=prefix,rep="",x=n) tab <- read.table(x,sep="\t",skip=1) diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R index 95f108b..2198c8e 100644 --- a/scripts/ploidy_purity_search_standard_error.R +++ b/scripts/ploidy_purity_search_standard_error.R @@ -189,4 +189,4 @@ print(ggplot(res,aes(x=ploidy,y=purity,fill=clonality))+geom_tile()+ theme_bw()) dev.off() -write.table(res,snakemake@output[["csv"]],sep="\t",quote=F,row.names=FALSE) +write.table(res,snakemake@output[["tsv"]],sep="\t",quote=F,row.names=FALSE) From f50b5d7bae6f68d404cf520ef322d3c4bf100989 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Mon, 22 Apr 2024 18:54:58 +0100 Subject: [PATCH 05/18] implemented power filter config option - allows user to remove the CN fit power filtering --- config/config.yaml | 4 +++ rules/filter_gridsearch.smk | 3 ++- schemas/config.schema.yaml | 4 +++ scripts/downsampleBams.R | 1 + scripts/gridsearch_results_filtering.R | 26 ++++++++++++++++--- scripts/ploidy_purity_search_standard_error.R | 4 +-- update_configs.py | 1 + 7 files changed, 36 insertions(+), 7 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 9caa5ca..16aa999 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -19,6 +19,10 @@ af_cutoff: 0.15 use_seed: "TRUE" seed_val: "9999" +# fitler underpowered solutions - TRUE or FALSE +# Default TRUE +filter_underpowered: "FALSE" + # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk index 4b0784e..a4add21 100644 --- a/rules/filter_gridsearch.smk +++ b/rules/filter_gridsearch.smk @@ -9,7 +9,8 @@ rule gridsearch_filter: meta=config["samplesheet"], project="{project}", outdir=OUT_DIR, - af_cutoff=config["af_cutoff"] + af_cutoff=config["af_cutoff"], + filter_underpowered=config["filter_underpowered"] threads: THREADS script: "../scripts/gridsearch_results_filtering.R" diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml index 8ce5948..d28f44a 100644 --- a/schemas/config.schema.yaml +++ b/schemas/config.schema.yaml @@ -23,6 +23,9 @@ properties: enum: ["TRUE","FALSE"] seed_val: type: string + filter_underpowered: + type: string + enum: ["TRUE","FALSE"] # entries that have to be in the config file for successful validation @@ -33,3 +36,4 @@ required: - project_name - use_seed - seed_val + - filter_underpowered diff --git a/scripts/downsampleBams.R b/scripts/downsampleBams.R index e9d3084..f3c008d 100644 --- a/scripts/downsampleBams.R +++ b/scripts/downsampleBams.R @@ -26,6 +26,7 @@ perc <- fit.qc.filt %>% filter(SAMPLE_ID == sample_name) %>% .$ratio +# If read ratio is greater than 0.96 (i.e close to original or higher than available reads) if( perc <= 0.96){ cmd.downsample <- paste("samtools view -s ", perc," -b ",bam_in," > ",outname) cmd.index <- paste0("samtools index ",outname) diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index 4518750..bf1fd61 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -14,10 +14,18 @@ metadata <- read.table(file = metafile,header=T,sep="\t") bin <- as.numeric(snakemake@params[["bin"]]) out_dir <- snakemake@params[["outdir"]] project <- snakemake@params[["project"]] +filter_underpowered <- snakemake@params[["filter_underpowered"]] af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]]) cores <- as.numeric(snakemake@threads) registerDoMC(cores) +# Filter for powered fits only +if(filter_underpowered == "TRUE"){ + filter_underpowered <- TRUE +} else { + filter_underpowered <- FALSE +} + # read in relative CN data # collapse rds files function rds.filename <- snakemake@input[["rds"]] @@ -63,20 +71,30 @@ depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given } #Top 10 when ranking by clonality and TP53 +print(filter_underpowered) +print(clonality) + +filtered_results <- clonality + +if(filter_underpowered){ + filtered_results <- filtered_results %>% + filter(powered == 1) +} -filtered_results <- clonality %>% +filtered_results <- filtered_results %>% # filter underpowered fits when config variable is TRUE select(SAMPLE_ID, PATIENT_ID, everything()) %>% - filter(powered ==1) %>% group_by(SAMPLE_ID, ploidy) %>% mutate(rank_clonality = min_rank(clonality)) %>% #rank clonality within a unique ploidy state - filter(rank_clonality ==1) %>% #select ploidy with the lowest clonality within a unique ploidy state + filter(rank_clonality == 1) %>% #select ploidy with the lowest clonality within a unique ploidy state group_by(SAMPLE_ID) %>% top_n(-10, wt = clonality) %>% # select top 10 ploidy states with the lowest clonality values mutate(rank_clonality = min_rank(clonality)) %>% # rank by clonality within a sample across ploidies in top 10 - # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15 + filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive + # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15 filter(is.na(TP53freq) | near(expected_TP53_AF,TP53freq, tol = af_cutoff )) %>% arrange(PATIENT_ID, SAMPLE_ID ) +print(filtered_results) #Further limit the results by selecting the ploidy states with the lowest clonality values where multiple similar solutions are present. #Threshold of 0.3 used to select different states diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R index 2198c8e..9dbf99e 100644 --- a/scripts/ploidy_purity_search_standard_error.R +++ b/scripts/ploidy_purity_search_standard_error.R @@ -107,7 +107,7 @@ cntodepth<-function(cn,purity,seqdepth) #converts copy number to read depth give { seqdepth*((1-purity)*2+purity*cn) } -## TP53 target bin +## TP53 target bin - hg19@30kb target <- c("17:7565097-7590863") get_gene_seg <- function(target=NULL,abs_data=NULL){ to_use <- fData(abs_data)$use @@ -137,7 +137,7 @@ gene_bin_seg <- get_gene_seg(target = target,abs_data = rds.obj[[1]]) #estimate absolute copy number fits for all samples in parallel ploidies<-seq(1.6,8,0.1) -purities<-seq(0.05,1,0.01) +purities<-seq(0.15,1,0.01) clonality<-c() #ind<-which(colnames(rds.obj)==sample) relcn<-rds.obj[[1]] diff --git a/update_configs.py b/update_configs.py index a77b78e..1038f98 100755 --- a/update_configs.py +++ b/update_configs.py @@ -45,6 +45,7 @@ def get_input(x,y): config['af_cutoff'] = float(get_input(config['af_cutoff'],'af_cutoff')) config['use_seed'] = DoubleQuotedScalarString(get_input(config['use_seed'],'use_seed')) config['seed_val'] = DoubleQuotedScalarString(get_input(config['seed_val'],'seed_val')) + config['filter_underpowered'] = DoubleQuotedScalarString(get_input(config['filter_underpowered'],'filter_underpowered')) # Save updated yaml with open('config/config.yaml', 'w') as fp: yaml.dump(config, fp) From bd0162cbd41aff574ac532bfb89f38e2578eab19 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Tue, 23 Apr 2024 18:00:01 +0100 Subject: [PATCH 06/18] export segment table as final output and remove redundant functions --- scripts/ploidy_purity_search_standard_error.R | 68 ------------------- scripts/qdnaseq_rel_to_abs.R | 39 +++++++++++ 2 files changed, 39 insertions(+), 68 deletions(-) diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R index 9dbf99e..3d27fd3 100644 --- a/scripts/ploidy_purity_search_standard_error.R +++ b/scripts/ploidy_purity_search_standard_error.R @@ -34,79 +34,11 @@ nbins_ref_genome <- sum(fData(rds.obj[[1]])$use) nbins<-nrow(bins) #define helper functions -getSegTable<-function(x) #returns a table containing copy number segments -{ - dat<-x - sn<-assayDataElement(dat,"segmented") - fd <- fData(dat) - fd$use -> use - fdfiltfull<-fd[use,] - sn<-sn[use,] - segTable<-c() - for(c in unique(fdfiltfull$chromosome)) - { - snfilt<-sn[fdfiltfull$chromosome==c] - fdfilt<-fdfiltfull[fdfiltfull$chromosome==c,] - sn.rle<-rle(snfilt) - starts <- cumsum(c(1, sn.rle$lengths[-length(sn.rle$lengths)])) - ends <- cumsum(sn.rle$lengths) - lapply(1:length(sn.rle$lengths), function(s) { - from <- fdfilt$start[starts[s]] - to <- fdfilt$end[ends[s]] - segValue <- sn.rle$value[s] - c(fdfilt$chromosome[starts[s]], from, to, segValue) - }) -> segtmp - segTableRaw <- data.frame(matrix(unlist(segtmp), ncol=4, byrow=T),stringsAsFactors=F) - segTable<-rbind(segTable,segTableRaw) - } - colnames(segTable) <- c("chromosome", "start", "end", "segVal") - segTable -} - -getPloidy<-function(abs_profiles) #returns the ploidy of a sample from segTab or QDNAseq object -{ - out<-c() - samps<-getSampNames(abs_profiles) - for(i in samps) - { - if(class(abs_profiles)=="QDNAseqCopyNumbers") - { - segTab<-getSegTable(abs_profiles[,which(colnames(abs_profiles)==i)]) - } - else - { - segTab<-abs_profiles[[i]] - colnames(segTab)[4]<-"segVal" - } - segLen<-(as.numeric(segTab$end)-as.numeric(segTab$start)) - ploidy<-sum((segLen/sum(segLen))*as.numeric(segTab$segVal)) - out<-c(out,ploidy) - } - data.frame(out,stringsAsFactors = F) -} - -getSampNames<-function(abs_profiles) # convenience function for getting sample names from QDNAseq or segTab list -{ - if(class(abs_profiles)=="QDNAseqCopyNumbers") - { - samps<-colnames(abs_profiles) - } - else - { - samps<-names(abs_profiles) - } - samps -} - depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given purity and single copy depth { (x/seqdepth-2*(1-purity))/purity } -cntodepth<-function(cn,purity,seqdepth) #converts copy number to read depth given purity and single copy depth -{ - seqdepth*((1-purity)*2+purity*cn) -} ## TP53 target bin - hg19@30kb target <- c("17:7565097-7590863") get_gene_seg <- function(target=NULL,abs_data=NULL){ diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R index af596f5..68c74bc 100644 --- a/scripts/qdnaseq_rel_to_abs.R +++ b/scripts/qdnaseq_rel_to_abs.R @@ -151,5 +151,44 @@ res <- data.frame(res,stringsAsFactors = F) # Save rds saveRDS(abs_profiles,file=paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/",project,"_",bin,"kb_ds_absCopyNumber.rds")) +# save segTable +getSegTable<-function(x){ + if(inherits(x,what = "QDNAseqCopyNumbers",which = F)){ + sn<-Biobase::assayDataElement(x,"segmented") + fd <- Biobase::fData(x) + fd$use -> use + fdfiltfull<-fd[use,] + sn<-sn[use,] + segTable<-c() + for(s in colnames(sn)){ + for(c in unique(fdfiltfull$chromosome)) + { + snfilt<-sn[fdfiltfull$chromosome==c,colnames(sn) == s] + fdfilt<-fdfiltfull[fdfiltfull$chromosome==c,] + sn.rle<-rle(snfilt) + starts <- cumsum(c(1, sn.rle$lengths[-length(sn.rle$lengths)])) + ends <- cumsum(sn.rle$lengths) + lapply(1:length(sn.rle$lengths), function(s) { + from <- fdfilt$start[starts[s]] + to <- fdfilt$end[ends[s]] + segValue <- sn.rle$value[s] + c(fdfilt$chromosome[starts[s]], from, to, segValue) + }) -> segtmp + segTableRaw <- data.frame(matrix(unlist(segtmp), ncol=4, byrow=T),sample = rep(s,times=nrow(matrix(unlist(segtmp), ncol=4, byrow=T))),stringsAsFactors=F) + segTable<-rbind(segTable,segTableRaw) + } + } + colnames(segTable) <- c("chromosome", "start", "end", "segVal","sample") + return(segTable) + } else { + # NON QDNASEQ BINNED DATA FUNCTION + stop("segtable error") + } +} + +write.table(getSegTable(abs_profiles), + paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/",project,"_",bin,"kb_ds_absCopyNumber_segTable.tsv"), + sep = "\t",quote=F,row.names=FALSE) + #write table of fits write.table(res,paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/",project,"_",bin,"kb_ds_abs_fits.tsv"),sep = "\t",quote=F,row.names=FALSE) From 62c8db0ed404313068685fca5ac1b113bf906c8b Mon Sep 17 00:00:00 2001 From: Phil9S Date: Tue, 23 Apr 2024 18:15:16 +0100 Subject: [PATCH 07/18] default config setting --- config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.yaml b/config/config.yaml index 16aa999..ff7bd97 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -21,7 +21,7 @@ seed_val: "9999" # fitler underpowered solutions - TRUE or FALSE # Default TRUE -filter_underpowered: "FALSE" +filter_underpowered: "TRUE" # Not implemented #custom_bin: false From 811d17cb7e716f6920eb2e7af2546ef31c22c39c Mon Sep 17 00:00:00 2001 From: Phil9S Date: Tue, 23 Apr 2024 19:26:11 +0100 Subject: [PATCH 08/18] Addition of pl/pu config values and schema entries. Fixed schema entry filters. Added python shebang to update_configs.py. Additional config value error catches. Added default_config.yaml as reference file. --- config/config.yaml | 20 +++++++++++++++--- config/default_config.yaml | 42 ++++++++++++++++++++++++++++++++++++++ rules/common.smk | 15 +++++++++++++- schemas/config.schema.yaml | 29 +++++++++++++++++++++++--- update_configs.py | 14 ++++++++++++- 5 files changed, 112 insertions(+), 8 deletions(-) create mode 100644 config/default_config.yaml diff --git a/config/config.yaml b/config/config.yaml index ff7bd97..3f4a358 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,15 +1,19 @@ --- # Sample sheet -samplesheet: "sample_sheet.tsv" +samplesheet: sample_sheet.tsv # Output location -out_dir: "/mnt/scratcha/fmlab/smith10/swgs_test/" +out_dir: results/ # Bin sizes # By default any in [1,5,15,30,50,100,500,1000] +# Add new line for additional bin sizes bins: - 30 -project_name: "nm_test" +#- 100 + +# Set project dir name +project_name: nm_test # Pipeline parameters af_cutoff: 0.15 @@ -23,6 +27,16 @@ seed_val: "9999" # Default TRUE filter_underpowered: "TRUE" +# ploidy range +# Default min = 1.6 | max = 8 +ploidy_min: 1.6 +ploidy_max: 8.0 + +# purity range (1 >= max > min >= 0) +# Default min = 0.15 | max = 1.00 +purity_min: 0.15 +purity_max: 1.0 + # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/config/default_config.yaml b/config/default_config.yaml new file mode 100644 index 0000000..3f4a358 --- /dev/null +++ b/config/default_config.yaml @@ -0,0 +1,42 @@ +--- +# Sample sheet +samplesheet: sample_sheet.tsv + +# Output location +out_dir: results/ + +# Bin sizes +# By default any in [1,5,15,30,50,100,500,1000] +# Add new line for additional bin sizes +bins: +- 30 +#- 100 + +# Set project dir name +project_name: nm_test + +# Pipeline parameters +af_cutoff: 0.15 + +# Set seed for CBS - TRUE or FALSE +# default TRUE +use_seed: "TRUE" +seed_val: "9999" + +# fitler underpowered solutions - TRUE or FALSE +# Default TRUE +filter_underpowered: "TRUE" + +# ploidy range +# Default min = 1.6 | max = 8 +ploidy_min: 1.6 +ploidy_max: 8.0 + +# purity range (1 >= max > min >= 0) +# Default min = 0.15 | max = 1.00 +purity_min: 0.15 +purity_max: 1.0 + +# Not implemented +#custom_bin: false +#custom_bin_folder: "/custom_bin_data/" diff --git a/rules/common.smk b/rules/common.smk index c783063..9a81abd 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -67,4 +67,17 @@ BIN_VALS = config["bins"] BIN_DEF = [1,5,15,30,50,100,500,1000] if not set(BIN_VALS).issubset(BIN_DEF): - sys.exit("Some bin values are not available") + sys.exit("Config error - Some specified bin values are not available") + +##### CHECK MAX > MIN ##### +PLMIN=config["ploidy_min"] +PLMAX=config["ploidy_max"] +PUMIN=config["purity_min"] +PUMAX=config["purity_max"] + +if PLMIN > PLMAX: + sys.exit("Config error - Minimum ploidy exceeds or is equal to maximum ploidy") + +if PUMIN > PUMAX: + sys.exit("Config error - Minimum purity exceeds or is equal to maximum purity") + diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml index d28f44a..986d038 100644 --- a/schemas/config.schema.yaml +++ b/schemas/config.schema.yaml @@ -12,12 +12,15 @@ properties: type: string bins: type: array + items: + type: number + uniqueItems: true project_name: type: string af_cutoff: type: number - min: 0 - max: 1.0 + minimum: 0 + maximum: 1.0 use_seed: type: string enum: ["TRUE","FALSE"] @@ -26,7 +29,22 @@ properties: filter_underpowered: type: string enum: ["TRUE","FALSE"] - + ploidy_min: + type: number + minimum: 1 + maximum: 20 + ploidy_max: + type: number + minimum: 1 + maximum: 20 + purity_min: + type: number + minimum: 0 + maximum: 1.0 + purity_max: + type: number + minimum: 0 + maximum: 1.0 # entries that have to be in the config file for successful validation required: @@ -37,3 +55,8 @@ required: - use_seed - seed_val - filter_underpowered + - ploidy_min + - ploidy_max + - purity_min + - purity_max + diff --git a/update_configs.py b/update_configs.py index 1038f98..f27ea43 100755 --- a/update_configs.py +++ b/update_configs.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import re import ruamel.yaml @@ -27,7 +29,7 @@ def get_input(x,y): type=str, \ required=True, \ nargs=1, \ - choices=['config','slurm','pbs','lsf','local']) + choices=['config','filters','slurm','pbs','lsf','local']) parser.add_argument("--advanced", help="Modify advanced cluster configuration", action="store_true") args = parser.parse_args() @@ -42,10 +44,20 @@ def get_input(x,y): config['bins'] = list(get_input(config['bins'],'bins')) config['out_dir'] = DoubleQuotedScalarString(get_input(config['out_dir'],'out_dir')) config['project_name'] = DoubleQuotedScalarString(get_input(config['project_name'],'project_name')) +elif args.config[0] == "filters": + # Config file updates + print('Update config.yaml values (Return no value to keep the current value)') + file_name = 'config/config.yaml' + config, ind, bsi = ruamel.yaml.util.load_yaml_guess_indent(open(file_name)) + # Update config values config['af_cutoff'] = float(get_input(config['af_cutoff'],'af_cutoff')) config['use_seed'] = DoubleQuotedScalarString(get_input(config['use_seed'],'use_seed')) config['seed_val'] = DoubleQuotedScalarString(get_input(config['seed_val'],'seed_val')) config['filter_underpowered'] = DoubleQuotedScalarString(get_input(config['filter_underpowered'],'filter_underpowered')) + config['ploidy_min'] = float(get_input(config['ploidy_min'],'ploidy_min')) + config['ploidy_max'] = float(get_input(config['ploidy_max'],'ploidy_max')) + config['purity_min'] = float(get_input(config['purity_min'],'purity_min')) + config['purity_max'] = float(get_input(config['purity_max'],'purity_max')) # Save updated yaml with open('config/config.yaml', 'w') as fp: yaml.dump(config, fp) From 3daa7e33ba67acc74ccef9467a64865ea8efa54c Mon Sep 17 00:00:00 2001 From: Phil9S Date: Wed, 24 Apr 2024 17:20:02 +0100 Subject: [PATCH 09/18] enabled specfication of ploidy/purity range --- rules/gridsearch.smk | 6 +++++- scripts/ploidy_purity_search_standard_error.R | 11 +++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk index 7f345b0..a5aec18 100644 --- a/rules/gridsearch.smk +++ b/rules/gridsearch.smk @@ -7,6 +7,10 @@ rule gridsearch_fitting: params: bin="{bin}", outdir=OUT_DIR, - project="{project}" + project="{project}", + ploidy_min=config["ploidy_min"], + ploidy_max=config["ploidy_max"], + purity_min=config["purity_min"], + purity_max=config["purity_max"] script: "../scripts/ploidy_purity_search_standard_error.R" diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R index 3d27fd3..06991cb 100644 --- a/scripts/ploidy_purity_search_standard_error.R +++ b/scripts/ploidy_purity_search_standard_error.R @@ -10,6 +10,13 @@ out_dir <- snakemake@params[["outdir"]] project <- snakemake@params[["project"]] cores <- as.numeric(snakemake@threads) +# gridsearch params +pl_min <- snakemake@params[["ploidy_min"]] # default 1.6 +pl_max <- snakemake@params[["ploidy_max"]] # default 8 +pu_min <- snakemake@params[["purity_min"]] # default 0.15 +pu_max <- snakemake@params[["purity_max"]] # deafult 1 +#print(c(pl_min,pl_max,pu_min,pu_max)) + #load libraries suppressPackageStartupMessages(library(QDNAseqmod)) suppressPackageStartupMessages(library(Biobase)) @@ -68,8 +75,8 @@ get_gene_seg <- function(target=NULL,abs_data=NULL){ gene_bin_seg <- get_gene_seg(target = target,abs_data = rds.obj[[1]]) #estimate absolute copy number fits for all samples in parallel -ploidies<-seq(1.6,8,0.1) -purities<-seq(0.15,1,0.01) +ploidies<-seq(pl_min,pl_max,0.1) +purities<-seq(pu_min,pu_max,0.01) clonality<-c() #ind<-which(colnames(rds.obj)==sample) relcn<-rds.obj[[1]] From 4fa433ca02b63ebb02af01318a76d43ffcc629b2 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Mon, 29 Apr 2024 16:03:25 +0100 Subject: [PATCH 10/18] updated sample_sheet_example.tsv and sample sheet schema defintion to support use of ploidy/purity columns --- sample_sheet_example.tsv | 20 ++++++++++---------- schemas/samples.schema.yaml | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv index 6dfe89a..cba55a0 100644 --- a/sample_sheet_example.tsv +++ b/sample_sheet_example.tsv @@ -1,10 +1,10 @@ -PATIENT_ID SAMPLE_ID TP53freq smooth file -PATIENT-1 SAMPLE_3 NA FALSE /data/SAMPLE_3.bam -PATIENT-2 SAMPLE_5 0.97604930362117 FALSE /data/SAMPLE_5.bam -PATIENT-2 SAMPLE_6 0.948429942418426 FALSE /data/SAMPLE_6.bam -PATIENT-3 SAMPLE_10 0.312743806009489 FALSE /data/SAMPLE_10.bam -PATIENT-3 SAMPLE_11 0.313365853658537 FALSE /data/SAMPLE_11.bam -PATIENT-3 SAMPLE_12 0.170947565543071 FALSE /data/SAMPLE_12.bam -PATIENT-3 SAMPLE_13 0.15861669829222 FALSE /data/SAMPLE_13.bam -PATIENT-3 SAMPLE_7 0.326712851405623 FALSE /data/SAMPLE_7.bam -PATIENT-3 SAMPLE_8 0.361060215053763 FALSE /data/SAMPLE_8.bam +PATIENT_ID SAMPLE_ID TP53freq smooth file ploidy purity +PATIENT-1 SAMPLE_3 NA FALSE /data/SAMPLE_3.bam NA NA +PATIENT-2 SAMPLE_5 0.97 FALSE /data/SAMPLE_5.bam NA NA +PATIENT-2 SAMPLE_6 0.94 FALSE /data/SAMPLE_6.bam NA NA +PATIENT-3 SAMPLE_10 0.31 TRUE /data/SAMPLE_10.bam NA NA +PATIENT-3 SAMPLE_11 NA FALSE /data/SAMPLE_11.bam NA NA +PATIENT-3 SAMPLE_12 0.17 FALSE /data/SAMPLE_12.bam NA NA +PATIENT-3 SAMPLE_13 0.15 FALSE /data/SAMPLE_13.bam NA NA +PATIENT-3 SAMPLE_7 0.32 TRUE /data/SAMPLE_7.bam NA NA +PATIENT-3 SAMPLE_8 0.36 FALSE /data/SAMPLE_8.bam NA NA diff --git a/schemas/samples.schema.yaml b/schemas/samples.schema.yaml index 4bf2173..8864e88 100644 --- a/schemas/samples.schema.yaml +++ b/schemas/samples.schema.yaml @@ -21,6 +21,22 @@ properties: smooth: description: "True or fale boolean as to apply segment smoothing or not" type: boolean + ploidy: + description: "sample ploidy" + anyOf: + - type: number + minimum: 1 + maximum: 20 + - type: string + enum: ["NA"] + purity: + description: "sample purity" + anyOf: + - type: number + minimum: 0 + maximum: 1.0 + - type: string + enum: ["NA"] required: - SAMPLE_ID From b38ccbbf4670f7bc0d32b2589ded148b023e90e3 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Mon, 29 Apr 2024 20:18:44 +0100 Subject: [PATCH 11/18] added functionality to specify precomputed ploidy and purity for a given sample. Can be provided together or either ploidy and purity independently --- rules/gridsearch.smk | 1 + sample_sheet_example.tsv | 2 +- schemas/samples.schema.yaml | 8 +-- scripts/gridsearch_results_filtering.R | 11 ++-- scripts/ploidy_purity_search_standard_error.R | 51 ++++++++++++++----- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk index a5aec18..e391430 100644 --- a/rules/gridsearch.smk +++ b/rules/gridsearch.smk @@ -8,6 +8,7 @@ rule gridsearch_fitting: bin="{bin}", outdir=OUT_DIR, project="{project}", + meta=config["samplesheet"], ploidy_min=config["ploidy_min"], ploidy_max=config["ploidy_max"], purity_min=config["purity_min"], diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv index cba55a0..3a690fb 100644 --- a/sample_sheet_example.tsv +++ b/sample_sheet_example.tsv @@ -1,4 +1,4 @@ -PATIENT_ID SAMPLE_ID TP53freq smooth file ploidy purity +PATIENT_ID SAMPLE_ID TP53freq smooth file precPloidy precPurity PATIENT-1 SAMPLE_3 NA FALSE /data/SAMPLE_3.bam NA NA PATIENT-2 SAMPLE_5 0.97 FALSE /data/SAMPLE_5.bam NA NA PATIENT-2 SAMPLE_6 0.94 FALSE /data/SAMPLE_6.bam NA NA diff --git a/schemas/samples.schema.yaml b/schemas/samples.schema.yaml index 8864e88..b2c7120 100644 --- a/schemas/samples.schema.yaml +++ b/schemas/samples.schema.yaml @@ -21,16 +21,16 @@ properties: smooth: description: "True or fale boolean as to apply segment smoothing or not" type: boolean - ploidy: - description: "sample ploidy" + precPloidy: + description: "pre-computed sample ploidy" anyOf: - type: number minimum: 1 maximum: 20 - type: string enum: ["NA"] - purity: - description: "sample purity" + precPurity: + description: "pre-computed sample purity" anyOf: - type: number minimum: 0 diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index bf1fd61..422952b 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -7,8 +7,6 @@ suppressWarnings(library(foreach)) ## Added by PS args = commandArgs(trailingOnly=TRUE) -print(snakemake) - metafile <- snakemake@params[["meta"]] metadata <- read.table(file = metafile,header=T,sep="\t") bin <- as.numeric(snakemake@params[["bin"]]) @@ -31,7 +29,6 @@ if(filter_underpowered == "TRUE"){ rds.filename <- snakemake@input[["rds"]] rds.list <- lapply(rds.filename,FUN=function(x){readRDS(x)}) - collapse_rds <- function(rds.list){ comb <- rds.list[[1]][[1]] if(length(rds.list) > 1){ @@ -63,7 +60,8 @@ colnames(clonality) <- c("SAMPLE_ID","ploidy","purity","clonality","downsample_d clonality <- left_join(clonality,metadata,by="SAMPLE_ID") %>% select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth) - + +print(clonality) ## Added by PS depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given purity and single copy depth { @@ -71,9 +69,6 @@ depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given } #Top 10 when ranking by clonality and TP53 -print(filter_underpowered) -print(clonality) - filtered_results <- clonality if(filter_underpowered){ @@ -89,7 +84,7 @@ filtered_results <- filtered_results %>% # filter underpowered fits when config group_by(SAMPLE_ID) %>% top_n(-10, wt = clonality) %>% # select top 10 ploidy states with the lowest clonality values mutate(rank_clonality = min_rank(clonality)) %>% # rank by clonality within a sample across ploidies in top 10 - filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive + #filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15 filter(is.na(TP53freq) | near(expected_TP53_AF,TP53freq, tol = af_cutoff )) %>% arrange(PATIENT_ID, SAMPLE_ID ) diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R index 06991cb..997969f 100644 --- a/scripts/ploidy_purity_search_standard_error.R +++ b/scripts/ploidy_purity_search_standard_error.R @@ -10,6 +10,9 @@ out_dir <- snakemake@params[["outdir"]] project <- snakemake@params[["project"]] cores <- as.numeric(snakemake@threads) +metafile <- snakemake@params[["meta"]] +metadata <- read.table(file = metafile,header=T,sep="\t") + # gridsearch params pl_min <- snakemake@params[["ploidy_min"]] # default 1.6 pl_max <- snakemake@params[["ploidy_max"]] # default 8 @@ -74,9 +77,30 @@ get_gene_seg <- function(target=NULL,abs_data=NULL){ #Get target anchor gene segments gene_bin_seg <- get_gene_seg(target = target,abs_data = rds.obj[[1]]) +#adjust for precomputed pl/pu +metaSample <- snakemake@wildcards[["sample"]] +metaflt <- metadata[metadata$SAMPLE_ID == metaSample,] + +# set gridsearch limits based on availability of +# precomputed pl/pu values +if(!is.null(metaflt$precPloidy)){ + if(!is.na(metaflt$precPloidy)){ + pl_min <- metaflt$precPloidy + pl_max <- metaflt$precPloidy + } +} + +if(!is.null(metaflt$precPurity)){ + if(!is.na(metaflt$precPurity)){ + pu_min <- metaflt$precPurity + pu_max <- metaflt$precPurity + } +} + #estimate absolute copy number fits for all samples in parallel ploidies<-seq(pl_min,pl_max,0.1) purities<-seq(pu_min,pu_max,0.01) + clonality<-c() #ind<-which(colnames(rds.obj)==sample) relcn<-rds.obj[[1]] @@ -88,14 +112,11 @@ copynumber<-assayDataElement(relcn,"copynumber") rel_ploidy<-mean(copynumber,na.rm=T) num_reads<-sum(copynumber,na.rm=T) sample <- sampleNames(relcn) -print(sample) -res<-foreach(i=1:length(ploidies),.combine=rbind) %do% -{ +#print(sample) +res<-foreach(i=1:length(ploidies),.combine=rbind) %do% { ploidy<-ploidies[i] - #print(ploidy) - rowres<-foreach(j=1:length(purities),.combine=rbind)%do% - { + rowres<-foreach(j=1:length(purities),.combine=rbind) %do% { purity<-purities[j] downsample_depth<-(((2*(1-purity)+purity*ploidy)/(ploidy*purity))/purity)*15*(2*(1-purity)+purity*ploidy)*nbins_ref_genome*(1/0.91) cellploidy<-ploidy*purity+2*(1-purity) @@ -110,16 +131,22 @@ res<-foreach(i=1:length(ploidies),.combine=rbind) %do% TP53cn<-round(depthtocn(median(seg[gene_bin_seg]),purity,seqdepth),1) # to 1 decimal place expected_TP53_AF<-TP53cn*purity/(TP53cn*purity+2*(1-purity)) clonality<-mean(diffs) - c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF) + r <- c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF) + r <- as.data.frame(t(r)) + return(r) } - rowres + return(as.data.frame(rowres)) } colnames(res)<-c("ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF") -rownames(res)<-1:nrow(res) -res<-data.frame(res,stringsAsFactors = F) -res<-data.frame(apply(res,2,as.numeric,stringsAsFactors=F)) -res<-res[order(res$clonality,decreasing=FALSE),] +if(nrow(res) > 1){ + rownames(res)<-1:nrow(res) + res<-data.frame(res,stringsAsFactors = F) + res<-data.frame(apply(res,2,as.numeric,stringsAsFactors=F)) + res<-res[order(res$clonality,decreasing=FALSE),] +} else { + rownames(res) <- 1 +} #output plot of clonality error landscape pdf(snakemake@output[["pdf"]]) From e79e5f12c991d96198ad28cba33a42ba5ca9d085 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Tue, 30 Apr 2024 19:51:38 +0100 Subject: [PATCH 12/18] added support for precomputed ploidy-purity and skipping stage_1 gridsearch fitting --- rules/downsample.smk | 1 + scripts/downsampleBams.R | 19 ++++-- scripts/processPrecomputed.R | 110 +++++++++++++++++++++++++++++++++++ scripts/qdnaseq_rel_to_abs.R | 5 ++ stage_1 | 2 + stage_2 | 9 +++ 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100755 scripts/processPrecomputed.R diff --git a/rules/downsample.smk b/rules/downsample.smk index 8241350..2b8aa6d 100644 --- a/rules/downsample.smk +++ b/rules/downsample.smk @@ -7,6 +7,7 @@ rule downsample: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/downsampled_bams/{sample}.bam" params: outdir=OUT_DIR, + prplpu=prplpu, bin="{bin}", project="{project}", sample="{sample}" diff --git a/scripts/downsampleBams.R b/scripts/downsampleBams.R index f3c008d..7803d08 100644 --- a/scripts/downsampleBams.R +++ b/scripts/downsampleBams.R @@ -10,20 +10,29 @@ bin <- as.numeric(snakemake@params[["bin"]]) project <- snakemake@params[["project"]] outname <- snakemake@output[[1]] sample_name <- snakemake@params[["sample"]] +prplpu <- snakemake@params[["prplpu"]] -fit.qc <- read.table(file = meta,header = T,sep = "\t",na.strings = "") - -relative_smoothed <- readRDS(rds) -read.data <- phenoData(relative_smoothed)@data +#print(prplpu) +fit.qc <- read.table(file = meta,header = T,sep = "\t",na.strings = "") fit.qc.filt <- fit.qc %>% + filter(SAMPLE_ID == sample_name) %>% filter(use == TRUE) +if(prplpu == "TRUE"){ + cmd.totalreads <- paste0("samtools view -c -F 260 ",bam_in) + tot.reads <- as.numeric(system(cmd.totalreads,intern=TRUE)) + read.data <- data.frame(name=fit.qc.filt$SAMPLE_ID,total.reads=tot.reads) + print(read.data) +} else { + relative_smoothed <- readRDS(rds) + read.data <- phenoData(relative_smoothed)@data +} + fit.qc.filt$total.reads <- read.data$total.reads[match(x = fit.qc.filt$SAMPLE_ID,read.data$name)] fit.qc.filt$ratio <- round(fit.qc.filt$downsample_depth / fit.qc.filt$total.reads,digits = 3) perc <- fit.qc.filt %>% - filter(SAMPLE_ID == sample_name) %>% .$ratio # If read ratio is greater than 0.96 (i.e close to original or higher than available reads) diff --git a/scripts/processPrecomputed.R b/scripts/processPrecomputed.R new file mode 100755 index 0000000..b0b831e --- /dev/null +++ b/scripts/processPrecomputed.R @@ -0,0 +1,110 @@ +args <- commandArgs(trailingOnly=TRUE) +suppressPackageStartupMessages(library(yaml)) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(tidyr)) +suppressPackageStartupMessages(library(QDNAseqmod)) +suppressPackageStartupMessages(library(Biobase)) + +cat("[processPrecomputed] Generating file to skip stage_1\n") +cat("[processPrecomputed] Reading config and samplesheet files...\n") +config <- read_yaml(file="config/config.yaml") + +samplesheet <- read.table(config$samplesheet,header=TRUE,sep="\t") +projectBin <- paste0(config$project_name,"_",config$bin,"kb") +outputLoc <- paste0(config$out_dir,"sWGS_fitting/",projectBin,"/") + + +if(any(is.na(samplesheet$precPloidy)) | any(is.na(samplesheet$precPurity))){ + stop("some samples do not have precomputed ploidy and purity values - check the samplesheet") +} + +cat("[processPrecomputed] Loading genome bin data...\n") +bin <- config$bin +bin_size <- bin*1000 +bins<-getBinAnnotations(binSize = bin) +# actual number of bins varies in stage_1 due to filtering so total usable bins is adjusted to attempt to fix this +nbins_ref_genome <- round(sum(bins@data$use) * 0.932) #0.932 ratio between actual usable bins and total usable bins in bin annotation data + +pre <- "absolute_PRE_down_sampling/" +preFile <- paste0(outputLoc,pre,config$project_name,"_fit_QC_predownsample.tsv") + + +if(!dir.exists(outputLoc)){ + dir.create(outputLoc,recursive=TRUE) +} + +cat("[processPrecomputed] Verifying BAM files...\n") +# check bams +bam <- samplesheet$file +outname <- paste0(outputLoc,"bam.ok") +log_vector <- c() +check_vector <- c() +for(i in bam){ + cmd <- paste0("samtools quickcheck -q ",i) + if(system(cmd) == 0){ + log_vector <- append(log_vector, paste0("BAM valid - ",i)) + check_vector <- append(check_vector,FALSE) + } else { + log_vector <- append(log_vector,paste0("BAM invalid or missing - ",i)) + check_vector <- append(check_vector,TRUE) + } +} + +if(any(check_vector)){ + outname <- gsub(pattern = "ok",replacement = "invalid",outname) + writeLines(text = as.character(log_vector),con = outname) +} else { + writeLines(text = as.character(log_vector),con = outname) +} + +cat("[processPrecomputed] Creating BAM file symlinks...\n") +# generate symlinks for bams +symoutLoc <- paste0(outputLoc,"bams/") +if(!dir.exists(symoutLoc)){ + dir.create(symoutLoc,recursive=TRUE) +} +for(i in 1:nrow(samplesheet)){ + symcmd <- paste0("ln -s ",samplesheet$file[i]," ",symoutLoc,samplesheet$SAMPLE_ID[i],".bam") + system(symcmd) +} + +cat("[processPrecomputed] Generating spoof Pre-downsampled RDS files...\n") +# spoof RDS files from stage_1 +rdsoutLoc <- paste0(outputLoc,pre,"relative_cn_rds/") +if(!dir.exists(rdsoutLoc)){ + dir.create(rdsoutLoc,recursive=TRUE) +} + +for(i in 1:nrow(samplesheet)){ + writeLines(text = as.character(samplesheet$SAMPLE_ID[i]), + paste0(rdsoutLoc,config$project,"_",samplesheet$SAMPLE_ID[i],"_",bin,"kb_relSmoothedCN.rds")) +} +writeLines(text = as.character(samplesheet$SAMPLE_ID), + paste0(outputLoc,pre,projectBin,"_relSmoothedCN.rds")) + +cat("[processPrecomputed] Generating stage_2 QC file input...\n") +# generate qc file +preFileCols <- c("clonality","powered","TP53cn","expected_TP53_AF","TP53freq","rank_clonality","pl_diff","pu_diff","new","new_state") +samplesheet <- samplesheet %>% + dplyr::select(SAMPLE_ID,PATIENT_ID,precPloidy,precPurity,TP53freq,smooth) %>% + rename("ploidy"="precPloidy","purity"="precPurity") %>% + mutate(!!!setNames(rep(NA, length(preFileCols)), preFileCols)) %>% + mutate(downsample_depth = (((2*(1-purity)+purity*ploidy)/(ploidy*purity))/purity)*15*(2*(1-purity)+purity*ploidy)*nbins_ref_genome*(1/0.91)) %>% + mutate(use = TRUE) %>% + mutate(notes = "using preprocessed ploidy purity values") %>% + select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth,rank_clonality,pl_diff,pu_diff,new,new_state,use,notes) + +foutputLoc <- paste0(outputLoc,pre) +if(!dir.exists(foutputLoc)){ + dir.create(foutputLoc,recursive=TRUE) +} + +cat("[processPrecomputed] Marking run as precomputed...\n") +write.table(samplesheet,preFile,sep="\t",col.names=T,row.names=F,quote=F) +write.table(samplesheet[,c("SAMPLE_ID","ploidy","purity")],paste0(foutputLoc,"precomp.ok"), + sep="\t",col.names=T,row.names=F,quote=F) +writeLines(text = c("PRECOMPUTED"),paste0(foutputLoc,"GENERATED_USING_PRECOMPUTED_PL_PU_RDS_FILES_ARE_EMPTY")) + +cat("[processPrecomputed] Processing precomputed values complete\n") +cat("[processPrecomputed] stage_2 can now run without completing stage_1\n") +#END diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R index 68c74bc..41b37f5 100644 --- a/scripts/qdnaseq_rel_to_abs.R +++ b/scripts/qdnaseq_rel_to_abs.R @@ -159,6 +159,11 @@ getSegTable<-function(x){ fd$use -> use fdfiltfull<-fd[use,] sn<-sn[use,] + if(is.null(ncol(sn))){ + sampleNa <- Biobase::sampleNames(x) + sn <- as.data.frame(sn) + colnames(sn) <- sampleNa + } segTable<-c() for(s in colnames(sn)){ for(c in unique(fdfiltfull$chromosome)) diff --git a/stage_1 b/stage_1 index 12de7e0..4f4982e 100644 --- a/stage_1 +++ b/stage_1 @@ -8,6 +8,8 @@ def get_bam(wildcards): if any(samplesheet.SAMPLE_ID.duplicated()): sys.exit("Sample sheet contains duplicated sample ids") +if all(samplesheet.precPloidy.notna()) and all(samplesheet.precPurity.notna()): + sys.exit("all ploidy and purity values are precomputed - Run `Rscript scripts/processPrecomputed.R` and skip to stage_2") SAMPLES = list(samplesheet['SAMPLE_ID'].unique()) FILE_LIST = list(samplesheet['file'].unique()) diff --git a/stage_2 b/stage_2 index e297655..c5acb38 100644 --- a/stage_2 +++ b/stage_2 @@ -23,6 +23,15 @@ def get_ds_bams(P,B): # Set new sample list based on filtered QC from stage 1 PROJECT=config["project_name"] BIN=config["bins"] + +if any(samplesheet.SAMPLE_ID.duplicated()): + sys.exit("Sample sheet contains duplicated sample ids") + + +prplpu = "FALSE" +if all(samplesheet.precPloidy.notna()) and all(samplesheet.precPurity.notna()): + prplpu = "TRUE" + SAMPLE_LISTS=get_ds_bams(PROJECT,BIN) rule all: From 8102f9cec85cab166098189ad371302b3551fccb Mon Sep 17 00:00:00 2001 From: Phil9S Date: Fri, 3 May 2024 12:14:23 +0100 Subject: [PATCH 13/18] implemented homozygousloss filter and associated config parameters --- config/config.yaml | 10 ++++++ config/default_config.yaml | 10 ++++++ rules/filter_gridsearch.smk | 4 ++- rules/gridsearch.smk | 3 +- sample_sheet_example.tsv | 6 ++-- schemas/config.schema.yaml | 13 +++++++ scripts/gridsearch_results_filtering.R | 36 ++++++++++++++----- scripts/ploidy_purity_search_standard_error.R | 8 +++-- 8 files changed, 74 insertions(+), 16 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3f4a358..8177195 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -37,6 +37,16 @@ ploidy_max: 8.0 purity_min: 0.15 purity_max: 1.0 +# Homozygous loss filter - TRUE or FALSE +# Default "TRUE" +filter_homozygous: "TRUE" +# Threshold basepairs lost +# Default 10000000 / 10Mbase +homozygous_prop: 10000000 +# Absolute CN homozygous loss threshold +# Default 0.4 +homozygous_threshold: 0.4 + # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/config/default_config.yaml b/config/default_config.yaml index 3f4a358..8177195 100644 --- a/config/default_config.yaml +++ b/config/default_config.yaml @@ -37,6 +37,16 @@ ploidy_max: 8.0 purity_min: 0.15 purity_max: 1.0 +# Homozygous loss filter - TRUE or FALSE +# Default "TRUE" +filter_homozygous: "TRUE" +# Threshold basepairs lost +# Default 10000000 / 10Mbase +homozygous_prop: 10000000 +# Absolute CN homozygous loss threshold +# Default 0.4 +homozygous_threshold: 0.4 + # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk index a4add21..62cd5a2 100644 --- a/rules/filter_gridsearch.smk +++ b/rules/filter_gridsearch.smk @@ -10,7 +10,9 @@ rule gridsearch_filter: project="{project}", outdir=OUT_DIR, af_cutoff=config["af_cutoff"], - filter_underpowered=config["filter_underpowered"] + filter_underpowered=config["filter_underpowered"], + filter_homozygous=config["filter_homozygous"], + homozygous_prop=config["homozygous_prop"] threads: THREADS script: "../scripts/gridsearch_results_filtering.R" diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk index e391430..626de36 100644 --- a/rules/gridsearch.smk +++ b/rules/gridsearch.smk @@ -12,6 +12,7 @@ rule gridsearch_fitting: ploidy_min=config["ploidy_min"], ploidy_max=config["ploidy_max"], purity_min=config["purity_min"], - purity_max=config["purity_max"] + purity_max=config["purity_max"], + homozygous_threshold=config["homozygous_threshold"] script: "../scripts/ploidy_purity_search_standard_error.R" diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv index 3a690fb..88ca928 100644 --- a/sample_sheet_example.tsv +++ b/sample_sheet_example.tsv @@ -1,7 +1,7 @@ PATIENT_ID SAMPLE_ID TP53freq smooth file precPloidy precPurity -PATIENT-1 SAMPLE_3 NA FALSE /data/SAMPLE_3.bam NA NA -PATIENT-2 SAMPLE_5 0.97 FALSE /data/SAMPLE_5.bam NA NA -PATIENT-2 SAMPLE_6 0.94 FALSE /data/SAMPLE_6.bam NA NA +PATIENT-1 SAMPLE_3 NA FALSE /data/SAMPLE_3.bam 3.2 0.76 +PATIENT-2 SAMPLE_5 0.97 FALSE /data/SAMPLE_5.bam 2.4 NA +PATIENT-2 SAMPLE_6 0.94 FALSE /data/SAMPLE_6.bam NA 0.55 PATIENT-3 SAMPLE_10 0.31 TRUE /data/SAMPLE_10.bam NA NA PATIENT-3 SAMPLE_11 NA FALSE /data/SAMPLE_11.bam NA NA PATIENT-3 SAMPLE_12 0.17 FALSE /data/SAMPLE_12.bam NA NA diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml index 986d038..0708d55 100644 --- a/schemas/config.schema.yaml +++ b/schemas/config.schema.yaml @@ -45,6 +45,16 @@ properties: type: number minimum: 0 maximum: 1.0 + filter_homozygous: + type: string + enum: ["TRUE","FALSE"] + homozygous_prop: + type: number + minimum: 0 + homozygous_threshold: + type: number + minimum: 0 + maximum: 0.99 # entries that have to be in the config file for successful validation required: @@ -59,4 +69,7 @@ required: - ploidy_max - purity_min - purity_max + - filter_homozygous + - homozygous_prop + - homozygous_threshold diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index 422952b..4a2c078 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -12,11 +12,24 @@ metadata <- read.table(file = metafile,header=T,sep="\t") bin <- as.numeric(snakemake@params[["bin"]]) out_dir <- snakemake@params[["outdir"]] project <- snakemake@params[["project"]] -filter_underpowered <- snakemake@params[["filter_underpowered"]] -af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]]) + +#threads cores <- as.numeric(snakemake@threads) registerDoMC(cores) +# Filter parameters +filter_underpowered <- snakemake@params[["filter_underpowered"]] +af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]]) +filter_homozygous=snakemake@params[["filter_homozygous"]] +homozygous_prop=as.numeric(snakemake@params[["homozygous_prop"]]) + +# filter homozygous loss +if(filter_homozygous == "TRUE"){ + filter_homozygous <- TRUE +} else { + filter_homozygous <- FALSE +} + # Filter for powered fits only if(filter_underpowered == "TRUE"){ filter_underpowered <- TRUE @@ -56,13 +69,11 @@ clonality <- do.call(rbind, tab <- cbind(rep(n,times=nrow(tab)),tab) return(tab) })) -colnames(clonality) <- c("SAMPLE_ID","ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF") +colnames(clonality) <- c("SAMPLE_ID","ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF","homozygousLoss") clonality <- left_join(clonality,metadata,by="SAMPLE_ID") %>% - select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth) + select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth,homozygousLoss) -print(clonality) -## Added by PS depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given purity and single copy depth { (x/seqdepth-2*(1-purity))/purity @@ -71,11 +82,20 @@ depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given #Top 10 when ranking by clonality and TP53 filtered_results <- clonality +## Filter underpowered if(filter_underpowered){ filtered_results <- filtered_results %>% filter(powered == 1) } + +## filter homozygous loss +if(filter_homozygous){ + filtered_results <- filtered_results %>% + filter(homozygousLoss <= homozygous_prop) +} + +# standard filtering filtered_results <- filtered_results %>% # filter underpowered fits when config variable is TRUE select(SAMPLE_ID, PATIENT_ID, everything()) %>% group_by(SAMPLE_ID, ploidy) %>% @@ -84,12 +104,10 @@ filtered_results <- filtered_results %>% # filter underpowered fits when config group_by(SAMPLE_ID) %>% top_n(-10, wt = clonality) %>% # select top 10 ploidy states with the lowest clonality values mutate(rank_clonality = min_rank(clonality)) %>% # rank by clonality within a sample across ploidies in top 10 - #filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15 filter(is.na(TP53freq) | near(expected_TP53_AF,TP53freq, tol = af_cutoff )) %>% - arrange(PATIENT_ID, SAMPLE_ID ) + arrange(PATIENT_ID, SAMPLE_ID) -print(filtered_results) #Further limit the results by selecting the ploidy states with the lowest clonality values where multiple similar solutions are present. #Threshold of 0.3 used to select different states diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R index 997969f..1967b9a 100644 --- a/scripts/ploidy_purity_search_standard_error.R +++ b/scripts/ploidy_purity_search_standard_error.R @@ -20,6 +20,9 @@ pu_min <- snakemake@params[["purity_min"]] # default 0.15 pu_max <- snakemake@params[["purity_max"]] # deafult 1 #print(c(pl_min,pl_max,pu_min,pu_max)) +# homozygous threshold +hmz_thrsh <- snakemake@params[["homozygous_threshold"]] + #load libraries suppressPackageStartupMessages(library(QDNAseqmod)) suppressPackageStartupMessages(library(Biobase)) @@ -131,14 +134,15 @@ res<-foreach(i=1:length(ploidies),.combine=rbind) %do% { TP53cn<-round(depthtocn(median(seg[gene_bin_seg]),purity,seqdepth),1) # to 1 decimal place expected_TP53_AF<-TP53cn*purity/(TP53cn*purity+2*(1-purity)) clonality<-mean(diffs) - r <- c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF) + hmzyg <- sum(abs_cn <= hmz_thrsh) * bin_size + r <- c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF,hmzyg) r <- as.data.frame(t(r)) return(r) } return(as.data.frame(rowres)) } -colnames(res)<-c("ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF") +colnames(res)<-c("ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF","homozygousLoss") if(nrow(res) > 1){ rownames(res)<-1:nrow(res) res<-data.frame(res,stringsAsFactors = F) From d3fe4087309c302f3e759628f3bf5921aa1afe67 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Fri, 3 May 2024 12:34:51 +0100 Subject: [PATCH 14/18] error catch for empty stage_1 qc file if filtering criteria too strict --- stage_2 | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/stage_2 b/stage_2 index c5acb38..9b56f0f 100644 --- a/stage_2 +++ b/stage_2 @@ -8,14 +8,17 @@ def get_ds_bams(P,B): fl = OUT_DIR+"sWGS_fitting/"+P+"_"+str(b)+"kb/absolute_PRE_down_sampling/"+P+"_fit_QC_predownsample.tsv" if os.path.isfile(fl): f = pd.read_table(fl).set_index(["SAMPLE_ID"], drop=False) - if f["use"].dtype == bool and all(~f.use.isnull()): - ftb = f[(f['use'] == True)] - if all(~ftb.SAMPLE_ID.duplicated()): - d[str(b)] = list(ftb['SAMPLE_ID']) + if not f.empty: + if f["use"].dtype == bool and all(~f.use.isnull()): + ftb = f[(f['use'] == True)] + if all(~ftb.SAMPLE_ID.duplicated()): + d[str(b)] = list(ftb['SAMPLE_ID']) + else: + sys.exit("QC file contains duplicated sample ids marked 'TRUE'") else: - sys.exit("QC file contains duplicated sample ids marked 'TRUE'") + sys.exit("QC file 'use' column contains non-boolean values") else: - sys.exit("QC file 'use' column contains non-boolean values") + sys.exit("QC file is empty. Filtering criteria may need adjusting") else: sys.exit("QC file from stage 1 is missing") return(d) From 47b7eb649ca796f95a1c51ca3ec66b625d138aad Mon Sep 17 00:00:00 2001 From: Phil9S Date: Fri, 3 May 2024 12:56:09 +0100 Subject: [PATCH 15/18] fix - disambiguate combine function to avoid confusion with deprecated dplyr function --- scripts/gridsearch_results_filtering.R | 2 +- scripts/qdnaseq_rel_to_abs.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index 4a2c078..1681f2d 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -47,7 +47,7 @@ collapse_rds <- function(rds.list){ if(length(rds.list) > 1){ for(i in 2:length(rds.list)){ add <- rds.list[[i]][[1]] - comb <- combine(comb,add) + comb <- Biobase::combine(comb,add) } rds.obj <- comb } else { diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R index 41b37f5..be6de70 100644 --- a/scripts/qdnaseq_rel_to_abs.R +++ b/scripts/qdnaseq_rel_to_abs.R @@ -26,7 +26,7 @@ collapse_rds <- function(rds.list){ if(length(rds.list) > 1){ for(i in 2:length(rds.list)){ add <- rds.list[[i]][[1]] - comb <- combine(comb,add) + comb <- Biobase::combine(comb,add) } rds.obj <- comb } else { From cc1f0aabbcbd45d54e3d8dc1de8f59659785d741 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Mon, 6 May 2024 02:40:50 +0100 Subject: [PATCH 16/18] added support for containerised pipeline using docker image. Fixed X11/Cairo error by setting cairo specifically in png plotting device. --- README.md | 6 ++++++ config/config.yaml | 2 ++ rules/bam_check.smk | 2 ++ rules/common.smk | 3 +++ rules/downsample.smk | 2 ++ rules/downsampled_rel_rds.smk | 2 ++ rules/filter_gridsearch.smk | 2 ++ rules/gridsearch.smk | 2 ++ rules/rel_rds.smk | 2 ++ rules/rel_to_abs.smk | 2 ++ schemas/config.schema.yaml | 2 ++ scripts/gridsearch_results_filtering.R | 6 ++++-- scripts/qdnaseq_rel_to_abs.R | 3 ++- 13 files changed, 33 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0ba2279..d8b4de2 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,12 @@ or conda activate swgs-abscn ``` +### Step 3a - Singularity-based implementation + +``` +--use-singularity --singularity-args "--bind /mnt/scratcha/fmlab/smith10/britroc/" +``` + ### Step 4 Preparing the input files #### Sample sheet diff --git a/config/config.yaml b/config/config.yaml index 8177195..b531300 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -47,6 +47,8 @@ homozygous_prop: 10000000 # Default 0.4 homozygous_threshold: 0.4 +# container url for swgs-absolutecn +image_base_url: "docker://phil9s/" # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/rules/bam_check.smk b/rules/bam_check.smk index 449639e..310bef5 100644 --- a/rules/bam_check.smk +++ b/rules/bam_check.smk @@ -3,6 +3,8 @@ rule check_bam: bam=FILE_LIST output: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/bam.ok" + singularity: + image_base_url+"swgs-absolutecn:latest" threads: 1 script: "../scripts/bam_check.R" diff --git a/rules/common.smk b/rules/common.smk index 9a81abd..f3e89c9 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -61,6 +61,9 @@ OUT_DIR=os.path.join(OUT_DIR,"") samplesheet = pd.read_table(config["samplesheet"],dtype={'PATIENT_ID': str,'SAMPLE_ID':str,'TP53freq':float}).set_index(["SAMPLE_ID"], drop=False) validate(samplesheet, schema="../schemas/samples.schema.yaml") +# set container uri +image_base_url = config["image_base_url"] + #### Check bin values #### BIN_VALS = config["bins"] diff --git a/rules/downsample.smk b/rules/downsample.smk index 2b8aa6d..dd18eb3 100644 --- a/rules/downsample.smk +++ b/rules/downsample.smk @@ -5,6 +5,8 @@ rule downsample: rds=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_{bin}kb_relSmoothedCN.rds" output: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/downsampled_bams/{sample}.bam" + singularity: + image_base_url+"swgs-absolutecn:latest" params: outdir=OUT_DIR, prplpu=prplpu, diff --git a/rules/downsampled_rel_rds.smk b/rules/downsampled_rel_rds.smk index 8218978..c728781 100644 --- a/rules/downsampled_rel_rds.smk +++ b/rules/downsampled_rel_rds.smk @@ -4,6 +4,8 @@ rule ds_relRDS: meta=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv" output: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/relative_cn_rds/{project}_{sample}_{bin}kb_relSmoothedCN.rds" + singularity: + image_base_url+"swgs-absolutecn:latest" params: outdir=OUT_DIR, project="{project}", diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk index 62cd5a2..7ac9d25 100644 --- a/rules/filter_gridsearch.smk +++ b/rules/filter_gridsearch.smk @@ -4,6 +4,8 @@ rule gridsearch_filter: rds=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{sample}_{{bin}}kb_relSmoothedCN.rds",sample=SAMPLES) output: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv" + singularity: + image_base_url+"swgs-absolutecn:latest" params: bin="{bin}", meta=config["samplesheet"], diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk index 626de36..3a64b4d 100644 --- a/rules/gridsearch.smk +++ b/rules/gridsearch.smk @@ -4,6 +4,8 @@ rule gridsearch_fitting: output: tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.tsv", pdf=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.pdf" + singularity: + image_base_url+"swgs-absolutecn:latest" params: bin="{bin}", outdir=OUT_DIR, diff --git a/rules/rel_rds.smk b/rules/rel_rds.smk index 3e5d406..bdb6a08 100644 --- a/rules/rel_rds.smk +++ b/rules/rel_rds.smk @@ -3,6 +3,8 @@ rule relRDS: bams=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/bams/{{sample}}.bam") output: OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/relative_cn_rds/{project}_{sample}_{bin}kb_relSmoothedCN.rds" + singularity: + image_base_url+"swgs-absolutecn:latest" params: bin="{bin}", outdir=OUT_DIR, diff --git a/rules/rel_to_abs.smk b/rules/rel_to_abs.smk index 21db431..baf9bb0 100644 --- a/rules/rel_to_abs.smk +++ b/rules/rel_to_abs.smk @@ -5,6 +5,8 @@ rule rel_to_abs: output: tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/{project}_{bin}kb_ds_abs_fits.tsv", rds=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/{project}_{bin}kb_ds_absCopyNumber.rds" + singularity: + image_base_url+"swgs-absolutecn:latest" params: outdir=OUT_DIR, project="{project}", diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml index 0708d55..c6d074f 100644 --- a/schemas/config.schema.yaml +++ b/schemas/config.schema.yaml @@ -55,6 +55,8 @@ properties: type: number minimum: 0 maximum: 0.99 + image_base_url: + type: string # entries that have to be in the config file for successful validation required: diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R index 1681f2d..c81dfa1 100644 --- a/scripts/gridsearch_results_filtering.R +++ b/scripts/gridsearch_results_filtering.R @@ -147,7 +147,8 @@ if(length(unique(pruned_results$SAMPLE_ID)) == 1){ seg <- assayDataElement(x,"segmented") rel_ploidy <- mean(cn,na.rm=T) ll <- nrow(dat) - png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350) + png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), + type="cairo",w= 450*ll, h = 350) par(mfrow = c(1,ll)) for(n in 1:nrow(dat)){ ploidy <- dat[n,]$ploidy @@ -198,7 +199,8 @@ if(length(unique(pruned_results$SAMPLE_ID)) == 1){ seg <- assayDataElement(x,"segmented") rel_ploidy <- mean(cn,na.rm=T) ll <- nrow(dat) - png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350) + png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), + type = "cairo", w= 450*ll, h = 350,) par(mfrow = c(1,ll)) for(n in 1:nrow(dat)){ diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R index be6de70..5cbdf3b 100644 --- a/scripts/qdnaseq_rel_to_abs.R +++ b/scripts/qdnaseq_rel_to_abs.R @@ -132,7 +132,8 @@ for(sample in pData(rds.rel)$name){ yrange=10 } # Plot abs fit - png(paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/plots/",sample,".png"), w= 8, h = 6, unit="in", res = 250) + png(paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/plots/",sample,".png"), + type="cairo",w= 8, h = 6, unit="in", res = 250) par(mfrow = c(1,1)) plot(relcn,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN", main=paste(sample, " eTP53=",round(expected_TP53_AF,2), From 05a0bcc1706c0fd6df08c9ccab71c81fd03fd3f5 Mon Sep 17 00:00:00 2001 From: Phil9S Date: Thu, 16 May 2024 14:51:46 +0100 Subject: [PATCH 17/18] added hmzy filtering paramters to update_config.py --- config/config.yaml | 2 +- update_configs.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/config/config.yaml b/config/config.yaml index b531300..7454146 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -48,7 +48,7 @@ homozygous_prop: 10000000 homozygous_threshold: 0.4 # container url for swgs-absolutecn -image_base_url: "docker://phil9s/" +image_base_url: docker://phil9s/ # Not implemented #custom_bin: false #custom_bin_folder: "/custom_bin_data/" diff --git a/update_configs.py b/update_configs.py index f27ea43..7663dee 100755 --- a/update_configs.py +++ b/update_configs.py @@ -58,6 +58,9 @@ def get_input(x,y): config['ploidy_max'] = float(get_input(config['ploidy_max'],'ploidy_max')) config['purity_min'] = float(get_input(config['purity_min'],'purity_min')) config['purity_max'] = float(get_input(config['purity_max'],'purity_max')) + config['filter_homozygous'] = DoubleQuotedScalarString(get_input(config['filter_homozygous'],'filter_homozygous')) + config['homozygous_prop'] = int(get_input(config['homozygous_prop'],'homozygous_prop')) + config['homozygous_threshold'] = float(get_input(config['homozygous_threshold'],'homozygous_threshold')) # Save updated yaml with open('config/config.yaml', 'w') as fp: yaml.dump(config, fp) From 16f9da12a18fdd4432bc3ce3ee04df872b630d60 Mon Sep 17 00:00:00 2001 From: Philip Smith Date: Thu, 16 May 2024 17:17:00 +0100 Subject: [PATCH 18/18] Update README.md --- README.md | 221 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 152 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index d8b4de2..ab592f7 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,15 @@ ## Description -*Summary* +### *Summary* Generate absolute copy number profiles from shallow whole genome sequencing data using a read depth normalised and allele frequency-anchored approach. -*Detailed description* +### *Detailed description* This pipeline implements a method for generating absolute copy number profiles from shallow whole genome sequencing data utilising a multi-stage fitting process to generate a set of read depth-normalised absolute copy number profiles. Copy number fits are generated in read count space using a modified implementation of [QDNAseq](https://github.com/ccagc/QDNAseq) on full read depth shallow whole genome BAM files. After which, a grid search of ploidy and purity values is performed to generate a matrix of potential absolute copy number profiles. The fitting of absolute copy number profiles includes an error function metric and a variant allele fraction anchoring process which assist in the selection of an optimal copy number fit. -The error function, `clonality`, which measures segment residual from integer state across the genome and acts similarly to a function such as RMSE, in which a lower value is suggestive of better confirmation to integer copy number states, and therefore a better absolute copy number profile. variant allele fraction achoring utilises the near-ubiquitous presence of somatic homozygous _TP53_ variants in high grade serous ovarian cancer as an anchoring point for a copy number fit. Experimentally validated _TP53_ allele fractions at homozygous sites are compared to an expected homozygous _TP53_ variant allele fraction, as determined by the copy number value of the segment overlapping the _TP53_ locus. The smaller absolute differences between the experimental and expected _TP53_ variant allele fractions is suggestive of a better fit. Minimisation of the `clonality` error function, expected-to-experimental TP53 distance, and manual review of fits allows for the selection of an optimal absolute copy number profile for a given sample. +The error function, `clonality`, which measures segment residual from integer state across the genome and is computed as mean absolute error (MAE), in which a lower value is suggestive of better confirmation to integer copy number states, and therefore a better absolute copy number profile. variant allele fraction achoring utilises the near-ubiquitous presence of somatic homozygous _TP53_ variants in high grade serous ovarian cancer as an anchoring point for a copy number fit. Experimentally validated _TP53_ allele fractions at homozygous sites are compared to an expected homozygous _TP53_ variant allele fraction, as determined by the copy number value of the segment overlapping the _TP53_ locus. The smaller absolute differences between the experimental and expected _TP53_ variant allele fractions is suggestive of a better fit. Minimisation of the `clonality` error function, expected-to-experimental TP53 distance, and manual review of fits allows for the selection of an optimal absolute copy number profile for a given sample. The selected fits are subject to a calculation of power to detect copy number alterations, as described [here](https://gmacintyre.shinyapps.io/sWGS_power/), in which the read depth of a given sample is assessed against its selected ploidy-purity combination. This process both determines if 1) a sample has a sufficient number of reads to support the selected ploidy-purity combination and 2) an optimal target number of reads to perform sample-specific read down sampling. This process normalises the read depth between samples in a ploidy and purity dependent manner so that read variance across segments is consistent, while excluding samples which are not supported by a sufficient number of reads. @@ -22,29 +22,27 @@ Samples passing all filtering criteria then undergo read downsampling to the spe * [Pipeline setup](#pipeline-setup) + [Step 1 Clone the repo](#step-1-clone-the-repo) - + [Step 2 Install conda](#step-2-install-conda) - + [Step 3 Installing environment](#step-3-installing-environment) - + [Step 4 Preparing the input files](#step-4-preparing-the-input-files) - - [Sample sheet](#sample-sheet) + + [Step 2 Installing environment](#step-2-installing-environment) + + [Step 3 Preparing the input files](#step-3-preparing-the-input-files) + - [sample sheet](#sample-sheet) - [config.yaml](#configyaml) - [profile config.yaml](#profile-configyaml) - [workflow management](#workflow-management) - - [Updating the pipeline configuration](#updating-the-pipeline-configuration) + - [updating the pipeline configuration](#updating-the-pipeline-configuration) * [Running the pipeline](#running-the-pipeline) - + [workflow management](#workflow-management) - + [Step 5 Stage 1](#step-5-stage-1) - + [Step 6 QC1](#step-6-qc1) + + [Step 4 Stage 1](#step-4-stage-1) + + [Step 5 QC1](#step-5-qc1) - [Smoothing](#smoothing) - [Fit selection](#fit-selection) - + [Step 7 Stage 2](#step-7-stage-2) - + [Step 8 QC2](#step-8-qc2) + + [Step 6 Stage 2](#step-6-stage-2) + + [Downsampling only](#downsampling-only) +* [Output files](#output-files) ## Compatibility ### Reference genome -This pipeline is currently only compatible with BAM files aligned with `hg19` or `GRCh37` genomes. -There are no checks in place for using the correct genome and results will be likely be extremely poor or incorrect if done using an unsupported reference genome. +This pipeline is currently only compatible with BAM files aligned with `hg19` or `GRCh37` genomes. There are no checks in place for using files aligned to an unsupported reference genome. ## Pipeline setup @@ -57,26 +55,24 @@ git clone https://github.com/Phil9S/swgs-absolutecn.git cd swgs-absolutecn/ ``` -### Step 2 Install conda +### Step 2 Installing environment -This pipeline utilises micromamba or conda installation environments to manage the software packages and versions. Please make sure either mamba or conda are installed and available on your system. -Our recommendation is to use micromamba or, ideally, the installed conda version should utilise the libmamba solver library as the required environment contains a large number of packages. +This pipeline can utilise either a conda environment or a containerised docker/singularity implementation to manage the software packages and dependecies. -See installing [micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) or [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) for more information. - -### Step 3 Installing environment - -From within the repository directory, run the `install_env.sh` script to generate a conda environment and install custom packages: +- For the conda implementation please make sure either mamba or conda are installed and available on your system. Our recommendation is to use micromamba or, ideally, the installed conda version should utilise the libmamba solver library as the required environment contains a large number of packages. See installing [micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) or [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). +- For the docker/singularity implementation please make sure a compatible version of both snakemake and singularity are available on your system. See installing [singularity](https://docs.sylabs.io/guides/latest/user-guide/quick_start.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html). +#### Conda environment +With conda or micromamba installed, from within the repository directory, run the `install_env.sh` script to generate a conda environment and install custom packages: ``` ./install_env.sh mamba ``` or ``` -install_env.sh conda $HOME/miniconda/ +./install_env.sh conda $HOME/miniconda/ ``` -If you used a previously installed conda build please use the conda or miniconda installation directory when running this section instead of '$HOME/miniconda/' to correctly initialise the conda environment. +If you are useing an existing conda installation, please use the conda installation directory relevant to your existing installation when running this section instead of `$HOME/miniconda/` to correctly initialise the conda environment. The newly installed environment can be activated using the following: @@ -88,119 +84,206 @@ or conda activate swgs-abscn ``` -### Step 3a - Singularity-based implementation +#### Singularity-based implementation +If singularity and snakemake are already available, the pipeline can be run using a container by running the following: +Set the input and output directories (these should match those specified in the config.yaml - see step 3) ``` ---use-singularity --singularity-args "--bind /mnt/scratcha/fmlab/smith10/britroc/" +INPUTDIR="/path/to/inputfiles/" +OUTPUTDIR="/path/to/output/" ``` +and then set the singularity args variable +``` +SIGBIND="--use-singularity --singularity-args \"--bind ${INPUTDIR},${OUTPUTDIR}\"" +``` +The containerised environment can be found on docker hub here [phil9s/swgs-absolutecn](https://hub.docker.com/repository/docker/phil9s/swgs-absolutecn/) -### Step 4 Preparing the input files - +### Step 3 Preparing the input files #### Sample sheet -The workflow requires a single input file `sample_sheet.tsv` which is a tab-separated document detailing sample names, patient names, smoothing booleans, TP53 allele frequencies, and BAM file locations. Make sure that the BAM file paths are absolute and that sample and patient names are sensible (i.e. do not contain abstract characters or white space). The `TPfreq` column should only contain a float (range 0.00-1.00) or `NA`. The `smooth` column is boolean and should only contain either `TRUE` or `FALSE`. +The workflow requires a single input file `sample_sheet.tsv` which is a tab-separated document detailing sample names, patient names, smoothing booleans, TP53 allele frequencies, and BAM file locations. Make sure that the BAM file paths are absolute and that sample and patient names are sensible (i.e. do not contain abstract characters or white space). The `TPfreq` column should only contain a float (range 0.00-1.00) or `NA`. The `smooth` column is boolean and should only contain either `TRUE` or `FALSE`. Optionally, users may provide precomputed or orthogonally generated sample ploidy and purity estimates using the `precPloidy` and `precPurity` fields but these are not required. Both `precPloidy` and `prePurity` can be provided seperately or together and samples do not need to have either field consistently, meaning one sample may have ploidy/purity estimate and others may not and the pipeline will only perform grid searches across the missing values (e.g In the example table below, a full gridsearch will occur for SAM1, no search performed for SAM2, only ploidy for SAM3, and only purity for SAM4). The table below demonstrates the basic schema required and the workflow will validate this file prior to running. -|PATIENT_ID|SAMPLE_ID|TP53freq|smooth|file | -|----------|---------|--------|------|-------------| -|PAT1 |SAM1 |0.45 |TRUE |/data/SAM1.bam| -|PAT1 |SAM2 |0.55 |FALSE |/data/SAM2.bam| +|PATIENT_ID|SAMPLE_ID|TP53freq|smooth|file |precPloidy|precPurity| +|----------|---------|--------|------|--------------|----------|----------| +|PAT1 |SAM1 |0.45 |TRUE |/data/SAM1.bam|NA |NA | +|PAT1 |SAM2 |0.55 |FALSE |/data/SAM2.bam|2.4 |0.75 | +|PAT2 |SAM3 |NA |FALSE |/data/SAM3.bam|NA |0.43 | +|PAT2 |SAM4 |NA |TRUE |/data/SAM4.bam|3.2 |NA | An example sample_sheet.tsv is included in this repository. #### config.yaml -The config.yaml (`config/config.yaml`) contains the necessary information for the pipeline you wish to run. This includes the location of the samplesheet.tsv, bin size, project name, and output directory. +The config.yaml (`config/config.yaml`) contains the necessary information for the pipeline you wish to run. This includes the location of the samplesheet.tsv, bin size, project name, output directory, and filtering parameters. + +#### Filtering parameters -#### profile config.yaml +Various filters for acceptable ploidy/purity combinations for fitting absolute copy number can be modified or disabled depending user requirements. -The profile configs (cluster_config.yaml & config.yaml) (`profile/*/*`) contains the necessary information to configure the job submission parameters passed to a given workload manager (or lack thereof). This includes the number of concurrently sumbitted jobs, account/project name, partition/queue name, and default job resources (though these are low and should work on almost any cluster). +|variable |default |function |type |values | +|--------------------|--------|-------------------------------------------------------------------------------------------------------------------------|-----------|--------------| +|af_cutoff |0.15 |Maximum difference between expected and observed _TP53_ allele fraction |float |0.0-1.0 | +|use_seed |"TRUE" |Set whether to use `seed_val` to ensure CBS segmentation returns identical results |string bool|"TRUE","FALSE"| +|seed_val |"9999" |seed value used by `use_seed` |string |any string | +|filter_underpowered |"TRUE" |Set whether to filter ploidy/purity combinations without sufficent available read depth to support the given profile |string bool|"TRUE","FALSE"| +|ploidy_min |1.6 |Minimum ploidy value for gridsearch - must be less than `ploidy_max`. Ignored for a sample if `precPloidy` is provided |float |1.0-20.0 | +|ploidy_max |8.0 |Minimum ploidy value for gridsearch - must be greater than `ploidy_min`. Ignored for a sample if `precPloidy` is provided|float |1.0-20.0 | +|purity_min |0.15 |Minimum purity value for gridsearch - must be less than `purity_max`. Ignored for a sample if `precPurity` is provided |float |0.0-1.0 | +|purity_max |1.0 |Minimum purity value for gridsearch - must be greater than `purity_min`. Ignored for a sample if `precPurity` is provided|float |0.0-1.0 | +|filter_homozygous |"TRUE" |Set whether to filter ploidy/purity combinations with a proportion of homozygous loss greater than `homozygous_prop` |string bool|"TRUE","FALSE | +|homozygous_prop |10000000|Proportion of genome (in basepairs) at which to filter a ploidy/purity combination where `filter_homozygous` is "TRUE" |integer |minimum=0 | +|homozygous_threshold|0.4 |Threshold at which to assign homozygous loss to copy number segment counted by `homozygous_prop` |float |0.0-0.99 | + +For most users, the default parameters should work well but in certain instances, these values should be modifed. + +For example, if users are not generating a sufficent number of high quality absolute copy number profiles, setting `filter_underpowered` to "FALSE" will show a larger range of fits which, although statistically underpowered, could be correct for given sample. + +Another use case would be samples where the general purity range is known, for example cell line or LCM data, where the expected purity is known to be ~1.0. As such setting `purity_min` to 0.95 would restrict ploidy/purity combinations to enforce this expectation. #### Workflow management +##### profile config.yaml -This pipeline was primarily developed using the [SLURM](https://slurm.schedmd.com/documentation.html) work load manager for job submission by snakemake. For individuals running on non-workload managed clusters, or utilising other workload managers, profiles are provided to allow for job submission with minimal configuration. Currently supported profiles are `local`, `slurm`, and `pbs`. These can be edited via the script described in the next section. +The profile configs (cluster_config.yaml & config.yaml) (`profile/*/*`) contains the necessary information to configure the job submission parameters passed to a given workload manager (or lack thereof). This includes the number of concurrently sumbitted jobs, account/project name, partition/queue name, and default job resources (though these are low and should work on almost any cluster). This pipeline was primarily developed using the [SLURM](https://slurm.schedmd.com/documentation.html) work load manager for job submission by snakemake. For individuals running on non-workload managed clusters, or utilising other workload managers, profiles are provided to allow for job submission with minimal configuration. Currently supported profiles are `local`, `slurm`, and `pbs`. These can be edited via the script described in the next section. #### Updating the pipeline configuration Environment-specific and pipeline-specific parameters need to be set for each run of this pipeline. While it is possible to manually edit the YAML files, a script has been provided to update the most frequently altered parameters programmatically. The script will iteratively list the parameter and its current value, asking for a user submitted new value should it be needed. If the value is already acceptable or does not need to be changed then an empty value (enter return without typing) will keep the current setting. -With the `swgs-abscn` conda environment active, run one of the following code for the profile you wish to update (typically both the pipeline configuration and one cluster configuration): +Run one of the following code for the profile you wish to update (typically both the pipeline configuration and one cluster configuration): ``` +# conda - with swgs-abscn env # Pipeline configuration -python update_configs.py -c config -# Slurm configuration -python update_configs.py -c slurm -# PBS-torque configuration -python update_configs.py -c pbs -# Local/non-managed configuration -python update_configs.py -c local +./update_configs.py -c config +# Pipeline filters +./update_configs.py -c filters +# workflow configuration +./update_configs.py -c {slurm,pbs,local} +``` +singularity users can run the following (whilst within the repository directory): +``` +SIGCMD="singularity exec --bind "$(pwd -P)" docker://phil9s/swgs-absolutecn:latest $(pwd -P)" +# Pipeline configuration +$SIGCMD/update_configs.py -c config +# Pipeline filters +$SIGCMD/update_configs.py -c filters +# workflow configuration +$SIGCMD/update_configs.py -c {slurm,pbs,local} + ``` ## Running the pipeline -### Step 5 Stage 1 +### Step 4 Stage 1 Once the pipeline and cluster parameters have been set and the samplesheet is prepared, the first stage of the pipeline is ready to run. - -With the `swgs-abscn` conda environment active, run the following: +Be sure to update the profile path to match your cluster/server configuration. Confirm the pipeline is configured correctly, run the first stage using the `dry-run` mode. ``` -snakemake -n --profile profile/*/ --snakefile stage_1 +# conda - with swgs-abscn env +snakemake -n --profile profile/slurm/ --snakefile stage_1 +``` +``` +# singularity +snakemake -n --profile profile/slurm/ --snakefile stage_1 ${SIGBIND} ``` - If the previous step ran without error then run the following: - ``` -snakemake --profile profile/*/ --snakefile stage_1 +# conda - with swgs-abscn env +snakemake --profile profile/slurm/ --snakefile stage_1 +``` +``` +# singularity +snakemake --profile profile/slurm/ --snakefile stage_1 ${SIGBIND} ``` -Where * is replaced with the profile matching your cluster/server configuration. - -### Step 6 QC1 +### Step 5 QC1 -At the conclusion of stage 1, files and fits will be generated for all samples present in the `samplesheet.tsv` provided. This will include grid search-generated fits, copy number profile plots, and a `QDNAseq` RDS file containing the copy number fit data. This data is not immediately usable in downstream processes and fits must undergo quality control and fit selection, as samples may generate more than one viable copy number profile. +At the conclusion of stage 1, files and fits will be generated for all samples present in the `samplesheet.tsv` provided. This will include grid search-generated fits, copy number profile plots, and a `QDNAseq` RDS file containing the copy number fit data. This data is not immediately usable in downstream processes and fits must undergo quality control and fit selection, as samples may generate more than one viable copy number profile. #### Smoothing -Prior to fit selection, a subset of samples will likely require smoothing of segments in order to be viable. Read the guide provided here to select and update which samples require smoothing [here](resources/smoothing_guide.md). Once the `samplesheet.tsv` has been updated with the new `smooth` values, rerun stage 1 using the following: +Prior to fit selection, a subset of samples may require smoothing of segments in order to be viable. Read the guide provided here to select and update which samples require smoothing [here](resources/smoothing_guide.md). Once the `samplesheet.tsv` has been updated with the new `smooth` values, rerun stage 1. Be sure to update the profile path to match your cluster/server configuration. ``` -snakemake --profile profile/*/ -F all --snakefile stage_1 +# conda - with swgs-abscn env +snakemake --profile profile/slurm/ -F all --snakefile stage_1 +``` +``` +#singularity +snakemake --profile profile/slurm/ -F all --snakefile stage_1 ${SIGBIND} ``` - -Where * is replaced with the profile matching your cluster/server configuration. #### Fit selection After running stage 1 with the appropraite smoothing values, copy number fits will have been generated for each sample. In most cases, multiple viable fits will have been selected for each sample and a semi-qualitative quality control process needs to be applied to select the best fitting solution or exclude a sample should no fit be good enough. -Follow the guide on fit selection [here](resources/quality_control_guide.md) to perform quality control and update the `{project}__fit_QC_predownsample.tsv` file. Once this step has been performed and a single acceptable fit has been selected for each sample, proceed to step 7. +Follow the guide on fit selection [here](resources/quality_control_guide.md) to perform quality control and update the `{project}_fit_QC_predownsample.tsv` file. Once this step has been performed and a single acceptable fit has been selected for each sample, proceed to stage 2. -### Step 7 Stage 2 +### Step 6 Stage 2 Provided quality control and fit selection was performed correctly, stage 2 of the pipeline can be performed which refits all copy number profiles using downsampled BAM files and the selected fits from stage 1. +Be sure to update the profile path to match your cluster/server configuration. As before, confirm the pipeline is configured correctly by running with the `dry-run` mode. ``` -snakemake -n --profile profile/*/ --snakefile stage_2 +# conda - with swgs-abscn env +snakemake -n --profile profile/slurm/ --snakefile stage_2 +``` +``` +# singularity +snakemake -n --profile profile/slurm/ --snakefile stage_2 ${SIGBIND} ``` - and if the previous step ran without error then run the following: +``` +# conda - with swgs-abscn env +snakemake --profile profile/slurm/ --snakefile stage_2 +``` +``` +# singularity +snakemake --profile profile/slurm/ --snakefile stage_2 ${SIGBIND} +``` +### Downsampling only + +In some instances, where ploidy and purity are provided from alternative sources, users may only want to perform the read depth normalisation step and not perform a gridsearch. In these cases, if all samples in the `sample_sheet.tsv` contain a valid `precPloidy` and `precPurity` values the `processPrecomputed.R` script can generate all the required output files to start stage 2 without needing to perform any gridsearch or select a ploidy/purity combination, and instead skip directly to read depth normalisation (via read downsampling) using the provied `precPloidy` and `precPurity` combination. + +First, check all samples have valid `precPloidy` and `precPurity` values by running the following: +``` +# conda - with swgs-abscn env +snakemake -n --profile profile/slurm/ --snakefile stage_1 +``` +``` +# singularity +snakemake -n --profile profile/slurm/ --snakefile stage_1 ${SIGBIND} +``` +This should return an error message stating `all ploidy and purity values are precomputed - Run Rscript scripts/processPrecomputed.R and skip to stage_2`. If this did not occur and stage 1 ran a dry-run then not all samples have a `precPloidy` and `precPurity` value specified. + +Provided that the previous step validated and returned the specified message then the following can be run to generate the required output files to skip stage 1 + +``` +# conda - with swgs-abscn env +Rscript scripts/processPrecomputed.R +``` ``` -snakemake --profile profile/*/ --snakefile stage_2 +# singularity +SIGCMD="singularity exec --bind "$(pwd -P)" docker://phil9s/swgs-absolutecn:latest" +$SIGCMD Rscript scripts/processPrecomputed.R ``` -Where * is replaced with the profile matching your cluster/server configuration. +At which point Stage 2 can then be performed as previously described. -### Step 8 QC2 +## Output files -To confirm the quality of newly generated downsampled absolute copy number profiles generated by stage 2, an evalution of outputted fits should be performed as described previously in step 6 [here](resources/quality_control_guide.md). The output `{}` should be updated accordingly, with poor fits being excluded. +The final output is generated by stage 2 is three files located in the specified output directory `{output_dir}/sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/` +- `*_ds_abs_fits.tsv` - Tab-seperated file containing absolute copy number profile metadata and fitting information +- `*_ds_absCopyNumber.rds` - QDNASeqmod class object containing binned copy number data in `rds` format +- `*_ds_absCopyNumber_segTable.tsv` - A multisample segment table consisting of copy number segment coordinates and associated segment values as unrounded absolute copy number ## Authors