From 3ae970890bba04326ed5a1d02aff2f8bcca902ef Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Thu, 11 Apr 2024 17:26:51 +0100
Subject: [PATCH 01/18] fixed pipeline failure on being only provided single
 sample

---
 scripts/gridsearch_results_filtering.R | 100 +++++++++++++++++++------
 scripts/qdnaseq_rel_to_abs.R           |   4 +-
 2 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index 56851f9..91dba31 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -32,7 +32,9 @@ collapse_rds <- function(rds.list){
       comb <- combine(comb,add)
     }
     rds.obj <- comb
-  }
+  } else {
+    rds.obj <- comb
+  } 
   return(rds.obj)
 }
 # Combine and load rds objects
@@ -103,57 +105,109 @@ if(!dir.exists(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_d
 	dir.create(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots"))
 }
 
-#relative_smoothed
-#Plot absolute CN fits for assessment
-foreach(i=unique(pruned_results$SAMPLE_ID)) %dopar% {
+if(length(unique(pruned_results$SAMPLE_ID)) == 1){
+  i <- unique(pruned_results$SAMPLE_ID)
   dat <-  pruned_results %>%
     filter(SAMPLE_ID == i) %>%
     arrange(ploidy)
     #arrange(rank_clonality)
-  x <- relative_smoothed[, i]
+  x <- relative_smoothed
   cn <- assayDataElement(x,"copynumber")
   seg <- assayDataElement(x,"segmented")
   rel_ploidy <- mean(cn,na.rm=T)
   ll <- nrow(dat)
   png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350)
-  par(mfrow = c(1,ll)) 
+  par(mfrow = c(1,ll))
   for(n in 1:nrow(dat)){
-    
     ploidy <- dat[n,]$ploidy
     purity <- dat[n,]$purity
     cellploidy <- ploidy*purity+2*(1-purity)
     seqdepth <- rel_ploidy/cellploidy
-    
     expTP53 <- round(dat[n,]$expected_TP53_AF, 2)
     TP53 <- dat[n,]$TP53freq
-    
+
     #convert to abs
-    
+
     pData(x)$ploidy <- ploidy
     pData(x)$purity <- purity
-    
+
     temp <- x
     abs_cn <- depthtocn(cn,purity,seqdepth)
     abs_seg <- depthtocn(seg,purity,seqdepth)
     assayDataElement(temp,"copynumber") <- abs_cn
     assayDataElement(temp,"segmented") <- abs_seg
-    
+
     #tmp_abs <- convert_rd_to_cn(x)
- # plot   
+    # plot
     if(ploidy>5){
       yrange=15
-    }else
-    {
+    } else {
       yrange=10
     }
-  plot(temp,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN",
-         main=paste(i, " eTP53=",round(expTP53,2),
-                    " AF=", round(TP53,2),
-                    " p=",round(purity,2),
-                    " pl=",round(ploidy,2),
-                    sep=""),cex.main=0.8)
-  abline(h=1:9,col = "blue")
-  
+    plot(temp,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN",
+           main=paste(i, " eTP53=",round(expTP53,2),
+                      " AF=", round(TP53,2),
+                      " p=",round(purity,2),
+                      " pl=",round(ploidy,2),
+                      sep=""),cex.main=0.8)
+    abline(h=1:9,col = "blue")
+
   }
   dev.off()
+} else {
+  #relative_smoothed
+  #Plot absolute CN fits for assessment
+  foreach(i=unique(pruned_results$SAMPLE_ID)) %dopar% {
+    dat <-  pruned_results %>%
+      filter(SAMPLE_ID == i) %>%
+      arrange(ploidy)
+      #arrange(rank_clonality)
+    x <- relative_smoothed[, i]
+    cn <- assayDataElement(x,"copynumber")
+    seg <- assayDataElement(x,"segmented")
+    rel_ploidy <- mean(cn,na.rm=T)
+    ll <- nrow(dat)
+    png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350)
+    par(mfrow = c(1,ll)) 
+    for(n in 1:nrow(dat)){
+    
+      ploidy <- dat[n,]$ploidy
+      purity <- dat[n,]$purity
+      cellploidy <- ploidy*purity+2*(1-purity)
+      seqdepth <- rel_ploidy/cellploidy
+    
+      expTP53 <- round(dat[n,]$expected_TP53_AF, 2)
+      TP53 <- dat[n,]$TP53freq
+    
+      #convert to abs
+    
+      pData(x)$ploidy <- ploidy
+      pData(x)$purity <- purity
+    
+      temp <- x
+      abs_cn <- depthtocn(cn,purity,seqdepth)
+      abs_seg <- depthtocn(seg,purity,seqdepth)
+      assayDataElement(temp,"copynumber") <- abs_cn
+      assayDataElement(temp,"segmented") <- abs_seg
+    
+      #tmp_abs <- convert_rd_to_cn(x)
+    # plot   
+      if(ploidy>5){
+        yrange=15
+      } else {
+        yrange=10
+      }
+    plot(temp,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN",
+           main=paste(i, " eTP53=",round(expTP53,2),
+                      " AF=", round(TP53,2),
+                      " p=",round(purity,2),
+                      " pl=",round(ploidy,2),
+                      sep=""),cex.main=0.8)
+    abline(h=1:9,col = "blue")
+  
+    }
+    dev.off()
+  }
 }
+
+#END
diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R
index cf36cc3..af596f5 100644
--- a/scripts/qdnaseq_rel_to_abs.R
+++ b/scripts/qdnaseq_rel_to_abs.R
@@ -17,8 +17,8 @@ cores <- as.numeric(snakemake@threads)
 registerDoMC(cores)
 
 qc.data <- qc.data[qc.data$use == "TRUE",]
-
 rds.filename <- snakemake@input[["rds"]]
+
 rds.list <- lapply(rds.filename,FUN=function(x){readRDS(x)})
 
 collapse_rds <- function(rds.list){
@@ -29,6 +29,8 @@ collapse_rds <- function(rds.list){
       comb <- combine(comb,add)
     }
     rds.obj <- comb
+  } else {
+    rds.obj <- comb
   }
   return(rds.obj)
 }

From c9dfb4591c561644a50a0b9a140fd730aabf9966 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Thu, 11 Apr 2024 18:44:53 +0100
Subject: [PATCH 02/18] added dev utility to report segment counts

---
 dev_tools/report_seg_counts.R | 49 +++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 dev_tools/report_seg_counts.R

diff --git a/dev_tools/report_seg_counts.R b/dev_tools/report_seg_counts.R
new file mode 100644
index 0000000..386e3bf
--- /dev/null
+++ b/dev_tools/report_seg_counts.R
@@ -0,0 +1,49 @@
+args <- commandArgs(trailingOnly=T)
+library(yaml)
+
+cat("report segments - use 'report_seg_counts.R all' for individal seg counts\n")
+
+config <- read_yaml(file="config/config.yaml")
+
+projectBin <- paste0(config$project_name,"_",config$bin,"kb")
+outputLoc <- paste0(config$out_dir,"sWGS_fitting/",projectBin,"/")
+
+pre <- "absolute_PRE_down_sampling/"
+post <- "absolute_POST_down_sampling/abs_cn_rds/"
+
+preFile <- paste0(outputLoc,pre,projectBin,"_relSmoothedCN.rds")
+postFile <- paste0(outputLoc,post,projectBin,"_ds_absCopyNumber.rds")
+
+verbose <- FALSE
+if(length(args) > 0){
+  if(args[1] == "all"){
+    verbose <- TRUE
+  }
+}
+
+if(file.exists(preFile)){
+  suppressMessages(library(QDNAseqmod))
+  suppressMessages(library(Biobase))
+  preS <- readRDS(preFile)
+  preS <- preS[featureData(preS)$use]
+  preSegs <- apply(assayDataElement(preS,"segmented"),MARGIN=2,function(x) length(rle(x)$lengths))
+  cat("\nPre-downsampled segments\n")
+  if(verbose){
+    print(preSegs)
+  } else {
+    print(summary(preSegs))
+  }
+  if(file.exists(postFile)){
+    postS <- readRDS(postFile)
+    postS <- postS[featureData(postS)$use]
+    postSegs <- apply(assayDataElement(postS,"segmented"),MARGIN=2,function(x) length(rle(x)$lengths))
+    cat("\nPost-downsampled segments\n")
+    if(verbose){
+      print(postSegs)
+    } else {
+      print(summary(postSegs))
+    }
+  }
+} else {
+  cat("no pre or post downsampled files found\n")
+}

From 08891089c8d99dde1462ae98050a844350f07a5c Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Wed, 17 Apr 2024 16:50:49 +0100
Subject: [PATCH 03/18] implemented setting seed in config and updated scripts
 to use new seed variable to fix segmentation - set to TRUE by default

---
 config/config.yaml            |  9 +++++++--
 dev_tools/report_seg_counts.R |  2 +-
 rules/downsampled_rel_rds.smk |  4 +++-
 rules/rel_rds.smk             |  4 +++-
 schemas/config.schema.yaml    |  9 ++++++++-
 scripts/qdnaseq_mod.R         | 13 +++++++++++--
 scripts/qdnaseq_mod_ds.R      | 17 ++++++++++++++---
 update_configs.py             |  2 ++
 8 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 91f0217..9caa5ca 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -3,17 +3,22 @@
 samplesheet: "sample_sheet.tsv"
 
 # Output location
-out_dir: "/mnt/scratcha/fmlab/smith10/britroc/"
+out_dir: "/mnt/scratcha/fmlab/smith10/swgs_test/"
 
 # Bin sizes
 # By default any in [1,5,15,30,50,100,500,1000]
 bins:
 - 30
-project_name: "britroc"
+project_name: "nm_test"
 
 # Pipeline parameters
 af_cutoff: 0.15
 
+# Set seed for CBS - TRUE or FALSE
+# default TRUE
+use_seed: "TRUE"
+seed_val: "9999"
+
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/dev_tools/report_seg_counts.R b/dev_tools/report_seg_counts.R
index 386e3bf..74051e4 100644
--- a/dev_tools/report_seg_counts.R
+++ b/dev_tools/report_seg_counts.R
@@ -1,7 +1,7 @@
 args <- commandArgs(trailingOnly=T)
 library(yaml)
 
-cat("report segments - use 'report_seg_counts.R all' for individal seg counts\n")
+cat("report segments - use 'report_seg_counts.R all' for individual seg counts\n")
 
 config <- read_yaml(file="config/config.yaml")
 
diff --git a/rules/downsampled_rel_rds.smk b/rules/downsampled_rel_rds.smk
index e6d4254..8218978 100644
--- a/rules/downsampled_rel_rds.smk
+++ b/rules/downsampled_rel_rds.smk
@@ -7,6 +7,8 @@ rule ds_relRDS:
     params:
         outdir=OUT_DIR,
         project="{project}",
-        bin="{bin}"
+        bin="{bin}",
+        use_seed=config["use_seed"],
+        seed_val=config["seed_val"]
     script:
         "../scripts/qdnaseq_mod_ds.R"
diff --git a/rules/rel_rds.smk b/rules/rel_rds.smk
index 812f01f..3e5d406 100644
--- a/rules/rel_rds.smk
+++ b/rules/rel_rds.smk
@@ -7,7 +7,9 @@ rule relRDS:
         bin="{bin}",
         outdir=OUT_DIR,
         project="{project}",
-        meta=config["samplesheet"]
+        meta=config["samplesheet"],
+        use_seed=config["use_seed"],
+        seed_val=config["seed_val"]
     script:
         "../scripts/qdnaseq_mod.R"
 
diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
index 7179698..8ce5948 100644
--- a/schemas/config.schema.yaml
+++ b/schemas/config.schema.yaml
@@ -18,6 +18,12 @@ properties:
     type: number
     min: 0
     max: 1.0
+  use_seed:
+    type: string
+    enum: ["TRUE","FALSE"]
+  seed_val:
+    type: string
+    
 
 # entries that have to be in the config file for successful validation
 required:
@@ -25,4 +31,5 @@ required:
   - out_dir
   - bins
   - project_name
-
+  - use_seed
+  - seed_val
diff --git a/scripts/qdnaseq_mod.R b/scripts/qdnaseq_mod.R
index 15e0da7..f472d01 100644
--- a/scripts/qdnaseq_mod.R
+++ b/scripts/qdnaseq_mod.R
@@ -7,6 +7,8 @@ ncores <- 1
 output_dir <- snakemake@params[["outdir"]]
 project <- snakemake@params[["project"]]
 metafile <- snakemake@params[["meta"]]
+use_seed <- snakemake@params[["use_seed"]]
+seed_val <- snakemake@params[["seed_val"]]
 metadata <- read.table(file = metafile,header=T,sep="\t")
 bam_list <- snakemake@input[["bams"]]
 outname <- snakemake@output[[1]]
@@ -42,8 +44,15 @@ assayDataElement(copyNumbers[[1]],"copynumber") <- assayDataElement(copyNumbers[
 # smooth outliers (Data is now ready to be analyzed with a downstream package of choice (exportBins))
 copyNumbersSmooth <- mclapply(X=copyNumbers, FUN=smoothOutlierBins, mc.cores=1)
 
+# Implement seeding to prevent variable segments on repeated runs
+if(use_seed){
+  seed <- as.character(seed_val)
+} else {
+  seed <- NULL
+}
+
 # perform segmentation on bins and save it
-copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores)
+copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores,seeds=seed)
 
 # For each
 smooth_samples <- function(obj){
@@ -64,7 +73,7 @@ smooth_samples <- function(obj){
     segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)}))
     while(sum(segnum>maxseg)&sdadjust<5)
     {
-      currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust)
+      currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust,seeds=seed)
       segments <- assayDataElement(currsamp, "segmented")[condition, , drop=FALSE]
       segments<-apply(segments,2,rle)
       segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)}))
diff --git a/scripts/qdnaseq_mod_ds.R b/scripts/qdnaseq_mod_ds.R
index d67517a..eb56c86 100644
--- a/scripts/qdnaseq_mod_ds.R
+++ b/scripts/qdnaseq_mod_ds.R
@@ -10,6 +10,9 @@ metafile <- snakemake@input[["meta"]]
 metadata <- read.table(file = metafile,header=T,sep="\t")
 bam_list <- snakemake@input[["bam"]]
 outname <- snakemake@output[[1]]
+use_seed <- snakemake@params[["use_seed"]]
+seed_val <- snakemake@params[["seed_val"]]
+
 
 suppressMessages(library(parallel))
 suppressMessages(library(tidyverse))
@@ -42,8 +45,16 @@ assayDataElement(copyNumbers[[1]],"copynumber") <- assayDataElement(copyNumbers[
 
 # smooth outliers (Data is now ready to be analyzed with a downstream package of choice (exportBins))
 copyNumbersSmooth <- mclapply(X=copyNumbers, FUN=smoothOutlierBins, mc.cores=1)
-  # perform segmentation on bins and save it
-copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores)
+
+# Implement seeding to prevent variable segments on repeated runs
+if(use_seed){
+  seed <- as.character(seed_val)
+} else {
+  seed <- NULL
+}
+
+# perform segmentation on bins and save it
+copyNumbersSegmented <- mclapply(X=copyNumbersSmooth, FUN=segmentBins, transformFun="sqrt", mc.cores=ncores,seeds=seed)
 
 changeSampleName <- function(CNsObj)
   {
@@ -75,7 +86,7 @@ smooth_samples <- function(obj){
     segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)}))
     while(sum(segnum>maxseg)&sdadjust<5)
     {
-      currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust)
+      currsamp<-segmentBins(currsamp, transformFun="sqrt",undo.SD=sdadjust,seeds=seed)
       segments <- assayDataElement(currsamp, "segmented")[condition, , drop=FALSE]
       segments<-apply(segments,2,rle)
       segnum<-as.numeric(lapply(segments,function(x){length(x$lengths)}))
diff --git a/update_configs.py b/update_configs.py
index 2aab6b0..a77b78e 100755
--- a/update_configs.py
+++ b/update_configs.py
@@ -43,6 +43,8 @@ def get_input(x,y):
     config['out_dir'] = DoubleQuotedScalarString(get_input(config['out_dir'],'out_dir'))
     config['project_name'] = DoubleQuotedScalarString(get_input(config['project_name'],'project_name'))
     config['af_cutoff'] = float(get_input(config['af_cutoff'],'af_cutoff'))
+    config['use_seed'] = DoubleQuotedScalarString(get_input(config['use_seed'],'use_seed'))
+    config['seed_val'] = DoubleQuotedScalarString(get_input(config['seed_val'],'seed_val'))
     # Save updated yaml
     with open('config/config.yaml', 'w') as fp:
         yaml.dump(config, fp)

From 55f0cdd85ccb9ae31cdafb6880e409b2d7841d93 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Wed, 17 Apr 2024 18:29:16 +0100
Subject: [PATCH 04/18] removed *.csv file ext on tab-seperated files and
 replaced with *.tsv

---
 rules/filter_gridsearch.smk                   | 2 +-
 rules/gridsearch.smk                          | 2 +-
 scripts/gridsearch_results_filtering.R        | 2 +-
 scripts/ploidy_purity_search_standard_error.R | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk
index 78df00d..4b0784e 100644
--- a/rules/filter_gridsearch.smk
+++ b/rules/filter_gridsearch.smk
@@ -1,6 +1,6 @@
 rule gridsearch_filter:
     input:
-        cl=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/clonality_results/{{project}}_{sample}_clonality.csv",sample=SAMPLES),
+        cl=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/clonality_results/{{project}}_{sample}_clonality.tsv",sample=SAMPLES),
         rds=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{sample}_{{bin}}kb_relSmoothedCN.rds",sample=SAMPLES)
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv"
diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk
index 086d852..7f345b0 100644
--- a/rules/gridsearch.smk
+++ b/rules/gridsearch.smk
@@ -2,7 +2,7 @@ rule gridsearch_fitting:
     input:
         expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{{sample}}_{{bin}}kb_relSmoothedCN.rds")
     output:
-        csv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.csv",
+        tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.tsv",
         pdf=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.pdf"
     params:
         bin="{bin}",
diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index 91dba31..4518750 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -44,7 +44,7 @@ saveRDS(relative_smoothed,paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/abs
 filelist <- snakemake@input[["cl"]]
 clonality <- do.call(rbind,
 			lapply(filelist,FUN = function(x){
-				n <- gsub(pattern="_clonality.csv",rep="",x=x)
+				n <- gsub(pattern="_clonality.tsv",rep="",x=x)
         prefix <- paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/clonality_results/",project,"_")
         n <- gsub(pattern=prefix,rep="",x=n)
 				tab <- read.table(x,sep="\t",skip=1)
diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
index 95f108b..2198c8e 100644
--- a/scripts/ploidy_purity_search_standard_error.R
+++ b/scripts/ploidy_purity_search_standard_error.R
@@ -189,4 +189,4 @@ print(ggplot(res,aes(x=ploidy,y=purity,fill=clonality))+geom_tile()+
                theme_bw())
 dev.off()
 
-write.table(res,snakemake@output[["csv"]],sep="\t",quote=F,row.names=FALSE)
+write.table(res,snakemake@output[["tsv"]],sep="\t",quote=F,row.names=FALSE)

From f50b5d7bae6f68d404cf520ef322d3c4bf100989 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Mon, 22 Apr 2024 18:54:58 +0100
Subject: [PATCH 05/18] implemented power filter config option - allows user to
 remove the CN fit power filtering

---
 config/config.yaml                            |  4 +++
 rules/filter_gridsearch.smk                   |  3 ++-
 schemas/config.schema.yaml                    |  4 +++
 scripts/downsampleBams.R                      |  1 +
 scripts/gridsearch_results_filtering.R        | 26 ++++++++++++++++---
 scripts/ploidy_purity_search_standard_error.R |  4 +--
 update_configs.py                             |  1 +
 7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 9caa5ca..16aa999 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -19,6 +19,10 @@ af_cutoff: 0.15
 use_seed: "TRUE"
 seed_val: "9999"
 
+# fitler underpowered solutions - TRUE or FALSE
+# Default TRUE
+filter_underpowered: "FALSE"
+
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk
index 4b0784e..a4add21 100644
--- a/rules/filter_gridsearch.smk
+++ b/rules/filter_gridsearch.smk
@@ -9,7 +9,8 @@ rule gridsearch_filter:
         meta=config["samplesheet"],
         project="{project}",
         outdir=OUT_DIR,
-        af_cutoff=config["af_cutoff"]
+        af_cutoff=config["af_cutoff"],
+        filter_underpowered=config["filter_underpowered"]
     threads: THREADS 
     script: 
         "../scripts/gridsearch_results_filtering.R"
diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
index 8ce5948..d28f44a 100644
--- a/schemas/config.schema.yaml
+++ b/schemas/config.schema.yaml
@@ -23,6 +23,9 @@ properties:
     enum: ["TRUE","FALSE"]
   seed_val:
     type: string
+  filter_underpowered:
+    type: string
+    enum: ["TRUE","FALSE"]
     
 
 # entries that have to be in the config file for successful validation
@@ -33,3 +36,4 @@ required:
   - project_name
   - use_seed
   - seed_val
+  - filter_underpowered
diff --git a/scripts/downsampleBams.R b/scripts/downsampleBams.R
index e9d3084..f3c008d 100644
--- a/scripts/downsampleBams.R
+++ b/scripts/downsampleBams.R
@@ -26,6 +26,7 @@ perc <- fit.qc.filt %>%
    filter(SAMPLE_ID == sample_name) %>%
    .$ratio
 
+# If read ratio is greater than 0.96 (i.e close to original or higher than available reads)
 if( perc <= 0.96){
   cmd.downsample <- paste("samtools view -s ", perc," -b ",bam_in," > ",outname)
   cmd.index <- paste0("samtools index ",outname)
diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index 4518750..bf1fd61 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -14,10 +14,18 @@ metadata <- read.table(file = metafile,header=T,sep="\t")
 bin <- as.numeric(snakemake@params[["bin"]])
 out_dir <- snakemake@params[["outdir"]]
 project <- snakemake@params[["project"]]
+filter_underpowered <- snakemake@params[["filter_underpowered"]]
 af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]])
 cores <- as.numeric(snakemake@threads)
 registerDoMC(cores)
 
+# Filter for powered fits only
+if(filter_underpowered == "TRUE"){
+  filter_underpowered <- TRUE
+} else {
+  filter_underpowered <- FALSE
+}
+
 # read in relative CN data
 # collapse rds files function
 rds.filename <- snakemake@input[["rds"]]
@@ -63,20 +71,30 @@ depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given
 }
 
 #Top 10 when ranking by clonality and TP53
+print(filter_underpowered)
+print(clonality)
+
+filtered_results <- clonality
+
+if(filter_underpowered){
+  filtered_results <- filtered_results %>%
+    filter(powered == 1) 
+}
 
-filtered_results <- clonality %>%
+filtered_results <- filtered_results %>% # filter underpowered fits when config variable is TRUE
   select(SAMPLE_ID, PATIENT_ID, everything()) %>%
-  filter(powered ==1) %>%
   group_by(SAMPLE_ID, ploidy) %>%
   mutate(rank_clonality = min_rank(clonality)) %>% #rank clonality within a unique ploidy state 
-  filter(rank_clonality ==1) %>% #select ploidy with the lowest clonality within a unique ploidy state 
+  filter(rank_clonality == 1) %>% #select ploidy with the lowest clonality within a unique ploidy state 
   group_by(SAMPLE_ID) %>%
   top_n(-10, wt = clonality) %>% # select top 10 ploidy states with the lowest clonality values
   mutate(rank_clonality = min_rank(clonality)) %>% # rank by clonality within a sample across ploidies in top 10
- # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15
+  filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive
+  # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15
   filter(is.na(TP53freq) | near(expected_TP53_AF,TP53freq, tol = af_cutoff )) %>%  
   arrange(PATIENT_ID, SAMPLE_ID )
 
+print(filtered_results)
 #Further limit the results by selecting the ploidy states with the lowest clonality values where multiple similar solutions are present.
 #Threshold of 0.3 used to select different states
 
diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
index 2198c8e..9dbf99e 100644
--- a/scripts/ploidy_purity_search_standard_error.R
+++ b/scripts/ploidy_purity_search_standard_error.R
@@ -107,7 +107,7 @@ cntodepth<-function(cn,purity,seqdepth) #converts copy number to read depth give
 {
     seqdepth*((1-purity)*2+purity*cn)
 }
-## TP53 target bin
+## TP53 target bin - hg19@30kb
 target <- c("17:7565097-7590863")
 get_gene_seg <- function(target=NULL,abs_data=NULL){
   to_use <- fData(abs_data)$use
@@ -137,7 +137,7 @@ gene_bin_seg <- get_gene_seg(target = target,abs_data = rds.obj[[1]])
 
 #estimate absolute copy number fits for all samples in parallel
 ploidies<-seq(1.6,8,0.1)
-purities<-seq(0.05,1,0.01)
+purities<-seq(0.15,1,0.01)
 clonality<-c()
 #ind<-which(colnames(rds.obj)==sample)
 relcn<-rds.obj[[1]]
diff --git a/update_configs.py b/update_configs.py
index a77b78e..1038f98 100755
--- a/update_configs.py
+++ b/update_configs.py
@@ -45,6 +45,7 @@ def get_input(x,y):
     config['af_cutoff'] = float(get_input(config['af_cutoff'],'af_cutoff'))
     config['use_seed'] = DoubleQuotedScalarString(get_input(config['use_seed'],'use_seed'))
     config['seed_val'] = DoubleQuotedScalarString(get_input(config['seed_val'],'seed_val'))
+    config['filter_underpowered'] = DoubleQuotedScalarString(get_input(config['filter_underpowered'],'filter_underpowered'))
     # Save updated yaml
     with open('config/config.yaml', 'w') as fp:
         yaml.dump(config, fp)

From bd0162cbd41aff574ac532bfb89f38e2578eab19 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Tue, 23 Apr 2024 18:00:01 +0100
Subject: [PATCH 06/18] export segment table as final output and remove
 redundant functions

---
 scripts/ploidy_purity_search_standard_error.R | 68 -------------------
 scripts/qdnaseq_rel_to_abs.R                  | 39 +++++++++++
 2 files changed, 39 insertions(+), 68 deletions(-)

diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
index 9dbf99e..3d27fd3 100644
--- a/scripts/ploidy_purity_search_standard_error.R
+++ b/scripts/ploidy_purity_search_standard_error.R
@@ -34,79 +34,11 @@ nbins_ref_genome <- sum(fData(rds.obj[[1]])$use)
 nbins<-nrow(bins)
 
 #define helper functions
-getSegTable<-function(x) #returns a table containing copy number segments
-{
-    dat<-x
-    sn<-assayDataElement(dat,"segmented")
-    fd <- fData(dat)
-    fd$use -> use
-    fdfiltfull<-fd[use,]
-    sn<-sn[use,]
-    segTable<-c()
-    for(c in unique(fdfiltfull$chromosome))
-    {
-        snfilt<-sn[fdfiltfull$chromosome==c]
-        fdfilt<-fdfiltfull[fdfiltfull$chromosome==c,]
-        sn.rle<-rle(snfilt)
-        starts <- cumsum(c(1, sn.rle$lengths[-length(sn.rle$lengths)]))
-        ends <- cumsum(sn.rle$lengths)
-        lapply(1:length(sn.rle$lengths), function(s) {
-            from <- fdfilt$start[starts[s]]
-            to <- fdfilt$end[ends[s]]
-            segValue <- sn.rle$value[s]
-            c(fdfilt$chromosome[starts[s]], from, to, segValue)
-        }) -> segtmp
-        segTableRaw <- data.frame(matrix(unlist(segtmp), ncol=4, byrow=T),stringsAsFactors=F)
-        segTable<-rbind(segTable,segTableRaw)
-    }
-    colnames(segTable) <- c("chromosome", "start", "end", "segVal")
-    segTable
-}
-
-getPloidy<-function(abs_profiles) #returns the ploidy of a sample from segTab or QDNAseq object
-{
-    out<-c()
-    samps<-getSampNames(abs_profiles)
-    for(i in samps)
-    {
-        if(class(abs_profiles)=="QDNAseqCopyNumbers")
-        {
-            segTab<-getSegTable(abs_profiles[,which(colnames(abs_profiles)==i)])
-        }
-        else
-        {
-            segTab<-abs_profiles[[i]]
-            colnames(segTab)[4]<-"segVal"
-        }
-        segLen<-(as.numeric(segTab$end)-as.numeric(segTab$start))
-        ploidy<-sum((segLen/sum(segLen))*as.numeric(segTab$segVal))
-        out<-c(out,ploidy)
-    }
-    data.frame(out,stringsAsFactors = F)
-}
-
-getSampNames<-function(abs_profiles) # convenience function for getting sample names from QDNAseq or segTab list
-{
-    if(class(abs_profiles)=="QDNAseqCopyNumbers")
-    {
-        samps<-colnames(abs_profiles)
-    }
-    else
-    {
-        samps<-names(abs_profiles)
-    }
-    samps
-}
-
 depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given purity and single copy depth
 {
     (x/seqdepth-2*(1-purity))/purity
 }
 
-cntodepth<-function(cn,purity,seqdepth) #converts copy number to read depth given purity and single copy depth
-{
-    seqdepth*((1-purity)*2+purity*cn)
-}
 ## TP53 target bin - hg19@30kb
 target <- c("17:7565097-7590863")
 get_gene_seg <- function(target=NULL,abs_data=NULL){
diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R
index af596f5..68c74bc 100644
--- a/scripts/qdnaseq_rel_to_abs.R
+++ b/scripts/qdnaseq_rel_to_abs.R
@@ -151,5 +151,44 @@ res <- data.frame(res,stringsAsFactors = F)
 # Save rds
 saveRDS(abs_profiles,file=paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/",project,"_",bin,"kb_ds_absCopyNumber.rds"))
 
+# save segTable
+getSegTable<-function(x){
+    if(inherits(x,what = "QDNAseqCopyNumbers",which = F)){
+        sn<-Biobase::assayDataElement(x,"segmented")
+        fd <- Biobase::fData(x)
+        fd$use -> use
+        fdfiltfull<-fd[use,]
+        sn<-sn[use,]
+        segTable<-c()
+        for(s in colnames(sn)){
+            for(c in unique(fdfiltfull$chromosome))
+            {
+                snfilt<-sn[fdfiltfull$chromosome==c,colnames(sn) == s]
+                fdfilt<-fdfiltfull[fdfiltfull$chromosome==c,]
+                sn.rle<-rle(snfilt)
+                starts <- cumsum(c(1, sn.rle$lengths[-length(sn.rle$lengths)]))
+                ends <- cumsum(sn.rle$lengths)
+                lapply(1:length(sn.rle$lengths), function(s) {
+                    from <- fdfilt$start[starts[s]]
+                    to <- fdfilt$end[ends[s]]
+                    segValue <- sn.rle$value[s]
+                    c(fdfilt$chromosome[starts[s]], from, to, segValue)
+                }) -> segtmp
+                segTableRaw <- data.frame(matrix(unlist(segtmp), ncol=4, byrow=T),sample = rep(s,times=nrow(matrix(unlist(segtmp), ncol=4, byrow=T))),stringsAsFactors=F)
+                segTable<-rbind(segTable,segTableRaw)
+            }
+        }
+        colnames(segTable) <- c("chromosome", "start", "end", "segVal","sample")
+        return(segTable)
+    } else {
+        # NON QDNASEQ BINNED DATA FUNCTION
+	stop("segtable error")
+    }
+}
+
+write.table(getSegTable(abs_profiles),
+	paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/",project,"_",bin,"kb_ds_absCopyNumber_segTable.tsv"),
+	sep = "\t",quote=F,row.names=FALSE)
+
 #write table of fits
 write.table(res,paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/",project,"_",bin,"kb_ds_abs_fits.tsv"),sep = "\t",quote=F,row.names=FALSE)

From 62c8db0ed404313068685fca5ac1b113bf906c8b Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Tue, 23 Apr 2024 18:15:16 +0100
Subject: [PATCH 07/18] default config setting

---
 config/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/config.yaml b/config/config.yaml
index 16aa999..ff7bd97 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -21,7 +21,7 @@ seed_val: "9999"
 
 # fitler underpowered solutions - TRUE or FALSE
 # Default TRUE
-filter_underpowered: "FALSE"
+filter_underpowered: "TRUE"
 
 # Not implemented
 #custom_bin: false

From 811d17cb7e716f6920eb2e7af2546ef31c22c39c Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Tue, 23 Apr 2024 19:26:11 +0100
Subject: [PATCH 08/18] Addition of pl/pu config values and schema entries.
 Fixed schema entry filters. Added python shebang to update_configs.py.
 Additional config value error catches. Added default_config.yaml as reference
 file.

---
 config/config.yaml         | 20 +++++++++++++++---
 config/default_config.yaml | 42 ++++++++++++++++++++++++++++++++++++++
 rules/common.smk           | 15 +++++++++++++-
 schemas/config.schema.yaml | 29 +++++++++++++++++++++++---
 update_configs.py          | 14 ++++++++++++-
 5 files changed, 112 insertions(+), 8 deletions(-)
 create mode 100644 config/default_config.yaml

diff --git a/config/config.yaml b/config/config.yaml
index ff7bd97..3f4a358 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,15 +1,19 @@
 ---
 # Sample sheet
-samplesheet: "sample_sheet.tsv"
+samplesheet: sample_sheet.tsv
 
 # Output location
-out_dir: "/mnt/scratcha/fmlab/smith10/swgs_test/"
+out_dir: results/
 
 # Bin sizes
 # By default any in [1,5,15,30,50,100,500,1000]
+# Add new line for additional bin sizes
 bins:
 - 30
-project_name: "nm_test"
+#- 100 
+
+# Set project dir name
+project_name: nm_test
 
 # Pipeline parameters
 af_cutoff: 0.15
@@ -23,6 +27,16 @@ seed_val: "9999"
 # Default TRUE
 filter_underpowered: "TRUE"
 
+# ploidy range
+# Default min = 1.6 | max = 8
+ploidy_min: 1.6
+ploidy_max: 8.0
+
+# purity range (1 >= max > min >= 0)
+# Default min = 0.15 | max = 1.00
+purity_min: 0.15
+purity_max: 1.0
+
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/config/default_config.yaml b/config/default_config.yaml
new file mode 100644
index 0000000..3f4a358
--- /dev/null
+++ b/config/default_config.yaml
@@ -0,0 +1,42 @@
+---
+# Sample sheet
+samplesheet: sample_sheet.tsv
+
+# Output location
+out_dir: results/
+
+# Bin sizes
+# By default any in [1,5,15,30,50,100,500,1000]
+# Add new line for additional bin sizes
+bins:
+- 30
+#- 100 
+
+# Set project dir name
+project_name: nm_test
+
+# Pipeline parameters
+af_cutoff: 0.15
+
+# Set seed for CBS - TRUE or FALSE
+# default TRUE
+use_seed: "TRUE"
+seed_val: "9999"
+
+# fitler underpowered solutions - TRUE or FALSE
+# Default TRUE
+filter_underpowered: "TRUE"
+
+# ploidy range
+# Default min = 1.6 | max = 8
+ploidy_min: 1.6
+ploidy_max: 8.0
+
+# purity range (1 >= max > min >= 0)
+# Default min = 0.15 | max = 1.00
+purity_min: 0.15
+purity_max: 1.0
+
+# Not implemented
+#custom_bin: false
+#custom_bin_folder: "/custom_bin_data/"
diff --git a/rules/common.smk b/rules/common.smk
index c783063..9a81abd 100644
--- a/rules/common.smk
+++ b/rules/common.smk
@@ -67,4 +67,17 @@ BIN_VALS = config["bins"]
 BIN_DEF = [1,5,15,30,50,100,500,1000]
 
 if not set(BIN_VALS).issubset(BIN_DEF):
-    sys.exit("Some bin values are not available")
+    sys.exit("Config error - Some specified bin values are not available")
+
+##### CHECK MAX > MIN #####
+PLMIN=config["ploidy_min"]
+PLMAX=config["ploidy_max"]
+PUMIN=config["purity_min"]
+PUMAX=config["purity_max"]
+
+if PLMIN > PLMAX:
+    sys.exit("Config error - Minimum ploidy exceeds or is equal to maximum ploidy")
+
+if PUMIN > PUMAX:
+    sys.exit("Config error - Minimum purity exceeds or is equal to maximum purity")
+
diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
index d28f44a..986d038 100644
--- a/schemas/config.schema.yaml
+++ b/schemas/config.schema.yaml
@@ -12,12 +12,15 @@ properties:
     type: string
   bins:
     type: array
+    items:
+      type: number
+    uniqueItems: true
   project_name:
     type: string
   af_cutoff:
     type: number
-    min: 0
-    max: 1.0
+    minimum: 0
+    maximum: 1.0
   use_seed:
     type: string
     enum: ["TRUE","FALSE"]
@@ -26,7 +29,22 @@ properties:
   filter_underpowered:
     type: string
     enum: ["TRUE","FALSE"]
-    
+  ploidy_min:
+    type: number
+    minimum: 1
+    maximum: 20
+  ploidy_max:
+    type: number
+    minimum: 1
+    maximum: 20
+  purity_min:
+    type: number
+    minimum: 0
+    maximum: 1.0
+  purity_max:
+    type: number
+    minimum: 0
+    maximum: 1.0
 
 # entries that have to be in the config file for successful validation
 required:
@@ -37,3 +55,8 @@ required:
   - use_seed
   - seed_val
   - filter_underpowered
+  - ploidy_min
+  - ploidy_max
+  - purity_min
+  - purity_max
+
diff --git a/update_configs.py b/update_configs.py
index 1038f98..f27ea43 100755
--- a/update_configs.py
+++ b/update_configs.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import argparse
 import re
 import ruamel.yaml
@@ -27,7 +29,7 @@ def get_input(x,y):
     type=str, \
     required=True, \
     nargs=1, \
-    choices=['config','slurm','pbs','lsf','local'])
+    choices=['config','filters','slurm','pbs','lsf','local'])
 parser.add_argument("--advanced", help="Modify advanced cluster configuration",
                     action="store_true")
 args = parser.parse_args()
@@ -42,10 +44,20 @@ def get_input(x,y):
     config['bins'] = list(get_input(config['bins'],'bins'))
     config['out_dir'] = DoubleQuotedScalarString(get_input(config['out_dir'],'out_dir'))
     config['project_name'] = DoubleQuotedScalarString(get_input(config['project_name'],'project_name'))
+elif args.config[0] == "filters":
+    # Config file updates
+    print('Update config.yaml values (Return no value to keep the current value)')
+    file_name = 'config/config.yaml'
+    config, ind, bsi = ruamel.yaml.util.load_yaml_guess_indent(open(file_name))
+    # Update config values
     config['af_cutoff'] = float(get_input(config['af_cutoff'],'af_cutoff'))
     config['use_seed'] = DoubleQuotedScalarString(get_input(config['use_seed'],'use_seed'))
     config['seed_val'] = DoubleQuotedScalarString(get_input(config['seed_val'],'seed_val'))
     config['filter_underpowered'] = DoubleQuotedScalarString(get_input(config['filter_underpowered'],'filter_underpowered'))
+    config['ploidy_min'] = float(get_input(config['ploidy_min'],'ploidy_min'))
+    config['ploidy_max'] = float(get_input(config['ploidy_max'],'ploidy_max'))
+    config['purity_min'] = float(get_input(config['purity_min'],'purity_min'))
+    config['purity_max'] = float(get_input(config['purity_max'],'purity_max'))
     # Save updated yaml
     with open('config/config.yaml', 'w') as fp:
         yaml.dump(config, fp)

From 3daa7e33ba67acc74ccef9467a64865ea8efa54c Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Wed, 24 Apr 2024 17:20:02 +0100
Subject: [PATCH 09/18] enabled specfication of ploidy/purity range

---
 rules/gridsearch.smk                          |  6 +++++-
 scripts/ploidy_purity_search_standard_error.R | 11 +++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk
index 7f345b0..a5aec18 100644
--- a/rules/gridsearch.smk
+++ b/rules/gridsearch.smk
@@ -7,6 +7,10 @@ rule gridsearch_fitting:
     params:
         bin="{bin}",
         outdir=OUT_DIR,
-        project="{project}"
+        project="{project}",
+        ploidy_min=config["ploidy_min"],
+        ploidy_max=config["ploidy_max"],
+        purity_min=config["purity_min"],
+        purity_max=config["purity_max"]
     script:
         "../scripts/ploidy_purity_search_standard_error.R"
diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
index 3d27fd3..06991cb 100644
--- a/scripts/ploidy_purity_search_standard_error.R
+++ b/scripts/ploidy_purity_search_standard_error.R
@@ -10,6 +10,13 @@ out_dir <- snakemake@params[["outdir"]]
 project <- snakemake@params[["project"]]
 cores <- as.numeric(snakemake@threads) 
 
+# gridsearch params
+pl_min <- snakemake@params[["ploidy_min"]] # default 1.6
+pl_max <- snakemake@params[["ploidy_max"]] # default 8
+pu_min <- snakemake@params[["purity_min"]] # default 0.15
+pu_max <- snakemake@params[["purity_max"]] # deafult 1
+#print(c(pl_min,pl_max,pu_min,pu_max))
+
 #load libraries
 suppressPackageStartupMessages(library(QDNAseqmod))
 suppressPackageStartupMessages(library(Biobase))
@@ -68,8 +75,8 @@ get_gene_seg <- function(target=NULL,abs_data=NULL){
 gene_bin_seg <- get_gene_seg(target = target,abs_data = rds.obj[[1]])
 
 #estimate absolute copy number fits for all samples in parallel
-ploidies<-seq(1.6,8,0.1)
-purities<-seq(0.15,1,0.01)
+ploidies<-seq(pl_min,pl_max,0.1)
+purities<-seq(pu_min,pu_max,0.01)
 clonality<-c()
 #ind<-which(colnames(rds.obj)==sample)
 relcn<-rds.obj[[1]]

From 4fa433ca02b63ebb02af01318a76d43ffcc629b2 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Mon, 29 Apr 2024 16:03:25 +0100
Subject: [PATCH 10/18] updated sample_sheet_example.tsv and sample sheet
 schema defintion to support use of ploidy/purity columns

---
 sample_sheet_example.tsv    | 20 ++++++++++----------
 schemas/samples.schema.yaml | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv
index 6dfe89a..cba55a0 100644
--- a/sample_sheet_example.tsv
+++ b/sample_sheet_example.tsv
@@ -1,10 +1,10 @@
-PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file
-PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam
-PATIENT-2	SAMPLE_5	0.97604930362117	FALSE	/data/SAMPLE_5.bam
-PATIENT-2	SAMPLE_6	0.948429942418426	FALSE	/data/SAMPLE_6.bam
-PATIENT-3	SAMPLE_10	0.312743806009489	FALSE	/data/SAMPLE_10.bam
-PATIENT-3	SAMPLE_11	0.313365853658537	FALSE	/data/SAMPLE_11.bam
-PATIENT-3	SAMPLE_12	0.170947565543071	FALSE	/data/SAMPLE_12.bam
-PATIENT-3	SAMPLE_13	0.15861669829222	FALSE	/data/SAMPLE_13.bam
-PATIENT-3	SAMPLE_7	0.326712851405623	FALSE	/data/SAMPLE_7.bam
-PATIENT-3	SAMPLE_8	0.361060215053763	FALSE	/data/SAMPLE_8.bam
+PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file	ploidy	purity
+PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam	NA	NA
+PATIENT-2	SAMPLE_5	0.97	FALSE	/data/SAMPLE_5.bam	NA	NA
+PATIENT-2	SAMPLE_6	0.94	FALSE	/data/SAMPLE_6.bam	NA	NA
+PATIENT-3	SAMPLE_10	0.31	TRUE	/data/SAMPLE_10.bam	NA	NA
+PATIENT-3	SAMPLE_11	NA	FALSE	/data/SAMPLE_11.bam	NA	NA
+PATIENT-3	SAMPLE_12	0.17	FALSE	/data/SAMPLE_12.bam	NA	NA
+PATIENT-3	SAMPLE_13	0.15	FALSE	/data/SAMPLE_13.bam	NA	NA
+PATIENT-3	SAMPLE_7	0.32	TRUE	/data/SAMPLE_7.bam	NA	NA
+PATIENT-3	SAMPLE_8	0.36	FALSE	/data/SAMPLE_8.bam	NA	NA
diff --git a/schemas/samples.schema.yaml b/schemas/samples.schema.yaml
index 4bf2173..8864e88 100644
--- a/schemas/samples.schema.yaml
+++ b/schemas/samples.schema.yaml
@@ -21,6 +21,22 @@ properties:
   smooth: 
     description: "True or fale boolean as to apply segment smoothing or not"
     type: boolean
+  ploidy:
+    description: "sample ploidy"
+    anyOf:
+      - type: number
+        minimum: 1
+        maximum: 20
+      - type: string
+        enum: ["NA"]
+  purity:
+    description: "sample purity"
+    anyOf:
+      - type: number
+        minimum: 0
+        maximum: 1.0
+      - type: string
+        enum: ["NA"]
 
 required: 
   - SAMPLE_ID

From b38ccbbf4670f7bc0d32b2589ded148b023e90e3 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Mon, 29 Apr 2024 20:18:44 +0100
Subject: [PATCH 11/18] added functionality to specify precomputed ploidy and
 purity for a given sample. Can be provided together or either ploidy and
 purity independently

---
 rules/gridsearch.smk                          |  1 +
 sample_sheet_example.tsv                      |  2 +-
 schemas/samples.schema.yaml                   |  8 +--
 scripts/gridsearch_results_filtering.R        | 11 ++--
 scripts/ploidy_purity_search_standard_error.R | 51 ++++++++++++++-----
 5 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk
index a5aec18..e391430 100644
--- a/rules/gridsearch.smk
+++ b/rules/gridsearch.smk
@@ -8,6 +8,7 @@ rule gridsearch_fitting:
         bin="{bin}",
         outdir=OUT_DIR,
         project="{project}",
+        meta=config["samplesheet"],
         ploidy_min=config["ploidy_min"],
         ploidy_max=config["ploidy_max"],
         purity_min=config["purity_min"],
diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv
index cba55a0..3a690fb 100644
--- a/sample_sheet_example.tsv
+++ b/sample_sheet_example.tsv
@@ -1,4 +1,4 @@
-PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file	ploidy	purity
+PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file	precPloidy	precPurity
 PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam	NA	NA
 PATIENT-2	SAMPLE_5	0.97	FALSE	/data/SAMPLE_5.bam	NA	NA
 PATIENT-2	SAMPLE_6	0.94	FALSE	/data/SAMPLE_6.bam	NA	NA
diff --git a/schemas/samples.schema.yaml b/schemas/samples.schema.yaml
index 8864e88..b2c7120 100644
--- a/schemas/samples.schema.yaml
+++ b/schemas/samples.schema.yaml
@@ -21,16 +21,16 @@ properties:
   smooth: 
     description: "True or fale boolean as to apply segment smoothing or not"
     type: boolean
-  ploidy:
-    description: "sample ploidy"
+  precPloidy:
+    description: "pre-computed sample ploidy"
     anyOf:
       - type: number
         minimum: 1
         maximum: 20
       - type: string
         enum: ["NA"]
-  purity:
-    description: "sample purity"
+  precPurity:
+    description: "pre-computed sample purity"
     anyOf:
       - type: number
         minimum: 0
diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index bf1fd61..422952b 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -7,8 +7,6 @@ suppressWarnings(library(foreach))
 ## Added by PS
 args = commandArgs(trailingOnly=TRUE)
 
-print(snakemake)
-
 metafile <- snakemake@params[["meta"]]
 metadata <- read.table(file = metafile,header=T,sep="\t")
 bin <- as.numeric(snakemake@params[["bin"]])
@@ -31,7 +29,6 @@ if(filter_underpowered == "TRUE"){
 rds.filename <- snakemake@input[["rds"]]
 
 rds.list <- lapply(rds.filename,FUN=function(x){readRDS(x)})
-
 collapse_rds <- function(rds.list){
   comb <- rds.list[[1]][[1]]
   if(length(rds.list) > 1){
@@ -63,7 +60,8 @@ colnames(clonality) <- c("SAMPLE_ID","ploidy","purity","clonality","downsample_d
 
 clonality <- left_join(clonality,metadata,by="SAMPLE_ID") %>%
                 select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth)
-                
+
+print(clonality)                
 ## Added by PS
 depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given purity and single copy depth
 {
@@ -71,9 +69,6 @@ depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given
 }
 
 #Top 10 when ranking by clonality and TP53
-print(filter_underpowered)
-print(clonality)
-
 filtered_results <- clonality
 
 if(filter_underpowered){
@@ -89,7 +84,7 @@ filtered_results <- filtered_results %>% # filter underpowered fits when config
   group_by(SAMPLE_ID) %>%
   top_n(-10, wt = clonality) %>% # select top 10 ploidy states with the lowest clonality values
   mutate(rank_clonality = min_rank(clonality)) %>% # rank by clonality within a sample across ploidies in top 10
-  filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive
+  #filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive
   # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15
   filter(is.na(TP53freq) | near(expected_TP53_AF,TP53freq, tol = af_cutoff )) %>%  
   arrange(PATIENT_ID, SAMPLE_ID )
diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
index 06991cb..997969f 100644
--- a/scripts/ploidy_purity_search_standard_error.R
+++ b/scripts/ploidy_purity_search_standard_error.R
@@ -10,6 +10,9 @@ out_dir <- snakemake@params[["outdir"]]
 project <- snakemake@params[["project"]]
 cores <- as.numeric(snakemake@threads) 
 
+metafile <- snakemake@params[["meta"]]
+metadata <- read.table(file = metafile,header=T,sep="\t")
+
 # gridsearch params
 pl_min <- snakemake@params[["ploidy_min"]] # default 1.6
 pl_max <- snakemake@params[["ploidy_max"]] # default 8
@@ -74,9 +77,30 @@ get_gene_seg <- function(target=NULL,abs_data=NULL){
 #Get target anchor gene segments
 gene_bin_seg <- get_gene_seg(target = target,abs_data = rds.obj[[1]])
 
+#adjust for precomputed pl/pu
+metaSample <- snakemake@wildcards[["sample"]]
+metaflt <- metadata[metadata$SAMPLE_ID == metaSample,]
+
+# set gridsearch limits based on availability of 
+# precomputed pl/pu values
+if(!is.null(metaflt$precPloidy)){
+	if(!is.na(metaflt$precPloidy)){
+    		pl_min <- metaflt$precPloidy
+		pl_max <- metaflt$precPloidy
+  	}
+}
+
+if(!is.null(metaflt$precPurity)){
+	if(!is.na(metaflt$precPurity)){
+    		pu_min <- metaflt$precPurity
+		pu_max <- metaflt$precPurity
+	}
+}
+
 #estimate absolute copy number fits for all samples in parallel
 ploidies<-seq(pl_min,pl_max,0.1)
 purities<-seq(pu_min,pu_max,0.01)
+
 clonality<-c()
 #ind<-which(colnames(rds.obj)==sample)
 relcn<-rds.obj[[1]]
@@ -88,14 +112,11 @@ copynumber<-assayDataElement(relcn,"copynumber")
 rel_ploidy<-mean(copynumber,na.rm=T)
 num_reads<-sum(copynumber,na.rm=T)
 sample <- sampleNames(relcn)
-print(sample)
 
-res<-foreach(i=1:length(ploidies),.combine=rbind) %do%
-{
+#print(sample)
+res<-foreach(i=1:length(ploidies),.combine=rbind) %do% {
         ploidy<-ploidies[i]
-        #print(ploidy)
-        rowres<-foreach(j=1:length(purities),.combine=rbind)%do%
-        {
+	rowres<-foreach(j=1:length(purities),.combine=rbind) %do% {
             purity<-purities[j]
             downsample_depth<-(((2*(1-purity)+purity*ploidy)/(ploidy*purity))/purity)*15*(2*(1-purity)+purity*ploidy)*nbins_ref_genome*(1/0.91)
             cellploidy<-ploidy*purity+2*(1-purity)
@@ -110,16 +131,22 @@ res<-foreach(i=1:length(ploidies),.combine=rbind) %do%
             TP53cn<-round(depthtocn(median(seg[gene_bin_seg]),purity,seqdepth),1) # to 1 decimal place
             expected_TP53_AF<-TP53cn*purity/(TP53cn*purity+2*(1-purity))
             clonality<-mean(diffs)
-            c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF)
+            r <- c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF)
+	    r <- as.data.frame(t(r))
+	    return(r)
         }
-        rowres
+	return(as.data.frame(rowres))
 }
 
 colnames(res)<-c("ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF")
-rownames(res)<-1:nrow(res)
-res<-data.frame(res,stringsAsFactors = F)
-res<-data.frame(apply(res,2,as.numeric,stringsAsFactors=F))
-res<-res[order(res$clonality,decreasing=FALSE),]
+if(nrow(res) > 1){
+	rownames(res)<-1:nrow(res)
+	res<-data.frame(res,stringsAsFactors = F)
+	res<-data.frame(apply(res,2,as.numeric,stringsAsFactors=F))
+	res<-res[order(res$clonality,decreasing=FALSE),]
+} else {
+	rownames(res) <- 1
+}
 
 #output plot of clonality error landscape
 pdf(snakemake@output[["pdf"]])

From e79e5f12c991d96198ad28cba33a42ba5ca9d085 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Tue, 30 Apr 2024 19:51:38 +0100
Subject: [PATCH 12/18] added support for precomputed ploidy-purity and
 skipping stage_1 gridsearch fitting

---
 rules/downsample.smk         |   1 +
 scripts/downsampleBams.R     |  19 ++++--
 scripts/processPrecomputed.R | 110 +++++++++++++++++++++++++++++++++++
 scripts/qdnaseq_rel_to_abs.R |   5 ++
 stage_1                      |   2 +
 stage_2                      |   9 +++
 6 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100755 scripts/processPrecomputed.R

diff --git a/rules/downsample.smk b/rules/downsample.smk
index 8241350..2b8aa6d 100644
--- a/rules/downsample.smk
+++ b/rules/downsample.smk
@@ -7,6 +7,7 @@ rule downsample:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/downsampled_bams/{sample}.bam"
     params:
         outdir=OUT_DIR,
+        prplpu=prplpu,
         bin="{bin}",
         project="{project}",
         sample="{sample}"
diff --git a/scripts/downsampleBams.R b/scripts/downsampleBams.R
index f3c008d..7803d08 100644
--- a/scripts/downsampleBams.R
+++ b/scripts/downsampleBams.R
@@ -10,20 +10,29 @@ bin <- as.numeric(snakemake@params[["bin"]])
 project <- snakemake@params[["project"]]
 outname <- snakemake@output[[1]]
 sample_name <- snakemake@params[["sample"]]
+prplpu <- snakemake@params[["prplpu"]]
 
-fit.qc <- read.table(file = meta,header = T,sep = "\t",na.strings = "")
-
-relative_smoothed <- readRDS(rds)
-read.data <- phenoData(relative_smoothed)@data
+#print(prplpu)
 
+fit.qc <- read.table(file = meta,header = T,sep = "\t",na.strings = "")
 fit.qc.filt <- fit.qc %>%
+  filter(SAMPLE_ID == sample_name) %>%
   filter(use == TRUE)
 
+if(prplpu == "TRUE"){
+  cmd.totalreads <- paste0("samtools view -c -F 260 ",bam_in)
+  tot.reads <- as.numeric(system(cmd.totalreads,intern=TRUE))
+  read.data <- data.frame(name=fit.qc.filt$SAMPLE_ID,total.reads=tot.reads)
+  print(read.data)
+} else {
+  relative_smoothed <- readRDS(rds)
+  read.data <- phenoData(relative_smoothed)@data
+}
+
 fit.qc.filt$total.reads <- read.data$total.reads[match(x = fit.qc.filt$SAMPLE_ID,read.data$name)]
 fit.qc.filt$ratio <- round(fit.qc.filt$downsample_depth / fit.qc.filt$total.reads,digits = 3)
 
 perc <- fit.qc.filt %>%
-   filter(SAMPLE_ID == sample_name) %>%
    .$ratio
 
 # If read ratio is greater than 0.96 (i.e close to original or higher than available reads)
diff --git a/scripts/processPrecomputed.R b/scripts/processPrecomputed.R
new file mode 100755
index 0000000..b0b831e
--- /dev/null
+++ b/scripts/processPrecomputed.R
@@ -0,0 +1,110 @@
+args <- commandArgs(trailingOnly=TRUE)
+suppressPackageStartupMessages(library(yaml))
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(tidyr))
+suppressPackageStartupMessages(library(QDNAseqmod))
+suppressPackageStartupMessages(library(Biobase))
+
+cat("[processPrecomputed] Generating file to skip stage_1\n")
+cat("[processPrecomputed] Reading config and samplesheet files...\n")
+config <- read_yaml(file="config/config.yaml")
+
+samplesheet <- read.table(config$samplesheet,header=TRUE,sep="\t")
+projectBin <- paste0(config$project_name,"_",config$bin,"kb")
+outputLoc <- paste0(config$out_dir,"sWGS_fitting/",projectBin,"/")
+
+
+if(any(is.na(samplesheet$precPloidy)) | any(is.na(samplesheet$precPurity))){
+	stop("some samples do not have precomputed ploidy and purity values - check the samplesheet")
+}
+
+cat("[processPrecomputed] Loading genome bin data...\n")
+bin <- config$bin
+bin_size <- bin*1000
+bins<-getBinAnnotations(binSize = bin)
+# actual number of bins varies in stage_1 due to filtering so total usable bins is adjusted to attempt to fix this
+nbins_ref_genome <- round(sum(bins@data$use) * 0.932) #0.932 ratio between actual usable bins and total usable bins in bin annotation data
+
+pre <- "absolute_PRE_down_sampling/"
+preFile <- paste0(outputLoc,pre,config$project_name,"_fit_QC_predownsample.tsv")
+
+
+if(!dir.exists(outputLoc)){
+        dir.create(outputLoc,recursive=TRUE)
+}
+
+cat("[processPrecomputed] Verifying BAM files...\n")
+# check bams
+bam <- samplesheet$file
+outname <- paste0(outputLoc,"bam.ok")
+log_vector <- c()
+check_vector <- c()
+for(i in bam){
+        cmd <- paste0("samtools quickcheck -q ",i)
+        if(system(cmd) == 0){
+                log_vector <- append(log_vector, paste0("BAM valid - ",i))
+                check_vector <- append(check_vector,FALSE)
+        } else {
+                log_vector <- append(log_vector,paste0("BAM invalid or missing - ",i))
+                check_vector <- append(check_vector,TRUE)
+        }
+}
+
+if(any(check_vector)){
+        outname <- gsub(pattern = "ok",replacement = "invalid",outname)
+        writeLines(text = as.character(log_vector),con = outname)
+} else {
+        writeLines(text = as.character(log_vector),con = outname)
+}
+
+cat("[processPrecomputed] Creating BAM file symlinks...\n")
+# generate symlinks for bams
+symoutLoc <- paste0(outputLoc,"bams/")
+if(!dir.exists(symoutLoc)){
+        dir.create(symoutLoc,recursive=TRUE)
+}
+for(i in 1:nrow(samplesheet)){
+	symcmd <- paste0("ln -s ",samplesheet$file[i]," ",symoutLoc,samplesheet$SAMPLE_ID[i],".bam")
+	system(symcmd)
+}
+
+cat("[processPrecomputed] Generating spoof Pre-downsampled RDS files...\n")
+# spoof RDS files from stage_1
+rdsoutLoc <- paste0(outputLoc,pre,"relative_cn_rds/")
+if(!dir.exists(rdsoutLoc)){
+        dir.create(rdsoutLoc,recursive=TRUE)
+}
+
+for(i in 1:nrow(samplesheet)){
+	writeLines(text = as.character(samplesheet$SAMPLE_ID[i]),
+		paste0(rdsoutLoc,config$project,"_",samplesheet$SAMPLE_ID[i],"_",bin,"kb_relSmoothedCN.rds"))
+}
+writeLines(text = as.character(samplesheet$SAMPLE_ID),
+                paste0(outputLoc,pre,projectBin,"_relSmoothedCN.rds"))
+
+cat("[processPrecomputed] Generating stage_2 QC file input...\n")
+# generate qc file
+preFileCols <- c("clonality","powered","TP53cn","expected_TP53_AF","TP53freq","rank_clonality","pl_diff","pu_diff","new","new_state")
+samplesheet <- samplesheet %>%
+		dplyr::select(SAMPLE_ID,PATIENT_ID,precPloidy,precPurity,TP53freq,smooth) %>%
+		rename("ploidy"="precPloidy","purity"="precPurity") %>%
+		mutate(!!!setNames(rep(NA, length(preFileCols)), preFileCols)) %>%
+		mutate(downsample_depth = (((2*(1-purity)+purity*ploidy)/(ploidy*purity))/purity)*15*(2*(1-purity)+purity*ploidy)*nbins_ref_genome*(1/0.91)) %>%
+		mutate(use = TRUE) %>%
+		mutate(notes = "using preprocessed ploidy purity values") %>%
+		select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth,rank_clonality,pl_diff,pu_diff,new,new_state,use,notes)
+
+foutputLoc <- paste0(outputLoc,pre)
+if(!dir.exists(foutputLoc)){
+	dir.create(foutputLoc,recursive=TRUE)
+}
+
+cat("[processPrecomputed] Marking run as precomputed...\n")
+write.table(samplesheet,preFile,sep="\t",col.names=T,row.names=F,quote=F)
+write.table(samplesheet[,c("SAMPLE_ID","ploidy","purity")],paste0(foutputLoc,"precomp.ok"),
+	sep="\t",col.names=T,row.names=F,quote=F)
+writeLines(text = c("PRECOMPUTED"),paste0(foutputLoc,"GENERATED_USING_PRECOMPUTED_PL_PU_RDS_FILES_ARE_EMPTY"))
+
+cat("[processPrecomputed] Processing precomputed values complete\n")
+cat("[processPrecomputed] stage_2 can now run without completing stage_1\n")
+#END
diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R
index 68c74bc..41b37f5 100644
--- a/scripts/qdnaseq_rel_to_abs.R
+++ b/scripts/qdnaseq_rel_to_abs.R
@@ -159,6 +159,11 @@ getSegTable<-function(x){
         fd$use -> use
         fdfiltfull<-fd[use,]
         sn<-sn[use,]
+	      if(is.null(ncol(sn))){
+		      sampleNa <- Biobase::sampleNames(x)
+		      sn <- as.data.frame(sn)
+		      colnames(sn) <- sampleNa
+	      }
         segTable<-c()
         for(s in colnames(sn)){
             for(c in unique(fdfiltfull$chromosome))
diff --git a/stage_1 b/stage_1
index 12de7e0..4f4982e 100644
--- a/stage_1
+++ b/stage_1
@@ -8,6 +8,8 @@ def get_bam(wildcards):
 if any(samplesheet.SAMPLE_ID.duplicated()):
     sys.exit("Sample sheet contains duplicated sample ids")
 
+if all(samplesheet.precPloidy.notna()) and all(samplesheet.precPurity.notna()):
+    sys.exit("all ploidy and purity values are precomputed - Run `Rscript scripts/processPrecomputed.R` and skip to stage_2")
 
 SAMPLES = list(samplesheet['SAMPLE_ID'].unique())
 FILE_LIST = list(samplesheet['file'].unique())
diff --git a/stage_2 b/stage_2
index e297655..c5acb38 100644
--- a/stage_2
+++ b/stage_2
@@ -23,6 +23,15 @@ def get_ds_bams(P,B):
 # Set new sample list based on filtered QC from stage 1
 PROJECT=config["project_name"]
 BIN=config["bins"]
+
+if any(samplesheet.SAMPLE_ID.duplicated()):
+    sys.exit("Sample sheet contains duplicated sample ids")
+
+
+prplpu = "FALSE"
+if all(samplesheet.precPloidy.notna()) and all(samplesheet.precPurity.notna()):
+    prplpu = "TRUE"
+
 SAMPLE_LISTS=get_ds_bams(PROJECT,BIN)
 
 rule all:

From 8102f9cec85cab166098189ad371302b3551fccb Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Fri, 3 May 2024 12:14:23 +0100
Subject: [PATCH 13/18] implemented homozygousloss filter and associated config
 parameters

---
 config/config.yaml                            | 10 ++++++
 config/default_config.yaml                    | 10 ++++++
 rules/filter_gridsearch.smk                   |  4 ++-
 rules/gridsearch.smk                          |  3 +-
 sample_sheet_example.tsv                      |  6 ++--
 schemas/config.schema.yaml                    | 13 +++++++
 scripts/gridsearch_results_filtering.R        | 36 ++++++++++++++-----
 scripts/ploidy_purity_search_standard_error.R |  8 +++--
 8 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 3f4a358..8177195 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -37,6 +37,16 @@ ploidy_max: 8.0
 purity_min: 0.15
 purity_max: 1.0
 
+# Homozygous loss filter - TRUE or FALSE
+# Default "TRUE"
+filter_homozygous: "TRUE"
+# Threshold basepairs lost
+# Default 10000000 / 10Mbase
+homozygous_prop: 10000000
+# Absolute CN homozygous loss threshold
+# Default 0.4
+homozygous_threshold: 0.4
+
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/config/default_config.yaml b/config/default_config.yaml
index 3f4a358..8177195 100644
--- a/config/default_config.yaml
+++ b/config/default_config.yaml
@@ -37,6 +37,16 @@ ploidy_max: 8.0
 purity_min: 0.15
 purity_max: 1.0
 
+# Homozygous loss filter - TRUE or FALSE
+# Default "TRUE"
+filter_homozygous: "TRUE"
+# Threshold basepairs lost
+# Default 10000000 / 10Mbase
+homozygous_prop: 10000000
+# Absolute CN homozygous loss threshold
+# Default 0.4
+homozygous_threshold: 0.4
+
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk
index a4add21..62cd5a2 100644
--- a/rules/filter_gridsearch.smk
+++ b/rules/filter_gridsearch.smk
@@ -10,7 +10,9 @@ rule gridsearch_filter:
         project="{project}",
         outdir=OUT_DIR,
         af_cutoff=config["af_cutoff"],
-        filter_underpowered=config["filter_underpowered"]
+        filter_underpowered=config["filter_underpowered"],
+        filter_homozygous=config["filter_homozygous"],
+        homozygous_prop=config["homozygous_prop"]
     threads: THREADS 
     script: 
         "../scripts/gridsearch_results_filtering.R"
diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk
index e391430..626de36 100644
--- a/rules/gridsearch.smk
+++ b/rules/gridsearch.smk
@@ -12,6 +12,7 @@ rule gridsearch_fitting:
         ploidy_min=config["ploidy_min"],
         ploidy_max=config["ploidy_max"],
         purity_min=config["purity_min"],
-        purity_max=config["purity_max"]
+        purity_max=config["purity_max"],
+        homozygous_threshold=config["homozygous_threshold"]
     script:
         "../scripts/ploidy_purity_search_standard_error.R"
diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv
index 3a690fb..88ca928 100644
--- a/sample_sheet_example.tsv
+++ b/sample_sheet_example.tsv
@@ -1,7 +1,7 @@
 PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file	precPloidy	precPurity
-PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam	NA	NA
-PATIENT-2	SAMPLE_5	0.97	FALSE	/data/SAMPLE_5.bam	NA	NA
-PATIENT-2	SAMPLE_6	0.94	FALSE	/data/SAMPLE_6.bam	NA	NA
+PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam	3.2	0.76
+PATIENT-2	SAMPLE_5	0.97	FALSE	/data/SAMPLE_5.bam	2.4	NA
+PATIENT-2	SAMPLE_6	0.94	FALSE	/data/SAMPLE_6.bam	NA	0.55
 PATIENT-3	SAMPLE_10	0.31	TRUE	/data/SAMPLE_10.bam	NA	NA
 PATIENT-3	SAMPLE_11	NA	FALSE	/data/SAMPLE_11.bam	NA	NA
 PATIENT-3	SAMPLE_12	0.17	FALSE	/data/SAMPLE_12.bam	NA	NA
diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
index 986d038..0708d55 100644
--- a/schemas/config.schema.yaml
+++ b/schemas/config.schema.yaml
@@ -45,6 +45,16 @@ properties:
     type: number
     minimum: 0
     maximum: 1.0
+  filter_homozygous:
+    type: string
+    enum: ["TRUE","FALSE"]
+  homozygous_prop:
+    type: number
+    minimum: 0
+  homozygous_threshold:
+    type: number
+    minimum: 0
+    maximum: 0.99
 
 # entries that have to be in the config file for successful validation
 required:
@@ -59,4 +69,7 @@ required:
   - ploidy_max
   - purity_min
   - purity_max
+  - filter_homozygous
+  - homozygous_prop
+  - homozygous_threshold
 
diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index 422952b..4a2c078 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -12,11 +12,24 @@ metadata <- read.table(file = metafile,header=T,sep="\t")
 bin <- as.numeric(snakemake@params[["bin"]])
 out_dir <- snakemake@params[["outdir"]]
 project <- snakemake@params[["project"]]
-filter_underpowered <- snakemake@params[["filter_underpowered"]]
-af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]])
+
+#threads
 cores <- as.numeric(snakemake@threads)
 registerDoMC(cores)
 
+# Filter parameters
+filter_underpowered <- snakemake@params[["filter_underpowered"]]
+af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]])
+filter_homozygous=snakemake@params[["filter_homozygous"]]
+homozygous_prop=as.numeric(snakemake@params[["homozygous_prop"]])
+
+# filter homozygous loss
+if(filter_homozygous == "TRUE"){
+  filter_homozygous <- TRUE
+} else {
+  filter_homozygous <- FALSE
+} 
+
 # Filter for powered fits only
 if(filter_underpowered == "TRUE"){
   filter_underpowered <- TRUE
@@ -56,13 +69,11 @@ clonality <- do.call(rbind,
 				tab <- cbind(rep(n,times=nrow(tab)),tab)
 				return(tab)
 			}))
-colnames(clonality) <- c("SAMPLE_ID","ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF")
+colnames(clonality) <- c("SAMPLE_ID","ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF","homozygousLoss")
 
 clonality <- left_join(clonality,metadata,by="SAMPLE_ID") %>%
-                select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth)
+                select(SAMPLE_ID,PATIENT_ID,ploidy,purity,clonality,downsample_depth,powered,TP53cn,expected_TP53_AF,TP53freq,smooth,homozygousLoss)
 
-print(clonality)                
-## Added by PS
 depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given purity and single copy depth
 {
   (x/seqdepth-2*(1-purity))/purity
@@ -71,11 +82,20 @@ depthtocn<-function(x,purity,seqdepth) #converts readdepth to copy number given
 #Top 10 when ranking by clonality and TP53
 filtered_results <- clonality
 
+## Filter underpowered
 if(filter_underpowered){
   filtered_results <- filtered_results %>%
     filter(powered == 1) 
 }
 
+
+## filter homozygous loss
+if(filter_homozygous){
+  filtered_results <- filtered_results %>%
+    filter(homozygousLoss <= homozygous_prop)
+}
+
+# standard filtering
 filtered_results <- filtered_results %>% # filter underpowered fits when config variable is TRUE
   select(SAMPLE_ID, PATIENT_ID, everything()) %>%
   group_by(SAMPLE_ID, ploidy) %>%
@@ -84,12 +104,10 @@ filtered_results <- filtered_results %>% # filter underpowered fits when config
   group_by(SAMPLE_ID) %>%
   top_n(-10, wt = clonality) %>% # select top 10 ploidy states with the lowest clonality values
   mutate(rank_clonality = min_rank(clonality)) %>% # rank by clonality within a sample across ploidies in top 10
-  #filter(expected_TP53_AF > 0) %>% # keep fits where TP53 is positive
   # retain samples without TP53 mutations and where expected and observed TP53freq <=0.15
   filter(is.na(TP53freq) | near(expected_TP53_AF,TP53freq, tol = af_cutoff )) %>%  
-  arrange(PATIENT_ID, SAMPLE_ID )
+  arrange(PATIENT_ID, SAMPLE_ID)
 
-print(filtered_results)
 #Further limit the results by selecting the ploidy states with the lowest clonality values where multiple similar solutions are present.
 #Threshold of 0.3 used to select different states
 
diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
index 997969f..1967b9a 100644
--- a/scripts/ploidy_purity_search_standard_error.R
+++ b/scripts/ploidy_purity_search_standard_error.R
@@ -20,6 +20,9 @@ pu_min <- snakemake@params[["purity_min"]] # default 0.15
 pu_max <- snakemake@params[["purity_max"]] # deafult 1
 #print(c(pl_min,pl_max,pu_min,pu_max))
 
+# homozygous threshold
+hmz_thrsh <- snakemake@params[["homozygous_threshold"]]
+
 #load libraries
 suppressPackageStartupMessages(library(QDNAseqmod))
 suppressPackageStartupMessages(library(Biobase))
@@ -131,14 +134,15 @@ res<-foreach(i=1:length(ploidies),.combine=rbind) %do% {
             TP53cn<-round(depthtocn(median(seg[gene_bin_seg]),purity,seqdepth),1) # to 1 decimal place
             expected_TP53_AF<-TP53cn*purity/(TP53cn*purity+2*(1-purity))
             clonality<-mean(diffs)
-            r <- c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF)
+	    hmzyg <- sum(abs_cn <= hmz_thrsh) * bin_size
+            r <- c(ploidy,purity,clonality,downsample_depth,downsample_depth < rds.pdata$total.reads[row.names(rds.pdata)==sample],TP53cn,expected_TP53_AF,hmzyg)
 	    r <- as.data.frame(t(r))
 	    return(r)
         }
 	return(as.data.frame(rowres))
 }
 
-colnames(res)<-c("ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF")
+colnames(res)<-c("ploidy","purity","clonality","downsample_depth","powered","TP53cn","expected_TP53_AF","homozygousLoss")
 if(nrow(res) > 1){
 	rownames(res)<-1:nrow(res)
 	res<-data.frame(res,stringsAsFactors = F)

From d3fe4087309c302f3e759628f3bf5921aa1afe67 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Fri, 3 May 2024 12:34:51 +0100
Subject: [PATCH 14/18] error catch for empty stage_1 qc file if filtering
 criteria too strict

---
 stage_2 | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/stage_2 b/stage_2
index c5acb38..9b56f0f 100644
--- a/stage_2
+++ b/stage_2
@@ -8,14 +8,17 @@ def get_ds_bams(P,B):
         fl = OUT_DIR+"sWGS_fitting/"+P+"_"+str(b)+"kb/absolute_PRE_down_sampling/"+P+"_fit_QC_predownsample.tsv"
         if os.path.isfile(fl):
             f = pd.read_table(fl).set_index(["SAMPLE_ID"], drop=False)
-            if f["use"].dtype == bool and all(~f.use.isnull()):
-                ftb = f[(f['use'] == True)]
-                if all(~ftb.SAMPLE_ID.duplicated()):
-                    d[str(b)] = list(ftb['SAMPLE_ID'])
+            if not f.empty:
+                if f["use"].dtype == bool and all(~f.use.isnull()):
+                    ftb = f[(f['use'] == True)]
+                    if all(~ftb.SAMPLE_ID.duplicated()):
+                        d[str(b)] = list(ftb['SAMPLE_ID'])
+                    else:
+                        sys.exit("QC file contains duplicated sample ids marked 'TRUE'")
                 else:
-                    sys.exit("QC file contains duplicated sample ids marked 'TRUE'")
+                    sys.exit("QC file 'use' column contains non-boolean values")
             else:
-                sys.exit("QC file 'use' column contains non-boolean values")
+                sys.exit("QC file is empty. Filtering criteria may need adjusting")
         else:
             sys.exit("QC file from stage 1 is missing")
     return(d)

From 47b7eb649ca796f95a1c51ca3ec66b625d138aad Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Fri, 3 May 2024 12:56:09 +0100
Subject: [PATCH 15/18] fix - disambiguate combine function to avoid confusion
 with deprecated dplyr function

---
 scripts/gridsearch_results_filtering.R | 2 +-
 scripts/qdnaseq_rel_to_abs.R           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index 4a2c078..1681f2d 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -47,7 +47,7 @@ collapse_rds <- function(rds.list){
   if(length(rds.list) > 1){
     for(i in 2:length(rds.list)){
       add <- rds.list[[i]][[1]]
-      comb <- combine(comb,add)
+      comb <- Biobase::combine(comb,add)
     }
     rds.obj <- comb
   } else {
diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R
index 41b37f5..be6de70 100644
--- a/scripts/qdnaseq_rel_to_abs.R
+++ b/scripts/qdnaseq_rel_to_abs.R
@@ -26,7 +26,7 @@ collapse_rds <- function(rds.list){
   if(length(rds.list) > 1){
     for(i in 2:length(rds.list)){
       add <- rds.list[[i]][[1]]
-      comb <- combine(comb,add)
+      comb <- Biobase::combine(comb,add)
     }
     rds.obj <- comb
   } else {

From cc1f0aabbcbd45d54e3d8dc1de8f59659785d741 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Mon, 6 May 2024 02:40:50 +0100
Subject: [PATCH 16/18] added support for containerised pipeline using docker
 image. Fixed X11/Cairo error by setting cairo specifically in png plotting
 device.

---
 README.md                              | 6 ++++++
 config/config.yaml                     | 2 ++
 rules/bam_check.smk                    | 2 ++
 rules/common.smk                       | 3 +++
 rules/downsample.smk                   | 2 ++
 rules/downsampled_rel_rds.smk          | 2 ++
 rules/filter_gridsearch.smk            | 2 ++
 rules/gridsearch.smk                   | 2 ++
 rules/rel_rds.smk                      | 2 ++
 rules/rel_to_abs.smk                   | 2 ++
 schemas/config.schema.yaml             | 2 ++
 scripts/gridsearch_results_filtering.R | 6 ++++--
 scripts/qdnaseq_rel_to_abs.R           | 3 ++-
 13 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0ba2279..d8b4de2 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,12 @@ or
 conda activate swgs-abscn
 ```
 
+### Step 3a - Singularity-based implementation
+
+```
+--use-singularity --singularity-args "--bind /mnt/scratcha/fmlab/smith10/britroc/"
+```
+
 ### Step 4 Preparing the input files
 
 #### Sample sheet
diff --git a/config/config.yaml b/config/config.yaml
index 8177195..b531300 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -47,6 +47,8 @@ homozygous_prop: 10000000
 # Default 0.4
 homozygous_threshold: 0.4
 
+# container url for swgs-absolutecn
+image_base_url: "docker://phil9s/" 
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/rules/bam_check.smk b/rules/bam_check.smk
index 449639e..310bef5 100644
--- a/rules/bam_check.smk
+++ b/rules/bam_check.smk
@@ -3,6 +3,8 @@ rule check_bam:
        bam=FILE_LIST
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/bam.ok"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     threads: 1
     script:
         "../scripts/bam_check.R"
diff --git a/rules/common.smk b/rules/common.smk
index 9a81abd..f3e89c9 100644
--- a/rules/common.smk
+++ b/rules/common.smk
@@ -61,6 +61,9 @@ OUT_DIR=os.path.join(OUT_DIR,"")
 samplesheet = pd.read_table(config["samplesheet"],dtype={'PATIENT_ID': str,'SAMPLE_ID':str,'TP53freq':float}).set_index(["SAMPLE_ID"], drop=False)
 validate(samplesheet, schema="../schemas/samples.schema.yaml")
 
+# set container uri
+image_base_url = config["image_base_url"]
+
 #### Check bin values ####
 
 BIN_VALS = config["bins"]
diff --git a/rules/downsample.smk b/rules/downsample.smk
index 2b8aa6d..dd18eb3 100644
--- a/rules/downsample.smk
+++ b/rules/downsample.smk
@@ -5,6 +5,8 @@ rule downsample:
         rds=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_{bin}kb_relSmoothedCN.rds"
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/downsampled_bams/{sample}.bam"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         outdir=OUT_DIR,
         prplpu=prplpu,
diff --git a/rules/downsampled_rel_rds.smk b/rules/downsampled_rel_rds.smk
index 8218978..c728781 100644
--- a/rules/downsampled_rel_rds.smk
+++ b/rules/downsampled_rel_rds.smk
@@ -4,6 +4,8 @@ rule ds_relRDS:
         meta=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv"
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/relative_cn_rds/{project}_{sample}_{bin}kb_relSmoothedCN.rds"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         outdir=OUT_DIR,
         project="{project}",
diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk
index 62cd5a2..7ac9d25 100644
--- a/rules/filter_gridsearch.smk
+++ b/rules/filter_gridsearch.smk
@@ -4,6 +4,8 @@ rule gridsearch_filter:
         rds=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{sample}_{{bin}}kb_relSmoothedCN.rds",sample=SAMPLES)
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         bin="{bin}",
         meta=config["samplesheet"],
diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk
index 626de36..3a64b4d 100644
--- a/rules/gridsearch.smk
+++ b/rules/gridsearch.smk
@@ -4,6 +4,8 @@ rule gridsearch_fitting:
     output:
         tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.tsv",
         pdf=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.pdf"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         bin="{bin}",
         outdir=OUT_DIR,
diff --git a/rules/rel_rds.smk b/rules/rel_rds.smk
index 3e5d406..bdb6a08 100644
--- a/rules/rel_rds.smk
+++ b/rules/rel_rds.smk
@@ -3,6 +3,8 @@ rule relRDS:
         bams=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/bams/{{sample}}.bam")
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/relative_cn_rds/{project}_{sample}_{bin}kb_relSmoothedCN.rds"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         bin="{bin}",
         outdir=OUT_DIR,
diff --git a/rules/rel_to_abs.smk b/rules/rel_to_abs.smk
index 21db431..baf9bb0 100644
--- a/rules/rel_to_abs.smk
+++ b/rules/rel_to_abs.smk
@@ -5,6 +5,8 @@ rule rel_to_abs:
     output:
         tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/{project}_{bin}kb_ds_abs_fits.tsv",
         rds=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/{project}_{bin}kb_ds_absCopyNumber.rds"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         outdir=OUT_DIR,
         project="{project}",
diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
index 0708d55..c6d074f 100644
--- a/schemas/config.schema.yaml
+++ b/schemas/config.schema.yaml
@@ -55,6 +55,8 @@ properties:
     type: number
     minimum: 0
     maximum: 0.99
+  image_base_url:
+    type: string
 
 # entries that have to be in the config file for successful validation
 required:
diff --git a/scripts/gridsearch_results_filtering.R b/scripts/gridsearch_results_filtering.R
index 1681f2d..c81dfa1 100644
--- a/scripts/gridsearch_results_filtering.R
+++ b/scripts/gridsearch_results_filtering.R
@@ -147,7 +147,8 @@ if(length(unique(pruned_results$SAMPLE_ID)) == 1){
   seg <- assayDataElement(x,"segmented")
   rel_ploidy <- mean(cn,na.rm=T)
   ll <- nrow(dat)
-  png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350)
+  png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"),
+	type="cairo",w= 450*ll, h = 350)
   par(mfrow = c(1,ll))
   for(n in 1:nrow(dat)){
     ploidy <- dat[n,]$ploidy
@@ -198,7 +199,8 @@ if(length(unique(pruned_results$SAMPLE_ID)) == 1){
     seg <- assayDataElement(x,"segmented")
     rel_ploidy <- mean(cn,na.rm=T)
     ll <- nrow(dat)
-    png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"), w= 450*ll, h = 350)
+    png(paste0(out_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_PRE_down_sampling/plots/", i, ".png"),
+	type = "cairo", w= 450*ll, h = 350,)
     par(mfrow = c(1,ll)) 
     for(n in 1:nrow(dat)){
     
diff --git a/scripts/qdnaseq_rel_to_abs.R b/scripts/qdnaseq_rel_to_abs.R
index be6de70..5cbdf3b 100644
--- a/scripts/qdnaseq_rel_to_abs.R
+++ b/scripts/qdnaseq_rel_to_abs.R
@@ -132,7 +132,8 @@ for(sample in pData(rds.rel)$name){
     yrange=10
   }
   # Plot abs fit
-  png(paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/plots/",sample,".png"), w= 8, h = 6, unit="in", res = 250)
+  png(paste0(output_dir,"sWGS_fitting/",project,"_",bin,"kb/absolute_POST_down_sampling/abs_cn_rds/plots/",sample,".png"),
+	type="cairo",w= 8, h = 6, unit="in", res = 250)
   par(mfrow = c(1,1))
   plot(relcn,doCalls=FALSE,showSD=TRUE,logTransform=FALSE,ylim=c(0,yrange),ylab="Absolute tumour CN",
        main=paste(sample, " eTP53=",round(expected_TP53_AF,2),

From 05a0bcc1706c0fd6df08c9ccab71c81fd03fd3f5 Mon Sep 17 00:00:00 2001
From: Phil9S <philip.smith@cruk.cam.ac.uk>
Date: Thu, 16 May 2024 14:51:46 +0100
Subject: [PATCH 17/18] added hmzy filtering paramters to update_config.py

---
 config/config.yaml | 2 +-
 update_configs.py  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/config/config.yaml b/config/config.yaml
index b531300..7454146 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -48,7 +48,7 @@ homozygous_prop: 10000000
 homozygous_threshold: 0.4
 
 # container url for swgs-absolutecn
-image_base_url: "docker://phil9s/" 
+image_base_url: docker://phil9s/
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/update_configs.py b/update_configs.py
index f27ea43..7663dee 100755
--- a/update_configs.py
+++ b/update_configs.py
@@ -58,6 +58,9 @@ def get_input(x,y):
     config['ploidy_max'] = float(get_input(config['ploidy_max'],'ploidy_max'))
     config['purity_min'] = float(get_input(config['purity_min'],'purity_min'))
     config['purity_max'] = float(get_input(config['purity_max'],'purity_max'))
+    config['filter_homozygous'] = DoubleQuotedScalarString(get_input(config['filter_homozygous'],'filter_homozygous'))
+    config['homozygous_prop'] = int(get_input(config['homozygous_prop'],'homozygous_prop'))
+    config['homozygous_threshold'] = float(get_input(config['homozygous_threshold'],'homozygous_threshold'))
     # Save updated yaml
     with open('config/config.yaml', 'w') as fp:
         yaml.dump(config, fp)

From 16f9da12a18fdd4432bc3ce3ee04df872b630d60 Mon Sep 17 00:00:00 2001
From: Philip Smith <philip.smith@cruk.cam.ac.uk>
Date: Thu, 16 May 2024 17:17:00 +0100
Subject: [PATCH 18/18] Update README.md

---
 README.md | 221 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 152 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index d8b4de2..ab592f7 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,15 @@
 
 ## Description
 
-*Summary*
+### *Summary*
 
 Generate absolute copy number profiles from shallow whole genome sequencing data using a read depth normalised and allele frequency-anchored approach.
 
-*Detailed description*
+### *Detailed description*
 
 This pipeline implements a method for generating absolute copy number profiles from shallow whole genome sequencing data utilising a multi-stage fitting process to generate a set of read depth-normalised absolute copy number profiles. Copy number fits are generated in read count space using a modified implementation of [QDNAseq](https://github.com/ccagc/QDNAseq) on full read depth shallow whole genome BAM files. After which, a grid search of ploidy and purity values is performed to generate a matrix of potential absolute copy number profiles. The fitting of absolute copy number profiles includes an error function metric and a variant allele fraction anchoring process which assist in the selection of an optimal copy number fit. 
 
-The error function, `clonality`, which measures segment residual from integer state across the genome and acts similarly to a function such as RMSE, in which a lower value is suggestive of better confirmation to integer copy number states, and therefore a better absolute copy number profile. variant allele fraction achoring utilises the near-ubiquitous presence of somatic homozygous _TP53_ variants in high grade serous ovarian cancer as an anchoring point for a copy number fit. Experimentally validated _TP53_ allele fractions at homozygous sites are compared to an expected homozygous _TP53_ variant allele fraction, as determined by the copy number value of the segment overlapping the _TP53_ locus. The smaller absolute differences between the experimental and expected _TP53_ variant allele fractions is suggestive of a better fit. Minimisation of the `clonality` error function, expected-to-experimental TP53 distance, and manual review of fits allows for the selection of an optimal absolute copy number profile for a given sample.
+The error function, `clonality`, which measures segment residual from integer state across the genome and is computed as mean absolute error (MAE), in which a lower value is suggestive of better confirmation to integer copy number states, and therefore a better absolute copy number profile. variant allele fraction achoring utilises the near-ubiquitous presence of somatic homozygous _TP53_ variants in high grade serous ovarian cancer as an anchoring point for a copy number fit. Experimentally validated _TP53_ allele fractions at homozygous sites are compared to an expected homozygous _TP53_ variant allele fraction, as determined by the copy number value of the segment overlapping the _TP53_ locus. The smaller absolute differences between the experimental and expected _TP53_ variant allele fractions is suggestive of a better fit. Minimisation of the `clonality` error function, expected-to-experimental TP53 distance, and manual review of fits allows for the selection of an optimal absolute copy number profile for a given sample.
 
 The selected fits are subject to a calculation of power to detect copy number alterations, as described [here](https://gmacintyre.shinyapps.io/sWGS_power/), in which the read depth of a given sample is assessed against its selected ploidy-purity combination. This process both determines if 1) a sample has a sufficient number of reads to support the selected ploidy-purity combination and 2) an optimal target number of reads to perform sample-specific read down sampling. This process normalises the read depth between samples in a ploidy and purity dependent manner so that read variance across segments is consistent, while excluding samples which are not supported by a sufficient number of reads.
 
@@ -22,29 +22,27 @@ Samples passing all filtering criteria then undergo read downsampling to the spe
 
 * [Pipeline setup](#pipeline-setup)
   + [Step 1 Clone the repo](#step-1-clone-the-repo)
-  + [Step 2 Install conda](#step-2-install-conda)
-  + [Step 3 Installing environment](#step-3-installing-environment)
-  + [Step 4 Preparing the input files](#step-4-preparing-the-input-files)
-    - [Sample sheet](#sample-sheet)
+  + [Step 2 Installing environment](#step-2-installing-environment)
+  + [Step 3 Preparing the input files](#step-3-preparing-the-input-files)
+    - [sample sheet](#sample-sheet)
     - [config.yaml](#configyaml)
     - [profile config.yaml](#profile-configyaml)
     - [workflow management](#workflow-management)
-    - [Updating the pipeline configuration](#updating-the-pipeline-configuration)
+    - [updating the pipeline configuration](#updating-the-pipeline-configuration)
 * [Running the pipeline](#running-the-pipeline)
-  + [workflow management](#workflow-management)
-  + [Step 5 Stage 1](#step-5-stage-1)
-  + [Step 6 QC1](#step-6-qc1)
+  + [Step 4 Stage 1](#step-4-stage-1)
+  + [Step 5 QC1](#step-5-qc1)
     - [Smoothing](#smoothing)
     - [Fit selection](#fit-selection)
-  + [Step 7 Stage 2](#step-7-stage-2)
-  + [Step 8 QC2](#step-8-qc2)
+  + [Step 6 Stage 2](#step-6-stage-2)
+  + [Downsampling only](#downsampling-only)
+* [Output files](#output-files)
 
 ## Compatibility
 
 ### Reference genome
 
-This pipeline is currently only compatible with BAM files aligned with `hg19` or `GRCh37` genomes.
-There are no checks in place for using the correct genome and results will be likely be extremely poor or incorrect if done using an unsupported reference genome.
+This pipeline is currently only compatible with BAM files aligned with `hg19` or `GRCh37` genomes. There are no checks in place for using files aligned to an unsupported reference genome.
 
 ## Pipeline setup
 
@@ -57,26 +55,24 @@ git clone https://github.com/Phil9S/swgs-absolutecn.git
 cd swgs-absolutecn/
 ```
 
-### Step 2 Install conda
+### Step 2 Installing environment
 
-This pipeline utilises micromamba or conda installation environments to manage the software packages and versions. Please make sure either mamba or conda are installed and available on your system.
-Our recommendation is to use micromamba or, ideally, the installed conda version should utilise the libmamba solver library as the required environment contains a large number of packages.
+This pipeline can utilise either a conda environment or a containerised docker/singularity implementation to manage the software packages and dependecies.
 
-See installing [micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) or [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) for more information.
-
-### Step 3 Installing environment
-
-From within the repository directory, run the `install_env.sh` script to generate a conda environment and install custom packages:
+- For the conda implementation please make sure either mamba or conda are installed and available on your system. Our recommendation is to use micromamba or, ideally, the installed conda version should utilise the libmamba solver library as the required environment contains a large number of packages. See installing [micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) or [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html).
+- For the docker/singularity implementation please make sure a compatible version of both snakemake and singularity are available on your system. See installing [singularity](https://docs.sylabs.io/guides/latest/user-guide/quick_start.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html).
 
+#### Conda environment
+With conda or micromamba installed, from within the repository directory, run the `install_env.sh` script to generate a conda environment and install custom packages:
 ```
 ./install_env.sh mamba
 ```
 or
 ```
-install_env.sh conda $HOME/miniconda/
+./install_env.sh conda $HOME/miniconda/
 ```
 
-If you used a previously installed conda build please use the conda or miniconda installation directory when running this section instead of '$HOME/miniconda/' to correctly initialise the conda environment.
+If you are useing an existing conda installation, please use the conda installation directory relevant to your existing installation when running this section instead of `$HOME/miniconda/` to correctly initialise the conda environment.
 
 The newly installed environment can be activated using the following:
 
@@ -88,119 +84,206 @@ or
 conda activate swgs-abscn
 ```
 
-### Step 3a - Singularity-based implementation
+#### Singularity-based implementation
+If singularity and snakemake are already available, the pipeline can be run using a container by running the following:
 
+Set the input and output directories (these should match those specified in the config.yaml - see step 3)
 ```
---use-singularity --singularity-args "--bind /mnt/scratcha/fmlab/smith10/britroc/"
+INPUTDIR="/path/to/inputfiles/"
+OUTPUTDIR="/path/to/output/"
 ```
+and then set the singularity args variable
+```
+SIGBIND="--use-singularity --singularity-args \"--bind ${INPUTDIR},${OUTPUTDIR}\""
+```
+The containerised environment can be found on docker hub here [phil9s/swgs-absolutecn](https://hub.docker.com/repository/docker/phil9s/swgs-absolutecn/)
 
-### Step 4 Preparing the input files
-
+### Step 3 Preparing the input files
 #### Sample sheet
 
-The workflow requires a single input file `sample_sheet.tsv` which is a tab-separated document detailing sample names, patient names, smoothing booleans, TP53 allele frequencies, and BAM file locations. Make sure that the BAM file paths are absolute and that sample and patient names are sensible (i.e. do not contain abstract characters or white space). The `TPfreq` column should only contain a float (range 0.00-1.00) or `NA`. The `smooth` column is boolean and should only contain either `TRUE` or `FALSE`.
+The workflow requires a single input file `sample_sheet.tsv` which is a tab-separated document detailing sample names, patient names, smoothing booleans, TP53 allele frequencies, and BAM file locations. Make sure that the BAM file paths are absolute and that sample and patient names are sensible (i.e. do not contain abstract characters or white space). The `TPfreq` column should only contain a float (range 0.00-1.00) or `NA`. The `smooth` column is boolean and should only contain either `TRUE` or `FALSE`. Optionally, users may provide precomputed or orthogonally generated sample ploidy and purity estimates using the `precPloidy` and `precPurity` fields but these are not required. Both `precPloidy` and `prePurity` can be provided seperately or together and samples do not need to have either field consistently, meaning one sample may have ploidy/purity estimate and others may not and the pipeline will only perform grid searches across the missing values (e.g In the example table below, a full gridsearch will occur for SAM1, no search performed for SAM2, only ploidy for SAM3, and only purity for SAM4). 
 
 The table below demonstrates the basic schema required and the workflow will validate this file prior to running.
 
-|PATIENT_ID|SAMPLE_ID|TP53freq|smooth|file         |
-|----------|---------|--------|------|-------------|
-|PAT1      |SAM1     |0.45    |TRUE  |/data/SAM1.bam|
-|PAT1      |SAM2     |0.55    |FALSE |/data/SAM2.bam|
+|PATIENT_ID|SAMPLE_ID|TP53freq|smooth|file          |precPloidy|precPurity|
+|----------|---------|--------|------|--------------|----------|----------|
+|PAT1      |SAM1     |0.45    |TRUE  |/data/SAM1.bam|NA        |NA        |
+|PAT1      |SAM2     |0.55    |FALSE |/data/SAM2.bam|2.4       |0.75      |
+|PAT2      |SAM3     |NA      |FALSE |/data/SAM3.bam|NA        |0.43      |
+|PAT2      |SAM4     |NA      |TRUE  |/data/SAM4.bam|3.2       |NA        |
 
 An example sample_sheet.tsv is included in this repository.
 
 #### config.yaml
 
-The config.yaml (`config/config.yaml`) contains the necessary information for the pipeline you wish to run. This includes the location of the samplesheet.tsv, bin size, project name, and output directory.
+The config.yaml (`config/config.yaml`) contains the necessary information for the pipeline you wish to run. This includes the location of the samplesheet.tsv, bin size, project name, output directory, and filtering parameters.
+
+#### Filtering parameters
 
-#### profile config.yaml
+Various filters for acceptable ploidy/purity combinations for fitting absolute copy number can be modified or disabled depending user requirements.
 
-The profile configs (cluster_config.yaml & config.yaml) (`profile/*/*`) contains the necessary information to configure the job submission parameters passed to a given workload manager (or lack thereof). This includes the number of concurrently sumbitted jobs, account/project name, partition/queue name, and default job resources (though these are low and should work on almost any cluster).
+|variable            |default |function                                                                                                                 |type       |values        |
+|--------------------|--------|-------------------------------------------------------------------------------------------------------------------------|-----------|--------------|
+|af_cutoff           |0.15    |Maximum difference between expected and observed _TP53_ allele fraction                                                  |float      |0.0-1.0       |
+|use_seed            |"TRUE"  |Set whether to use `seed_val` to ensure CBS segmentation returns identical results                                       |string bool|"TRUE","FALSE"|
+|seed_val            |"9999"  |seed value used by `use_seed`                                                                                            |string     |any string    |
+|filter_underpowered |"TRUE"  |Set whether to filter ploidy/purity combinations without sufficent available read depth to support the given profile     |string bool|"TRUE","FALSE"|
+|ploidy_min          |1.6     |Minimum ploidy value for gridsearch - must be less than `ploidy_max`. Ignored for a sample if `precPloidy` is provided   |float      |1.0-20.0      |
+|ploidy_max          |8.0     |Minimum ploidy value for gridsearch - must be greater than `ploidy_min`. Ignored for a sample if `precPloidy` is provided|float      |1.0-20.0      |
+|purity_min          |0.15    |Minimum purity value for gridsearch - must be less than `purity_max`. Ignored for a sample if `precPurity` is provided   |float      |0.0-1.0       |
+|purity_max          |1.0     |Minimum purity value for gridsearch - must be greater than `purity_min`. Ignored for a sample if `precPurity` is provided|float      |0.0-1.0       |
+|filter_homozygous   |"TRUE"  |Set whether to filter ploidy/purity combinations with a proportion of homozygous loss greater than `homozygous_prop`     |string bool|"TRUE","FALSE |
+|homozygous_prop     |10000000|Proportion of genome (in basepairs) at which to filter a ploidy/purity combination where `filter_homozygous` is "TRUE"   |integer    |minimum=0     |
+|homozygous_threshold|0.4     |Threshold at which to assign homozygous loss to copy number segment counted by `homozygous_prop`                         |float      |0.0-0.99      |
+
+For most users, the default parameters should work well but in certain instances, these values should be modifed. 
+
+For example, if users are not generating a sufficent number of high quality absolute copy number profiles, setting `filter_underpowered` to "FALSE" will show a larger range of fits which, although statistically underpowered, could be correct for given sample.
+
+Another use case would be samples where the general purity range is known, for example cell line or LCM data, where the expected purity is known to be ~1.0. As such setting `purity_min` to 0.95 would restrict ploidy/purity combinations to enforce this expectation.
 
 #### Workflow management
+##### profile config.yaml
 
-This pipeline was primarily developed using the [SLURM](https://slurm.schedmd.com/documentation.html) work load manager for job submission by snakemake. For individuals running on non-workload managed clusters, or utilising other workload managers, profiles are provided to allow for job submission with minimal configuration. Currently supported profiles are `local`, `slurm`, and `pbs`. These can be edited via the script described in the next section.
+The profile configs (cluster_config.yaml & config.yaml) (`profile/*/*`) contains the necessary information to configure the job submission parameters passed to a given workload manager (or lack thereof). This includes the number of concurrently sumbitted jobs, account/project name, partition/queue name, and default job resources (though these are low and should work on almost any cluster). This pipeline was primarily developed using the [SLURM](https://slurm.schedmd.com/documentation.html) work load manager for job submission by snakemake. For individuals running on non-workload managed clusters, or utilising other workload managers, profiles are provided to allow for job submission with minimal configuration. Currently supported profiles are `local`, `slurm`, and `pbs`. These can be edited via the script described in the next section.
 
 #### Updating the pipeline configuration
 
 Environment-specific and pipeline-specific parameters need to be set for each run of this pipeline. While it is possible to manually edit the YAML files, a script has been provided to update the most frequently altered parameters programmatically. The script will iteratively list the parameter and its current value, asking for a user submitted new value should it be needed. If the value is already acceptable or does not need to be changed then an empty value (enter return without typing) will keep the current setting.
 
-With the `swgs-abscn` conda environment active, run one of the following code for the profile you wish to update (typically both the pipeline configuration and one cluster configuration):
+Run one of the following code for the profile you wish to update (typically both the pipeline configuration and one cluster configuration):
 
 ```
+# conda - with swgs-abscn env
 # Pipeline configuration
-python update_configs.py -c config
-# Slurm configuration
-python update_configs.py -c slurm
-# PBS-torque configuration
-python update_configs.py -c pbs
-# Local/non-managed configuration
-python update_configs.py -c local
+./update_configs.py -c config
+# Pipeline filters
+./update_configs.py -c filters
+# workflow configuration
+./update_configs.py -c {slurm,pbs,local}
+```
+singularity users can run the following (whilst within the repository directory):
+```
+SIGCMD="singularity exec --bind "$(pwd -P)" docker://phil9s/swgs-absolutecn:latest $(pwd -P)"
+# Pipeline configuration
+$SIGCMD/update_configs.py -c config
+# Pipeline filters
+$SIGCMD/update_configs.py -c filters
+# workflow configuration
+$SIGCMD/update_configs.py -c {slurm,pbs,local}
+
 ```
 
 ## Running the pipeline
 
-### Step 5 Stage 1
+### Step 4 Stage 1
 
 Once the pipeline and cluster parameters have been set and the samplesheet is prepared, the first stage of the pipeline is ready to run.
-
-With the `swgs-abscn` conda environment active, run the following:
+Be sure to update the profile path to match your cluster/server configuration. 
 
 Confirm the pipeline is configured correctly, run the first stage using the `dry-run` mode.
 
 ```
-snakemake -n --profile profile/*/ --snakefile stage_1
+# conda - with swgs-abscn env
+snakemake -n --profile profile/slurm/ --snakefile stage_1
+```
+```
+# singularity
+snakemake -n --profile profile/slurm/ --snakefile stage_1 ${SIGBIND}
 ```
-
 If the previous step ran without error then run the following:
-
 ```
-snakemake --profile profile/*/ --snakefile stage_1
+# conda - with swgs-abscn env
+snakemake --profile profile/slurm/ --snakefile stage_1
+```
+```
+# singularity
+snakemake --profile profile/slurm/ --snakefile stage_1 ${SIGBIND}
 ```
 
-Where * is replaced with the profile matching your cluster/server configuration. 
-
-### Step 6 QC1
+### Step 5 QC1
 
-At the conclusion of stage 1, files and fits will be generated for all samples present in the `samplesheet.tsv` provided. This will include grid search-generated fits, copy number profile plots, and a `QDNAseq` RDS file containing the copy number fit data. This data is not immediately usable in downstream processes and fits must undergo quality control and fit selection, as samples may generate more than one viable copy number profile.
+At the conclusion of stage 1, files and fits will be generated for all samples present in the `samplesheet.tsv` provided. This will include grid search-generated fits, copy number profile plots, and a `QDNAseq` RDS file containing the copy number fit data. This data is not immediately usable in downstream processes and fits must undergo quality control and fit selection, as samples may generate more than one viable copy number profile. 
 
 #### Smoothing
 
-Prior to fit selection, a subset of samples will likely require smoothing of segments in order to be viable. Read the guide provided here to select and update which samples require smoothing [here](resources/smoothing_guide.md). Once the `samplesheet.tsv` has been updated with the new `smooth` values, rerun stage 1 using the following:
+Prior to fit selection, a subset of samples may require smoothing of segments in order to be viable. Read the guide provided here to select and update which samples require smoothing [here](resources/smoothing_guide.md). Once the `samplesheet.tsv` has been updated with the new `smooth` values, rerun stage 1. Be sure to update the profile path to match your cluster/server configuration.
 
 ```
-snakemake --profile profile/*/ -F all --snakefile stage_1
+# conda - with swgs-abscn env
+snakemake --profile profile/slurm/ -F all --snakefile stage_1
+```
+```
+#singularity
+snakemake --profile profile/slurm/ -F all --snakefile stage_1 ${SIGBIND}
 ```
-
-Where * is replaced with the profile matching your cluster/server configuration.
 
 #### Fit selection
 
 After running stage 1 with the appropraite smoothing values, copy number fits will have been generated for each sample. In most cases, multiple viable fits will have been selected for each sample and a semi-qualitative quality control process needs to be applied to select the best fitting solution or exclude a sample should no fit be good enough.
 
-Follow the guide on fit selection [here](resources/quality_control_guide.md) to perform quality control and update the `{project}__fit_QC_predownsample.tsv` file. Once this step has been performed and a single acceptable fit has been selected for each sample, proceed to step 7.
+Follow the guide on fit selection [here](resources/quality_control_guide.md) to perform quality control and update the `{project}_fit_QC_predownsample.tsv` file. Once this step has been performed and a single acceptable fit has been selected for each sample, proceed to stage 2.
 
-### Step 7 Stage 2
+### Step 6 Stage 2
 
 Provided quality control and fit selection was performed correctly, stage 2 of the pipeline can be performed which refits all copy number profiles using downsampled BAM files and the selected fits from stage 1.
+Be sure to update the profile path to match your cluster/server configuration. 
 
 As before, confirm the pipeline is configured correctly by running with the `dry-run` mode.
 
 ```
-snakemake -n --profile profile/*/ --snakefile stage_2
+# conda - with swgs-abscn env
+snakemake -n --profile profile/slurm/ --snakefile stage_2
+```
+```
+# singularity
+snakemake -n --profile profile/slurm/ --snakefile stage_2 ${SIGBIND}
 ```
-
 and if the previous step ran without error then run the following:
+```
+# conda - with swgs-abscn env
+snakemake --profile profile/slurm/ --snakefile stage_2
+```
+```
+# singularity
+snakemake --profile profile/slurm/ --snakefile stage_2 ${SIGBIND}
+```
 
+### Downsampling only
+
+In some instances, where ploidy and purity are provided from alternative sources, users may only want to perform the read depth normalisation step and not perform a gridsearch. In these cases, if all samples in the `sample_sheet.tsv` contain a valid `precPloidy` and `precPurity` values the `processPrecomputed.R` script can generate all the required output files to start stage 2 without needing to perform any gridsearch or select a ploidy/purity combination, and instead skip directly to read depth normalisation (via read downsampling) using the provied `precPloidy` and `precPurity` combination.
+
+First, check all samples have valid `precPloidy` and `precPurity` values by running the following:
+```
+# conda - with swgs-abscn env
+snakemake -n --profile profile/slurm/ --snakefile stage_1
+```
+```
+# singularity
+snakemake -n --profile profile/slurm/ --snakefile stage_1 ${SIGBIND}
+```
+This should return an error message stating `all ploidy and purity values are precomputed - Run Rscript scripts/processPrecomputed.R and skip to stage_2`. If this did not occur and stage 1 ran a dry-run then not all samples have a `precPloidy` and `precPurity` value specified.
+
+Provided that the previous step validated and returned the specified message then the following can be run to generate the required output files to skip stage 1
+
+```
+# conda - with swgs-abscn env
+Rscript scripts/processPrecomputed.R
+```
 ```
-snakemake --profile profile/*/ --snakefile stage_2
+# singularity
+SIGCMD="singularity exec --bind "$(pwd -P)" docker://phil9s/swgs-absolutecn:latest"
+$SIGCMD Rscript scripts/processPrecomputed.R
 ```
 
-Where * is replaced with the profile matching your cluster/server configuration.
+At which point Stage 2 can then be performed as previously described.
 
-### Step 8 QC2
+## Output files
 
-To confirm the quality of newly generated downsampled absolute copy number profiles generated by stage 2, an evalution of outputted fits should be performed as described previously in step 6 [here](resources/quality_control_guide.md). The output `{}` should be updated accordingly, with poor fits being excluded.
+The final output is generated by stage 2 is three files located in the specified output directory `{output_dir}/sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/`
+- `*_ds_abs_fits.tsv` - Tab-seperated file containing absolute copy number profile metadata and fitting information
+- `*_ds_absCopyNumber.rds` - QDNASeqmod class object containing binned copy number data in `rds` format
+- `*_ds_absCopyNumber_segTable.tsv` - A multisample segment table consisting of copy number segment coordinates and associated segment values as unrounded absolute copy number
 
 ## Authors