Merge pull request #26 from Phil9S/dev

PR - v1.2.0
Phil9S · May 16, 2024 · f492f43 · f492f43
2 parents b2f4ec5 + 16f9da1
commit f492f43
Show file tree

Hide file tree

Showing 25 changed files with 842 additions and 232 deletions.
diff --git a/README.md b/README.md
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,19 +1,54 @@
 ---
 # Sample sheet
-samplesheet: "sample_sheet.tsv"
+samplesheet: sample_sheet.tsv
 
 # Output location
-out_dir: "/mnt/scratcha/fmlab/smith10/britroc/"
+out_dir: results/
 
 # Bin sizes
 # By default any in [1,5,15,30,50,100,500,1000]
+# Add new line for additional bin sizes
 bins:
 - 30
-project_name: "britroc"
+#- 100 
+
+# Set project dir name
+project_name: nm_test
 
 # Pipeline parameters
 af_cutoff: 0.15
 
+# Set seed for CBS - TRUE or FALSE
+# default TRUE
+use_seed: "TRUE"
+seed_val: "9999"
+
+# fitler underpowered solutions - TRUE or FALSE
+# Default TRUE
+filter_underpowered: "TRUE"
+
+# ploidy range
+# Default min = 1.6 | max = 8
+ploidy_min: 1.6
+ploidy_max: 8.0
+
+# purity range (1 >= max > min >= 0)
+# Default min = 0.15 | max = 1.00
+purity_min: 0.15
+purity_max: 1.0
+
+# Homozygous loss filter - TRUE or FALSE
+# Default "TRUE"
+filter_homozygous: "TRUE"
+# Threshold basepairs lost
+# Default 10000000 / 10Mbase
+homozygous_prop: 10000000
+# Absolute CN homozygous loss threshold
+# Default 0.4
+homozygous_threshold: 0.4
+
+# container url for swgs-absolutecn
+image_base_url: docker://phil9s/
 # Not implemented
 #custom_bin: false
 #custom_bin_folder: "/custom_bin_data/"
diff --git a/config/default_config.yaml b/config/default_config.yaml
@@ -0,0 +1,52 @@
+---
+# Sample sheet
+samplesheet: sample_sheet.tsv
+
+# Output location
+out_dir: results/
+
+# Bin sizes
+# By default any in [1,5,15,30,50,100,500,1000]
+# Add new line for additional bin sizes
+bins:
+- 30
+#- 100 
+
+# Set project dir name
+project_name: nm_test
+
+# Pipeline parameters
+af_cutoff: 0.15
+
+# Set seed for CBS - TRUE or FALSE
+# default TRUE
+use_seed: "TRUE"
+seed_val: "9999"
+
+# fitler underpowered solutions - TRUE or FALSE
+# Default TRUE
+filter_underpowered: "TRUE"
+
+# ploidy range
+# Default min = 1.6 | max = 8
+ploidy_min: 1.6
+ploidy_max: 8.0
+
+# purity range (1 >= max > min >= 0)
+# Default min = 0.15 | max = 1.00
+purity_min: 0.15
+purity_max: 1.0
+
+# Homozygous loss filter - TRUE or FALSE
+# Default "TRUE"
+filter_homozygous: "TRUE"
+# Threshold basepairs lost
+# Default 10000000 / 10Mbase
+homozygous_prop: 10000000
+# Absolute CN homozygous loss threshold
+# Default 0.4
+homozygous_threshold: 0.4
+
+# Not implemented
+#custom_bin: false
+#custom_bin_folder: "/custom_bin_data/"
diff --git a/dev_tools/report_seg_counts.R b/dev_tools/report_seg_counts.R
@@ -0,0 +1,49 @@
+args <- commandArgs(trailingOnly=T)
+library(yaml)
+
+cat("report segments - use 'report_seg_counts.R all' for individual seg counts\n")
+
+config <- read_yaml(file="config/config.yaml")
+
+projectBin <- paste0(config$project_name,"_",config$bin,"kb")
+outputLoc <- paste0(config$out_dir,"sWGS_fitting/",projectBin,"/")
+
+pre <- "absolute_PRE_down_sampling/"
+post <- "absolute_POST_down_sampling/abs_cn_rds/"
+
+preFile <- paste0(outputLoc,pre,projectBin,"_relSmoothedCN.rds")
+postFile <- paste0(outputLoc,post,projectBin,"_ds_absCopyNumber.rds")
+
+verbose <- FALSE
+if(length(args) > 0){
+  if(args[1] == "all"){
+    verbose <- TRUE
+  }
+}
+
+if(file.exists(preFile)){
+  suppressMessages(library(QDNAseqmod))
+  suppressMessages(library(Biobase))
+  preS <- readRDS(preFile)
+  preS <- preS[featureData(preS)$use]
+  preSegs <- apply(assayDataElement(preS,"segmented"),MARGIN=2,function(x) length(rle(x)$lengths))
+  cat("\nPre-downsampled segments\n")
+  if(verbose){
+    print(preSegs)
+  } else {
+    print(summary(preSegs))
+  }
+  if(file.exists(postFile)){
+    postS <- readRDS(postFile)
+    postS <- postS[featureData(postS)$use]
+    postSegs <- apply(assayDataElement(postS,"segmented"),MARGIN=2,function(x) length(rle(x)$lengths))
+    cat("\nPost-downsampled segments\n")
+    if(verbose){
+      print(postSegs)
+    } else {
+      print(summary(postSegs))
+    }
+  }
+} else {
+  cat("no pre or post downsampled files found\n")
+}
diff --git a/rules/bam_check.smk b/rules/bam_check.smk
@@ -3,6 +3,8 @@ rule check_bam:
        bam=FILE_LIST
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/bam.ok"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     threads: 1
     script:
         "../scripts/bam_check.R"
diff --git a/rules/common.smk b/rules/common.smk
@@ -61,10 +61,26 @@ OUT_DIR=os.path.join(OUT_DIR,"")
 samplesheet = pd.read_table(config["samplesheet"],dtype={'PATIENT_ID': str,'SAMPLE_ID':str,'TP53freq':float}).set_index(["SAMPLE_ID"], drop=False)
 validate(samplesheet, schema="../schemas/samples.schema.yaml")
 
+# set container uri
+image_base_url = config["image_base_url"]
+
 #### Check bin values ####
 
 BIN_VALS = config["bins"]
 BIN_DEF = [1,5,15,30,50,100,500,1000]
 
 if not set(BIN_VALS).issubset(BIN_DEF):
-    sys.exit("Some bin values are not available")
+    sys.exit("Config error - Some specified bin values are not available")
+
+##### CHECK MAX > MIN #####
+PLMIN=config["ploidy_min"]
+PLMAX=config["ploidy_max"]
+PUMIN=config["purity_min"]
+PUMAX=config["purity_max"]
+
+if PLMIN > PLMAX:
+    sys.exit("Config error - Minimum ploidy exceeds or is equal to maximum ploidy")
+
+if PUMIN > PUMAX:
+    sys.exit("Config error - Minimum purity exceeds or is equal to maximum purity")
+
diff --git a/rules/downsample.smk b/rules/downsample.smk
@@ -5,8 +5,11 @@ rule downsample:
         rds=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_{bin}kb_relSmoothedCN.rds"
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/downsampled_bams/{sample}.bam"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         outdir=OUT_DIR,
+        prplpu=prplpu,
         bin="{bin}",
         project="{project}",
         sample="{sample}"

diff --git a/rules/downsampled_rel_rds.smk b/rules/downsampled_rel_rds.smk
@@ -4,9 +4,13 @@ rule ds_relRDS:
         meta=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv"
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/relative_cn_rds/{project}_{sample}_{bin}kb_relSmoothedCN.rds"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         outdir=OUT_DIR,
         project="{project}",
-        bin="{bin}"
+        bin="{bin}",
+        use_seed=config["use_seed"],
+        seed_val=config["seed_val"]
     script:
         "../scripts/qdnaseq_mod_ds.R"
diff --git a/rules/filter_gridsearch.smk b/rules/filter_gridsearch.smk
@@ -1,15 +1,20 @@
 rule gridsearch_filter:
     input:
-        cl=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/clonality_results/{{project}}_{sample}_clonality.csv",sample=SAMPLES),
+        cl=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/clonality_results/{{project}}_{sample}_clonality.tsv",sample=SAMPLES),
         rds=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{sample}_{{bin}}kb_relSmoothedCN.rds",sample=SAMPLES)
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/{project}_fit_QC_predownsample.tsv"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         bin="{bin}",
         meta=config["samplesheet"],
         project="{project}",
         outdir=OUT_DIR,
-        af_cutoff=config["af_cutoff"]
+        af_cutoff=config["af_cutoff"],
+        filter_underpowered=config["filter_underpowered"],
+        filter_homozygous=config["filter_homozygous"],
+        homozygous_prop=config["homozygous_prop"]
     threads: THREADS 
     script: 
         "../scripts/gridsearch_results_filtering.R"

diff --git a/rules/gridsearch.smk b/rules/gridsearch.smk
@@ -2,11 +2,19 @@ rule gridsearch_fitting:
     input:
         expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/absolute_PRE_down_sampling/relative_cn_rds/{{project}}_{{sample}}_{{bin}}kb_relSmoothedCN.rds")
     output:
-        csv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.csv",
+        tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.tsv",
         pdf=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/clonality_results/{project}_{sample}_clonality.pdf"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         bin="{bin}",
         outdir=OUT_DIR,
-        project="{project}"
+        project="{project}",
+        meta=config["samplesheet"],
+        ploidy_min=config["ploidy_min"],
+        ploidy_max=config["ploidy_max"],
+        purity_min=config["purity_min"],
+        purity_max=config["purity_max"],
+        homozygous_threshold=config["homozygous_threshold"]
     script:
         "../scripts/ploidy_purity_search_standard_error.R"
diff --git a/rules/rel_rds.smk b/rules/rel_rds.smk
@@ -3,11 +3,15 @@ rule relRDS:
         bams=expand(OUT_DIR+"sWGS_fitting/{{project}}_{{bin}}kb/bams/{{sample}}.bam")
     output:
         OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_PRE_down_sampling/relative_cn_rds/{project}_{sample}_{bin}kb_relSmoothedCN.rds"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         bin="{bin}",
         outdir=OUT_DIR,
         project="{project}",
-        meta=config["samplesheet"]
+        meta=config["samplesheet"],
+        use_seed=config["use_seed"],
+        seed_val=config["seed_val"]
     script:
         "../scripts/qdnaseq_mod.R"
 
diff --git a/rules/rel_to_abs.smk b/rules/rel_to_abs.smk
@@ -5,6 +5,8 @@ rule rel_to_abs:
     output:
         tsv=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/{project}_{bin}kb_ds_abs_fits.tsv",
         rds=OUT_DIR+"sWGS_fitting/{project}_{bin}kb/absolute_POST_down_sampling/abs_cn_rds/{project}_{bin}kb_ds_absCopyNumber.rds"
+    singularity:
+        image_base_url+"swgs-absolutecn:latest"
     params:
         outdir=OUT_DIR,
         project="{project}",

diff --git a/sample_sheet_example.tsv b/sample_sheet_example.tsv
@@ -1,10 +1,10 @@
-PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file
-PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam
-PATIENT-2	SAMPLE_5	0.97604930362117	FALSE	/data/SAMPLE_5.bam
-PATIENT-2	SAMPLE_6	0.948429942418426	FALSE	/data/SAMPLE_6.bam
-PATIENT-3	SAMPLE_10	0.312743806009489	FALSE	/data/SAMPLE_10.bam
-PATIENT-3	SAMPLE_11	0.313365853658537	FALSE	/data/SAMPLE_11.bam
-PATIENT-3	SAMPLE_12	0.170947565543071	FALSE	/data/SAMPLE_12.bam
-PATIENT-3	SAMPLE_13	0.15861669829222	FALSE	/data/SAMPLE_13.bam
-PATIENT-3	SAMPLE_7	0.326712851405623	FALSE	/data/SAMPLE_7.bam
-PATIENT-3	SAMPLE_8	0.361060215053763	FALSE	/data/SAMPLE_8.bam
+PATIENT_ID	SAMPLE_ID	TP53freq	smooth	file	precPloidy	precPurity
+PATIENT-1	SAMPLE_3	NA	FALSE	/data/SAMPLE_3.bam	3.2	0.76
+PATIENT-2	SAMPLE_5	0.97	FALSE	/data/SAMPLE_5.bam	2.4	NA
+PATIENT-2	SAMPLE_6	0.94	FALSE	/data/SAMPLE_6.bam	NA	0.55
+PATIENT-3	SAMPLE_10	0.31	TRUE	/data/SAMPLE_10.bam	NA	NA
+PATIENT-3	SAMPLE_11	NA	FALSE	/data/SAMPLE_11.bam	NA	NA
+PATIENT-3	SAMPLE_12	0.17	FALSE	/data/SAMPLE_12.bam	NA	NA
+PATIENT-3	SAMPLE_13	0.15	FALSE	/data/SAMPLE_13.bam	NA	NA
+PATIENT-3	SAMPLE_7	0.32	TRUE	/data/SAMPLE_7.bam	NA	NA
+PATIENT-3	SAMPLE_8	0.36	FALSE	/data/SAMPLE_8.bam	NA	NA
diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
@@ -12,17 +12,66 @@ properties:
     type: string
   bins:
     type: array
+    items:
+      type: number
+    uniqueItems: true
   project_name:
     type: string
   af_cutoff:
     type: number
-    min: 0
-    max: 1.0
+    minimum: 0
+    maximum: 1.0
+  use_seed:
+    type: string
+    enum: ["TRUE","FALSE"]
+  seed_val:
+    type: string
+  filter_underpowered:
+    type: string
+    enum: ["TRUE","FALSE"]
+  ploidy_min:
+    type: number
+    minimum: 1
+    maximum: 20
+  ploidy_max:
+    type: number
+    minimum: 1
+    maximum: 20
+  purity_min:
+    type: number
+    minimum: 0
+    maximum: 1.0
+  purity_max:
+    type: number
+    minimum: 0
+    maximum: 1.0
+  filter_homozygous:
+    type: string
+    enum: ["TRUE","FALSE"]
+  homozygous_prop:
+    type: number
+    minimum: 0
+  homozygous_threshold:
+    type: number
+    minimum: 0
+    maximum: 0.99
+  image_base_url:
+    type: string
 
 # entries that have to be in the config file for successful validation
 required:
   - samplesheet
   - out_dir
   - bins
   - project_name
+  - use_seed
+  - seed_val
+  - filter_underpowered
+  - ploidy_min
+  - ploidy_max
+  - purity_min
+  - purity_max
+  - filter_homozygous
+  - homozygous_prop
+  - homozygous_threshold