Merging QC branch

LiaOb21 · Mar 28, 2024 · e67853a · e67853a
2 parents d0e1ffe + 2774a83
commit e67853a
Show file tree

Hide file tree

Showing 21 changed files with 228 additions and 67 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -99,3 +99,20 @@ yahs:
 #  o: "hifiasm_p_purged_yahs" # output prefix use of this needs evaluation
   optional_params: 
     "-e": "" # you can specify the restriction enzyme(s) used by the Hi-C experiment
+
+
+# Customisable parameters for quast
+quast:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  optional_params: 
+    "--fragmented": ""
+    "--large": ""
+#    "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"
+#    "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3"
+
+# Customisable parameters for busco
+busco:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis
diff --git a/config/config_test.yaml b/config/config_test.yaml
@@ -79,7 +79,7 @@ fcsgx:
   path_to_gx_db: "resources/gx_test_db/test-only"
 
 # Set this to False if you want to skip purge_dups steps:
-include_purge_dups: True
+include_purge_dups: False
 
 # Customisable parameters for purge_dups
 purge_dups:
@@ -97,3 +97,19 @@ yahs:
 #  o: "hifiasm_p_purged_yahs" # output prefix #use of this needs evaluation
   optional_params: 
     "-e": "GATC" # you can specify the restriction enzyme(s) used by the Hi-C experiment
+
+# Customisable parameters for quast
+quast:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  optional_params: 
+    "--fragmented": ""
+    "--large": ""
+#    "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"
+#    "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3"
+
+# Customisable parameters for busco
+busco:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -8,11 +8,12 @@ configfile: "config/config.yaml"
 
 # set {samples} global variable - used to read Hi-C files
 samples_in_pattern = "{sample}_1.fastq.gz"
-samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
+sample, = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
 
 # set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm
 diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"]
 
+
 # Include the common rule, where the inputs for rule_all are defined
 include: "rules/common.smk"
 # Include the hifi_prep rule
@@ -47,7 +48,10 @@ include: "rules/two_read_bam_combiner.smk"
 include: "rules/picard.smk"
 # Include the yahs rule
 include: "rules/yahs.smk"
-
+# Include the quast rule
+include: "rules/quast.smk"
+# Include the busco rule
+include: "rules/busco.smk"
 
 # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified
 # input files are defined in the common.smk file

diff --git a/workflow/envs/busco.yaml b/workflow/envs/busco.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+
+dependencies:
+  - busco=5.7.0
diff --git a/workflow/envs/quast.yaml b/workflow/envs/quast.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+
+dependencies:
+  - quast=5.2.0
diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk
@@ -0,0 +1,28 @@
+# rule busco for assembly QC
+
+rule busco:
+    input:
+        unpack(get_assemblies_QC)
+    output:
+        dir = directory("results/busco"),
+    params:
+        lineage=config["busco"]["lineage"],
+        labels = list(get_assemblies_QC().keys())
+    threads: config["busco"]["t"]
+    log:
+        "logs/busco.log"
+    resources:
+        mem_mb=config['busco']['mem_mb'],  # access memory from config
+    conda:
+        "../envs/busco.yaml"
+    shell:
+        """
+        mkdir -p {output}
+        labels=({params.labels})
+        count=0
+        for v in {input} ; do
+            k=${{labels[$count]}}
+            busco -i $v -o {output}/$k -m genome -l {params.lineage} -f -c {threads} >> {log} 2>&1
+            count=$(( $count + 1 ))
+        done
+        """
diff --git a/workflow/rules/bwa_index.smk b/workflow/rules/bwa_index.smk
@@ -3,7 +3,6 @@
 # For more info: https://github.com/ArimaGenomics/mapping_pipeline
 # colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that
 
-
 rule bwa_index:
     input:
         get_bwa_index_inputs

diff --git a/workflow/rules/bwa_mem.smk b/workflow/rules/bwa_mem.smk
@@ -8,14 +8,14 @@
 rule bwa_mem:
     input:
         REF="results/bwa_index_{hap}/asm.fa",
-        forward_hic="results/fastp/{sample}_trim_1.fastq.gz",
-        reverse_hic="results/fastp/{sample}_trim_2.fastq.gz",
+        forward_hic="results/fastp/hic_trim_1.fastq.gz",
+        reverse_hic="results/fastp/hic_trim_2.fastq.gz",
     output:
-        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
-        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
+        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
+        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
     threads: config["arima"]["CPU"]
     log:
-        "logs/bwa_mem_{hap}_{sample}.log",
+        "logs/bwa_mem_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config    
     conda:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -4,14 +4,14 @@
 import os
 
 wildcard_constraints:
-    hap = "primary|hap1|hap2"
+    hap = "primary|hap1|hap2",
 
 def get_all_inputs(wc=None):
     if diploid_mode: 
-        hap= ["hap1", "hap2"]
+        hap = ["hap1", "hap2"]
     else:
-        hap = ["primary"]
-    if not samples:
+        hap = ["primary"]      
+    if not sample:
         # Show the user a meaningful error message
         logger.error(f"No files matching {samples_in_pattern} were found in"
                      f" {config.get('hic_path')}. Please check your config file.")
@@ -24,6 +24,8 @@ def get_all_inputs(wc=None):
         "results/nanoplot/NanoPlot-report.html",  # nanoplot report
         "results/kmc/out.hist",  # output of kmc rule
         "results/genomescope",  # output directory of genomescope
+        "results/quast", # output directory of quast
+        "results/busco", # output directory of busco
     ]
 
     # Get the OATK outputs
@@ -37,13 +39,15 @@ def get_all_inputs(wc=None):
     inputs.extend(oatk_outputs)
 
     inputs.extend(expand(
-            "results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", sample=samples, hap=hap
+            "results/yahs_{hap}/asm_yahs_scaffolds_final.fa", hap=hap
         ))  # final scaffolded assembly
 
     # If not in diploid mode, add asm.alternate.fa as an input
     if not diploid_mode and config["include_purge_dups"] == True:
         inputs.append("results/purge_dups_alt/asm.alternate_purged.fa")
 
+#    inputs.extend(expand("results/busco/{tool}_{hap}_busco", tool=tool, hap=hap))
+
     return inputs
 
 
@@ -82,3 +86,17 @@ def get_bwa_index_inputs(wildcards):
         return f"results/hifiasm/asm.{hap}.fa"
 
 
+def get_assemblies_QC(wildcards=None):
+    if diploid_mode: 
+        haps = ["hap1", "hap2"]
+    else:
+        haps = ["primary"]
+    results = {}
+    for hap in haps:
+        results[f"asm_{hap}"] = f"results/assemblies/asm_{hap}.fa"
+        results[f"yahs_{hap}"] = f"results/assemblies/yahs_{hap}.fa"
+        if config["include_purge_dups"]:
+            results[f"purged_{hap}"] = f"results/assemblies/purged_{hap}.fa"  
+        if config["include_fcsgx"]:
+            results[f"fcsgx_{hap}"] = f"results/assemblies/fcsgx_{hap}.fa"
+    return results
diff --git a/workflow/rules/fastp.smk b/workflow/rules/fastp.smk
@@ -5,28 +5,20 @@ import glob
 
 rule fastp:
     input:
-        forward_in=expand(
-            "{hic_path}{sample}_1.fastq.gz",
-            hic_path=config["hic_path"],
-            sample=glob_wildcards(config["hic_path"] + "{sample}_1.fastq.gz").sample,
-        ),
-        reverse_in=expand(
-            "{hic_path}{sample}_2.fastq.gz",
-            hic_path=config["hic_path"],
-            sample=glob_wildcards(config["hic_path"] + "{sample}_2.fastq.gz").sample,
-        ),
+        forward_in=f"{config["hic_path"]}{sample}_1.fastq.gz",
+        reverse_in=f"{config["hic_path"]}{sample}_2.fastq.gz",
     output:
-        forward_out="results/fastp/{sample}_trim_1.fastq.gz",
-        reverse_out="results/fastp/{sample}_trim_2.fastq.gz",
-        json="results/fastp/{sample}_report_fastp.HiC.json",
-        html="results/fastp/{sample}_report_fastp.HiC.html",
+        forward_out="results/fastp/hic_trim_1.fastq.gz",
+        reverse_out="results/fastp/hic_trim_2.fastq.gz",
+        json="results/fastp/hic_report_fastp.HiC.json",
+        html="results/fastp/hic_report_fastp.HiC.html",
     params:
         optional_params=" ".join(
             f"{k} {v}" for k, v in config["fastp"]["optional_params"].items() if v
         ),
     threads: config["fastp"]["t"]
     log:
-        "logs/{sample}_fastp.log",
+        "logs/fastp.log",
     resources:
         mem_mb=config['fastp']['mem_mb'],  # access memory from config
     conda:

diff --git a/workflow/rules/filter_five_end.smk b/workflow/rules/filter_five_end.smk
@@ -6,13 +6,13 @@
 
 rule fiter_five_end:
     input:
-        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
-        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
+        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
+        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
     output:
-        bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam",
-        bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam",
+        bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam",
+        bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam",
     log:
-        "logs/fiter_five_end_{hap}_{sample}.log",
+        "logs/fiter_five_end_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config    
     conda:

diff --git a/workflow/rules/get_assemblies.smk b/workflow/rules/get_assemblies.smk
@@ -0,0 +1,15 @@
+include: "common.smk"
+
+rule get_assemblies:
+    output:
+        directory("results/assemblies")
+    log:
+        "logs/get_assemblies.log",
+    resources:
+        mem_mb=config['arima']['mem_mb'],  # access memory from config
+    conda:
+        "../envs/basic.yaml"
+    shell:
+        """
+        mkdir -p {output}
+        """
diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk
@@ -4,12 +4,13 @@
 
 rule hifiasm:
     input:
-        "results/reads/hifi/hifi.fastq.gz",
+        reads = "results/reads/hifi/hifi.fastq.gz",
     output:
         gfa="results/hifiasm/asm.primary.gfa",
         gfa_alt="results/hifiasm/asm.alternate.gfa",
         fasta="results/hifiasm/asm.primary.fa",
         fasta_alt="results/hifiasm/asm.alternate.fa",
+        link_primary = "results/assemblies/asm_primary.fa",
     threads: config["hifiasm"]["t"]
     log:
         "logs/hifiasm.log",
@@ -23,22 +24,28 @@ rule hifiasm:
         ),
     shell:
         """
-        hifiasm {input} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1
+        hifiasm {input.reads} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1
         mv results/hifiasm/asm.p_ctg.gfa {output.gfa}
         mv results/hifiasm/asm.a_ctg.gfa {output.gfa_alt}       
         awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta}
         awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt}
+
+        # all the assemblies produced by the workflow will be symlinked to results/assemblies
+
+        ln -srn {output.fasta} {output.link_primary}
         """
 
 
 rule hifiasm_het:
     input:
-        "results/reads/hifi/hifi.fastq.gz",
+        reads = "results/reads/hifi/hifi.fastq.gz",
     output:
-        gfa="results/hifiasm/asm.hap1.gfa",
-        gfa_alt="results/hifiasm/asm.hap2.gfa",
-        fasta="results/hifiasm/asm.hap1.fa",
-        fasta_alt="results/hifiasm/asm.hap2.fa",
+        gfa_hap1="results/hifiasm/asm.hap1.gfa",
+        gfa_hap2="results/hifiasm/asm.hap2.gfa",
+        fasta_hap1="results/hifiasm/asm.hap1.fa",
+        fasta_hap2="results/hifiasm/asm.hap2.fa",
+        link_hap1 = "results/assemblies/asm_hap1.fa",
+        link_hap2 = "results/assemblies/asm_hap2.fa"
     threads: config["hifiasm"]["t"]
     log:
         "logs/hifiasm.log",
@@ -52,9 +59,14 @@ rule hifiasm_het:
         ),
     shell:
         """
-        hifiasm {input} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1
-        mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa}
-        mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_alt}
-        awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta}
-        awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt}
+        hifiasm {input.reads} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1
+        mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa_hap1}
+        mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_hap2}
+        awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap1} > {output.fasta_hap1}
+        awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap2} > {output.fasta_hap2}
+
+        # all the assemblies produced by the workflow will be symlinked to results/assemblies
+
+        ln -srn {output.fasta_hap1} {output.link_hap1}
+        ln -srn {output.fasta_hap2} {output.link_hap2}       
         """
diff --git a/workflow/rules/ncbi_fcsgx.smk b/workflow/rules/ncbi_fcsgx.smk
@@ -3,13 +3,14 @@
 
 rule fcsgx:
     input:
-        fasta="results/hifiasm/asm.{hap}.fa",
+        fasta = "results/hifiasm/asm.{hap}.fa",
     params:
         ncbi_tax_id=config["fcsgx"]["ncbi_tax_id"],
         path_to_gx_db=config["fcsgx"]["path_to_gx_db"], 
     output:
         clean_fasta="results/ncbi_fcsgx_{hap}/asm_clean.fa",
-        contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa"
+        contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa",
+        link="results/assemblies/fcsgx_{hap}.fa"
     threads: config["hifiasm"]["t"]
     log:
         "logs/fcsgx_{hap}.log",
@@ -21,4 +22,5 @@ rule fcsgx:
         """
         run_gx.py --fasta {input.fasta} --out-dir results/ncbi_fcsgx_{wildcards.hap}/out --gx-db {params.path_to_gx_db} --tax-id {params.ncbi_tax_id} --debug >> {log} 2>&1
         cat {input.fasta} | gx clean-genome --action-report results/ncbi_fcsgx_{wildcards.hap}/out/asm.{wildcards.hap}.{params.ncbi_tax_id}.fcs_gx_report.txt --output {output.clean_fasta} --contam-fasta-out {output.contaminants} >> {log} 2>&1
+        ln -srn {output.clean_fasta} {output.link}
         """
diff --git a/workflow/rules/oatk.smk b/workflow/rules/oatk.smk
@@ -1,5 +1,5 @@
 # This rule runs oatk to extract organelles reads from the hifi reads.
-
+# get_oatk_outputs is used to dynamically decide the outputs of oatk
 
 rule oatk:
     input: