From 1f0de0ae07d437d3d0c49fd50470aa8f5c70586b Mon Sep 17 00:00:00 2001
From: LiaOb21 <liaobinu@gmail.com>
Date: Tue, 26 Mar 2024 16:38:06 +0000
Subject: [PATCH 1/5] added quast and busco

---
 config/config_test.yaml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/config/config_test.yaml b/config/config_test.yaml
index c4f7b1c..402bfd5 100644
--- a/config/config_test.yaml
+++ b/config/config_test.yaml
@@ -79,7 +79,7 @@ fcsgx:
   path_to_gx_db: "resources/gx_test_db/test-only"
 
 # Set this to False if you want to skip purge_dups steps:
-include_purge_dups: True
+include_purge_dups: False
 
 # Customisable parameters for purge_dups
 purge_dups:
@@ -97,3 +97,19 @@ yahs:
 #  o: "hifiasm_p_purged_yahs" # output prefix #use of this needs evaluation
   optional_params: 
     "-e": "GATC" # you can specify the restriction enzyme(s) used by the Hi-C experiment
+
+# Customisable parameters for quast
+quast:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  optional_params: 
+    "--fragmented": ""
+    "--large": ""
+#    "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"
+#    "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3"
+
+# Customisable parameters for busco
+busco:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis

From f3ece2095cfd6e1e4b4df95fb762fb048c28a95c Mon Sep 17 00:00:00 2001
From: LiaOb21 <liaobinu@gmail.com>
Date: Tue, 26 Mar 2024 16:40:12 +0000
Subject: [PATCH 2/5] adding quast and busco

---
 workflow/Snakefile                | 14 ++++++++---
 workflow/envs/busco.yaml          |  6 +++++
 workflow/envs/quast.yaml          |  6 +++++
 workflow/rules/busco.smk          | 22 +++++++++++++++++
 workflow/rules/bwa_index.smk      |  4 ---
 workflow/rules/common.smk         | 21 +++++++++++++---
 workflow/rules/get_assemblies.smk | 15 +++++++++++
 workflow/rules/hifiasm.smk        | 41 ++++++++++++++++++++++---------
 workflow/rules/ncbi_fcsgx.smk     |  7 ++++--
 workflow/rules/oatk.smk           |  4 ---
 workflow/rules/purge_dups.smk     | 13 +++++-----
 workflow/rules/purge_dups2.smk    |  4 +++
 workflow/rules/quast.smk          | 23 +++++++++++++++++
 workflow/rules/yahs.smk           |  8 +++---
 workflow/rules/yahs_completed.smk | 15 +++++++++++
 15 files changed, 162 insertions(+), 41 deletions(-)
 create mode 100644 workflow/envs/busco.yaml
 create mode 100644 workflow/envs/quast.yaml
 create mode 100644 workflow/rules/busco.smk
 create mode 100644 workflow/rules/get_assemblies.smk
 create mode 100644 workflow/rules/quast.smk
 create mode 100644 workflow/rules/yahs_completed.smk

diff --git a/workflow/Snakefile b/workflow/Snakefile
index c7b3767..3883ed7 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -13,6 +13,11 @@ samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
 # set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm
 diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"]
 
+# set {file} in the assemblies directory as global variable
+assembly_files = glob_wildcards("results/assemblies/{file}.fa")[0]
+
+# Include the common rule, where the inputs for rule_all are defined
+include: "rules/common.smk"
 # Include the hifi_prep rule
 include: "rules/hifi_prep.smk"
 # Include the nanoplot
@@ -45,9 +50,12 @@ include: "rules/two_read_bam_combiner.smk"
 include: "rules/picard.smk"
 # Include the yahs rule
 include: "rules/yahs.smk"
-# Include the common rule, where the inputs for rule_all are defined
-include: "rules/common.smk"
-
+# include yahs checkpoint - to start the QC after yahs is done
+include: "rules/yahs_completed.smk"
+# Include the quast rule
+include: "rules/quast.smk"
+# Include the busco rule
+include: "rules/busco.smk"
 
 # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified
 # input files are defined in the common.smk file
diff --git a/workflow/envs/busco.yaml b/workflow/envs/busco.yaml
new file mode 100644
index 0000000..1bea8c4
--- /dev/null
+++ b/workflow/envs/busco.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+
+dependencies:
+  - busco=5.7.0
\ No newline at end of file
diff --git a/workflow/envs/quast.yaml b/workflow/envs/quast.yaml
new file mode 100644
index 0000000..dda2a1f
--- /dev/null
+++ b/workflow/envs/quast.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+
+dependencies:
+  - quast=5.2.0
\ No newline at end of file
diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk
new file mode 100644
index 0000000..4226640
--- /dev/null
+++ b/workflow/rules/busco.smk
@@ -0,0 +1,22 @@
+# rule busco for assembly QC
+
+rule busco:
+    input:
+        completion = "results/yahs_all_done.txt",
+        assemblies = "results/assemblies/{file}.fa"
+    output:
+        dir = directory("results/busco/{file}.fa_busco"),
+    params:
+        lineage=config["busco"]["lineage"],
+    threads: config["busco"]["t"]
+    log:
+        "logs/busco{file}.log"
+    resources:
+        mem_mb=config['busco']['mem_mb'],  # access memory from config
+    conda:
+        "../envs/busco.yaml"
+    shell:
+        """
+        # run busco on each assembly
+		busco -i {input.assemblies} -o {output.dir} -m genome -l {params.lineage} -f -c {threads}
+        """
\ No newline at end of file
diff --git a/workflow/rules/bwa_index.smk b/workflow/rules/bwa_index.smk
index d264fee..2e03418 100644
--- a/workflow/rules/bwa_index.smk
+++ b/workflow/rules/bwa_index.smk
@@ -3,10 +3,6 @@
 # For more info: https://github.com/ArimaGenomics/mapping_pipeline
 # colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that
 
-
-# include common.smk to use wildcards
-include: "common.smk"
-
 rule bwa_index:
     input:
         get_bwa_index_inputs
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index f0c09c4..f78457f 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -8,9 +8,9 @@ wildcard_constraints:
 
 def get_all_inputs(wc=None):
     if diploid_mode: 
-        hap= ["hap1", "hap2"]
+        hap = ["hap1", "hap2"]
     else:
-        hap = ["primary"]
+        hap = ["primary"]    
     if not samples:
         # Show the user a meaningful error message
         logger.error(f"No files matching {samples_in_pattern} were found in"
@@ -24,6 +24,7 @@ def get_all_inputs(wc=None):
         "results/nanoplot/NanoPlot-report.html",  # nanoplot report
         "results/kmc/out.hist",  # output of kmc rule
         "results/genomescope",  # output directory of genomescope
+        "results/quast/report.html"
     ]
 
     # Get the OATK outputs
@@ -44,6 +45,11 @@ def get_all_inputs(wc=None):
     if not diploid_mode:
         inputs.append("results/purge_dups_alt/asm.alternate_purged.fa")
 
+    # Extend list of inputs with busco directories
+    inputs.extend(expand(
+        "results/busco/{file}.fa_busco", file=assembly_files
+    ))
+
     return inputs
 
 
@@ -75,10 +81,17 @@ def get_purge_dups_inputs():
 def get_bwa_index_inputs(wildcards):
     hap = wildcards.hap
     if config["include_purge_dups"] == True:
-        return "results/purge_dups/asm.{hap}_purged.fa"
+        return f"results/purge_dups/asm.{hap}_purged.fa"
     elif config["include_fcsgx"] == True:
         return f"results/ncbi_fcsgx_{hap}/asm_clean.fa"
     else:
         return f"results/hifiasm/asm.{hap}.fa"
 
-
+def get_yahs_output():
+    if diploid_mode:
+        return [
+            expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples),
+            expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
+        ]
+    else:
+        return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
diff --git a/workflow/rules/get_assemblies.smk b/workflow/rules/get_assemblies.smk
new file mode 100644
index 0000000..58eb087
--- /dev/null
+++ b/workflow/rules/get_assemblies.smk
@@ -0,0 +1,15 @@
+include: "common.smk"
+
+rule get_assemblies:
+    output:
+        directory("results/assemblies")
+    log:
+        "logs/get_assemblies.log",
+    resources:
+        mem_mb=config['arima']['mem_mb'],  # access memory from config
+    conda:
+        "../envs/basic.yaml"
+    shell:
+        """
+        mkdir -p {output}
+        """
\ No newline at end of file
diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk
index d838c06..967f718 100644
--- a/workflow/rules/hifiasm.smk
+++ b/workflow/rules/hifiasm.smk
@@ -4,12 +4,16 @@
 
 rule hifiasm:
     input:
-        "results/reads/hifi/hifi.fastq.gz",
+        reads = "results/reads/hifi/hifi.fastq.gz",
+        asm_dir = "results/assemblies"
     output:
         gfa="results/hifiasm/asm.primary.gfa",
         gfa_alt="results/hifiasm/asm.alternate.gfa",
         fasta="results/hifiasm/asm.primary.fa",
         fasta_alt="results/hifiasm/asm.alternate.fa",
+        asm_dir = directory("results/assemblies"),
+        link_primary = "results/assemblies/asm.primary.fa",
+        link_alternate = "results/assemblies/asm.alternate.fa"
     threads: config["hifiasm"]["t"]
     log:
         "logs/hifiasm.log",
@@ -23,22 +27,30 @@ rule hifiasm:
         ),
     shell:
         """
-        hifiasm {input} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1
+        hifiasm {input.reads} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1
         mv results/hifiasm/asm.p_ctg.gfa {output.gfa}
         mv results/hifiasm/asm.a_ctg.gfa {output.gfa_alt}       
         awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta}
         awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt}
+
+        # all the assemblies produced by the workflow will be symlinked to results/assemblies
+
+        ln -srn {output.fasta} {output.link_primary}
+        ln -srn {output.fasta_alt} {output.link_alternate}
         """
 
 
 rule hifiasm_het:
     input:
-        "results/reads/hifi/hifi.fastq.gz",
+        reads = "results/reads/hifi/hifi.fastq.gz",
     output:
-        gfa="results/hifiasm/asm.hap1.gfa",
-        gfa_alt="results/hifiasm/asm.hap2.gfa",
-        fasta="results/hifiasm/asm.hap1.fa",
-        fasta_alt="results/hifiasm/asm.hap2.fa",
+        gfa_hap1="results/hifiasm/asm.hap1.gfa",
+        gfa_hap2="results/hifiasm/asm.hap2.gfa",
+        fasta_hap1="results/hifiasm/asm.hap1.fa",
+        fasta_hap2="results/hifiasm/asm.hap2.fa",
+        asm_dir = directory("results/assemblies"),
+        link_hap1 = "results/assemblies/asm.hap1.fa",
+        link_hap2 = "results/assemblies/asm.hap2.fa"
     threads: config["hifiasm"]["t"]
     log:
         "logs/hifiasm.log",
@@ -52,9 +64,14 @@ rule hifiasm_het:
         ),
     shell:
         """
-        hifiasm {input} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1
-        mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa}
-        mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_alt}
-        awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta}
-        awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt}
+        hifiasm {input.reads} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1
+        mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa_hap1}
+        mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_hap2}
+        awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap1} > {output.fasta_hap1}
+        awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap2} > {output.fasta_hap2}
+
+        # all the assemblies produced by the workflow will be symlinked to results/assemblies
+
+        ln -srn {output.fasta_hap1} {output.link_hap1}
+        ln -srn {output.fasta_hap2} {output.link_hap2}       
         """
diff --git a/workflow/rules/ncbi_fcsgx.smk b/workflow/rules/ncbi_fcsgx.smk
index 6d9e203..2f95656 100644
--- a/workflow/rules/ncbi_fcsgx.smk
+++ b/workflow/rules/ncbi_fcsgx.smk
@@ -3,13 +3,15 @@
 
 rule fcsgx:
     input:
-        fasta="results/hifiasm/asm.{hap}.fa",
+        fasta = "results/hifiasm/asm.{hap}.fa",
+        asm_dir ="results/assemblies"
     params:
         ncbi_tax_id=config["fcsgx"]["ncbi_tax_id"],
         path_to_gx_db=config["fcsgx"]["path_to_gx_db"], 
     output:
         clean_fasta="results/ncbi_fcsgx_{hap}/asm_clean.fa",
-        contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa"
+        contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa",
+        link="results/assemblies/fcsgx_{hap}.fa"
     threads: config["hifiasm"]["t"]
     log:
         "logs/fcsgx_{hap}.log",
@@ -21,4 +23,5 @@ rule fcsgx:
         """
         run_gx.py --fasta {input.fasta} --out-dir results/ncbi_fcsgx_{wildcards.hap}/out --gx-db {params.path_to_gx_db} --tax-id {params.ncbi_tax_id} --debug >> {log} 2>&1
         cat {input.fasta} | gx clean-genome --action-report results/ncbi_fcsgx_{wildcards.hap}/out/asm.{wildcards.hap}.{params.ncbi_tax_id}.fcs_gx_report.txt --output {output.clean_fasta} --contam-fasta-out {output.contaminants} >> {log} 2>&1
+        ln -srn {output.clean_fasta} {output.link}
         """
diff --git a/workflow/rules/oatk.smk b/workflow/rules/oatk.smk
index 843eb89..5be3626 100644
--- a/workflow/rules/oatk.smk
+++ b/workflow/rules/oatk.smk
@@ -1,9 +1,5 @@
 # This rule runs oatk to extract organelles reads from the hifi reads.
-
-# include common.smk to use get_oatk_outputs function
 # get_oatk_outputs is used to dynamically decide the outputs of oatk
-include: "common.smk"
-
 
 rule oatk:
     input:
diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk
index bb40a8a..5eb8c0e 100644
--- a/workflow/rules/purge_dups.smk
+++ b/workflow/rules/purge_dups.smk
@@ -1,14 +1,10 @@
 # This rule uses purge_dups purge haplotigs and overlaps from the primary assembly produced by hifiasm
 
-
-# include common.smk to use get_purge_dups_inputs
-include: "common.smk"
-
-
 rule purge_dups:
     input:
-        reads="results/reads/hifi/hifi.fastq.gz",
-        fasta=get_purge_dups_inputs(),
+        reads = "results/reads/hifi/hifi.fastq.gz",
+        fasta = get_purge_dups_inputs(),
+        asm_dir = "results/assemblies"
     output:
         paf="results/purge_dups/hifi_vs_primary_contigs.paf.gz",
         calcuts_log="results/purge_dups/calcuts.log",
@@ -23,6 +19,7 @@ rule purge_dups:
         hap_fa="results/purge_dups/hap.fa",
         purged_fasta="results/purge_dups/asm.primary_purged.fa",
         hist_plot="results/purge_dups/hist.out.png",
+        link="results/assemblies/purged_primary.fa"
     threads: config["minimap2"]["t"]
     log:
         "logs/purge_dups.log",
@@ -55,4 +52,6 @@ rule purge_dups:
         mv hap.fa {output.hap_fa}
         mv purged.fa {output.purged_fasta}
         mv hist.out.png {output.hist_plot}
+
+        ln -srn {output.purged_fasta} {output.link}
         """
diff --git a/workflow/rules/purge_dups2.smk b/workflow/rules/purge_dups2.smk
index f8461d9..bc420c4 100644
--- a/workflow/rules/purge_dups2.smk
+++ b/workflow/rules/purge_dups2.smk
@@ -6,6 +6,7 @@ rule purge_dups_alt:
         reads="results/reads/hifi/hifi.fastq.gz",
         fasta="results/hifiasm/asm.alternate.fa",
         hap_fa_in="results/purge_dups/hap.fa",
+        asm_dir = "results/assemblies"
     output:
         merged_fasta="results/purge_dups_alt/merged.fa",
         paf="results/purge_dups_alt/hifi_vs_alternate_contigs.paf.gz",
@@ -21,6 +22,7 @@ rule purge_dups_alt:
         hap_fa="results/purge_dups_alt/hap.fa",
         purged_fasta="results/purge_dups_alt/asm.alternate_purged.fa",
         hist_plot="results/purge_dups_alt/hist.out.png",
+        link="results/assemblies/purged_alternate.fa"
     threads: config["minimap2"]["t"]
     log:
         "logs/purge_dups_alt.log",
@@ -55,4 +57,6 @@ rule purge_dups_alt:
         mv hap.fa {output.hap_fa}
         mv purged.fa {output.purged_fasta}
         mv hist.out.png {output.hist_plot}
+
+        ln -srn {output.purged_fasta} {output.link}
         """
diff --git a/workflow/rules/quast.smk b/workflow/rules/quast.smk
new file mode 100644
index 0000000..f80942d
--- /dev/null
+++ b/workflow/rules/quast.smk
@@ -0,0 +1,23 @@
+# rule quast for assembly QC
+
+rule quast:
+    input:
+        completion = "results/yahs_all_done.txt",
+        assemblies = expand("results/assemblies/{file}.fa", file=assembly_files)
+    output:
+        "results/quast/report.html",
+    threads: config["quast"]["t"]
+    log:
+        "logs/quast.log",
+    resources:
+        mem_mb=config['quast']['mem_mb'],  # access memory from config
+    conda:
+        "../envs/quast.yaml"
+    params:
+        optional_params=" ".join(
+            f"{k} {v}" for k, v in config["quast"]["optional_params"].items() if v
+        ),
+    shell:
+        """ 
+        quast {input.assemblies} -o results/quast  -t {threads} {params.optional_params}
+        """
\ No newline at end of file
diff --git a/workflow/rules/yahs.smk b/workflow/rules/yahs.smk
index bca1607..2b45e85 100644
--- a/workflow/rules/yahs.smk
+++ b/workflow/rules/yahs.smk
@@ -2,17 +2,14 @@
 
 import glob
 
-
-# include common.smk to use get_bwa_index_inputs
-include: "common.smk"
-
-
 rule yahs:
     input:
         REF="results/bwa_index_{hap}/asm.fa",
         bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam",
+        asm_dir = "results/assemblies"
     output:
         scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa",
+        link="results/assemblies/yahs_{hap}_{sample}.fa",
     log:
         "logs/yahs_{hap}_{sample}.log",
     resources:
@@ -26,4 +23,5 @@ rule yahs:
     shell:
         """ 
         yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs_{wildcards.sample} {params.optional_params} >> {log} 2>&1
+        ln -srn {output.scaffolds} {output.link}
         """
diff --git a/workflow/rules/yahs_completed.smk b/workflow/rules/yahs_completed.smk
new file mode 100644
index 0000000..56c503b
--- /dev/null
+++ b/workflow/rules/yahs_completed.smk
@@ -0,0 +1,15 @@
+# add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies
+
+rule all_yahs_completed:
+    input:
+        get_yahs_output()
+    output:
+        "results/yahs_all_done.txt"
+    log:
+        "logs/all_yahs_completed.log",
+    conda:
+        "../envs/basic.yaml"
+    shell:
+        """
+        ls {input} > {output}
+        """
\ No newline at end of file

From 3bcaf2bad3bb3cf3c6ac90df1db0ab8a0bc1985d Mon Sep 17 00:00:00 2001
From: LiaOb21 <liaobinu@gmail.com>
Date: Wed, 27 Mar 2024 11:58:08 +0000
Subject: [PATCH 3/5] adding quast and busco

---
 config/config_test.yaml           |  6 +++---
 workflow/Snakefile                |  8 ++++----
 workflow/rules/busco.smk          |  7 ++++---
 workflow/rules/common.smk         | 27 +++++++++++++--------------
 workflow/rules/hifiasm.smk        |  3 ---
 workflow/rules/ncbi_fcsgx.smk     |  1 -
 workflow/rules/purge_dups.smk     |  4 ----
 workflow/rules/purge_dups2.smk    |  4 ----
 workflow/rules/quast.smk          |  2 +-
 workflow/rules/yahs.smk           |  1 -
 workflow/rules/yahs_completed.smk |  8 ++++----
 11 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/config/config_test.yaml b/config/config_test.yaml
index 402bfd5..55e8ce6 100644
--- a/config/config_test.yaml
+++ b/config/config_test.yaml
@@ -66,11 +66,11 @@ hifiasm:
     "-f": "0" # used for small datasets
     "-l": "" # purge level. 0: no purging; 1: light; 2/3: aggressive [0 for trio; 3 for unzip]
     "--ul": "" # use this if you have also ont data you want to integrate in your assembly
-    "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly
-    "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly
+#    "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly
+#    "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly
 
 #Set this to False if you want to skip the fcsgx step:
-include_fcsgx: True #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM)
+include_fcsgx: False #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM)
 
 # Customisable parameters for fcsgx
 fcsgx:
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 3883ed7..ce40eb9 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -14,7 +14,7 @@ samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
 diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"]
 
 # set {file} in the assemblies directory as global variable
-assembly_files = glob_wildcards("results/assemblies/{file}.fa")[0]
+# assembly_files = glob_wildcards("results/assemblies/{file}.fa")
 
 # Include the common rule, where the inputs for rule_all are defined
 include: "rules/common.smk"
@@ -51,11 +51,11 @@ include: "rules/picard.smk"
 # Include the yahs rule
 include: "rules/yahs.smk"
 # include yahs checkpoint - to start the QC after yahs is done
-include: "rules/yahs_completed.smk"
+#include: "rules/yahs_completed.smk"
 # Include the quast rule
-include: "rules/quast.smk"
+#include: "rules/quast.smk"
 # Include the busco rule
-include: "rules/busco.smk"
+#include: "rules/busco.smk"
 
 # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified
 # input files are defined in the common.smk file
diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk
index 4226640..82b85da 100644
--- a/workflow/rules/busco.smk
+++ b/workflow/rules/busco.smk
@@ -2,7 +2,7 @@
 
 rule busco:
     input:
-        completion = "results/yahs_all_done.txt",
+        completion = checkpoints.yahs_completed.get().completion_marker,
         assemblies = "results/assemblies/{file}.fa"
     output:
         dir = directory("results/busco/{file}.fa_busco"),
@@ -10,7 +10,7 @@ rule busco:
         lineage=config["busco"]["lineage"],
     threads: config["busco"]["t"]
     log:
-        "logs/busco{file}.log"
+        "logs/busco_{file}.log"
     resources:
         mem_mb=config['busco']['mem_mb'],  # access memory from config
     conda:
@@ -18,5 +18,6 @@ rule busco:
     shell:
         """
         # run busco on each assembly
-		busco -i {input.assemblies} -o {output.dir} -m genome -l {params.lineage} -f -c {threads}
+        #assembly_name=$(basename {input.assemblies})
+		busco -i {input.assemblies} -o {output.dir}/{wildcards.file}_busco -m genome -l {params.lineage} -f -c {threads}
         """
\ No newline at end of file
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index f78457f..0953511 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -24,7 +24,7 @@ def get_all_inputs(wc=None):
         "results/nanoplot/NanoPlot-report.html",  # nanoplot report
         "results/kmc/out.hist",  # output of kmc rule
         "results/genomescope",  # output directory of genomescope
-        "results/quast/report.html"
+#        "results/quast/report.html"
     ]
 
     # Get the OATK outputs
@@ -45,10 +45,7 @@ def get_all_inputs(wc=None):
     if not diploid_mode:
         inputs.append("results/purge_dups_alt/asm.alternate_purged.fa")
 
-    # Extend list of inputs with busco directories
-    inputs.extend(expand(
-        "results/busco/{file}.fa_busco", file=assembly_files
-    ))
+#    inputs.extend(expand("results/busco/{file}.fa_busco", file=assembly_files))
 
     return inputs
 
@@ -81,17 +78,19 @@ def get_purge_dups_inputs():
 def get_bwa_index_inputs(wildcards):
     hap = wildcards.hap
     if config["include_purge_dups"] == True:
-        return f"results/purge_dups/asm.{hap}_purged.fa"
+        return "results/purge_dups/asm.{hap}_purged.fa"
     elif config["include_fcsgx"] == True:
         return f"results/ncbi_fcsgx_{hap}/asm_clean.fa"
     else:
         return f"results/hifiasm/asm.{hap}.fa"
 
-def get_yahs_output():
-    if diploid_mode:
-        return [
-            expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples),
-            expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
-        ]
-    else:
-        return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
+#def get_yahs_output():
+#    if diploid_mode:
+#        return [
+#            expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples),
+#            expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
+#        ]
+#    else:
+#        return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
+
+
diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk
index 967f718..68372b1 100644
--- a/workflow/rules/hifiasm.smk
+++ b/workflow/rules/hifiasm.smk
@@ -5,13 +5,11 @@
 rule hifiasm:
     input:
         reads = "results/reads/hifi/hifi.fastq.gz",
-        asm_dir = "results/assemblies"
     output:
         gfa="results/hifiasm/asm.primary.gfa",
         gfa_alt="results/hifiasm/asm.alternate.gfa",
         fasta="results/hifiasm/asm.primary.fa",
         fasta_alt="results/hifiasm/asm.alternate.fa",
-        asm_dir = directory("results/assemblies"),
         link_primary = "results/assemblies/asm.primary.fa",
         link_alternate = "results/assemblies/asm.alternate.fa"
     threads: config["hifiasm"]["t"]
@@ -48,7 +46,6 @@ rule hifiasm_het:
         gfa_hap2="results/hifiasm/asm.hap2.gfa",
         fasta_hap1="results/hifiasm/asm.hap1.fa",
         fasta_hap2="results/hifiasm/asm.hap2.fa",
-        asm_dir = directory("results/assemblies"),
         link_hap1 = "results/assemblies/asm.hap1.fa",
         link_hap2 = "results/assemblies/asm.hap2.fa"
     threads: config["hifiasm"]["t"]
diff --git a/workflow/rules/ncbi_fcsgx.smk b/workflow/rules/ncbi_fcsgx.smk
index 2f95656..14b5478 100644
--- a/workflow/rules/ncbi_fcsgx.smk
+++ b/workflow/rules/ncbi_fcsgx.smk
@@ -4,7 +4,6 @@
 rule fcsgx:
     input:
         fasta = "results/hifiasm/asm.{hap}.fa",
-        asm_dir ="results/assemblies"
     params:
         ncbi_tax_id=config["fcsgx"]["ncbi_tax_id"],
         path_to_gx_db=config["fcsgx"]["path_to_gx_db"], 
diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk
index 5eb8c0e..9afd2d5 100644
--- a/workflow/rules/purge_dups.smk
+++ b/workflow/rules/purge_dups.smk
@@ -4,7 +4,6 @@ rule purge_dups:
     input:
         reads = "results/reads/hifi/hifi.fastq.gz",
         fasta = get_purge_dups_inputs(),
-        asm_dir = "results/assemblies"
     output:
         paf="results/purge_dups/hifi_vs_primary_contigs.paf.gz",
         calcuts_log="results/purge_dups/calcuts.log",
@@ -19,7 +18,6 @@ rule purge_dups:
         hap_fa="results/purge_dups/hap.fa",
         purged_fasta="results/purge_dups/asm.primary_purged.fa",
         hist_plot="results/purge_dups/hist.out.png",
-        link="results/assemblies/purged_primary.fa"
     threads: config["minimap2"]["t"]
     log:
         "logs/purge_dups.log",
@@ -52,6 +50,4 @@ rule purge_dups:
         mv hap.fa {output.hap_fa}
         mv purged.fa {output.purged_fasta}
         mv hist.out.png {output.hist_plot}
-
-        ln -srn {output.purged_fasta} {output.link}
         """
diff --git a/workflow/rules/purge_dups2.smk b/workflow/rules/purge_dups2.smk
index bc420c4..f8461d9 100644
--- a/workflow/rules/purge_dups2.smk
+++ b/workflow/rules/purge_dups2.smk
@@ -6,7 +6,6 @@ rule purge_dups_alt:
         reads="results/reads/hifi/hifi.fastq.gz",
         fasta="results/hifiasm/asm.alternate.fa",
         hap_fa_in="results/purge_dups/hap.fa",
-        asm_dir = "results/assemblies"
     output:
         merged_fasta="results/purge_dups_alt/merged.fa",
         paf="results/purge_dups_alt/hifi_vs_alternate_contigs.paf.gz",
@@ -22,7 +21,6 @@ rule purge_dups_alt:
         hap_fa="results/purge_dups_alt/hap.fa",
         purged_fasta="results/purge_dups_alt/asm.alternate_purged.fa",
         hist_plot="results/purge_dups_alt/hist.out.png",
-        link="results/assemblies/purged_alternate.fa"
     threads: config["minimap2"]["t"]
     log:
         "logs/purge_dups_alt.log",
@@ -57,6 +55,4 @@ rule purge_dups_alt:
         mv hap.fa {output.hap_fa}
         mv purged.fa {output.purged_fasta}
         mv hist.out.png {output.hist_plot}
-
-        ln -srn {output.purged_fasta} {output.link}
         """
diff --git a/workflow/rules/quast.smk b/workflow/rules/quast.smk
index f80942d..1f4adc3 100644
--- a/workflow/rules/quast.smk
+++ b/workflow/rules/quast.smk
@@ -2,7 +2,7 @@
 
 rule quast:
     input:
-        completion = "results/yahs_all_done.txt",
+        completion = checkpoints.yahs_completed.get().completion_marker,
         assemblies = expand("results/assemblies/{file}.fa", file=assembly_files)
     output:
         "results/quast/report.html",
diff --git a/workflow/rules/yahs.smk b/workflow/rules/yahs.smk
index 2b45e85..f73a801 100644
--- a/workflow/rules/yahs.smk
+++ b/workflow/rules/yahs.smk
@@ -6,7 +6,6 @@ rule yahs:
     input:
         REF="results/bwa_index_{hap}/asm.fa",
         bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam",
-        asm_dir = "results/assemblies"
     output:
         scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa",
         link="results/assemblies/yahs_{hap}_{sample}.fa",
diff --git a/workflow/rules/yahs_completed.smk b/workflow/rules/yahs_completed.smk
index 56c503b..c3611e3 100644
--- a/workflow/rules/yahs_completed.smk
+++ b/workflow/rules/yahs_completed.smk
@@ -1,15 +1,15 @@
 # add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies
 
-rule all_yahs_completed:
+checkpoint symlinks:
     input:
-        get_yahs_output()
+        expand("results/assemblies/yahs_{hap}_{sample}.fa", sample=samples, hap=hap)
     output:
-        "results/yahs_all_done.txt"
+        completion_marker = "results/yahs_all_done.txt"
     log:
         "logs/all_yahs_completed.log",
     conda:
         "../envs/basic.yaml"
     shell:
         """
-        ls {input} > {output}
+        ls {input} > {output.completion_marker}
         """
\ No newline at end of file

From 532d0a49986194705dab3c55c4edbb821a74736f Mon Sep 17 00:00:00 2001
From: LiaOb21 <liaobinu@gmail.com>
Date: Thu, 28 Mar 2024 18:30:23 +0000
Subject: [PATCH 4/5] updating config format

---
 config/config.yaml      | 17 +++++++++++++++++
 config/config_test.yaml |  6 +++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 3061040..3e8a1cb 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -99,3 +99,20 @@ yahs:
 #  o: "hifiasm_p_purged_yahs" # output prefix use of this needs evaluation
   optional_params: 
     "-e": "" # you can specify the restriction enzyme(s) used by the Hi-C experiment
+
+
+# Customisable parameters for quast
+quast:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  optional_params: 
+    "--fragmented": ""
+    "--large": ""
+#    "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"
+#    "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3"
+
+# Customisable parameters for busco
+busco:
+  t: 4 # number of threads
+  mem_mb: 1000 # memory in MB
+  lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis
diff --git a/config/config_test.yaml b/config/config_test.yaml
index 55e8ce6..402bfd5 100644
--- a/config/config_test.yaml
+++ b/config/config_test.yaml
@@ -66,11 +66,11 @@ hifiasm:
     "-f": "0" # used for small datasets
     "-l": "" # purge level. 0: no purging; 1: light; 2/3: aggressive [0 for trio; 3 for unzip]
     "--ul": "" # use this if you have also ont data you want to integrate in your assembly
-#    "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly
-#    "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly
+    "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly
+    "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly
 
 #Set this to False if you want to skip the fcsgx step:
-include_fcsgx: False #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM)
+include_fcsgx: True #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM)
 
 # Customisable parameters for fcsgx
 fcsgx:

From 2774a835e66ed97697373a4abc2f9e964c9da51a Mon Sep 17 00:00:00 2001
From: LiaOb21 <liaobinu@gmail.com>
Date: Thu, 28 Mar 2024 18:31:12 +0000
Subject: [PATCH 5/5] adding busco and quast

---
 workflow/Snakefile                       | 10 ++-----
 workflow/rules/busco.smk                 | 19 +++++++-----
 workflow/rules/bwa_mem.smk               | 10 +++----
 workflow/rules/common.smk                | 38 ++++++++++++++----------
 workflow/rules/fastp.smk                 | 22 +++++---------
 workflow/rules/filter_five_end.smk       | 10 +++----
 workflow/rules/hifiasm.smk               |  8 ++---
 workflow/rules/picard.smk                | 19 +++++++-----
 workflow/rules/purge_dups.smk            |  3 ++
 workflow/rules/quast.smk                 | 12 ++++----
 workflow/rules/two_read_bam_combiner.smk |  8 ++---
 workflow/rules/yahs.smk                  | 10 +++----
 workflow/rules/yahs_completed.smk        |  8 ++---
 13 files changed, 90 insertions(+), 87 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index ce40eb9..29f5f12 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -8,13 +8,11 @@ configfile: "config/config.yaml"
 
 # set {samples} global variable - used to read Hi-C files
 samples_in_pattern = "{sample}_1.fastq.gz"
-samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
+sample, = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
 
 # set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm
 diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"]
 
-# set {file} in the assemblies directory as global variable
-# assembly_files = glob_wildcards("results/assemblies/{file}.fa")
 
 # Include the common rule, where the inputs for rule_all are defined
 include: "rules/common.smk"
@@ -50,12 +48,10 @@ include: "rules/two_read_bam_combiner.smk"
 include: "rules/picard.smk"
 # Include the yahs rule
 include: "rules/yahs.smk"
-# include yahs checkpoint - to start the QC after yahs is done
-#include: "rules/yahs_completed.smk"
 # Include the quast rule
-#include: "rules/quast.smk"
+include: "rules/quast.smk"
 # Include the busco rule
-#include: "rules/busco.smk"
+include: "rules/busco.smk"
 
 # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified
 # input files are defined in the common.smk file
diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk
index 82b85da..ee83669 100644
--- a/workflow/rules/busco.smk
+++ b/workflow/rules/busco.smk
@@ -2,22 +2,27 @@
 
 rule busco:
     input:
-        completion = checkpoints.yahs_completed.get().completion_marker,
-        assemblies = "results/assemblies/{file}.fa"
+        unpack(get_assemblies_QC)
     output:
-        dir = directory("results/busco/{file}.fa_busco"),
+        dir = directory("results/busco"),
     params:
         lineage=config["busco"]["lineage"],
+        labels = list(get_assemblies_QC().keys())
     threads: config["busco"]["t"]
     log:
-        "logs/busco_{file}.log"
+        "logs/busco.log"
     resources:
         mem_mb=config['busco']['mem_mb'],  # access memory from config
     conda:
         "../envs/busco.yaml"
     shell:
         """
-        # run busco on each assembly
-        #assembly_name=$(basename {input.assemblies})
-		busco -i {input.assemblies} -o {output.dir}/{wildcards.file}_busco -m genome -l {params.lineage} -f -c {threads}
+        mkdir -p {output}
+        labels=({params.labels})
+        count=0
+        for v in {input} ; do
+            k=${{labels[$count]}}
+            busco -i $v -o {output}/$k -m genome -l {params.lineage} -f -c {threads} >> {log} 2>&1
+            count=$(( $count + 1 ))
+        done
         """
\ No newline at end of file
diff --git a/workflow/rules/bwa_mem.smk b/workflow/rules/bwa_mem.smk
index 87d45e0..ae02479 100644
--- a/workflow/rules/bwa_mem.smk
+++ b/workflow/rules/bwa_mem.smk
@@ -8,14 +8,14 @@
 rule bwa_mem:
     input:
         REF="results/bwa_index_{hap}/asm.fa",
-        forward_hic="results/fastp/{sample}_trim_1.fastq.gz",
-        reverse_hic="results/fastp/{sample}_trim_2.fastq.gz",
+        forward_hic="results/fastp/hic_trim_1.fastq.gz",
+        reverse_hic="results/fastp/hic_trim_2.fastq.gz",
     output:
-        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
-        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
+        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
+        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
     threads: config["arima"]["CPU"]
     log:
-        "logs/bwa_mem_{hap}_{sample}.log",
+        "logs/bwa_mem_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config    
     conda:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 0953511..462de5d 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -4,14 +4,14 @@
 import os
 
 wildcard_constraints:
-    hap = "primary|hap1|hap2"
+    hap = "primary|hap1|hap2",
 
 def get_all_inputs(wc=None):
     if diploid_mode: 
         hap = ["hap1", "hap2"]
     else:
-        hap = ["primary"]    
-    if not samples:
+        hap = ["primary"]      
+    if not sample:
         # Show the user a meaningful error message
         logger.error(f"No files matching {samples_in_pattern} were found in"
                      f" {config.get('hic_path')}. Please check your config file.")
@@ -24,7 +24,8 @@ def get_all_inputs(wc=None):
         "results/nanoplot/NanoPlot-report.html",  # nanoplot report
         "results/kmc/out.hist",  # output of kmc rule
         "results/genomescope",  # output directory of genomescope
-#        "results/quast/report.html"
+        "results/quast", # output directory of quast
+        "results/busco", # output directory of busco
     ]
 
     # Get the OATK outputs
@@ -38,14 +39,14 @@ def get_all_inputs(wc=None):
     inputs.extend(oatk_outputs)
 
     inputs.extend(expand(
-            "results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", sample=samples, hap=hap
+            "results/yahs_{hap}/asm_yahs_scaffolds_final.fa", hap=hap
         ))  # final scaffolded assembly
 
     # If not in diploid mode, add asm.alternate.fa as an input
-    if not diploid_mode:
+    if not diploid_mode and config["include_purge_dups"] == True:
         inputs.append("results/purge_dups_alt/asm.alternate_purged.fa")
 
-#    inputs.extend(expand("results/busco/{file}.fa_busco", file=assembly_files))
+#    inputs.extend(expand("results/busco/{tool}_{hap}_busco", tool=tool, hap=hap))
 
     return inputs
 
@@ -84,13 +85,18 @@ def get_bwa_index_inputs(wildcards):
     else:
         return f"results/hifiasm/asm.{hap}.fa"
 
-#def get_yahs_output():
-#    if diploid_mode:
-#        return [
-#            expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples),
-#            expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
-#        ]
-#    else:
-#        return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
-
 
+def get_assemblies_QC(wildcards=None):
+    if diploid_mode: 
+        haps = ["hap1", "hap2"]
+    else:
+        haps = ["primary"]
+    results = {}
+    for hap in haps:
+        results[f"asm_{hap}"] = f"results/assemblies/asm_{hap}.fa"
+        results[f"yahs_{hap}"] = f"results/assemblies/yahs_{hap}.fa"
+        if config["include_purge_dups"]:
+            results[f"purged_{hap}"] = f"results/assemblies/purged_{hap}.fa"  
+        if config["include_fcsgx"]:
+            results[f"fcsgx_{hap}"] = f"results/assemblies/fcsgx_{hap}.fa"
+    return results
\ No newline at end of file
diff --git a/workflow/rules/fastp.smk b/workflow/rules/fastp.smk
index 05949f6..c4be9ee 100644
--- a/workflow/rules/fastp.smk
+++ b/workflow/rules/fastp.smk
@@ -5,28 +5,20 @@ import glob
 
 rule fastp:
     input:
-        forward_in=expand(
-            "{hic_path}{sample}_1.fastq.gz",
-            hic_path=config["hic_path"],
-            sample=glob_wildcards(config["hic_path"] + "{sample}_1.fastq.gz").sample,
-        ),
-        reverse_in=expand(
-            "{hic_path}{sample}_2.fastq.gz",
-            hic_path=config["hic_path"],
-            sample=glob_wildcards(config["hic_path"] + "{sample}_2.fastq.gz").sample,
-        ),
+        forward_in=f"{config["hic_path"]}{sample}_1.fastq.gz",
+        reverse_in=f"{config["hic_path"]}{sample}_2.fastq.gz",
     output:
-        forward_out="results/fastp/{sample}_trim_1.fastq.gz",
-        reverse_out="results/fastp/{sample}_trim_2.fastq.gz",
-        json="results/fastp/{sample}_report_fastp.HiC.json",
-        html="results/fastp/{sample}_report_fastp.HiC.html",
+        forward_out="results/fastp/hic_trim_1.fastq.gz",
+        reverse_out="results/fastp/hic_trim_2.fastq.gz",
+        json="results/fastp/hic_report_fastp.HiC.json",
+        html="results/fastp/hic_report_fastp.HiC.html",
     params:
         optional_params=" ".join(
             f"{k} {v}" for k, v in config["fastp"]["optional_params"].items() if v
         ),
     threads: config["fastp"]["t"]
     log:
-        "logs/{sample}_fastp.log",
+        "logs/fastp.log",
     resources:
         mem_mb=config['fastp']['mem_mb'],  # access memory from config
     conda:
diff --git a/workflow/rules/filter_five_end.smk b/workflow/rules/filter_five_end.smk
index 7a893bd..514a78d 100644
--- a/workflow/rules/filter_five_end.smk
+++ b/workflow/rules/filter_five_end.smk
@@ -6,13 +6,13 @@
 
 rule fiter_five_end:
     input:
-        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
-        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
+        bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
+        bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
     output:
-        bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam",
-        bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam",
+        bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam",
+        bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam",
     log:
-        "logs/fiter_five_end_{hap}_{sample}.log",
+        "logs/fiter_five_end_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config    
     conda:
diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk
index 68372b1..1ce0147 100644
--- a/workflow/rules/hifiasm.smk
+++ b/workflow/rules/hifiasm.smk
@@ -10,8 +10,7 @@ rule hifiasm:
         gfa_alt="results/hifiasm/asm.alternate.gfa",
         fasta="results/hifiasm/asm.primary.fa",
         fasta_alt="results/hifiasm/asm.alternate.fa",
-        link_primary = "results/assemblies/asm.primary.fa",
-        link_alternate = "results/assemblies/asm.alternate.fa"
+        link_primary = "results/assemblies/asm_primary.fa",
     threads: config["hifiasm"]["t"]
     log:
         "logs/hifiasm.log",
@@ -34,7 +33,6 @@ rule hifiasm:
         # all the assemblies produced by the workflow will be symlinked to results/assemblies
 
         ln -srn {output.fasta} {output.link_primary}
-        ln -srn {output.fasta_alt} {output.link_alternate}
         """
 
 
@@ -46,8 +44,8 @@ rule hifiasm_het:
         gfa_hap2="results/hifiasm/asm.hap2.gfa",
         fasta_hap1="results/hifiasm/asm.hap1.fa",
         fasta_hap2="results/hifiasm/asm.hap2.fa",
-        link_hap1 = "results/assemblies/asm.hap1.fa",
-        link_hap2 = "results/assemblies/asm.hap2.fa"
+        link_hap1 = "results/assemblies/asm_hap1.fa",
+        link_hap2 = "results/assemblies/asm_hap2.fa"
     threads: config["hifiasm"]["t"]
     log:
         "logs/hifiasm.log",
diff --git a/workflow/rules/picard.smk b/workflow/rules/picard.smk
index abb54d0..2004b4b 100644
--- a/workflow/rules/picard.smk
+++ b/workflow/rules/picard.smk
@@ -2,18 +2,21 @@
 # This pipeline is currently built for processing a single sample with one read1 and read2 FASTQ file.
 # For more info: https://github.com/ArimaGenomics/mapping_pipeline
 # colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that
+# The final files will be in REP_DIR to be consistent with the original pipeline but they will be marked with "final" and not with "rep1"
 
 
 rule picard:
     input:
-        tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/{sample}.bam",
+        tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/hic_vs_contigs_filt_paired.bam",
     output:
-        bam_paired="results/arima_mapping_pipeline_{hap}/PAIR_DIR/{sample}.bam",
-        mark_dup="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam",
-        metrics="results/arima_mapping_pipeline_{hap}/REP_DIR/metrics.{sample}_rep1.txt",
-        stats="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam.stats",
+        bam_add_read_group="results/arima_mapping_pipeline_{hap}/PAIR_DIR/paired_add_read_group.bam",
+        mark_dup="results/arima_mapping_pipeline_{hap}/REP_DIR/paired_mark_dups_final.bam",
+        metrics="results/arima_mapping_pipeline_{hap}/REP_DIR/metrics.final.txt",
+        stats="results/arima_mapping_pipeline_{hap}/REP_DIR/final.bam.stats",
+    params:
+        hic_sample_name=sample
     log:
-        "logs/picard_{hap}_{sample}.log",
+        "logs/picard_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config
     conda:
@@ -21,8 +24,8 @@ rule picard:
     shell:
         """
         PICARD=${{CONDA_PREFIX}}/share/picard-*/picard.jar
-        java -Xmx4G -Djava.io.tmpdir=temp/ -jar ${{PICARD}} AddOrReplaceReadGroups INPUT={input.tmp_bam} OUTPUT={output.bam_paired} ID={wildcards.sample} LB={wildcards.sample} SM={wildcards.sample} PL=ILLUMINA PU=none 
-        java -Xmx30G -XX:-UseGCOverheadLimit -Djava.io.tmpdir=temp/ -jar ${{PICARD}} MarkDuplicates INPUT={output.bam_paired} OUTPUT={output.mark_dup} METRICS_FILE={output.metrics} TMP_DIR=results/arima_mapping_pipeline_{wildcards.hap}/TMP_DIR ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE
+        java -Xmx4G -Djava.io.tmpdir=temp/ -jar ${{PICARD}} AddOrReplaceReadGroups INPUT={input.tmp_bam} OUTPUT={output.bam_add_read_group} ID={params.hic_sample_name} LB={params.hic_sample_name} SM={params.hic_sample_name} PL=ILLUMINA PU=none 
+        java -Xmx30G -XX:-UseGCOverheadLimit -Djava.io.tmpdir=temp/ -jar ${{PICARD}} MarkDuplicates INPUT={output.bam_add_read_group} OUTPUT={output.mark_dup} METRICS_FILE={output.metrics} TMP_DIR=results/arima_mapping_pipeline_{wildcards.hap}/TMP_DIR ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE
         samtools index {output.mark_dup}
         perl scripts/get_stats.pl {output.mark_dup} > {output.stats}
         """
diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk
index 9afd2d5..df6ce50 100644
--- a/workflow/rules/purge_dups.smk
+++ b/workflow/rules/purge_dups.smk
@@ -18,6 +18,7 @@ rule purge_dups:
         hap_fa="results/purge_dups/hap.fa",
         purged_fasta="results/purge_dups/asm.primary_purged.fa",
         hist_plot="results/purge_dups/hist.out.png",
+        link = "results/assemblies/purged_primary.fa",
     threads: config["minimap2"]["t"]
     log:
         "logs/purge_dups.log",
@@ -50,4 +51,6 @@ rule purge_dups:
         mv hap.fa {output.hap_fa}
         mv purged.fa {output.purged_fasta}
         mv hist.out.png {output.hist_plot}
+
+        ln -srn {output.purged_fasta} {output.link}
         """
diff --git a/workflow/rules/quast.smk b/workflow/rules/quast.smk
index 1f4adc3..9d71b5d 100644
--- a/workflow/rules/quast.smk
+++ b/workflow/rules/quast.smk
@@ -2,10 +2,9 @@
 
 rule quast:
     input:
-        completion = checkpoints.yahs_completed.get().completion_marker,
-        assemblies = expand("results/assemblies/{file}.fa", file=assembly_files)
+        unpack(get_assemblies_QC)
     output:
-        "results/quast/report.html",
+        directory("results/quast"),
     threads: config["quast"]["t"]
     log:
         "logs/quast.log",
@@ -17,7 +16,8 @@ rule quast:
         optional_params=" ".join(
             f"{k} {v}" for k, v in config["quast"]["optional_params"].items() if v
         ),
+        labels = ",".join(get_assemblies_QC().keys())
     shell:
-        """ 
-        quast {input.assemblies} -o results/quast  -t {threads} {params.optional_params}
-        """
\ No newline at end of file
+        """
+        quast {input} -l {params.labels} -o {output}  -t {threads} {params.optional_params} >> {log} 2>&1
+        """
diff --git a/workflow/rules/two_read_bam_combiner.smk b/workflow/rules/two_read_bam_combiner.smk
index 518536c..99f85e4 100644
--- a/workflow/rules/two_read_bam_combiner.smk
+++ b/workflow/rules/two_read_bam_combiner.smk
@@ -7,16 +7,16 @@
 
 rule two_read_bam_combiner:
     input:
-        bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam",
-        bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam",
+        bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam",
+        bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam",
         REF="results/bwa_index_{hap}/asm.fa",
     output:
-        tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/{sample}.bam",
+        tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/hic_vs_contigs_filt_paired.bam",
     threads: config["arima"]["CPU"]
     params:
         MAPQ_FILTER=config["arima"]["MAPQ_FILTER"],
     log:
-        "logs/two_read_bam_combiner_{hap}_{sample}.log",
+        "logs/two_read_bam_combiner_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config
     conda:
diff --git a/workflow/rules/yahs.smk b/workflow/rules/yahs.smk
index f73a801..db057e6 100644
--- a/workflow/rules/yahs.smk
+++ b/workflow/rules/yahs.smk
@@ -5,12 +5,12 @@ import glob
 rule yahs:
     input:
         REF="results/bwa_index_{hap}/asm.fa",
-        bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam",
+        bam="results/arima_mapping_pipeline_{hap}/REP_DIR/paired_mark_dups_final.bam",
     output:
-        scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa",
-        link="results/assemblies/yahs_{hap}_{sample}.fa",
+        scaffolds="results/yahs_{hap}/asm_yahs_scaffolds_final.fa",
+        link="results/assemblies/yahs_{hap}.fa"
     log:
-        "logs/yahs_{hap}_{sample}.log",
+        "logs/yahs_{hap}.log",
     resources:
         mem_mb=config['arima']['mem_mb'],  # access memory from config
     conda:
@@ -21,6 +21,6 @@ rule yahs:
         ),
     shell:
         """ 
-        yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs_{wildcards.sample} {params.optional_params} >> {log} 2>&1
+        yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs {params.optional_params} >> {log} 2>&1
         ln -srn {output.scaffolds} {output.link}
         """
diff --git a/workflow/rules/yahs_completed.smk b/workflow/rules/yahs_completed.smk
index c3611e3..0d61b69 100644
--- a/workflow/rules/yahs_completed.smk
+++ b/workflow/rules/yahs_completed.smk
@@ -1,12 +1,12 @@
 # add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies
 
-checkpoint symlinks:
+checkpoint yahs_done:
     input:
-        expand("results/assemblies/yahs_{hap}_{sample}.fa", sample=samples, hap=hap)
+        get_yahs_output()
     output:
-        completion_marker = "results/yahs_all_done.txt"
+        completion_marker = "results/yahs_completed.txt"
     log:
-        "logs/all_yahs_completed.log",
+        "logs/yahs_completed.log",
     conda:
         "../envs/basic.yaml"
     shell: