From 1f0de0ae07d437d3d0c49fd50470aa8f5c70586b Mon Sep 17 00:00:00 2001 From: LiaOb21 Date: Tue, 26 Mar 2024 16:38:06 +0000 Subject: [PATCH 1/5] added quast and busco --- config/config_test.yaml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/config/config_test.yaml b/config/config_test.yaml index c4f7b1c..402bfd5 100644 --- a/config/config_test.yaml +++ b/config/config_test.yaml @@ -79,7 +79,7 @@ fcsgx: path_to_gx_db: "resources/gx_test_db/test-only" # Set this to False if you want to skip purge_dups steps: -include_purge_dups: True +include_purge_dups: False # Customisable parameters for purge_dups purge_dups: @@ -97,3 +97,19 @@ yahs: # o: "hifiasm_p_purged_yahs" # output prefix #use of this needs evaluation optional_params: "-e": "GATC" # you can specify the restriction enzyme(s) used by the Hi-C experiment + +# Customisable parameters for quast +quast: + t: 4 # number of threads + mem_mb: 1000 # memory in MB + optional_params: + "--fragmented": "" + "--large": "" +# "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa" +# "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3" + +# Customisable parameters for busco +busco: + t: 4 # number of threads + mem_mb: 1000 # memory in MB + lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis From f3ece2095cfd6e1e4b4df95fb762fb048c28a95c Mon Sep 17 00:00:00 2001 From: LiaOb21 Date: Tue, 26 Mar 2024 16:40:12 +0000 Subject: [PATCH 2/5] adding quast and busco --- workflow/Snakefile | 14 ++++++++--- workflow/envs/busco.yaml | 6 +++++ workflow/envs/quast.yaml | 6 +++++ workflow/rules/busco.smk | 22 +++++++++++++++++ workflow/rules/bwa_index.smk | 4 --- workflow/rules/common.smk | 21 +++++++++++++--- workflow/rules/get_assemblies.smk | 15 +++++++++++ workflow/rules/hifiasm.smk | 41 ++++++++++++++++++++++--------- workflow/rules/ncbi_fcsgx.smk | 7 ++++-- workflow/rules/oatk.smk | 4 --- workflow/rules/purge_dups.smk | 13 +++++----- workflow/rules/purge_dups2.smk | 4 +++ workflow/rules/quast.smk | 23 +++++++++++++++++ workflow/rules/yahs.smk | 8 +++--- workflow/rules/yahs_completed.smk | 15 +++++++++++ 15 files changed, 162 insertions(+), 41 deletions(-) create mode 100644 workflow/envs/busco.yaml create mode 100644 workflow/envs/quast.yaml create mode 100644 workflow/rules/busco.smk create mode 100644 workflow/rules/get_assemblies.smk create mode 100644 workflow/rules/quast.smk create mode 100644 workflow/rules/yahs_completed.smk diff --git a/workflow/Snakefile b/workflow/Snakefile index c7b3767..3883ed7 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -13,6 +13,11 @@ samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample # set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"] +# set {file} in the assemblies directory as global variable +assembly_files = glob_wildcards("results/assemblies/{file}.fa")[0] + +# Include the common rule, where the inputs for rule_all are defined +include: "rules/common.smk" # Include the hifi_prep rule include: "rules/hifi_prep.smk" # Include the nanoplot @@ -45,9 +50,12 @@ include: "rules/two_read_bam_combiner.smk" include: "rules/picard.smk" # Include the yahs rule include: "rules/yahs.smk" -# Include the common rule, where the inputs for rule_all are defined -include: "rules/common.smk" - +# include yahs checkpoint - to start the QC after yahs is done +include: "rules/yahs_completed.smk" +# Include the quast rule +include: "rules/quast.smk" +# Include the busco rule +include: "rules/busco.smk" # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified # input files are defined in the common.smk file diff --git a/workflow/envs/busco.yaml b/workflow/envs/busco.yaml new file mode 100644 index 0000000..1bea8c4 --- /dev/null +++ b/workflow/envs/busco.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - busco=5.7.0 \ No newline at end of file diff --git a/workflow/envs/quast.yaml b/workflow/envs/quast.yaml new file mode 100644 index 0000000..dda2a1f --- /dev/null +++ b/workflow/envs/quast.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - quast=5.2.0 \ No newline at end of file diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk new file mode 100644 index 0000000..4226640 --- /dev/null +++ b/workflow/rules/busco.smk @@ -0,0 +1,22 @@ +# rule busco for assembly QC + +rule busco: + input: + completion = "results/yahs_all_done.txt", + assemblies = "results/assemblies/{file}.fa" + output: + dir = directory("results/busco/{file}.fa_busco"), + params: + lineage=config["busco"]["lineage"], + threads: config["busco"]["t"] + log: + "logs/busco{file}.log" + resources: + mem_mb=config['busco']['mem_mb'], # access memory from config + conda: + "../envs/busco.yaml" + shell: + """ + # run busco on each assembly + busco -i {input.assemblies} -o {output.dir} -m genome -l {params.lineage} -f -c {threads} + """ \ No newline at end of file diff --git a/workflow/rules/bwa_index.smk b/workflow/rules/bwa_index.smk index d264fee..2e03418 100644 --- a/workflow/rules/bwa_index.smk +++ b/workflow/rules/bwa_index.smk @@ -3,10 +3,6 @@ # For more info: https://github.com/ArimaGenomics/mapping_pipeline # colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that - -# include common.smk to use wildcards -include: "common.smk" - rule bwa_index: input: get_bwa_index_inputs diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f0c09c4..f78457f 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -8,9 +8,9 @@ wildcard_constraints: def get_all_inputs(wc=None): if diploid_mode: - hap= ["hap1", "hap2"] + hap = ["hap1", "hap2"] else: - hap = ["primary"] + hap = ["primary"] if not samples: # Show the user a meaningful error message logger.error(f"No files matching {samples_in_pattern} were found in" @@ -24,6 +24,7 @@ def get_all_inputs(wc=None): "results/nanoplot/NanoPlot-report.html", # nanoplot report "results/kmc/out.hist", # output of kmc rule "results/genomescope", # output directory of genomescope + "results/quast/report.html" ] # Get the OATK outputs @@ -44,6 +45,11 @@ def get_all_inputs(wc=None): if not diploid_mode: inputs.append("results/purge_dups_alt/asm.alternate_purged.fa") + # Extend list of inputs with busco directories + inputs.extend(expand( + "results/busco/{file}.fa_busco", file=assembly_files + )) + return inputs @@ -75,10 +81,17 @@ def get_purge_dups_inputs(): def get_bwa_index_inputs(wildcards): hap = wildcards.hap if config["include_purge_dups"] == True: - return "results/purge_dups/asm.{hap}_purged.fa" + return f"results/purge_dups/asm.{hap}_purged.fa" elif config["include_fcsgx"] == True: return f"results/ncbi_fcsgx_{hap}/asm_clean.fa" else: return f"results/hifiasm/asm.{hap}.fa" - +def get_yahs_output(): + if diploid_mode: + return [ + expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples), + expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) + ] + else: + return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) diff --git a/workflow/rules/get_assemblies.smk b/workflow/rules/get_assemblies.smk new file mode 100644 index 0000000..58eb087 --- /dev/null +++ b/workflow/rules/get_assemblies.smk @@ -0,0 +1,15 @@ +include: "common.smk" + +rule get_assemblies: + output: + directory("results/assemblies") + log: + "logs/get_assemblies.log", + resources: + mem_mb=config['arima']['mem_mb'], # access memory from config + conda: + "../envs/basic.yaml" + shell: + """ + mkdir -p {output} + """ \ No newline at end of file diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk index d838c06..967f718 100644 --- a/workflow/rules/hifiasm.smk +++ b/workflow/rules/hifiasm.smk @@ -4,12 +4,16 @@ rule hifiasm: input: - "results/reads/hifi/hifi.fastq.gz", + reads = "results/reads/hifi/hifi.fastq.gz", + asm_dir = "results/assemblies" output: gfa="results/hifiasm/asm.primary.gfa", gfa_alt="results/hifiasm/asm.alternate.gfa", fasta="results/hifiasm/asm.primary.fa", fasta_alt="results/hifiasm/asm.alternate.fa", + asm_dir = directory("results/assemblies"), + link_primary = "results/assemblies/asm.primary.fa", + link_alternate = "results/assemblies/asm.alternate.fa" threads: config["hifiasm"]["t"] log: "logs/hifiasm.log", @@ -23,22 +27,30 @@ rule hifiasm: ), shell: """ - hifiasm {input} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1 + hifiasm {input.reads} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1 mv results/hifiasm/asm.p_ctg.gfa {output.gfa} mv results/hifiasm/asm.a_ctg.gfa {output.gfa_alt} awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta} awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt} + + # all the assemblies produced by the workflow will be symlinked to results/assemblies + + ln -srn {output.fasta} {output.link_primary} + ln -srn {output.fasta_alt} {output.link_alternate} """ rule hifiasm_het: input: - "results/reads/hifi/hifi.fastq.gz", + reads = "results/reads/hifi/hifi.fastq.gz", output: - gfa="results/hifiasm/asm.hap1.gfa", - gfa_alt="results/hifiasm/asm.hap2.gfa", - fasta="results/hifiasm/asm.hap1.fa", - fasta_alt="results/hifiasm/asm.hap2.fa", + gfa_hap1="results/hifiasm/asm.hap1.gfa", + gfa_hap2="results/hifiasm/asm.hap2.gfa", + fasta_hap1="results/hifiasm/asm.hap1.fa", + fasta_hap2="results/hifiasm/asm.hap2.fa", + asm_dir = directory("results/assemblies"), + link_hap1 = "results/assemblies/asm.hap1.fa", + link_hap2 = "results/assemblies/asm.hap2.fa" threads: config["hifiasm"]["t"] log: "logs/hifiasm.log", @@ -52,9 +64,14 @@ rule hifiasm_het: ), shell: """ - hifiasm {input} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1 - mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa} - mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_alt} - awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta} - awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt} + hifiasm {input.reads} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1 + mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa_hap1} + mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_hap2} + awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap1} > {output.fasta_hap1} + awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap2} > {output.fasta_hap2} + + # all the assemblies produced by the workflow will be symlinked to results/assemblies + + ln -srn {output.fasta_hap1} {output.link_hap1} + ln -srn {output.fasta_hap2} {output.link_hap2} """ diff --git a/workflow/rules/ncbi_fcsgx.smk b/workflow/rules/ncbi_fcsgx.smk index 6d9e203..2f95656 100644 --- a/workflow/rules/ncbi_fcsgx.smk +++ b/workflow/rules/ncbi_fcsgx.smk @@ -3,13 +3,15 @@ rule fcsgx: input: - fasta="results/hifiasm/asm.{hap}.fa", + fasta = "results/hifiasm/asm.{hap}.fa", + asm_dir ="results/assemblies" params: ncbi_tax_id=config["fcsgx"]["ncbi_tax_id"], path_to_gx_db=config["fcsgx"]["path_to_gx_db"], output: clean_fasta="results/ncbi_fcsgx_{hap}/asm_clean.fa", - contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa" + contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa", + link="results/assemblies/fcsgx_{hap}.fa" threads: config["hifiasm"]["t"] log: "logs/fcsgx_{hap}.log", @@ -21,4 +23,5 @@ rule fcsgx: """ run_gx.py --fasta {input.fasta} --out-dir results/ncbi_fcsgx_{wildcards.hap}/out --gx-db {params.path_to_gx_db} --tax-id {params.ncbi_tax_id} --debug >> {log} 2>&1 cat {input.fasta} | gx clean-genome --action-report results/ncbi_fcsgx_{wildcards.hap}/out/asm.{wildcards.hap}.{params.ncbi_tax_id}.fcs_gx_report.txt --output {output.clean_fasta} --contam-fasta-out {output.contaminants} >> {log} 2>&1 + ln -srn {output.clean_fasta} {output.link} """ diff --git a/workflow/rules/oatk.smk b/workflow/rules/oatk.smk index 843eb89..5be3626 100644 --- a/workflow/rules/oatk.smk +++ b/workflow/rules/oatk.smk @@ -1,9 +1,5 @@ # This rule runs oatk to extract organelles reads from the hifi reads. - -# include common.smk to use get_oatk_outputs function # get_oatk_outputs is used to dynamically decide the outputs of oatk -include: "common.smk" - rule oatk: input: diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk index bb40a8a..5eb8c0e 100644 --- a/workflow/rules/purge_dups.smk +++ b/workflow/rules/purge_dups.smk @@ -1,14 +1,10 @@ # This rule uses purge_dups purge haplotigs and overlaps from the primary assembly produced by hifiasm - -# include common.smk to use get_purge_dups_inputs -include: "common.smk" - - rule purge_dups: input: - reads="results/reads/hifi/hifi.fastq.gz", - fasta=get_purge_dups_inputs(), + reads = "results/reads/hifi/hifi.fastq.gz", + fasta = get_purge_dups_inputs(), + asm_dir = "results/assemblies" output: paf="results/purge_dups/hifi_vs_primary_contigs.paf.gz", calcuts_log="results/purge_dups/calcuts.log", @@ -23,6 +19,7 @@ rule purge_dups: hap_fa="results/purge_dups/hap.fa", purged_fasta="results/purge_dups/asm.primary_purged.fa", hist_plot="results/purge_dups/hist.out.png", + link="results/assemblies/purged_primary.fa" threads: config["minimap2"]["t"] log: "logs/purge_dups.log", @@ -55,4 +52,6 @@ rule purge_dups: mv hap.fa {output.hap_fa} mv purged.fa {output.purged_fasta} mv hist.out.png {output.hist_plot} + + ln -srn {output.purged_fasta} {output.link} """ diff --git a/workflow/rules/purge_dups2.smk b/workflow/rules/purge_dups2.smk index f8461d9..bc420c4 100644 --- a/workflow/rules/purge_dups2.smk +++ b/workflow/rules/purge_dups2.smk @@ -6,6 +6,7 @@ rule purge_dups_alt: reads="results/reads/hifi/hifi.fastq.gz", fasta="results/hifiasm/asm.alternate.fa", hap_fa_in="results/purge_dups/hap.fa", + asm_dir = "results/assemblies" output: merged_fasta="results/purge_dups_alt/merged.fa", paf="results/purge_dups_alt/hifi_vs_alternate_contigs.paf.gz", @@ -21,6 +22,7 @@ rule purge_dups_alt: hap_fa="results/purge_dups_alt/hap.fa", purged_fasta="results/purge_dups_alt/asm.alternate_purged.fa", hist_plot="results/purge_dups_alt/hist.out.png", + link="results/assemblies/purged_alternate.fa" threads: config["minimap2"]["t"] log: "logs/purge_dups_alt.log", @@ -55,4 +57,6 @@ rule purge_dups_alt: mv hap.fa {output.hap_fa} mv purged.fa {output.purged_fasta} mv hist.out.png {output.hist_plot} + + ln -srn {output.purged_fasta} {output.link} """ diff --git a/workflow/rules/quast.smk b/workflow/rules/quast.smk new file mode 100644 index 0000000..f80942d --- /dev/null +++ b/workflow/rules/quast.smk @@ -0,0 +1,23 @@ +# rule quast for assembly QC + +rule quast: + input: + completion = "results/yahs_all_done.txt", + assemblies = expand("results/assemblies/{file}.fa", file=assembly_files) + output: + "results/quast/report.html", + threads: config["quast"]["t"] + log: + "logs/quast.log", + resources: + mem_mb=config['quast']['mem_mb'], # access memory from config + conda: + "../envs/quast.yaml" + params: + optional_params=" ".join( + f"{k} {v}" for k, v in config["quast"]["optional_params"].items() if v + ), + shell: + """ + quast {input.assemblies} -o results/quast -t {threads} {params.optional_params} + """ \ No newline at end of file diff --git a/workflow/rules/yahs.smk b/workflow/rules/yahs.smk index bca1607..2b45e85 100644 --- a/workflow/rules/yahs.smk +++ b/workflow/rules/yahs.smk @@ -2,17 +2,14 @@ import glob - -# include common.smk to use get_bwa_index_inputs -include: "common.smk" - - rule yahs: input: REF="results/bwa_index_{hap}/asm.fa", bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam", + asm_dir = "results/assemblies" output: scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", + link="results/assemblies/yahs_{hap}_{sample}.fa", log: "logs/yahs_{hap}_{sample}.log", resources: @@ -26,4 +23,5 @@ rule yahs: shell: """ yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs_{wildcards.sample} {params.optional_params} >> {log} 2>&1 + ln -srn {output.scaffolds} {output.link} """ diff --git a/workflow/rules/yahs_completed.smk b/workflow/rules/yahs_completed.smk new file mode 100644 index 0000000..56c503b --- /dev/null +++ b/workflow/rules/yahs_completed.smk @@ -0,0 +1,15 @@ +# add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies + +rule all_yahs_completed: + input: + get_yahs_output() + output: + "results/yahs_all_done.txt" + log: + "logs/all_yahs_completed.log", + conda: + "../envs/basic.yaml" + shell: + """ + ls {input} > {output} + """ \ No newline at end of file From 3bcaf2bad3bb3cf3c6ac90df1db0ab8a0bc1985d Mon Sep 17 00:00:00 2001 From: LiaOb21 Date: Wed, 27 Mar 2024 11:58:08 +0000 Subject: [PATCH 3/5] adding quast and busco --- config/config_test.yaml | 6 +++--- workflow/Snakefile | 8 ++++---- workflow/rules/busco.smk | 7 ++++--- workflow/rules/common.smk | 27 +++++++++++++-------------- workflow/rules/hifiasm.smk | 3 --- workflow/rules/ncbi_fcsgx.smk | 1 - workflow/rules/purge_dups.smk | 4 ---- workflow/rules/purge_dups2.smk | 4 ---- workflow/rules/quast.smk | 2 +- workflow/rules/yahs.smk | 1 - workflow/rules/yahs_completed.smk | 8 ++++---- 11 files changed, 29 insertions(+), 42 deletions(-) diff --git a/config/config_test.yaml b/config/config_test.yaml index 402bfd5..55e8ce6 100644 --- a/config/config_test.yaml +++ b/config/config_test.yaml @@ -66,11 +66,11 @@ hifiasm: "-f": "0" # used for small datasets "-l": "" # purge level. 0: no purging; 1: light; 2/3: aggressive [0 for trio; 3 for unzip] "--ul": "" # use this if you have also ont data you want to integrate in your assembly - "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly - "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly +# "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly +# "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly #Set this to False if you want to skip the fcsgx step: -include_fcsgx: True #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM) +include_fcsgx: False #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM) # Customisable parameters for fcsgx fcsgx: diff --git a/workflow/Snakefile b/workflow/Snakefile index 3883ed7..ce40eb9 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,7 +14,7 @@ samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"] # set {file} in the assemblies directory as global variable -assembly_files = glob_wildcards("results/assemblies/{file}.fa")[0] +# assembly_files = glob_wildcards("results/assemblies/{file}.fa") # Include the common rule, where the inputs for rule_all are defined include: "rules/common.smk" @@ -51,11 +51,11 @@ include: "rules/picard.smk" # Include the yahs rule include: "rules/yahs.smk" # include yahs checkpoint - to start the QC after yahs is done -include: "rules/yahs_completed.smk" +#include: "rules/yahs_completed.smk" # Include the quast rule -include: "rules/quast.smk" +#include: "rules/quast.smk" # Include the busco rule -include: "rules/busco.smk" +#include: "rules/busco.smk" # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified # input files are defined in the common.smk file diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk index 4226640..82b85da 100644 --- a/workflow/rules/busco.smk +++ b/workflow/rules/busco.smk @@ -2,7 +2,7 @@ rule busco: input: - completion = "results/yahs_all_done.txt", + completion = checkpoints.yahs_completed.get().completion_marker, assemblies = "results/assemblies/{file}.fa" output: dir = directory("results/busco/{file}.fa_busco"), @@ -10,7 +10,7 @@ rule busco: lineage=config["busco"]["lineage"], threads: config["busco"]["t"] log: - "logs/busco{file}.log" + "logs/busco_{file}.log" resources: mem_mb=config['busco']['mem_mb'], # access memory from config conda: @@ -18,5 +18,6 @@ rule busco: shell: """ # run busco on each assembly - busco -i {input.assemblies} -o {output.dir} -m genome -l {params.lineage} -f -c {threads} + #assembly_name=$(basename {input.assemblies}) + busco -i {input.assemblies} -o {output.dir}/{wildcards.file}_busco -m genome -l {params.lineage} -f -c {threads} """ \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f78457f..0953511 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -24,7 +24,7 @@ def get_all_inputs(wc=None): "results/nanoplot/NanoPlot-report.html", # nanoplot report "results/kmc/out.hist", # output of kmc rule "results/genomescope", # output directory of genomescope - "results/quast/report.html" +# "results/quast/report.html" ] # Get the OATK outputs @@ -45,10 +45,7 @@ def get_all_inputs(wc=None): if not diploid_mode: inputs.append("results/purge_dups_alt/asm.alternate_purged.fa") - # Extend list of inputs with busco directories - inputs.extend(expand( - "results/busco/{file}.fa_busco", file=assembly_files - )) +# inputs.extend(expand("results/busco/{file}.fa_busco", file=assembly_files)) return inputs @@ -81,17 +78,19 @@ def get_purge_dups_inputs(): def get_bwa_index_inputs(wildcards): hap = wildcards.hap if config["include_purge_dups"] == True: - return f"results/purge_dups/asm.{hap}_purged.fa" + return "results/purge_dups/asm.{hap}_purged.fa" elif config["include_fcsgx"] == True: return f"results/ncbi_fcsgx_{hap}/asm_clean.fa" else: return f"results/hifiasm/asm.{hap}.fa" -def get_yahs_output(): - if diploid_mode: - return [ - expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples), - expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) - ] - else: - return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) +#def get_yahs_output(): +# if diploid_mode: +# return [ +# expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples), +# expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) +# ] +# else: +# return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) + + diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk index 967f718..68372b1 100644 --- a/workflow/rules/hifiasm.smk +++ b/workflow/rules/hifiasm.smk @@ -5,13 +5,11 @@ rule hifiasm: input: reads = "results/reads/hifi/hifi.fastq.gz", - asm_dir = "results/assemblies" output: gfa="results/hifiasm/asm.primary.gfa", gfa_alt="results/hifiasm/asm.alternate.gfa", fasta="results/hifiasm/asm.primary.fa", fasta_alt="results/hifiasm/asm.alternate.fa", - asm_dir = directory("results/assemblies"), link_primary = "results/assemblies/asm.primary.fa", link_alternate = "results/assemblies/asm.alternate.fa" threads: config["hifiasm"]["t"] @@ -48,7 +46,6 @@ rule hifiasm_het: gfa_hap2="results/hifiasm/asm.hap2.gfa", fasta_hap1="results/hifiasm/asm.hap1.fa", fasta_hap2="results/hifiasm/asm.hap2.fa", - asm_dir = directory("results/assemblies"), link_hap1 = "results/assemblies/asm.hap1.fa", link_hap2 = "results/assemblies/asm.hap2.fa" threads: config["hifiasm"]["t"] diff --git a/workflow/rules/ncbi_fcsgx.smk b/workflow/rules/ncbi_fcsgx.smk index 2f95656..14b5478 100644 --- a/workflow/rules/ncbi_fcsgx.smk +++ b/workflow/rules/ncbi_fcsgx.smk @@ -4,7 +4,6 @@ rule fcsgx: input: fasta = "results/hifiasm/asm.{hap}.fa", - asm_dir ="results/assemblies" params: ncbi_tax_id=config["fcsgx"]["ncbi_tax_id"], path_to_gx_db=config["fcsgx"]["path_to_gx_db"], diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk index 5eb8c0e..9afd2d5 100644 --- a/workflow/rules/purge_dups.smk +++ b/workflow/rules/purge_dups.smk @@ -4,7 +4,6 @@ rule purge_dups: input: reads = "results/reads/hifi/hifi.fastq.gz", fasta = get_purge_dups_inputs(), - asm_dir = "results/assemblies" output: paf="results/purge_dups/hifi_vs_primary_contigs.paf.gz", calcuts_log="results/purge_dups/calcuts.log", @@ -19,7 +18,6 @@ rule purge_dups: hap_fa="results/purge_dups/hap.fa", purged_fasta="results/purge_dups/asm.primary_purged.fa", hist_plot="results/purge_dups/hist.out.png", - link="results/assemblies/purged_primary.fa" threads: config["minimap2"]["t"] log: "logs/purge_dups.log", @@ -52,6 +50,4 @@ rule purge_dups: mv hap.fa {output.hap_fa} mv purged.fa {output.purged_fasta} mv hist.out.png {output.hist_plot} - - ln -srn {output.purged_fasta} {output.link} """ diff --git a/workflow/rules/purge_dups2.smk b/workflow/rules/purge_dups2.smk index bc420c4..f8461d9 100644 --- a/workflow/rules/purge_dups2.smk +++ b/workflow/rules/purge_dups2.smk @@ -6,7 +6,6 @@ rule purge_dups_alt: reads="results/reads/hifi/hifi.fastq.gz", fasta="results/hifiasm/asm.alternate.fa", hap_fa_in="results/purge_dups/hap.fa", - asm_dir = "results/assemblies" output: merged_fasta="results/purge_dups_alt/merged.fa", paf="results/purge_dups_alt/hifi_vs_alternate_contigs.paf.gz", @@ -22,7 +21,6 @@ rule purge_dups_alt: hap_fa="results/purge_dups_alt/hap.fa", purged_fasta="results/purge_dups_alt/asm.alternate_purged.fa", hist_plot="results/purge_dups_alt/hist.out.png", - link="results/assemblies/purged_alternate.fa" threads: config["minimap2"]["t"] log: "logs/purge_dups_alt.log", @@ -57,6 +55,4 @@ rule purge_dups_alt: mv hap.fa {output.hap_fa} mv purged.fa {output.purged_fasta} mv hist.out.png {output.hist_plot} - - ln -srn {output.purged_fasta} {output.link} """ diff --git a/workflow/rules/quast.smk b/workflow/rules/quast.smk index f80942d..1f4adc3 100644 --- a/workflow/rules/quast.smk +++ b/workflow/rules/quast.smk @@ -2,7 +2,7 @@ rule quast: input: - completion = "results/yahs_all_done.txt", + completion = checkpoints.yahs_completed.get().completion_marker, assemblies = expand("results/assemblies/{file}.fa", file=assembly_files) output: "results/quast/report.html", diff --git a/workflow/rules/yahs.smk b/workflow/rules/yahs.smk index 2b45e85..f73a801 100644 --- a/workflow/rules/yahs.smk +++ b/workflow/rules/yahs.smk @@ -6,7 +6,6 @@ rule yahs: input: REF="results/bwa_index_{hap}/asm.fa", bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam", - asm_dir = "results/assemblies" output: scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", link="results/assemblies/yahs_{hap}_{sample}.fa", diff --git a/workflow/rules/yahs_completed.smk b/workflow/rules/yahs_completed.smk index 56c503b..c3611e3 100644 --- a/workflow/rules/yahs_completed.smk +++ b/workflow/rules/yahs_completed.smk @@ -1,15 +1,15 @@ # add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies -rule all_yahs_completed: +checkpoint symlinks: input: - get_yahs_output() + expand("results/assemblies/yahs_{hap}_{sample}.fa", sample=samples, hap=hap) output: - "results/yahs_all_done.txt" + completion_marker = "results/yahs_all_done.txt" log: "logs/all_yahs_completed.log", conda: "../envs/basic.yaml" shell: """ - ls {input} > {output} + ls {input} > {output.completion_marker} """ \ No newline at end of file From 532d0a49986194705dab3c55c4edbb821a74736f Mon Sep 17 00:00:00 2001 From: LiaOb21 Date: Thu, 28 Mar 2024 18:30:23 +0000 Subject: [PATCH 4/5] updating config format --- config/config.yaml | 17 +++++++++++++++++ config/config_test.yaml | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3061040..3e8a1cb 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -99,3 +99,20 @@ yahs: # o: "hifiasm_p_purged_yahs" # output prefix use of this needs evaluation optional_params: "-e": "" # you can specify the restriction enzyme(s) used by the Hi-C experiment + + +# Customisable parameters for quast +quast: + t: 4 # number of threads + mem_mb: 1000 # memory in MB + optional_params: + "--fragmented": "" + "--large": "" +# "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa" +# "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3" + +# Customisable parameters for busco +busco: + t: 4 # number of threads + mem_mb: 1000 # memory in MB + lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis diff --git a/config/config_test.yaml b/config/config_test.yaml index 55e8ce6..402bfd5 100644 --- a/config/config_test.yaml +++ b/config/config_test.yaml @@ -66,11 +66,11 @@ hifiasm: "-f": "0" # used for small datasets "-l": "" # purge level. 0: no purging; 1: light; 2/3: aggressive [0 for trio; 3 for unzip] "--ul": "" # use this if you have also ont data you want to integrate in your assembly -# "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly -# "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly + "--h1": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_1 data if you want to produce a phased assembly + "--h2": "resources/raw_hic/sorted_hic_test_YBP2_SRR27548905_PRJNA1013711_1.fastq.gz" # here you must set the path to your raw Hi-C_2 data if you want to produce a phased assembly #Set this to False if you want to skip the fcsgx step: -include_fcsgx: False #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM) +include_fcsgx: True #inlcude this rule only if you have preiously downloaded the database (recommended to run fcsgx only on a HPC. It requires around 500 GB of space on your disk and a large RAM) # Customisable parameters for fcsgx fcsgx: From 2774a835e66ed97697373a4abc2f9e964c9da51a Mon Sep 17 00:00:00 2001 From: LiaOb21 Date: Thu, 28 Mar 2024 18:31:12 +0000 Subject: [PATCH 5/5] adding busco and quast --- workflow/Snakefile | 10 ++----- workflow/rules/busco.smk | 19 +++++++----- workflow/rules/bwa_mem.smk | 10 +++---- workflow/rules/common.smk | 38 ++++++++++++++---------- workflow/rules/fastp.smk | 22 +++++--------- workflow/rules/filter_five_end.smk | 10 +++---- workflow/rules/hifiasm.smk | 8 ++--- workflow/rules/picard.smk | 19 +++++++----- workflow/rules/purge_dups.smk | 3 ++ workflow/rules/quast.smk | 12 ++++---- workflow/rules/two_read_bam_combiner.smk | 8 ++--- workflow/rules/yahs.smk | 10 +++---- workflow/rules/yahs_completed.smk | 8 ++--- 13 files changed, 90 insertions(+), 87 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index ce40eb9..29f5f12 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -8,13 +8,11 @@ configfile: "config/config.yaml" # set {samples} global variable - used to read Hi-C files samples_in_pattern = "{sample}_1.fastq.gz" -samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample +sample, = glob_wildcards(config["hic_path"] + samples_in_pattern).sample # set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"] -# set {file} in the assemblies directory as global variable -# assembly_files = glob_wildcards("results/assemblies/{file}.fa") # Include the common rule, where the inputs for rule_all are defined include: "rules/common.smk" @@ -50,12 +48,10 @@ include: "rules/two_read_bam_combiner.smk" include: "rules/picard.smk" # Include the yahs rule include: "rules/yahs.smk" -# include yahs checkpoint - to start the QC after yahs is done -#include: "rules/yahs_completed.smk" # Include the quast rule -#include: "rules/quast.smk" +include: "rules/quast.smk" # Include the busco rule -#include: "rules/busco.smk" +include: "rules/busco.smk" # Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified # input files are defined in the common.smk file diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk index 82b85da..ee83669 100644 --- a/workflow/rules/busco.smk +++ b/workflow/rules/busco.smk @@ -2,22 +2,27 @@ rule busco: input: - completion = checkpoints.yahs_completed.get().completion_marker, - assemblies = "results/assemblies/{file}.fa" + unpack(get_assemblies_QC) output: - dir = directory("results/busco/{file}.fa_busco"), + dir = directory("results/busco"), params: lineage=config["busco"]["lineage"], + labels = list(get_assemblies_QC().keys()) threads: config["busco"]["t"] log: - "logs/busco_{file}.log" + "logs/busco.log" resources: mem_mb=config['busco']['mem_mb'], # access memory from config conda: "../envs/busco.yaml" shell: """ - # run busco on each assembly - #assembly_name=$(basename {input.assemblies}) - busco -i {input.assemblies} -o {output.dir}/{wildcards.file}_busco -m genome -l {params.lineage} -f -c {threads} + mkdir -p {output} + labels=({params.labels}) + count=0 + for v in {input} ; do + k=${{labels[$count]}} + busco -i $v -o {output}/$k -m genome -l {params.lineage} -f -c {threads} >> {log} 2>&1 + count=$(( $count + 1 )) + done """ \ No newline at end of file diff --git a/workflow/rules/bwa_mem.smk b/workflow/rules/bwa_mem.smk index 87d45e0..ae02479 100644 --- a/workflow/rules/bwa_mem.smk +++ b/workflow/rules/bwa_mem.smk @@ -8,14 +8,14 @@ rule bwa_mem: input: REF="results/bwa_index_{hap}/asm.fa", - forward_hic="results/fastp/{sample}_trim_1.fastq.gz", - reverse_hic="results/fastp/{sample}_trim_2.fastq.gz", + forward_hic="results/fastp/hic_trim_1.fastq.gz", + reverse_hic="results/fastp/hic_trim_2.fastq.gz", output: - bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam", - bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam", + bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam", + bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam", threads: config["arima"]["CPU"] log: - "logs/bwa_mem_{hap}_{sample}.log", + "logs/bwa_mem_{hap}.log", resources: mem_mb=config['arima']['mem_mb'], # access memory from config conda: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0953511..462de5d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -4,14 +4,14 @@ import os wildcard_constraints: - hap = "primary|hap1|hap2" + hap = "primary|hap1|hap2", def get_all_inputs(wc=None): if diploid_mode: hap = ["hap1", "hap2"] else: - hap = ["primary"] - if not samples: + hap = ["primary"] + if not sample: # Show the user a meaningful error message logger.error(f"No files matching {samples_in_pattern} were found in" f" {config.get('hic_path')}. Please check your config file.") @@ -24,7 +24,8 @@ def get_all_inputs(wc=None): "results/nanoplot/NanoPlot-report.html", # nanoplot report "results/kmc/out.hist", # output of kmc rule "results/genomescope", # output directory of genomescope -# "results/quast/report.html" + "results/quast", # output directory of quast + "results/busco", # output directory of busco ] # Get the OATK outputs @@ -38,14 +39,14 @@ def get_all_inputs(wc=None): inputs.extend(oatk_outputs) inputs.extend(expand( - "results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", sample=samples, hap=hap + "results/yahs_{hap}/asm_yahs_scaffolds_final.fa", hap=hap )) # final scaffolded assembly # If not in diploid mode, add asm.alternate.fa as an input - if not diploid_mode: + if not diploid_mode and config["include_purge_dups"] == True: inputs.append("results/purge_dups_alt/asm.alternate_purged.fa") -# inputs.extend(expand("results/busco/{file}.fa_busco", file=assembly_files)) +# inputs.extend(expand("results/busco/{tool}_{hap}_busco", tool=tool, hap=hap)) return inputs @@ -84,13 +85,18 @@ def get_bwa_index_inputs(wildcards): else: return f"results/hifiasm/asm.{hap}.fa" -#def get_yahs_output(): -# if diploid_mode: -# return [ -# expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples), -# expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) -# ] -# else: -# return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples) - +def get_assemblies_QC(wildcards=None): + if diploid_mode: + haps = ["hap1", "hap2"] + else: + haps = ["primary"] + results = {} + for hap in haps: + results[f"asm_{hap}"] = f"results/assemblies/asm_{hap}.fa" + results[f"yahs_{hap}"] = f"results/assemblies/yahs_{hap}.fa" + if config["include_purge_dups"]: + results[f"purged_{hap}"] = f"results/assemblies/purged_{hap}.fa" + if config["include_fcsgx"]: + results[f"fcsgx_{hap}"] = f"results/assemblies/fcsgx_{hap}.fa" + return results \ No newline at end of file diff --git a/workflow/rules/fastp.smk b/workflow/rules/fastp.smk index 05949f6..c4be9ee 100644 --- a/workflow/rules/fastp.smk +++ b/workflow/rules/fastp.smk @@ -5,28 +5,20 @@ import glob rule fastp: input: - forward_in=expand( - "{hic_path}{sample}_1.fastq.gz", - hic_path=config["hic_path"], - sample=glob_wildcards(config["hic_path"] + "{sample}_1.fastq.gz").sample, - ), - reverse_in=expand( - "{hic_path}{sample}_2.fastq.gz", - hic_path=config["hic_path"], - sample=glob_wildcards(config["hic_path"] + "{sample}_2.fastq.gz").sample, - ), + forward_in=f"{config["hic_path"]}{sample}_1.fastq.gz", + reverse_in=f"{config["hic_path"]}{sample}_2.fastq.gz", output: - forward_out="results/fastp/{sample}_trim_1.fastq.gz", - reverse_out="results/fastp/{sample}_trim_2.fastq.gz", - json="results/fastp/{sample}_report_fastp.HiC.json", - html="results/fastp/{sample}_report_fastp.HiC.html", + forward_out="results/fastp/hic_trim_1.fastq.gz", + reverse_out="results/fastp/hic_trim_2.fastq.gz", + json="results/fastp/hic_report_fastp.HiC.json", + html="results/fastp/hic_report_fastp.HiC.html", params: optional_params=" ".join( f"{k} {v}" for k, v in config["fastp"]["optional_params"].items() if v ), threads: config["fastp"]["t"] log: - "logs/{sample}_fastp.log", + "logs/fastp.log", resources: mem_mb=config['fastp']['mem_mb'], # access memory from config conda: diff --git a/workflow/rules/filter_five_end.smk b/workflow/rules/filter_five_end.smk index 7a893bd..514a78d 100644 --- a/workflow/rules/filter_five_end.smk +++ b/workflow/rules/filter_five_end.smk @@ -6,13 +6,13 @@ rule fiter_five_end: input: - bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam", - bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam", + bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam", + bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam", output: - bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam", - bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam", + bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam", + bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam", log: - "logs/fiter_five_end_{hap}_{sample}.log", + "logs/fiter_five_end_{hap}.log", resources: mem_mb=config['arima']['mem_mb'], # access memory from config conda: diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk index 68372b1..1ce0147 100644 --- a/workflow/rules/hifiasm.smk +++ b/workflow/rules/hifiasm.smk @@ -10,8 +10,7 @@ rule hifiasm: gfa_alt="results/hifiasm/asm.alternate.gfa", fasta="results/hifiasm/asm.primary.fa", fasta_alt="results/hifiasm/asm.alternate.fa", - link_primary = "results/assemblies/asm.primary.fa", - link_alternate = "results/assemblies/asm.alternate.fa" + link_primary = "results/assemblies/asm_primary.fa", threads: config["hifiasm"]["t"] log: "logs/hifiasm.log", @@ -34,7 +33,6 @@ rule hifiasm: # all the assemblies produced by the workflow will be symlinked to results/assemblies ln -srn {output.fasta} {output.link_primary} - ln -srn {output.fasta_alt} {output.link_alternate} """ @@ -46,8 +44,8 @@ rule hifiasm_het: gfa_hap2="results/hifiasm/asm.hap2.gfa", fasta_hap1="results/hifiasm/asm.hap1.fa", fasta_hap2="results/hifiasm/asm.hap2.fa", - link_hap1 = "results/assemblies/asm.hap1.fa", - link_hap2 = "results/assemblies/asm.hap2.fa" + link_hap1 = "results/assemblies/asm_hap1.fa", + link_hap2 = "results/assemblies/asm_hap2.fa" threads: config["hifiasm"]["t"] log: "logs/hifiasm.log", diff --git a/workflow/rules/picard.smk b/workflow/rules/picard.smk index abb54d0..2004b4b 100644 --- a/workflow/rules/picard.smk +++ b/workflow/rules/picard.smk @@ -2,18 +2,21 @@ # This pipeline is currently built for processing a single sample with one read1 and read2 FASTQ file. # For more info: https://github.com/ArimaGenomics/mapping_pipeline # colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that +# The final files will be in REP_DIR to be consistent with the original pipeline but they will be marked with "final" and not with "rep1" rule picard: input: - tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/{sample}.bam", + tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/hic_vs_contigs_filt_paired.bam", output: - bam_paired="results/arima_mapping_pipeline_{hap}/PAIR_DIR/{sample}.bam", - mark_dup="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam", - metrics="results/arima_mapping_pipeline_{hap}/REP_DIR/metrics.{sample}_rep1.txt", - stats="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam.stats", + bam_add_read_group="results/arima_mapping_pipeline_{hap}/PAIR_DIR/paired_add_read_group.bam", + mark_dup="results/arima_mapping_pipeline_{hap}/REP_DIR/paired_mark_dups_final.bam", + metrics="results/arima_mapping_pipeline_{hap}/REP_DIR/metrics.final.txt", + stats="results/arima_mapping_pipeline_{hap}/REP_DIR/final.bam.stats", + params: + hic_sample_name=sample log: - "logs/picard_{hap}_{sample}.log", + "logs/picard_{hap}.log", resources: mem_mb=config['arima']['mem_mb'], # access memory from config conda: @@ -21,8 +24,8 @@ rule picard: shell: """ PICARD=${{CONDA_PREFIX}}/share/picard-*/picard.jar - java -Xmx4G -Djava.io.tmpdir=temp/ -jar ${{PICARD}} AddOrReplaceReadGroups INPUT={input.tmp_bam} OUTPUT={output.bam_paired} ID={wildcards.sample} LB={wildcards.sample} SM={wildcards.sample} PL=ILLUMINA PU=none - java -Xmx30G -XX:-UseGCOverheadLimit -Djava.io.tmpdir=temp/ -jar ${{PICARD}} MarkDuplicates INPUT={output.bam_paired} OUTPUT={output.mark_dup} METRICS_FILE={output.metrics} TMP_DIR=results/arima_mapping_pipeline_{wildcards.hap}/TMP_DIR ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE + java -Xmx4G -Djava.io.tmpdir=temp/ -jar ${{PICARD}} AddOrReplaceReadGroups INPUT={input.tmp_bam} OUTPUT={output.bam_add_read_group} ID={params.hic_sample_name} LB={params.hic_sample_name} SM={params.hic_sample_name} PL=ILLUMINA PU=none + java -Xmx30G -XX:-UseGCOverheadLimit -Djava.io.tmpdir=temp/ -jar ${{PICARD}} MarkDuplicates INPUT={output.bam_add_read_group} OUTPUT={output.mark_dup} METRICS_FILE={output.metrics} TMP_DIR=results/arima_mapping_pipeline_{wildcards.hap}/TMP_DIR ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE samtools index {output.mark_dup} perl scripts/get_stats.pl {output.mark_dup} > {output.stats} """ diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk index 9afd2d5..df6ce50 100644 --- a/workflow/rules/purge_dups.smk +++ b/workflow/rules/purge_dups.smk @@ -18,6 +18,7 @@ rule purge_dups: hap_fa="results/purge_dups/hap.fa", purged_fasta="results/purge_dups/asm.primary_purged.fa", hist_plot="results/purge_dups/hist.out.png", + link = "results/assemblies/purged_primary.fa", threads: config["minimap2"]["t"] log: "logs/purge_dups.log", @@ -50,4 +51,6 @@ rule purge_dups: mv hap.fa {output.hap_fa} mv purged.fa {output.purged_fasta} mv hist.out.png {output.hist_plot} + + ln -srn {output.purged_fasta} {output.link} """ diff --git a/workflow/rules/quast.smk b/workflow/rules/quast.smk index 1f4adc3..9d71b5d 100644 --- a/workflow/rules/quast.smk +++ b/workflow/rules/quast.smk @@ -2,10 +2,9 @@ rule quast: input: - completion = checkpoints.yahs_completed.get().completion_marker, - assemblies = expand("results/assemblies/{file}.fa", file=assembly_files) + unpack(get_assemblies_QC) output: - "results/quast/report.html", + directory("results/quast"), threads: config["quast"]["t"] log: "logs/quast.log", @@ -17,7 +16,8 @@ rule quast: optional_params=" ".join( f"{k} {v}" for k, v in config["quast"]["optional_params"].items() if v ), + labels = ",".join(get_assemblies_QC().keys()) shell: - """ - quast {input.assemblies} -o results/quast -t {threads} {params.optional_params} - """ \ No newline at end of file + """ + quast {input} -l {params.labels} -o {output} -t {threads} {params.optional_params} >> {log} 2>&1 + """ diff --git a/workflow/rules/two_read_bam_combiner.smk b/workflow/rules/two_read_bam_combiner.smk index 518536c..99f85e4 100644 --- a/workflow/rules/two_read_bam_combiner.smk +++ b/workflow/rules/two_read_bam_combiner.smk @@ -7,16 +7,16 @@ rule two_read_bam_combiner: input: - bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam", - bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam", + bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam", + bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam", REF="results/bwa_index_{hap}/asm.fa", output: - tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/{sample}.bam", + tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/hic_vs_contigs_filt_paired.bam", threads: config["arima"]["CPU"] params: MAPQ_FILTER=config["arima"]["MAPQ_FILTER"], log: - "logs/two_read_bam_combiner_{hap}_{sample}.log", + "logs/two_read_bam_combiner_{hap}.log", resources: mem_mb=config['arima']['mem_mb'], # access memory from config conda: diff --git a/workflow/rules/yahs.smk b/workflow/rules/yahs.smk index f73a801..db057e6 100644 --- a/workflow/rules/yahs.smk +++ b/workflow/rules/yahs.smk @@ -5,12 +5,12 @@ import glob rule yahs: input: REF="results/bwa_index_{hap}/asm.fa", - bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam", + bam="results/arima_mapping_pipeline_{hap}/REP_DIR/paired_mark_dups_final.bam", output: - scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", - link="results/assemblies/yahs_{hap}_{sample}.fa", + scaffolds="results/yahs_{hap}/asm_yahs_scaffolds_final.fa", + link="results/assemblies/yahs_{hap}.fa" log: - "logs/yahs_{hap}_{sample}.log", + "logs/yahs_{hap}.log", resources: mem_mb=config['arima']['mem_mb'], # access memory from config conda: @@ -21,6 +21,6 @@ rule yahs: ), shell: """ - yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs_{wildcards.sample} {params.optional_params} >> {log} 2>&1 + yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs {params.optional_params} >> {log} 2>&1 ln -srn {output.scaffolds} {output.link} """ diff --git a/workflow/rules/yahs_completed.smk b/workflow/rules/yahs_completed.smk index c3611e3..0d61b69 100644 --- a/workflow/rules/yahs_completed.smk +++ b/workflow/rules/yahs_completed.smk @@ -1,12 +1,12 @@ # add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies -checkpoint symlinks: +checkpoint yahs_done: input: - expand("results/assemblies/yahs_{hap}_{sample}.fa", sample=samples, hap=hap) + get_yahs_output() output: - completion_marker = "results/yahs_all_done.txt" + completion_marker = "results/yahs_completed.txt" log: - "logs/all_yahs_completed.log", + "logs/yahs_completed.log", conda: "../envs/basic.yaml" shell: