Skip to content

Commit

Permalink
adding busco and quast
Browse files Browse the repository at this point in the history
  • Loading branch information
LiaOb21 committed Mar 28, 2024
1 parent 532d0a4 commit 2774a83
Show file tree
Hide file tree
Showing 13 changed files with 90 additions and 87 deletions.
10 changes: 3 additions & 7 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@ configfile: "config/config.yaml"

# set {samples} global variable - used to read Hi-C files
samples_in_pattern = "{sample}_1.fastq.gz"
samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
sample, = glob_wildcards(config["hic_path"] + samples_in_pattern).sample

# set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm
diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"]

# set {file} in the assemblies directory as global variable
# assembly_files = glob_wildcards("results/assemblies/{file}.fa")

# Include the common rule, where the inputs for rule_all are defined
include: "rules/common.smk"
Expand Down Expand Up @@ -50,12 +48,10 @@ include: "rules/two_read_bam_combiner.smk"
include: "rules/picard.smk"
# Include the yahs rule
include: "rules/yahs.smk"
# include yahs checkpoint - to start the QC after yahs is done
#include: "rules/yahs_completed.smk"
# Include the quast rule
#include: "rules/quast.smk"
include: "rules/quast.smk"
# Include the busco rule
#include: "rules/busco.smk"
include: "rules/busco.smk"

# Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified
# input files are defined in the common.smk file
Expand Down
19 changes: 12 additions & 7 deletions workflow/rules/busco.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,27 @@

rule busco:
input:
completion = checkpoints.yahs_completed.get().completion_marker,
assemblies = "results/assemblies/{file}.fa"
unpack(get_assemblies_QC)
output:
dir = directory("results/busco/{file}.fa_busco"),
dir = directory("results/busco"),
params:
lineage=config["busco"]["lineage"],
labels = list(get_assemblies_QC().keys())
threads: config["busco"]["t"]
log:
"logs/busco_{file}.log"
"logs/busco.log"
resources:
mem_mb=config['busco']['mem_mb'], # access memory from config
conda:
"../envs/busco.yaml"
shell:
"""
# run busco on each assembly
#assembly_name=$(basename {input.assemblies})
busco -i {input.assemblies} -o {output.dir}/{wildcards.file}_busco -m genome -l {params.lineage} -f -c {threads}
mkdir -p {output}
labels=({params.labels})
count=0
for v in {input} ; do
k=${{labels[$count]}}
busco -i $v -o {output}/$k -m genome -l {params.lineage} -f -c {threads} >> {log} 2>&1
count=$(( $count + 1 ))
done
"""
10 changes: 5 additions & 5 deletions workflow/rules/bwa_mem.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
rule bwa_mem:
input:
REF="results/bwa_index_{hap}/asm.fa",
forward_hic="results/fastp/{sample}_trim_1.fastq.gz",
reverse_hic="results/fastp/{sample}_trim_2.fastq.gz",
forward_hic="results/fastp/hic_trim_1.fastq.gz",
reverse_hic="results/fastp/hic_trim_2.fastq.gz",
output:
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
threads: config["arima"]["CPU"]
log:
"logs/bwa_mem_{hap}_{sample}.log",
"logs/bwa_mem_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
Expand Down
38 changes: 22 additions & 16 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import os

wildcard_constraints:
hap = "primary|hap1|hap2"
hap = "primary|hap1|hap2",

def get_all_inputs(wc=None):
if diploid_mode:
hap = ["hap1", "hap2"]
else:
hap = ["primary"]
if not samples:
hap = ["primary"]
if not sample:
# Show the user a meaningful error message
logger.error(f"No files matching {samples_in_pattern} were found in"
f" {config.get('hic_path')}. Please check your config file.")
Expand All @@ -24,7 +24,8 @@ def get_all_inputs(wc=None):
"results/nanoplot/NanoPlot-report.html", # nanoplot report
"results/kmc/out.hist", # output of kmc rule
"results/genomescope", # output directory of genomescope
# "results/quast/report.html"
"results/quast", # output directory of quast
"results/busco", # output directory of busco
]

# Get the OATK outputs
Expand All @@ -38,14 +39,14 @@ def get_all_inputs(wc=None):
inputs.extend(oatk_outputs)

inputs.extend(expand(
"results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", sample=samples, hap=hap
"results/yahs_{hap}/asm_yahs_scaffolds_final.fa", hap=hap
)) # final scaffolded assembly

# If not in diploid mode, add asm.alternate.fa as an input
if not diploid_mode:
if not diploid_mode and config["include_purge_dups"] == True:
inputs.append("results/purge_dups_alt/asm.alternate_purged.fa")

# inputs.extend(expand("results/busco/{file}.fa_busco", file=assembly_files))
# inputs.extend(expand("results/busco/{tool}_{hap}_busco", tool=tool, hap=hap))

return inputs

Expand Down Expand Up @@ -84,13 +85,18 @@ def get_bwa_index_inputs(wildcards):
else:
return f"results/hifiasm/asm.{hap}.fa"

#def get_yahs_output():
# if diploid_mode:
# return [
# expand("results/yahs_hap1/asm_yahs_{sample}_scaffolds_final.fa", sample=samples),
# expand("results/yahs_hap2/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)
# ]
# else:
# return expand("results/yahs_primary/asm_yahs_{sample}_scaffolds_final.fa", sample=samples)


def get_assemblies_QC(wildcards=None):
if diploid_mode:
haps = ["hap1", "hap2"]
else:
haps = ["primary"]
results = {}
for hap in haps:
results[f"asm_{hap}"] = f"results/assemblies/asm_{hap}.fa"
results[f"yahs_{hap}"] = f"results/assemblies/yahs_{hap}.fa"
if config["include_purge_dups"]:
results[f"purged_{hap}"] = f"results/assemblies/purged_{hap}.fa"
if config["include_fcsgx"]:
results[f"fcsgx_{hap}"] = f"results/assemblies/fcsgx_{hap}.fa"
return results
22 changes: 7 additions & 15 deletions workflow/rules/fastp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,20 @@ import glob

rule fastp:
input:
forward_in=expand(
"{hic_path}{sample}_1.fastq.gz",
hic_path=config["hic_path"],
sample=glob_wildcards(config["hic_path"] + "{sample}_1.fastq.gz").sample,
),
reverse_in=expand(
"{hic_path}{sample}_2.fastq.gz",
hic_path=config["hic_path"],
sample=glob_wildcards(config["hic_path"] + "{sample}_2.fastq.gz").sample,
),
forward_in=f"{config["hic_path"]}{sample}_1.fastq.gz",
reverse_in=f"{config["hic_path"]}{sample}_2.fastq.gz",
output:
forward_out="results/fastp/{sample}_trim_1.fastq.gz",
reverse_out="results/fastp/{sample}_trim_2.fastq.gz",
json="results/fastp/{sample}_report_fastp.HiC.json",
html="results/fastp/{sample}_report_fastp.HiC.html",
forward_out="results/fastp/hic_trim_1.fastq.gz",
reverse_out="results/fastp/hic_trim_2.fastq.gz",
json="results/fastp/hic_report_fastp.HiC.json",
html="results/fastp/hic_report_fastp.HiC.html",
params:
optional_params=" ".join(
f"{k} {v}" for k, v in config["fastp"]["optional_params"].items() if v
),
threads: config["fastp"]["t"]
log:
"logs/{sample}_fastp.log",
"logs/fastp.log",
resources:
mem_mb=config['fastp']['mem_mb'], # access memory from config
conda:
Expand Down
10 changes: 5 additions & 5 deletions workflow/rules/filter_five_end.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

rule fiter_five_end:
input:
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
output:
bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam",
bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam",
bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam",
bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam",
log:
"logs/fiter_five_end_{hap}_{sample}.log",
"logs/fiter_five_end_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
Expand Down
8 changes: 3 additions & 5 deletions workflow/rules/hifiasm.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ rule hifiasm:
gfa_alt="results/hifiasm/asm.alternate.gfa",
fasta="results/hifiasm/asm.primary.fa",
fasta_alt="results/hifiasm/asm.alternate.fa",
link_primary = "results/assemblies/asm.primary.fa",
link_alternate = "results/assemblies/asm.alternate.fa"
link_primary = "results/assemblies/asm_primary.fa",
threads: config["hifiasm"]["t"]
log:
"logs/hifiasm.log",
Expand All @@ -34,7 +33,6 @@ rule hifiasm:
# all the assemblies produced by the workflow will be symlinked to results/assemblies
ln -srn {output.fasta} {output.link_primary}
ln -srn {output.fasta_alt} {output.link_alternate}
"""


Expand All @@ -46,8 +44,8 @@ rule hifiasm_het:
gfa_hap2="results/hifiasm/asm.hap2.gfa",
fasta_hap1="results/hifiasm/asm.hap1.fa",
fasta_hap2="results/hifiasm/asm.hap2.fa",
link_hap1 = "results/assemblies/asm.hap1.fa",
link_hap2 = "results/assemblies/asm.hap2.fa"
link_hap1 = "results/assemblies/asm_hap1.fa",
link_hap2 = "results/assemblies/asm_hap2.fa"
threads: config["hifiasm"]["t"]
log:
"logs/hifiasm.log",
Expand Down
19 changes: 11 additions & 8 deletions workflow/rules/picard.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,30 @@
# This pipeline is currently built for processing a single sample with one read1 and read2 FASTQ file.
# For more info: https://github.com/ArimaGenomics/mapping_pipeline
# colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that
# The final files will be in REP_DIR to be consistent with the original pipeline but they will be marked with "final" and not with "rep1"


rule picard:
input:
tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/{sample}.bam",
tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/hic_vs_contigs_filt_paired.bam",
output:
bam_paired="results/arima_mapping_pipeline_{hap}/PAIR_DIR/{sample}.bam",
mark_dup="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam",
metrics="results/arima_mapping_pipeline_{hap}/REP_DIR/metrics.{sample}_rep1.txt",
stats="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam.stats",
bam_add_read_group="results/arima_mapping_pipeline_{hap}/PAIR_DIR/paired_add_read_group.bam",
mark_dup="results/arima_mapping_pipeline_{hap}/REP_DIR/paired_mark_dups_final.bam",
metrics="results/arima_mapping_pipeline_{hap}/REP_DIR/metrics.final.txt",
stats="results/arima_mapping_pipeline_{hap}/REP_DIR/final.bam.stats",
params:
hic_sample_name=sample
log:
"logs/picard_{hap}_{sample}.log",
"logs/picard_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
"../envs/arima_mapping_pipeline.yaml"
shell:
"""
PICARD=${{CONDA_PREFIX}}/share/picard-*/picard.jar
java -Xmx4G -Djava.io.tmpdir=temp/ -jar ${{PICARD}} AddOrReplaceReadGroups INPUT={input.tmp_bam} OUTPUT={output.bam_paired} ID={wildcards.sample} LB={wildcards.sample} SM={wildcards.sample} PL=ILLUMINA PU=none
java -Xmx30G -XX:-UseGCOverheadLimit -Djava.io.tmpdir=temp/ -jar ${{PICARD}} MarkDuplicates INPUT={output.bam_paired} OUTPUT={output.mark_dup} METRICS_FILE={output.metrics} TMP_DIR=results/arima_mapping_pipeline_{wildcards.hap}/TMP_DIR ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE
java -Xmx4G -Djava.io.tmpdir=temp/ -jar ${{PICARD}} AddOrReplaceReadGroups INPUT={input.tmp_bam} OUTPUT={output.bam_add_read_group} ID={params.hic_sample_name} LB={params.hic_sample_name} SM={params.hic_sample_name} PL=ILLUMINA PU=none
java -Xmx30G -XX:-UseGCOverheadLimit -Djava.io.tmpdir=temp/ -jar ${{PICARD}} MarkDuplicates INPUT={output.bam_add_read_group} OUTPUT={output.mark_dup} METRICS_FILE={output.metrics} TMP_DIR=results/arima_mapping_pipeline_{wildcards.hap}/TMP_DIR ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE
samtools index {output.mark_dup}
perl scripts/get_stats.pl {output.mark_dup} > {output.stats}
"""
3 changes: 3 additions & 0 deletions workflow/rules/purge_dups.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ rule purge_dups:
hap_fa="results/purge_dups/hap.fa",
purged_fasta="results/purge_dups/asm.primary_purged.fa",
hist_plot="results/purge_dups/hist.out.png",
link = "results/assemblies/purged_primary.fa",
threads: config["minimap2"]["t"]
log:
"logs/purge_dups.log",
Expand Down Expand Up @@ -50,4 +51,6 @@ rule purge_dups:
mv hap.fa {output.hap_fa}
mv purged.fa {output.purged_fasta}
mv hist.out.png {output.hist_plot}
ln -srn {output.purged_fasta} {output.link}
"""
12 changes: 6 additions & 6 deletions workflow/rules/quast.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@

rule quast:
input:
completion = checkpoints.yahs_completed.get().completion_marker,
assemblies = expand("results/assemblies/{file}.fa", file=assembly_files)
unpack(get_assemblies_QC)
output:
"results/quast/report.html",
directory("results/quast"),
threads: config["quast"]["t"]
log:
"logs/quast.log",
Expand All @@ -17,7 +16,8 @@ rule quast:
optional_params=" ".join(
f"{k} {v}" for k, v in config["quast"]["optional_params"].items() if v
),
labels = ",".join(get_assemblies_QC().keys())
shell:
"""
quast {input.assemblies} -o results/quast -t {threads} {params.optional_params}
"""
"""
quast {input} -l {params.labels} -o {output} -t {threads} {params.optional_params} >> {log} 2>&1
"""
8 changes: 4 additions & 4 deletions workflow/rules/two_read_bam_combiner.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@

rule two_read_bam_combiner:
input:
bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam",
bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam",
bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam",
bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam",
REF="results/bwa_index_{hap}/asm.fa",
output:
tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/{sample}.bam",
tmp_bam="results/arima_mapping_pipeline_{hap}/TMP_DIR/hic_vs_contigs_filt_paired.bam",
threads: config["arima"]["CPU"]
params:
MAPQ_FILTER=config["arima"]["MAPQ_FILTER"],
log:
"logs/two_read_bam_combiner_{hap}_{sample}.log",
"logs/two_read_bam_combiner_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
Expand Down
10 changes: 5 additions & 5 deletions workflow/rules/yahs.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ import glob
rule yahs:
input:
REF="results/bwa_index_{hap}/asm.fa",
bam="results/arima_mapping_pipeline_{hap}/REP_DIR/{sample}_rep1.bam",
bam="results/arima_mapping_pipeline_{hap}/REP_DIR/paired_mark_dups_final.bam",
output:
scaffolds="results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa",
link="results/assemblies/yahs_{hap}_{sample}.fa",
scaffolds="results/yahs_{hap}/asm_yahs_scaffolds_final.fa",
link="results/assemblies/yahs_{hap}.fa"
log:
"logs/yahs_{hap}_{sample}.log",
"logs/yahs_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
Expand All @@ -21,6 +21,6 @@ rule yahs:
),
shell:
"""
yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs_{wildcards.sample} {params.optional_params} >> {log} 2>&1
yahs {input.REF} {input.bam} -o results/yahs_{wildcards.hap}/asm_yahs {params.optional_params} >> {log} 2>&1
ln -srn {output.scaffolds} {output.link}
"""
8 changes: 4 additions & 4 deletions workflow/rules/yahs_completed.smk
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# add a checkpoint to run the QC after all the assemblies have been symlinked to results/assemblies

checkpoint symlinks:
checkpoint yahs_done:
input:
expand("results/assemblies/yahs_{hap}_{sample}.fa", sample=samples, hap=hap)
get_yahs_output()
output:
completion_marker = "results/yahs_all_done.txt"
completion_marker = "results/yahs_completed.txt"
log:
"logs/all_yahs_completed.log",
"logs/yahs_completed.log",
conda:
"../envs/basic.yaml"
shell:
Expand Down

0 comments on commit 2774a83

Please sign in to comment.