Skip to content

Commit

Permalink
Merging QC branch
Browse files Browse the repository at this point in the history
  • Loading branch information
LiaOb21 committed Mar 28, 2024
2 parents d0e1ffe + 2774a83 commit e67853a
Show file tree
Hide file tree
Showing 21 changed files with 228 additions and 67 deletions.
17 changes: 17 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,20 @@ yahs:
# o: "hifiasm_p_purged_yahs" # output prefix use of this needs evaluation
optional_params:
"-e": "" # you can specify the restriction enzyme(s) used by the Hi-C experiment


# Customisable parameters for quast
quast:
t: 4 # number of threads
mem_mb: 1000 # memory in MB
optional_params:
"--fragmented": ""
"--large": ""
# "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"
# "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3"

# Customisable parameters for busco
busco:
t: 4 # number of threads
mem_mb: 1000 # memory in MB
lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis
18 changes: 17 additions & 1 deletion config/config_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ fcsgx:
path_to_gx_db: "resources/gx_test_db/test-only"

# Set this to False if you want to skip purge_dups steps:
include_purge_dups: True
include_purge_dups: False

# Customisable parameters for purge_dups
purge_dups:
Expand All @@ -97,3 +97,19 @@ yahs:
# o: "hifiasm_p_purged_yahs" # output prefix #use of this needs evaluation
optional_params:
"-e": "GATC" # you can specify the restriction enzyme(s) used by the Hi-C experiment

# Customisable parameters for quast
quast:
t: 4 # number of threads
mem_mb: 1000 # memory in MB
optional_params:
"--fragmented": ""
"--large": ""
# "-r": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"
# "-g": "resources/reference_genomes/yeast/Saccharomyces_cerevisiae.R64-1-1.101.gff3"

# Customisable parameters for busco
busco:
t: 4 # number of threads
mem_mb: 1000 # memory in MB
lineage: "resources/saccharomycetes_odb10" # lineage to be used for busco analysis
8 changes: 6 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ configfile: "config/config.yaml"

# set {samples} global variable - used to read Hi-C files
samples_in_pattern = "{sample}_1.fastq.gz"
samples = glob_wildcards(config["hic_path"] + samples_in_pattern).sample
sample, = glob_wildcards(config["hic_path"] + samples_in_pattern).sample

# set {diploid_mode} global variable - used to run hifiasm_het rather than hifiasm
diploid_mode=config["hifiasm"]["optional_params"]["--h1"] and config["hifiasm"]["optional_params"]["--h2"]


# Include the common rule, where the inputs for rule_all are defined
include: "rules/common.smk"
# Include the hifi_prep rule
Expand Down Expand Up @@ -47,7 +48,10 @@ include: "rules/two_read_bam_combiner.smk"
include: "rules/picard.smk"
# Include the yahs rule
include: "rules/yahs.smk"

# Include the quast rule
include: "rules/quast.smk"
# Include the busco rule
include: "rules/busco.smk"

# Define the rule all, which is the default rule that Snakemake runs when no specific rule or target is specified
# input files are defined in the common.smk file
Expand Down
6 changes: 6 additions & 0 deletions workflow/envs/busco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda

dependencies:
- busco=5.7.0
6 changes: 6 additions & 0 deletions workflow/envs/quast.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda

dependencies:
- quast=5.2.0
28 changes: 28 additions & 0 deletions workflow/rules/busco.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# rule busco for assembly QC

rule busco:
input:
unpack(get_assemblies_QC)
output:
dir = directory("results/busco"),
params:
lineage=config["busco"]["lineage"],
labels = list(get_assemblies_QC().keys())
threads: config["busco"]["t"]
log:
"logs/busco.log"
resources:
mem_mb=config['busco']['mem_mb'], # access memory from config
conda:
"../envs/busco.yaml"
shell:
"""
mkdir -p {output}
labels=({params.labels})
count=0
for v in {input} ; do
k=${{labels[$count]}}
busco -i $v -o {output}/$k -m genome -l {params.lineage} -f -c {threads} >> {log} 2>&1
count=$(( $count + 1 ))
done
"""
1 change: 0 additions & 1 deletion workflow/rules/bwa_index.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# For more info: https://github.com/ArimaGenomics/mapping_pipeline
# colora doesn't contain the codes to handle technical and biological replicates of hic reads, refer to the original pipeline for that


rule bwa_index:
input:
get_bwa_index_inputs
Expand Down
10 changes: 5 additions & 5 deletions workflow/rules/bwa_mem.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
rule bwa_mem:
input:
REF="results/bwa_index_{hap}/asm.fa",
forward_hic="results/fastp/{sample}_trim_1.fastq.gz",
reverse_hic="results/fastp/{sample}_trim_2.fastq.gz",
forward_hic="results/fastp/hic_trim_1.fastq.gz",
reverse_hic="results/fastp/hic_trim_2.fastq.gz",
output:
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
threads: config["arima"]["CPU"]
log:
"logs/bwa_mem_{hap}_{sample}.log",
"logs/bwa_mem_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
Expand Down
28 changes: 23 additions & 5 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import os

wildcard_constraints:
hap = "primary|hap1|hap2"
hap = "primary|hap1|hap2",

def get_all_inputs(wc=None):
if diploid_mode:
hap= ["hap1", "hap2"]
hap = ["hap1", "hap2"]
else:
hap = ["primary"]
if not samples:
hap = ["primary"]
if not sample:
# Show the user a meaningful error message
logger.error(f"No files matching {samples_in_pattern} were found in"
f" {config.get('hic_path')}. Please check your config file.")
Expand All @@ -24,6 +24,8 @@ def get_all_inputs(wc=None):
"results/nanoplot/NanoPlot-report.html", # nanoplot report
"results/kmc/out.hist", # output of kmc rule
"results/genomescope", # output directory of genomescope
"results/quast", # output directory of quast
"results/busco", # output directory of busco
]

# Get the OATK outputs
Expand All @@ -37,13 +39,15 @@ def get_all_inputs(wc=None):
inputs.extend(oatk_outputs)

inputs.extend(expand(
"results/yahs_{hap}/asm_yahs_{sample}_scaffolds_final.fa", sample=samples, hap=hap
"results/yahs_{hap}/asm_yahs_scaffolds_final.fa", hap=hap
)) # final scaffolded assembly

# If not in diploid mode, add asm.alternate.fa as an input
if not diploid_mode and config["include_purge_dups"] == True:
inputs.append("results/purge_dups_alt/asm.alternate_purged.fa")

# inputs.extend(expand("results/busco/{tool}_{hap}_busco", tool=tool, hap=hap))

return inputs


Expand Down Expand Up @@ -82,3 +86,17 @@ def get_bwa_index_inputs(wildcards):
return f"results/hifiasm/asm.{hap}.fa"


def get_assemblies_QC(wildcards=None):
if diploid_mode:
haps = ["hap1", "hap2"]
else:
haps = ["primary"]
results = {}
for hap in haps:
results[f"asm_{hap}"] = f"results/assemblies/asm_{hap}.fa"
results[f"yahs_{hap}"] = f"results/assemblies/yahs_{hap}.fa"
if config["include_purge_dups"]:
results[f"purged_{hap}"] = f"results/assemblies/purged_{hap}.fa"
if config["include_fcsgx"]:
results[f"fcsgx_{hap}"] = f"results/assemblies/fcsgx_{hap}.fa"
return results
22 changes: 7 additions & 15 deletions workflow/rules/fastp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,20 @@ import glob

rule fastp:
input:
forward_in=expand(
"{hic_path}{sample}_1.fastq.gz",
hic_path=config["hic_path"],
sample=glob_wildcards(config["hic_path"] + "{sample}_1.fastq.gz").sample,
),
reverse_in=expand(
"{hic_path}{sample}_2.fastq.gz",
hic_path=config["hic_path"],
sample=glob_wildcards(config["hic_path"] + "{sample}_2.fastq.gz").sample,
),
forward_in=f"{config["hic_path"]}{sample}_1.fastq.gz",
reverse_in=f"{config["hic_path"]}{sample}_2.fastq.gz",
output:
forward_out="results/fastp/{sample}_trim_1.fastq.gz",
reverse_out="results/fastp/{sample}_trim_2.fastq.gz",
json="results/fastp/{sample}_report_fastp.HiC.json",
html="results/fastp/{sample}_report_fastp.HiC.html",
forward_out="results/fastp/hic_trim_1.fastq.gz",
reverse_out="results/fastp/hic_trim_2.fastq.gz",
json="results/fastp/hic_report_fastp.HiC.json",
html="results/fastp/hic_report_fastp.HiC.html",
params:
optional_params=" ".join(
f"{k} {v}" for k, v in config["fastp"]["optional_params"].items() if v
),
threads: config["fastp"]["t"]
log:
"logs/{sample}_fastp.log",
"logs/fastp.log",
resources:
mem_mb=config['fastp']['mem_mb'], # access memory from config
conda:
Expand Down
10 changes: 5 additions & 5 deletions workflow/rules/filter_five_end.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

rule fiter_five_end:
input:
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/{sample}_2.bam",
bam1="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_1.bam",
bam2="results/arima_mapping_pipeline_{hap}/RAW_DIR/hic_vs_contigs_2.bam",
output:
bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_1.bam",
bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/{sample}_2.bam",
bam1_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_1.bam",
bam2_filt="results/arima_mapping_pipeline_{hap}/FILT_DIR/hic_vs_contigs_filt_2.bam",
log:
"logs/fiter_five_end_{hap}_{sample}.log",
"logs/fiter_five_end_{hap}.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
Expand Down
15 changes: 15 additions & 0 deletions workflow/rules/get_assemblies.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
include: "common.smk"

rule get_assemblies:
output:
directory("results/assemblies")
log:
"logs/get_assemblies.log",
resources:
mem_mb=config['arima']['mem_mb'], # access memory from config
conda:
"../envs/basic.yaml"
shell:
"""
mkdir -p {output}
"""
36 changes: 24 additions & 12 deletions workflow/rules/hifiasm.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

rule hifiasm:
input:
"results/reads/hifi/hifi.fastq.gz",
reads = "results/reads/hifi/hifi.fastq.gz",
output:
gfa="results/hifiasm/asm.primary.gfa",
gfa_alt="results/hifiasm/asm.alternate.gfa",
fasta="results/hifiasm/asm.primary.fa",
fasta_alt="results/hifiasm/asm.alternate.fa",
link_primary = "results/assemblies/asm_primary.fa",
threads: config["hifiasm"]["t"]
log:
"logs/hifiasm.log",
Expand All @@ -23,22 +24,28 @@ rule hifiasm:
),
shell:
"""
hifiasm {input} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1
hifiasm {input.reads} -t {threads} -o results/hifiasm/asm --primary {params.optional_params} >> {log} 2>&1
mv results/hifiasm/asm.p_ctg.gfa {output.gfa}
mv results/hifiasm/asm.a_ctg.gfa {output.gfa_alt}
awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta}
awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt}
# all the assemblies produced by the workflow will be symlinked to results/assemblies
ln -srn {output.fasta} {output.link_primary}
"""


rule hifiasm_het:
input:
"results/reads/hifi/hifi.fastq.gz",
reads = "results/reads/hifi/hifi.fastq.gz",
output:
gfa="results/hifiasm/asm.hap1.gfa",
gfa_alt="results/hifiasm/asm.hap2.gfa",
fasta="results/hifiasm/asm.hap1.fa",
fasta_alt="results/hifiasm/asm.hap2.fa",
gfa_hap1="results/hifiasm/asm.hap1.gfa",
gfa_hap2="results/hifiasm/asm.hap2.gfa",
fasta_hap1="results/hifiasm/asm.hap1.fa",
fasta_hap2="results/hifiasm/asm.hap2.fa",
link_hap1 = "results/assemblies/asm_hap1.fa",
link_hap2 = "results/assemblies/asm_hap2.fa"
threads: config["hifiasm"]["t"]
log:
"logs/hifiasm.log",
Expand All @@ -52,9 +59,14 @@ rule hifiasm_het:
),
shell:
"""
hifiasm {input} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1
mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa}
mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_alt}
awk -f scripts/gfa_to_fasta.awk < {output.gfa} > {output.fasta}
awk -f scripts/gfa_to_fasta.awk < {output.gfa_alt} > {output.fasta_alt}
hifiasm {input.reads} -t {threads} -o results/hifiasm/asm {params.optional_params} >> {log} 2>&1
mv results/hifiasm/asm.hic.hap1.p_ctg.gfa {output.gfa_hap1}
mv results/hifiasm/asm.hic.hap2.p_ctg.gfa {output.gfa_hap2}
awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap1} > {output.fasta_hap1}
awk -f scripts/gfa_to_fasta.awk < {output.gfa_hap2} > {output.fasta_hap2}
# all the assemblies produced by the workflow will be symlinked to results/assemblies
ln -srn {output.fasta_hap1} {output.link_hap1}
ln -srn {output.fasta_hap2} {output.link_hap2}
"""
6 changes: 4 additions & 2 deletions workflow/rules/ncbi_fcsgx.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@

rule fcsgx:
input:
fasta="results/hifiasm/asm.{hap}.fa",
fasta = "results/hifiasm/asm.{hap}.fa",
params:
ncbi_tax_id=config["fcsgx"]["ncbi_tax_id"],
path_to_gx_db=config["fcsgx"]["path_to_gx_db"],
output:
clean_fasta="results/ncbi_fcsgx_{hap}/asm_clean.fa",
contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa"
contaminants="results/ncbi_fcsgx_{hap}/contaminants.fa",
link="results/assemblies/fcsgx_{hap}.fa"
threads: config["hifiasm"]["t"]
log:
"logs/fcsgx_{hap}.log",
Expand All @@ -21,4 +22,5 @@ rule fcsgx:
"""
run_gx.py --fasta {input.fasta} --out-dir results/ncbi_fcsgx_{wildcards.hap}/out --gx-db {params.path_to_gx_db} --tax-id {params.ncbi_tax_id} --debug >> {log} 2>&1
cat {input.fasta} | gx clean-genome --action-report results/ncbi_fcsgx_{wildcards.hap}/out/asm.{wildcards.hap}.{params.ncbi_tax_id}.fcs_gx_report.txt --output {output.clean_fasta} --contam-fasta-out {output.contaminants} >> {log} 2>&1
ln -srn {output.clean_fasta} {output.link}
"""
2 changes: 1 addition & 1 deletion workflow/rules/oatk.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This rule runs oatk to extract organelles reads from the hifi reads.

# get_oatk_outputs is used to dynamically decide the outputs of oatk

rule oatk:
input:
Expand Down
Loading

0 comments on commit e67853a

Please sign in to comment.