Skip to content

Commit

Permalink
recent refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
riasc committed Dec 17, 2023
1 parent b22466b commit af5f3f0
Show file tree
Hide file tree
Showing 20 changed files with 1,566 additions and 393 deletions.
4 changes: 3 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@ from snakemake.utils import min_version
min_version("6.4.1")

#### setup #######
configfile: "config/config.yaml"
#configfile: "config/config.yaml"

rule all:
input:
expand("results/{sample}/priorization/neoantigens.tsv", sample=config['data']['name'])

#### load rules ####
include: "rules/prelim.smk"
include: "rules/ref.smk"
include: "rules/common.smk"
include: "rules/hlatyping.smk"
include: "rules/preproc.smk"
include: "rules/align.smk"
include: "rules/quantification.smk"
include: "rules/genefusion.smk"
include: "rules/altsplicing.smk"
include: "rules/exitron.smk"
Expand Down
38 changes: 29 additions & 9 deletions workflow/rules/align.smk
Original file line number Diff line number Diff line change
Expand Up @@ -114,25 +114,45 @@ if config['data']['rnaseq_filetype'] == '.bam':
"v1.32.1/bio/samtools/merge"

# post-processingn and realignment
rule postproc:
rule rnaseq_postproc_fixmate:
input:
"results/{sample}/rnaseq/align/{group}_aligned_STAR.bam"
output:
"results/{sample}/rnaseq/align/{group}_final_STAR.bam"
"results/{sample}/rnaseq/align/{group}_fixmate_STAR.bam"
conda:
"../envs/samtools.yml"
log:
"logs/{sample}/postproc/rnaseq_{group}.log"
threads: config['threads']
"logs/{sample}/postproc/rnaseq_{group}_fixmate.log"
threads: 4
params:
mapq="--min-MQ config['mapq']"
shell:
"""
samtools view -bh -F 4 {params.mapq} {input} -o - \
| samtools sort -n -@ {threads} -m1g -O bam - -o - \
| samtools fixmate -pcmu -O bam -@ {threads} - - \
| samtools sort -@ {threads} -m1g -O bam - -o - \
| samtools markdup -r -@ {threads} - {output} > {log} 2>&1
samtools view -h -F 4 {params.mapq} {input} -o - \
| samtools sort -n -@4 -m4g -O SAM - -o - \
| samtools fixmate -pcmu -O bam -@ {threads} - {output} > {log} 2>&1
"""

# sort and markdup needed to be separated (ensure no core dump for whatever reason)
rule rnaseq_postproc_markdup:
input:
bam="results/{sample}/rnaseq/align/{group}_fixmate_STAR.bam",
tmp="tmp/"
output:
"results/{sample}/rnaseq/align/{group}_final_STAR.bam"
conda:
"../envs/samtools.yml"
log:
"logs/{sample}/postproc/rnaseq_{group}_markdup.log"
threads: 4
resources:
mem_mb_per_cpu=4000
shell:
"""
samtools sort -@4 -m4G -O BAM -T tmp/ {input.bam} \
-o tmp/sorted.bam > {log} 2>&1
samtools markdup -r -@4 tmp/sorted.bam {output} > {log} 2>&1
rm tmp/sorted.bam
"""

rule postproc_bam_index:
Expand Down
8 changes: 3 additions & 5 deletions workflow/rules/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,16 @@ rule download_vep_cache:

rule index_variants:
input:
"results/{sample}/variants/{vartype}.vcf"
"results/{sample}/variants/{vartype}.vcf.gz"
output:
bgzip="results/{sample}/variants/{vartype}.vcf.gz",
idx="results/{sample}/variants/{vartype}.vcf.gz.tbi"
"results/{sample}/variants/{vartype}.vcf.gz.tbi"
log:
"logs/indexvcf/{sample}_{vartype}.log"
conda:
"../envs/samtools.yml"
shell:
"""
bgzip {input}
tabix {input}.gz
tabix {input}
"""

rule annotate_variants:
Expand Down
68 changes: 49 additions & 19 deletions workflow/rules/exitron.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ rule download_genome:
curl -L https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz \
| gzip -d > resources/refs/gencode.v37.annotation.gtf
"""


rule prepare_cds:
output:
Expand All @@ -24,10 +23,16 @@ rule prepare_cds:
"../envs/basic.yml"
shell:
"""
cat resources/refs/gencode.v37.annotation.gtf \
cat resources/refs/genome.gtf \
| awk 'OFS="\\t" {{if ($3=="CDS") {{print $1,$4-1,$5,$10,$16,$7}}}}' \
| tr -d '";' > {output}
"""

#"""
#cat resources/refs/gencode.v37.annotation.gtf \
#| awk 'OFS="\\t" {{if ($3=="CDS") {{print $1,$4-1,$5,$10,$16,$7}}}}' \
#| tr -d '";' > {output}
#"""

rule prepare_scanexitron_config:
output:
Expand All @@ -39,17 +44,24 @@ rule prepare_scanexitron_config:
shell:
"""
python3 workflow/scripts/prep_scanexitron_config.py \
resources/refs/hg38.fa \
resources/refs/gencode.v37.annotation.gtf \
resources/refs/genome.fasta \
resources/refs/genome.gtf \
resources/refs/CDS.bed \
{output} > {log}
"""

#python3 workflow/scripts/prep_scanexitron_config.py \
#resources/refs/hg38.fa \
#resources/refs/gencode.v37.annotation.gtf \
#resources/refs/CDS.bed \
#{output} > {log}

rule scanexitron:
input:
bam = "results/{sample}/rnaseq/align/{group}_final_STAR.bam",
fasta = "resources/refs/hg38.fa",
gtf = "resources/refs/gencode.v37.annotation.gtf",
idx = "results/{sample}/rnaseq/align/{group}_final_STAR.bam.bai",
fasta = "resources/refs/genome.fasta",
gtf = "resources/refs/genome.gtf",
cds = "resources/refs/CDS.bed",
config="resources/scanexitron_config.ini"
output:
Expand Down Expand Up @@ -84,46 +96,64 @@ rule exitron_to_vcf:
input:
"results/{sample}/rnaseq/exitron/{group}.exitron"
output:
"results/{sample}/rnaseq/exitron/{group}_exitron.vcf"
"results/{sample}/rnaseq/exitron/{group}_exitrons.vcf"
log:
"logs/exitron2vcf_{sample}_{group}.log"
conda:
"../envs/scanexitron.yml"
shell:
"""
python workflow/scripts/exitron2vcf2.py \
python workflow/scripts/exitron2vcf.py \
{input} \
{output} \
resources/refs/hg38.fa > {log}
"""

rule combine_exitrons:
rule exitron_augment:
input:
get_exitrons,
"results/{sample}/rnaseq/exitron/{group}_exitrons.vcf"
output:
"results/{sample}/rnaseq/exitron/exitrons_combined.vcf"
"results/{sample}/rnaseq/exitron/{group}_exitrons_augmented.vcf"
message:
"Combining exitrons on sample:{wildcards.sample}"
"Augmenting exitrons on sample:{wildcards.sample} of group:{wildcards.group}"
log:
"logs/{sample}/scanexitron/combine_groups.log"
"logs/exitron_augment_{sample}_{group}.log"
conda:
"../envs/manipulate_vcf.yml"
shell:
"""
python workflow/scripts/combine_vcf.py '{input}' exitron {output} > {log} 2>&1
python workflow/scripts/add_infos_to_vcf.py {input} exitron {output} > {log} 2>&1
"""

rule sort_exitrons:
rule sort_exitron:
input:
"results/{sample}/rnaseq/exitron/exitrons_combined.vcf",
"results/{sample}/rnaseq/exitron/{group}_exitrons_augmented.vcf"
output:
"results/{sample}/variants/exitrons.vcf"
"results/{sample}/rnaseq/exitron/{group}_exitrons.vcf.gz"
message:
"Sorting and compressing exitrons on sample:{wildcards.sample} of group:{wildcards.group}"
log:
"logs/exitron_sort_{sample}_{group}.log"
conda:
"../envs/samtools.yml"
shell:
"""
bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1
"""

rule combine_exitrons:
input:
get_exitrons
output:
"results/{sample}/variants/exitrons.vcf.gz"
message:
"Combining exitrons on sample:{wildcards.sample}"
log:
"logs/{sample}/scanexitron/sort_exitrons.log"
"logs/{sample}/exitrons/combine_exitrons.log"
conda:
"../envs/samtools.yml"
shell:
"""
bcftools sort {input} -o {output}
bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1
"""

96 changes: 47 additions & 49 deletions workflow/rules/genefusion.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rule arriba:
default_blacklist=False, # optional
default_known_fusions=True, # optional
sv_file="", # optional
extra=f"""-G 'gene_name=gene_name gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS' \
extra=f"""-G 'gene_name=gene_name|gene_id gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS' \
-E {config['genefusion']['maxevalue']} \
-S {config['genefusion']['suppreads']} \
-L {config['genefusion']['maxidentity']} \
Expand All @@ -30,54 +30,52 @@ rule arriba:
wrapper:
"v2.6.0/bio/arriba"

#-V {config['genefusion']['maxmismatch']} \
#rule fusions_to_vcf:
#input:
#"results/{sample}/rnaseq/genefusion/{group}_fusions.tsv",
#output:
#"results/{sample}/rnaseq/genefusion/{group}_fusions.vcf"
#log:
#"logs/{sample}/genefusion/fusion_to_vcf_{group}.log"
#conda:
#"../envs/basic.yml"
#shell:
#"""
#workflow/scripts/convert_fusions_to_vcf.sh \
#resources/refs/genome.fasta \
#{input} \
#{output} > {log} 2>&1
#"""

rule fusions_to_vcf:
input:
"results/{sample}/rnaseq/genefusion/{group}_fusions.tsv",
output:
"results/{sample}/rnaseq/genefusion/{group}_fusions.vcf"
log:
"logs/{sample}/genefusion/fusion_to_vcf_{group}.log"
conda:
"../envs/basic.yml"
shell:
"""
workflow/scripts/convert_fusions_to_vcf.sh \
resources/refs/genome.fasta \
{input} \
{output} > {log} 2>&1
"""

rule combine_fusions:
input:
get_fusions,
output:
"results/{sample}/rnaseq/genefusion/fusions.vcf"
message:
"Combining gene fusion events on sample:{wildcards.sample}"
log:
"logs/{sample}/combine/fusions.log"
conda:
"../envs/manipulate_vcf.yml"
shell:
"""
python workflow/scripts/combine_vcf.py '{input}' fusion {output} > {log} 2>&1
#rule combine_fusions:
#input:
#get_fusions,
#output:
#"results/{sample}/rnaseq/genefusion/fusions.vcf"
#message:
#"Combining gene fusion events on sample:{wildcards.sample}"
#log:
#"logs/{sample}/combine/fusions.log"
#conda:
#"../envs/manipulate_vcf.yml"
#shell:
#"""
#python workflow/scripts/combine_vcf.py '{input}' fusion {output} > {log} 2>&1

"""
#"""

rule sort_fusions:
input:
"results/{sample}/rnaseq/genefusion/fusions.vcf"
output:
"results/{sample}/variants/fusions.vcf"
message:
"Sorting gene fusion events on sample:{wildcards.sample}"
log:
"logs/{sample}/genefusion/sort_fusions.log"
conda:
"../envs/samtools.yml"
shell:
"""
bcftools sort {input} -o {output} > {log} 2>&1
"""
#rule sort_fusions:
#input:
#"results/{sample}/rnaseq/genefusion/fusions.vcf"
#output:
#"results/{sample}/variants/fusions.vcf"
#message:
#"Sorting gene fusion events on sample:{wildcards.sample}"
#log:
#"logs/{sample}/genefusion/sort_fusions.log"
#conda:
#"../envs/samtools.yml"
#shell:
#"""
#bcftools sort {input} -o {output} > {log} 2>&1
#"""
9 changes: 5 additions & 4 deletions workflow/rules/germline.smk
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ rule detect_variants_htc_first_round:
input:
# single or list of bam files
bam="results/{sample}/{seqtype}/align/{group}_final_BWA.bam",
idx="results/{sample}/{seqtype}/align/{group}_final_BWA.bam.bai",
ref="resources/refs/genome.fasta",
#known="resources/vqsr/dbSNP_b150.vcf.gz" # optional
output:
Expand All @@ -48,7 +49,7 @@ rule detect_variants_htc_first_round:
params:
extra="",
java_opts="",
threads: config['threads']
threads: 4
resources:
mem_mb=1024,
wrapper:
Expand Down Expand Up @@ -82,7 +83,7 @@ rule recalibrate_variants_first_round:
},
annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
extra=""
threads: config['threads']
threads: 4
resources:
mem_mb=1024,
wrapper:
Expand Down Expand Up @@ -183,7 +184,7 @@ rule detect_variants_htc_final_round:
params:
extra="", # optional
java_opts="", # optional
threads: config['threads']
threads: 4
resources:
mem_mb=1024,
wrapper:
Expand Down Expand Up @@ -217,7 +218,7 @@ rule recalibrate_variants_final_round:
},
annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
extra=""
threads: config['threads']
threads: 1
resources:
mem_mb=1024,
wrapper:
Expand Down
Loading

0 comments on commit af5f3f0

Please sign in to comment.