diff --git a/workflow/Snakefile b/workflow/Snakefile index 88dbcb8..66291c9 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -4,18 +4,20 @@ from snakemake.utils import min_version min_version("6.4.1") #### setup ####### -configfile: "config/config.yaml" +#configfile: "config/config.yaml" rule all: input: expand("results/{sample}/priorization/neoantigens.tsv", sample=config['data']['name']) #### load rules #### +include: "rules/prelim.smk" include: "rules/ref.smk" include: "rules/common.smk" include: "rules/hlatyping.smk" include: "rules/preproc.smk" include: "rules/align.smk" +include: "rules/quantification.smk" include: "rules/genefusion.smk" include: "rules/altsplicing.smk" include: "rules/exitron.smk" diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 1363e16..188a33f 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -114,25 +114,45 @@ if config['data']['rnaseq_filetype'] == '.bam': "v1.32.1/bio/samtools/merge" # post-processingn and realignment -rule postproc: +rule rnaseq_postproc_fixmate: input: "results/{sample}/rnaseq/align/{group}_aligned_STAR.bam" output: - "results/{sample}/rnaseq/align/{group}_final_STAR.bam" + "results/{sample}/rnaseq/align/{group}_fixmate_STAR.bam" conda: "../envs/samtools.yml" log: - "logs/{sample}/postproc/rnaseq_{group}.log" - threads: config['threads'] + "logs/{sample}/postproc/rnaseq_{group}_fixmate.log" + threads: 4 params: mapq="--min-MQ config['mapq']" shell: """ - samtools view -bh -F 4 {params.mapq} {input} -o - \ - | samtools sort -n -@ {threads} -m1g -O bam - -o - \ - | samtools fixmate -pcmu -O bam -@ {threads} - - \ - | samtools sort -@ {threads} -m1g -O bam - -o - \ - | samtools markdup -r -@ {threads} - {output} > {log} 2>&1 + samtools view -h -F 4 {params.mapq} {input} -o - \ + | samtools sort -n -@4 -m4g -O SAM - -o - \ + | samtools fixmate -pcmu -O bam -@ {threads} - {output} > {log} 2>&1 + """ + +# sort and markdup needed to be separated (ensure no core dump for whatever reason) +rule rnaseq_postproc_markdup: + input: + bam="results/{sample}/rnaseq/align/{group}_fixmate_STAR.bam", + tmp="tmp/" + output: + "results/{sample}/rnaseq/align/{group}_final_STAR.bam" + conda: + "../envs/samtools.yml" + log: + "logs/{sample}/postproc/rnaseq_{group}_markdup.log" + threads: 4 + resources: + mem_mb_per_cpu=4000 + shell: + """ + samtools sort -@4 -m4G -O BAM -T tmp/ {input.bam} \ + -o tmp/sorted.bam > {log} 2>&1 + samtools markdup -r -@4 tmp/sorted.bam {output} > {log} 2>&1 + rm tmp/sorted.bam """ rule postproc_bam_index: diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index d2d42ad..de23fec 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -31,18 +31,16 @@ rule download_vep_cache: rule index_variants: input: - "results/{sample}/variants/{vartype}.vcf" + "results/{sample}/variants/{vartype}.vcf.gz" output: - bgzip="results/{sample}/variants/{vartype}.vcf.gz", - idx="results/{sample}/variants/{vartype}.vcf.gz.tbi" + "results/{sample}/variants/{vartype}.vcf.gz.tbi" log: "logs/indexvcf/{sample}_{vartype}.log" conda: "../envs/samtools.yml" shell: """ - bgzip {input} - tabix {input}.gz + tabix {input} """ rule annotate_variants: diff --git a/workflow/rules/exitron.smk b/workflow/rules/exitron.smk index 169aed1..4f17ba8 100644 --- a/workflow/rules/exitron.smk +++ b/workflow/rules/exitron.smk @@ -13,7 +13,6 @@ rule download_genome: curl -L https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz \ | gzip -d > resources/refs/gencode.v37.annotation.gtf """ - rule prepare_cds: output: @@ -24,10 +23,16 @@ rule prepare_cds: "../envs/basic.yml" shell: """ - cat resources/refs/gencode.v37.annotation.gtf \ + cat resources/refs/genome.gtf \ | awk 'OFS="\\t" {{if ($3=="CDS") {{print $1,$4-1,$5,$10,$16,$7}}}}' \ | tr -d '";' > {output} """ + + #""" + #cat resources/refs/gencode.v37.annotation.gtf \ + #| awk 'OFS="\\t" {{if ($3=="CDS") {{print $1,$4-1,$5,$10,$16,$7}}}}' \ + #| tr -d '";' > {output} + #""" rule prepare_scanexitron_config: output: @@ -39,17 +44,24 @@ rule prepare_scanexitron_config: shell: """ python3 workflow/scripts/prep_scanexitron_config.py \ - resources/refs/hg38.fa \ - resources/refs/gencode.v37.annotation.gtf \ + resources/refs/genome.fasta \ + resources/refs/genome.gtf \ resources/refs/CDS.bed \ {output} > {log} """ + + #python3 workflow/scripts/prep_scanexitron_config.py \ + #resources/refs/hg38.fa \ + #resources/refs/gencode.v37.annotation.gtf \ + #resources/refs/CDS.bed \ + #{output} > {log} rule scanexitron: input: bam = "results/{sample}/rnaseq/align/{group}_final_STAR.bam", - fasta = "resources/refs/hg38.fa", - gtf = "resources/refs/gencode.v37.annotation.gtf", + idx = "results/{sample}/rnaseq/align/{group}_final_STAR.bam.bai", + fasta = "resources/refs/genome.fasta", + gtf = "resources/refs/genome.gtf", cds = "resources/refs/CDS.bed", config="resources/scanexitron_config.ini" output: @@ -84,46 +96,64 @@ rule exitron_to_vcf: input: "results/{sample}/rnaseq/exitron/{group}.exitron" output: - "results/{sample}/rnaseq/exitron/{group}_exitron.vcf" + "results/{sample}/rnaseq/exitron/{group}_exitrons.vcf" log: "logs/exitron2vcf_{sample}_{group}.log" conda: "../envs/scanexitron.yml" shell: """ - python workflow/scripts/exitron2vcf2.py \ + python workflow/scripts/exitron2vcf.py \ {input} \ {output} \ resources/refs/hg38.fa > {log} """ -rule combine_exitrons: +rule exitron_augment: input: - get_exitrons, + "results/{sample}/rnaseq/exitron/{group}_exitrons.vcf" output: - "results/{sample}/rnaseq/exitron/exitrons_combined.vcf" + "results/{sample}/rnaseq/exitron/{group}_exitrons_augmented.vcf" message: - "Combining exitrons on sample:{wildcards.sample}" + "Augmenting exitrons on sample:{wildcards.sample} of group:{wildcards.group}" log: - "logs/{sample}/scanexitron/combine_groups.log" + "logs/exitron_augment_{sample}_{group}.log" conda: "../envs/manipulate_vcf.yml" shell: """ - python workflow/scripts/combine_vcf.py '{input}' exitron {output} > {log} 2>&1 + python workflow/scripts/add_infos_to_vcf.py {input} exitron {output} > {log} 2>&1 """ -rule sort_exitrons: +rule sort_exitron: input: - "results/{sample}/rnaseq/exitron/exitrons_combined.vcf", + "results/{sample}/rnaseq/exitron/{group}_exitrons_augmented.vcf" output: - "results/{sample}/variants/exitrons.vcf" + "results/{sample}/rnaseq/exitron/{group}_exitrons.vcf.gz" + message: + "Sorting and compressing exitrons on sample:{wildcards.sample} of group:{wildcards.group}" + log: + "logs/exitron_sort_{sample}_{group}.log" + conda: + "../envs/samtools.yml" + shell: + """ + bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 + """ + +rule combine_exitrons: + input: + get_exitrons + output: + "results/{sample}/variants/exitrons.vcf.gz" + message: + "Combining exitrons on sample:{wildcards.sample}" log: - "logs/{sample}/scanexitron/sort_exitrons.log" + "logs/{sample}/exitrons/combine_exitrons.log" conda: "../envs/samtools.yml" shell: """ - bcftools sort {input} -o {output} + bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 """ diff --git a/workflow/rules/genefusion.smk b/workflow/rules/genefusion.smk index e0d4cdd..401fb38 100644 --- a/workflow/rules/genefusion.smk +++ b/workflow/rules/genefusion.smk @@ -14,7 +14,7 @@ rule arriba: default_blacklist=False, # optional default_known_fusions=True, # optional sv_file="", # optional - extra=f"""-G 'gene_name=gene_name gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS' \ + extra=f"""-G 'gene_name=gene_name|gene_id gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS' \ -E {config['genefusion']['maxevalue']} \ -S {config['genefusion']['suppreads']} \ -L {config['genefusion']['maxidentity']} \ @@ -30,54 +30,52 @@ rule arriba: wrapper: "v2.6.0/bio/arriba" - #-V {config['genefusion']['maxmismatch']} \ +#rule fusions_to_vcf: + #input: + #"results/{sample}/rnaseq/genefusion/{group}_fusions.tsv", + #output: + #"results/{sample}/rnaseq/genefusion/{group}_fusions.vcf" + #log: + #"logs/{sample}/genefusion/fusion_to_vcf_{group}.log" + #conda: + #"../envs/basic.yml" + #shell: + #""" + #workflow/scripts/convert_fusions_to_vcf.sh \ + #resources/refs/genome.fasta \ + #{input} \ + #{output} > {log} 2>&1 + #""" -rule fusions_to_vcf: - input: - "results/{sample}/rnaseq/genefusion/{group}_fusions.tsv", - output: - "results/{sample}/rnaseq/genefusion/{group}_fusions.vcf" - log: - "logs/{sample}/genefusion/fusion_to_vcf_{group}.log" - conda: - "../envs/basic.yml" - shell: - """ - workflow/scripts/convert_fusions_to_vcf.sh \ - resources/refs/genome.fasta \ - {input} \ - {output} > {log} 2>&1 - """ - -rule combine_fusions: - input: - get_fusions, - output: - "results/{sample}/rnaseq/genefusion/fusions.vcf" - message: - "Combining gene fusion events on sample:{wildcards.sample}" - log: - "logs/{sample}/combine/fusions.log" - conda: - "../envs/manipulate_vcf.yml" - shell: - """ - python workflow/scripts/combine_vcf.py '{input}' fusion {output} > {log} 2>&1 +#rule combine_fusions: + #input: + #get_fusions, + #output: + #"results/{sample}/rnaseq/genefusion/fusions.vcf" + #message: + #"Combining gene fusion events on sample:{wildcards.sample}" + #log: + #"logs/{sample}/combine/fusions.log" + #conda: + #"../envs/manipulate_vcf.yml" + #shell: + #""" + #python workflow/scripts/combine_vcf.py '{input}' fusion {output} > {log} 2>&1 - """ + #""" -rule sort_fusions: - input: - "results/{sample}/rnaseq/genefusion/fusions.vcf" - output: - "results/{sample}/variants/fusions.vcf" - message: - "Sorting gene fusion events on sample:{wildcards.sample}" - log: - "logs/{sample}/genefusion/sort_fusions.log" - conda: - "../envs/samtools.yml" - shell: - """ - bcftools sort {input} -o {output} > {log} 2>&1 - """ +#rule sort_fusions: + #input: + #"results/{sample}/rnaseq/genefusion/fusions.vcf" + #output: + #"results/{sample}/variants/fusions.vcf" + #message: + #"Sorting gene fusion events on sample:{wildcards.sample}" + #log: + #"logs/{sample}/genefusion/sort_fusions.log" + #conda: + #"../envs/samtools.yml" + #shell: + #""" + #bcftools sort {input} -o {output} > {log} 2>&1 + #""" diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk index 402e3dc..d7c4786 100644 --- a/workflow/rules/germline.smk +++ b/workflow/rules/germline.smk @@ -37,6 +37,7 @@ rule detect_variants_htc_first_round: input: # single or list of bam files bam="results/{sample}/{seqtype}/align/{group}_final_BWA.bam", + idx="results/{sample}/{seqtype}/align/{group}_final_BWA.bam.bai", ref="resources/refs/genome.fasta", #known="resources/vqsr/dbSNP_b150.vcf.gz" # optional output: @@ -48,7 +49,7 @@ rule detect_variants_htc_first_round: params: extra="", java_opts="", - threads: config['threads'] + threads: 4 resources: mem_mb=1024, wrapper: @@ -82,7 +83,7 @@ rule recalibrate_variants_first_round: }, annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], extra="" - threads: config['threads'] + threads: 4 resources: mem_mb=1024, wrapper: @@ -183,7 +184,7 @@ rule detect_variants_htc_final_round: params: extra="", # optional java_opts="", # optional - threads: config['threads'] + threads: 4 resources: mem_mb=1024, wrapper: @@ -217,7 +218,7 @@ rule recalibrate_variants_final_round: }, annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], extra="" - threads: config['threads'] + threads: 1 resources: mem_mb=1024, wrapper: diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk index 32765cb..41fa1a3 100644 --- a/workflow/rules/hlatyping.smk +++ b/workflow/rules/hlatyping.smk @@ -1,8 +1,8 @@ ####### CLASS I HLA GENOTYPING ########### rule get_hla_panel: output: - dna="resources/hla/hla_ref_dna.fasta", - rna="resources/hla/hla_ref_rna.fasta" + dna="resources/hla/hla_ref_DNA.fa", + rna="resources/hla/hla_ref_RNA.fa" message: "Downloading HLA reference panels" conda: @@ -17,163 +17,104 @@ rule get_hla_panel: rule index_hla_panel: input: - dna="resources/hla/hla_ref_dna.fasta", - rna="resources/hla/hla_ref_rna.fasta" + fa="resources/hla/hla_ref_{type}.fa", output: - dna=multiext("resources/hla/yara/idx/dna", + panel=multiext("resources/hla/yara_index/{type}", ".lf.drp", ".lf.drs", ".lf.drv", ".lf.pst", ".rid.concat", ".rid.limits", ".sa.ind", ".sa.len", ".sa.val", ".txt.concat", ".txt.limits", ".txt.size"), - rna=multiext("resources/hla/yara/idx/rna", - ".lf.drp", ".lf.drs", ".lf.drv", - ".lf.pst", ".rid.concat", ".rid.limits", - ".sa.ind", ".sa.len", ".sa.val", - ".txt.concat", ".txt.limits", ".txt.size") log: - "logs/yara_indexer.log" + "logs/yara_indexer_{type}.log" conda: "../envs/yara.yml" shell: - """ - yara_indexer -o resources/hla/yara/idx/dna {input.dna} > {log} - yara_indexer -o resources/hla/yara/idx/rna {input.rna} >> {log} - """ + """ + yara_indexer -o resources/hla/yara_index/{wildcards.type} {input.fa} > {log} + """ -# map reads against reference DNA -if config['data']['dnaseq'] is not None: - if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'DNA': - if config['data']['dnaseq_readtype'] == 'SE': - rule get_hla_filtering_input_single_DNA: - input: - get_hla_flt_dna_se, - dna=multiext("resources/hla/yara/idx/dna", - ".lf.drp", ".lf.drs", ".lf.drv", - ".lf.pst", ".rid.concat", ".rid.limits", - ".sa.ind", ".sa.len", ".sa.val", - ".txt.concat", ".txt.limits", ".txt.size"), - output: - "results/{sample}/hla/{group}_flt_DNA.bam" - message: - "Mapping HLA reads against reference" - log: - "logs/{sample}/{group}_hla_reads_filtering_DNA" - conda: - "../envs/yara.yml" - shell: - """ - samtools fastq {input[0]} > results/{wildcards.sample}/hla/{wildcards.group}_DNA.fastq - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/dna \ - results/{wildcards.sample}/hla/{wildcards.group}_DNA.fastq \ - | samtools view -h -F 4 -b1 - | samtools sort - -o {output} - samtools index {output} - """ +######### get input reads ######## +rule get_reads_hlatyping_SE: + input: + get_input_hlatyping_SE + output: + "results/{sample}/hla/reads/{group}_{type}_SE.fq" + message: + "Retrieve reads for HLA genotyping by converting the alignments ({wildcards.type}) of group:{wildcards.group} from sample:{wildcards.sample} to reads in FASTQ format" + log: + "logs/{sample}/hla/get_hla_input_reads_{group}_{type}.log" + conda: + "../envs/samtools.yml" + shell: + """ + samtools fastq {input} > {output} + """ -if config['data']['dnaseq'] is not None: - if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'DNA': - if config['data']['dnaseq_readtype'] == 'PE': - rule get_hla_filtering_input_paired_DNA: - input: - unpack(get_hla_flt_dna_pe), - dna=multiext("resources/hla/yara/idx/dna", - ".lf.drp", ".lf.drs", ".lf.drv", - ".lf.pst", ".rid.concat", ".rid.limits", - ".sa.ind", ".sa.len", ".sa.val", - ".txt.concat", ".txt.limits", ".txt.size"), - output: - fwd="results/{sample}/hla/{group}_R1_flt_DNA.bam", - rev="results/{sample}/hla/{group}_R2_flt_DNA.bam", - message: - "Mapping HLA reads against reference" - log: - "logs/{sample}/{group}_hla_reads_filtering_dna" - conda: - "../envs/yara.yml" - threads: config['threads'] - shell: - """ - samtools fastq {input.r1} > results/{wildcards.sample}/hla/{wildcards.group}_flt_R1_DNA.fastq - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/dna \ - results/{wildcards.sample}/hla/{wildcards.group}_flt_R1_DNA.fastq \ - | samtools view -h -F 4 -b1 - | samtools sort - -o {output.fwd} - samtools index {output.fwd} - samtools fastq {input.r2} > results/{wildcards.sample}/hla/{wildcards.group}_flt_R2_DNA.fastq - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/dna \ - results/{wildcards.sample}/hla/{wildcards.group}_flt_R2_DNA.fastq \ - | samtools view -h -F 4 -b1 - | samtools sort - -o {output.rev} - samtools index {output.rev} - """ +rule get_reads_hlatyping_PE: + input: + unpack(get_input_hlatyping_PE) + output: + fwd="results/{sample}/hla/reads/{group}_{type}_PE_{readtype}.fq", + message: + "Retrieve paired-end reads ({wildcards.type}) for HLA genotyping - sample:{wildcards.sample} group:{wildcards.group}" + log: + "logs/{sample}/hla/get_reads_hlatyping_PE_{group}_{type}_{readtype}.log" + conda: + "../envs/samtools.yml" + shell: + """ + samtools fastq {input.fwd} > {output.fwd} + """ -# map reads against reference RNA -if config['data']['rnaseq'] is not None: - if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'RNA': - if config['data']['rnaseq_readtype'] == 'SE': - rule get_hla_filtering_input_single_RNA: - input: - get_hla_flt_rna_se, - dna=multiext("resources/hla/yara/idx/rna", - ".lf.drp", ".lf.drs", ".lf.drv", - ".lf.pst", ".rid.concat", ".rid.limits", - ".sa.ind", ".sa.len", ".sa.val", - ".txt.concat", ".txt.limits", ".txt.size"), - output: - "results/{sample}/hla/{group}_flt_RNA.bam" - message: - "Mapping HLA reads against reference" - log: - "logs/{sample}/{group}_hla_reads_filtering_rna.log" - threads: config['threads'] - conda: - "../envs/yara.yml" - shell: - """ - samtools fastq {input[0]} > results/{wildcards.sample}/hla/{wildcards.group}.fastq - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/rna \ - results/{wildcards.sample}/hla/{wildcards.group}.fastq \ - | samtools view -h -F 4 -b1 - | samtools sort - -o {output} - samtools index {output} - """ +######### single-end reads ######### +rule filter_reads_mhcI_SE: + input: + reads="results/{sample}/hla/reads/{group}_{type}_SE.fq", + panel=multiext("resources/hla/yara_index/{type}", + ".lf.drp", ".lf.drs", ".lf.drv", + ".lf.pst", ".rid.concat", ".rid.limits", + ".sa.ind", ".sa.len", ".sa.val", + ".txt.concat", ".txt.limits", ".txt.size"), + output: + "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam" + message: + "Filter the {wildcards.type} reads of group:{wildcards.group} of sample:{wildcards.sample} against the HLA panel" + log: + "logs/{sample}/{group}_hla_reads_filtering_{type}" + threads: + config["threads"] + conda: + "../envs/yara.yml" + shell: + """ + if [ "{wildcards.type}" == "DNA" ]; then + yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.type} \ + {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log} + elif [ "{wildcards.type}" == "RNA" ]; then + yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.type} \ + {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log} + fi + """ -if config['data']['rnaseq'] is not None: - if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'RNA': - if config['data']['rnaseq_readtype'] == 'PE': - rule get_hla_filtering_input_paired_RNA: - input: - unpack(get_hla_flt_rna_pe), - dna=multiext("resources/hla/yara/idx/rna", - ".lf.drp", ".lf.drs", ".lf.drv", - ".lf.pst", ".rid.concat", ".rid.limits", - ".sa.ind", ".sa.len", ".sa.val", - ".txt.concat", ".txt.limits", ".txt.size"), - output: - fwd="results/{sample}/hla/{group}_R1_flt_RNA.bam", - rev="results/{sample}/hla/{group}_R2_flt_RNA.bam", - message: - "Mapping HLA reads against reference" - log: - "logs/{sample}/{group}_hla_reads_filtering_RNA" - conda: - "../envs/yara.yml" - threads: config['threads'] - shell: - """ - samtools fastq {input.r1} > results/{wildcards.sample}/hla/{wildcards.group}_r1.fastq - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/rna \ - results/{wildcards.sample}/hla/{wildcards.group}_r1.fastq \ - | samtools view -h -F 4 -b1 - | samtools sort - -o {output.fwd} - samtools index -m1g {output.fwd} - samtools fastq {input.r2} > results/{wildcards.sample}/hla/{wildcards.group}_r2.fastq - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/rna \ - results/{wildcards.sample}/hla/{wildcards.group}_r2.fastq \ - | samtools view -h -F 4 -b1 - | samtools sort - -o {output.rev} - samtools index -m1g {output.rev} - """ +rule index_reads_mhcI_SE: + input: + "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam" + output: + "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam.bai" + log: + "logs/{sample}/{group}_hla_reads_filtering_{type}_index" + params: + extra="", # optional params string + threads: 4 # This value - 1 will be sent to -@ + wrapper: + "v2.3.0/bio/samtools/index" -checkpoint split_bam_single: +checkpoint split_reads_mhcI_SE: input: - "results/{sample}/hla/{group}_flt_{type}.bam", + reads="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam", + index="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam.bai" output: - directory("results/{sample}/hla/{group}_flt_{type}_splitted"), + directory("results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE/") message: "Splitting filtered BAM files for HLA typing" log: @@ -183,21 +124,20 @@ checkpoint split_bam_single: threads: 1 shell: """ - mkdir -p results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_splitted/ + mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.type}_flt_SE/ gatk SplitSamByNumberOfReads \ - -I {input[0]} \ + -I {input.reads} \ --OUTPUT {output} \ --OUT_PREFIX R \ --SPLIT_TO_N_READS 100000 """ - -rule hla_genotyping_single: +rule hlatyping_mhcI_SE: input: - "results/{sample}/hla/{group}_flt_{type}_splitted/R_{no}.bam", + reads="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE/R_{no}.bam", output: - pdf="results/{sample}/hla/{group}_flt_{type}_typed/{no}_coverage_plot.pdf", - tsv="results/{sample}/hla/{group}_flt_{type}_typed/{no}_result.tsv" + pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_SE/{no}_coverage_plot.pdf", + tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_SE/{no}_result.tsv" message: "HLA typing from splitted BAM files" log: @@ -207,23 +147,23 @@ rule hla_genotyping_single: threads: config['threads'] shell: """ - samtools index {input} - if [ "{wildcards.type}" == "RNA" ]; then - OptiTypePipeline.py --input {input} \ - --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \ - --prefix {wildcards.no} --rna -v > {log} - elif [ "{wildcards.type}" == "DNA" ]; then - OptiTypePipeline.py --input {input} \ - --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \ + samtools index {input.reads} + if [ "{wildcards.type}" == "DNA" ]; then + OptiTypePipeline.py --input {input.reads} \ + --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_SE/ \ --prefix {wildcards.no} --dna -v > {log} + elif [ "{wildcards.type}" == "RNA" ]; then + OptiTypePipeline.py --input {input.reads} \ + --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_SE/ \ + --prefix {wildcards.no} --rna -v > {log} fi """ -rule combine_genotyping_single: +rule combine_mhcI_SE: input: - aggregate_genotyping_single + aggregate_mhcI_SE output: - "results/{sample}/hla/alleles/classI_{group}_{type}_SE.tsv" + "results/{sample}/hla/mhc-I/genotyping/{group}_{type}_SE.tsv" log: "logs/{sample}/optitype/{group}_{type}_call.log" conda: @@ -231,16 +171,55 @@ rule combine_genotyping_single: threads: 1 shell: """ - python3 workflow/scripts/merge_classI_alleles.py \ + python3 workflow/scripts/combine_optitype_results.py \ '{input}' {output} - """ + """ + +############# paired-end reads ########### +rule filter_reads_mhcI_PE: + input: + reads="results/{sample}/hla/reads/{group}_{type}_PE_{readtype}.fq", + panel=multiext("resources/hla/yara_index/{type}", + ".lf.drp", ".lf.drs", ".lf.drv", + ".lf.pst", ".rid.concat", ".rid.limits", + ".sa.ind", ".sa.len", ".sa.val", + ".txt.concat", ".txt.limits", ".txt.size"), + output: + reads="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_{readtype}.bam", + message: + "Filter the {wildcards.type} reads of group:{wildcards.group} of sample:{wildcards.sample} against the HLA panel" + log: + "logs/{sample}/{group}_hla_reads_filtering_{type}_{readtype}.log" + conda: + "../envs/yara.yml" + threads: config['threads'] + shell: + """ + yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.type} \ + {input.reads} | samtools view -h -F 4 -b1 - | samtools sort -O bam - -o {output.reads} > {log} + """ + +rule index_reads_mhcI_PE: + input: + "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_{readtype}.bam" + output: + "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_{readtype}.bam.bai" + log: + "logs/{sample}/{group}_hla_reads_filtering_{type}_index_{readtype}" + params: + extra="", # optional params string + threads: 4 # This value - 1 will be sent to -@ + wrapper: + "v2.3.0/bio/samtools/index" -checkpoint split_bam_paired: +checkpoint split_reads_mhcI_PE: input: - fwd="results/{sample}/hla/{group}_R1_flt_{type}.bam", - rev="results/{sample}/hla/{group}_R2_flt_{type}.bam" + fwd="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R1.bam", + fwd_idx="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R1.bam.bai", + rev="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R2.bam", + rev_idx="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R2.bam.bai" output: - directory("results/{sample}/hla/{group}_flt_{type}_splitted"), + directory("results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE/") message: "Splitting filtered BAM files for HLA typing" log: @@ -250,12 +229,13 @@ checkpoint split_bam_paired: threads: 1 shell: """ - mkdir -p results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_splitted/ + mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.type}_flt_PE/ gatk SplitSamByNumberOfReads \ -I {input.fwd} \ --OUTPUT {output} \ --OUT_PREFIX R1 \ --SPLIT_TO_N_READS 100000 + gatk SplitSamByNumberOfReads \ -I {input.rev} \ --OUTPUT {output} \ @@ -263,14 +243,13 @@ checkpoint split_bam_paired: --SPLIT_TO_N_READS 100000 """ - -rule hla_genotyping_paired: +rule hlatyping_mhcI_PE: input: - fwd="results/{sample}/hla/{group}_flt_{type}_splitted/R1_{no}.bam", - rev="results/{sample}/hla/{group}_flt_{type}_splitted/R2_{no}.bam" + fwd="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE/R1_{no}.bam", + rev="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE/R2_{no}.bam", output: - pdf="results/{sample}/hla/{group}_flt_{type}_typed/{no}_coverage_plot.pdf", - tsv="results/{sample}/hla/{group}_flt_{type}_typed/{no}_result.tsv" + pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_PE/{no}_coverage_plot.pdf", + tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_PE/{no}_result.tsv" message: "HLA typing from splitted BAM files" log: @@ -282,36 +261,38 @@ rule hla_genotyping_paired: """ samtools index {input.fwd} samtools index {input.rev} - if [ "{wildcards.type}" == "RNA" ]; then - OptiTypePipeline.py --input {input.fwd} {input.rev} \ - --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \ - --prefix {wildcards.no} --rna -v > {log} - elif [ "{wildcards.type}" == "DNA" ]; then + if [ "{wildcards.type}" == "DNA" ]; then OptiTypePipeline.py --input {input.fwd} {input.rev} \ - --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \ + --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_PE/ \ --prefix {wildcards.no} --dna -v > {log} + elif [ "{wildcards.type}" == "RNA" ]; then + OptiTypePipeline.py --input {input.fwd} {input.rev} \ + --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_PE/ \ + --prefix {wildcards.no} --rna -v > {log} fi """ -rule combine_genotyping_paired: +rule combine_mhcI_PE: input: - aggregate_genotyping_paired + aggregate_mhcI_PE output: - "results/{sample}/hla/alleles/classI_{group}_{type}_PE.tsv" + "results/{sample}/hla/mhc-I/genotyping/{group}_{type}_PE.tsv" log: "logs/{sample}/optitype/{group}_{type}_call.log" + conda: + "../envs/basic.yml" + threads: 1 shell: """ - python3 workflow/scripts/merge_classI_alleles.py \ + python3 workflow/scripts/combine_optitype_results.py \ '{input}' {output} - """ - + """ -rule merge_allels: +rule merge_mhcI_allels: input: - get_alleles + get_mhcI_alleles output: - "results/{sample}/hla/classI_alleles.tsv", + "results/{sample}/hla/mhc-I.tsv", message: "Merging HLA alleles from different sources" log: @@ -321,27 +302,103 @@ rule merge_allels: threads: 1 shell: """ - cat {input} | sort | uniq > {output} + python workflow/scripts/merge_mhcI_alleles.py \ + '{input}' {output} """ + +######### MHC-II HLA GENOTYPING ########### +rule filter_reads_mhcII_PE: + input: + sample=["results/{sample}/hla/reads/{group}_{type}_PE_R1.fq", "results/{sample}/hla/reads/{group}_{type}_PE_R2.fq"], + idx=multiext( + "resources/hla/bowtie2_index", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ), + output: + "results/{sample}/hla/mhc-II/reads/{group}_{type}_hla.bam" + log: + "logs/bowtie2/{sample}/{group}_{type}.log", + params: + extra="", # optional parameters + threads: config['threads'] # Use at least two threads + wrapper: + "v2.11.1/bio/bowtie2/align" -####### CLASS II HLA GENOTYPING ########### -#rule classII_get_reads_DNA_SE: +rule bam2fastq_reads_mhcII: + input: + "results/{sample}/hla/mhc-II/reads/{group}_{type}_hla.bam" + output: + fwd="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R1.fastq", + rev="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R2.fastq", + log: + "logs/{sample}/{group}_{type}_hla_sam.log" + conda: + "../envs/samtools.yml" + threads: 1 + shell: + """ + samtools fastq -F 4 {input} -1 {output.fwd} -2 {output.rev} -s /dev/null + """ + +rule hlatyping_mhcII: + input: + fwd="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R1.fastq", + rev="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R2.fastq" + output: + "results/{sample}/hla/mhc-II/genotyping/{group}_{type}/result/{group}_{type}_final.result.txt" + log: + "logs/{sample}/hla/{group}_{type}_hlahd.log" + conda: + "../envs/hlahd.yml" + threads: config['threads'] + shell: + """ + hlahd.sh \ + -t {threads} \ + -m 100 \ + -c 0.95 \ + -f {config[hlatyping][freqdata]} \ + {input.fwd} {input.rev} \ + {config[hlatyping][split]} \ + {config[hlatyping][dict]} \ + {wildcards.group}_{wildcards.type} \ + results/{wildcards.sample}/hla/mhc-II/genotyping/ + """ + +#rule merge_mhcII_allels: #input: - #get_hla_flt_dna_se + #get_mhcII_alleles #output: - #"results/{sample}/hla/classII/{group}_DNA.fastq" + #"results/{sample}/hla/mhc-I.tsv", #message: - #"Extracting DNA reads for HLA classII typing" + #"Merging HLA alleles from different sources" #log: - #"logs/{sample}/optitype/{group}_DNA_call.log" + #"logs/{sample}/optitype/merge_classI_alleles.log" #conda: #"../envs/basic.yml" #threads: 1 #shell: #""" - + #python workflow/scripts/merge_mhcI_alleles.py \ + #'{input}' {output} #""" -#def get_hla_flt_dna_se(wildcards): + + + + + + + + + + + + diff --git a/workflow/rules/neoantigen.smk b/workflow/rules/neoantigen.smk index fc04f7b..5d875c0 100644 --- a/workflow/rules/neoantigen.smk +++ b/workflow/rules/neoantigen.smk @@ -75,7 +75,7 @@ rule priorization: pred_aff="workflow/scripts/mhc_i/", pred_imm="workflow/scripts/immunogenicity/", peptides="results/{sample}/priorization/peptides.tsv", - alleles="results/{sample}/hla/classI_alleles.tsv", + alleles="results/{sample}/hla/mhc-I.tsv", output: "results/{sample}/priorization/neoantigens.tsv" message: diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk index e2f471e..849372e 100644 --- a/workflow/rules/preproc.smk +++ b/workflow/rules/preproc.smk @@ -36,8 +36,8 @@ rule fastqc_forward: input: get_qc_input_fwd output: - html="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc_raw.html", - zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc_raw.zip" + html="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc_raw.html", + zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc_raw.zip" params: extra = "" log: @@ -52,8 +52,8 @@ rule fastqc_reverse: input: get_qc_input_rev output: - html="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc_raw.html", - zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc_raw.zip" + html="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc_raw.html", + zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc_raw.zip" params: extra = "--quiet" log: @@ -67,13 +67,13 @@ rule fastqc_reverse: rule preproc_paired_end: input: unpack(get_preproc_input), - qc1="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc_raw.html", - qc2="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc_raw.html" + qc1="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc_raw.html", + qc2="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc_raw.html" output: - trimmed=["results/{sample}/{seqtype}/reads/{group}_r1_preproc.fq.gz", - "results/{sample}/{seqtype}/reads/{group}_r2_preproc.fq.gz"], - unpaired1="results/{sample}/{seqtype}/reads/{group}_r1_preproc_unpaired.fq.gz", - unpaired2="results/{sample}/{seqtype}/reads/{group}_r2_preproc_unpaired.fq.gz", + trimmed=["results/{sample}/{seqtype}/reads/{group}_R1_preproc.fq.gz", + "results/{sample}/{seqtype}/reads/{group}_R2_preproc.fq.gz"], + unpaired1="results/{sample}/{seqtype}/reads/{group}_R1_preproc_unpaired.fq.gz", + unpaired2="results/{sample}/{seqtype}/reads/{group}_R2_preproc_unpaired.fq.gz", failed="results/{sample}/{seqtype}/reads/{group}_preproc_failed.fq.gz", html="results/{sample}/{seqtype}/reads/{group}_preproc_report.html", json="results/{sample}/{seqtype}/reads/{group}_preproc_report.json", @@ -96,10 +96,10 @@ rule preproc_paired_end: # fastqc after pre-processing (forward) rule fastqc_forward_after: input: - "results/{sample}/{seqtype}/reads/{group}_r1_preproc.fq.gz" + "results/{sample}/{seqtype}/reads/{group}_R1_preproc.fq.gz" output: - html="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc.html", - zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc.zip" + html="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc.html", + zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc.zip" params: extra = "" log: @@ -113,10 +113,10 @@ rule fastqc_forward_after: # fastqc after pre-processing (reverse) rule fastqc_reverse_after: input: - "results/{sample}/{seqtype}/reads/{group}_r2_preproc.fq.gz" + "results/{sample}/{seqtype}/reads/{group}_R2_preproc.fq.gz" output: - html="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc.html", - zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc.zip" + html="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc.html", + zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc.zip" params: extra = "--quiet" log: diff --git a/workflow/scripts/combine_vcf.py b/workflow/scripts/combine_vcf.py deleted file mode 100644 index 21d21b4..0000000 --- a/workflow/scripts/combine_vcf.py +++ /dev/null @@ -1,51 +0,0 @@ -import vcfpy -import sys -from pathlib import Path - -''' -This script combines multiple vcfs into one vcf. It adds a new INFO field for the group name - -Usage: -python combine_vcf.py '' -''' - -def main(): - vcfs = sys.argv[1].split(' ') - # iterate through all vcfs - for i,v in enumerate(vcfs): - # extract the replicate (e.g., {rep1}_*.vcf) - if '_sliprem.vcf' in str(Path(v).name): - rpl = Path(v).name.split('_sliprem.vcf')[0] - elif '_somatic.short.indels.vcf' in str(Path(v).name): - rpl = Path(v).name.split('_somatic.short.indels.vcf')[0] - elif '_somatic.snvs.vcf' in str(Path(v).name): - rpl = Path(v).name.split('_somatic.snvs.vcf')[0] - elif '_fusions.vcf' in str(Path(v).name): - rpl = Path(v).name.split('_fusions.vcf')[0] - elif '_exitron.vcf' in str(Path(v).name): - rpl = Path(v).name.split('_exitron.vcf')[0] - - reader = vcfpy.Reader.from_path(v) - reader.header.add_info_line( - vcfpy.OrderedDict( - [('ID', 'GRP'), - ('Number', '1'), - ('Type', 'String'), - ('Description', 'Name of the group')])) - reader.header.add_info_line( - vcfpy.OrderedDict( - [('ID', 'SRC'), - ('Number', '1'), - ('Type', 'String'), - ('Description', 'Source of the variant')])) - - writer = vcfpy.Writer.from_path(sys.argv[3], reader.header) - - # iterate through all records - for record in reader: - print(record) - record.INFO['GRP'] = rpl - record.INFO['SRC'] = sys.argv[2] - writer.write_record(record) - -main() diff --git a/workflow/scripts/exitron2vcf.py b/workflow/scripts/exitron2vcf.py index cccea60..674048d 100644 --- a/workflow/scripts/exitron2vcf.py +++ b/workflow/scripts/exitron2vcf.py @@ -111,6 +111,7 @@ def main(): [('SVTYPE', 'DEL'), ('END', end-1), ('DP', dp), + ('AO', ao), ('STRAND', strand), ('SplicedSite', spliced_site), ('GeneName', gene_name), diff --git a/workflow/scripts/merge_classI_alleles.py b/workflow/scripts/merge_classI_alleles.py deleted file mode 100644 index b534074..0000000 --- a/workflow/scripts/merge_classI_alleles.py +++ /dev/null @@ -1,40 +0,0 @@ -import sys - -def main(): - alleles = [] - - # load alleles - allele_list = load_alleles('workflow/scripts/valid_alleles/netmhcpan.txt') - - #print(allele_list) - - for alfile in sys.argv[1].split(' '): - with open(alfile, 'r') as f: - next(f) - for line in f: - line_list = line.rstrip().split('\t') - for i in range(1,7): - # modify header - modname = 'HLA-' + line_list[i] - if modname in allele_list: - if modname not in alleles: - alleles.append(modname) - # remove duplicates - alleles = list(set(alleles)) - - - output = open(sys.argv[2], 'w') - for i in alleles: - output.write(i + '\n') - output.close() - - -def load_alleles(alleles_file): - alleles = [] - with open(alleles_file, 'r') as f: - for line in f: - alleles.append(line.rstrip()) - return alleles - - -main() diff --git a/workflow/scripts/priorization/__pycache__/prediction.cpython-37.pyc b/workflow/scripts/priorization/__pycache__/prediction.cpython-37.pyc new file mode 100644 index 0000000..a7330a6 Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/prediction.cpython-37.pyc differ diff --git a/workflow/scripts/priorization/__pycache__/reference.cpython-311.pyc b/workflow/scripts/priorization/__pycache__/reference.cpython-311.pyc new file mode 100644 index 0000000..93b0531 Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/reference.cpython-311.pyc differ diff --git a/workflow/scripts/priorization/__pycache__/reference.cpython-37.pyc b/workflow/scripts/priorization/__pycache__/reference.cpython-37.pyc new file mode 100644 index 0000000..36d9dfe Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/reference.cpython-37.pyc differ diff --git a/workflow/scripts/priorization/__pycache__/variants.cpython-37.pyc b/workflow/scripts/priorization/__pycache__/variants.cpython-37.pyc new file mode 100644 index 0000000..cf8b0f1 Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/variants.cpython-37.pyc differ diff --git a/workflow/scripts/priorization/compile.py b/workflow/scripts/priorization/compile.py new file mode 100644 index 0000000..2f0855b --- /dev/null +++ b/workflow/scripts/priorization/compile.py @@ -0,0 +1,61 @@ +import configargparse +import reference +import variants +import prediction + +import pdb + + +def main(): + options = parse_arguments() + + proteome = reference.Proteome(options.proteome) + annotation = reference.Annotations(options.anno) + + variant_effects = variants.VariantEffects(options.output_dir) + variant_effects.print_header() + + if options.fusion is not None: + fusions = variants.Fusions(options.fusion, + options.confidence, + proteome.proteome, + annotation.anno, + variant_effects) + + if options.input is not None: + other_variants = variants.Variants(options.input, + variant_effects) + + variant_effects.close_file() + + + # predict binding affinities + binding = prediction.BindingAffinities(variant_effects.variantEffectsFile, + options.mhcI, + options.mhcII, + options.mhcI_len, + options.mhcII_len) + + + +def parse_arguments(): + p = configargparse.ArgParser() + + # define command line arguments + p.add('-i', '--input', required=False, help='input file') + # when fusion file is provided (remains optional) + p.add('-f', '--fusion', required=False, help='fusion file') + p.add('-c', '--confidence', required=False, choices=['high', 'medium', 'low'], help='confidence level of fusion events') + p.add("--mhcI", required=False, help='MHC-I allele') + p.add("--mhcII", required=False, help='MHC-II allele') + p.add("--mhcI_len", required=False, help='MHC-I peptide length') + p.add("--mhcII_len", required=False, help='MHC-II peptide length') + p.add('-p', '--proteome', required=True, help='proteome file') + p.add('-a', '--anno', required=True, help='annotation file') + p.add('-o', '--output_dir', required=True, help='output directory') + + return p.parse_args() + + + +main() diff --git a/workflow/scripts/priorization/prediction.py b/workflow/scripts/priorization/prediction.py new file mode 100644 index 0000000..398938c --- /dev/null +++ b/workflow/scripts/priorization/prediction.py @@ -0,0 +1,189 @@ +import tempfile +import pdb +import os +import concurrent.futures + +class BindingAffinities: + def __init__(self, effects, mhcI, mhcII, mhcI_len, mhcII_len): + + with tempfile.TemporaryDirectory() as tmp_seqs: + if mhcI is not None: + mhcI_alleles = {} + fh_mhcI_alleles = open(mhcI, 'r') + for allele in fh_mhcI_alleles: + line = allele.rstrip().split('\t') + mhcI_alleles[line[1]] = line[0] + + wt_fname = {} + mt_fname = {} + + # counters to store number of sequence in fa + wt_cnt = {} + mt_cnt = {} + + # file handler for wt and mt sequences + fh_wt = {} + fh_mt = {} + + # extract mhcI lens + epilens = [] + for lens in mhcI_len.split(','): + if '-' in lens: + lower, upper = lens.split('-',1) + epilens.extend(range(int(lower), int(upper)+1)) + else: + epilens.append(int(lens)) + + for epilen in epilens: + wt_fname[epilen] = os.path.join(tmp_seqs, f'wt_{epilen}.fa') + mt_fname[epilen] = os.path.join(tmp_seqs, f'mt_{epilen}.fa') + + fh_wt[epilen] = open(wt_fname[epilen], 'w') + fh_mt[epilen] = open(mt_fname[epilen], 'w') + + # initialise counter + wt_cnt[epilen] = 1 + mt_cnt[epilen] = 1 + + subseqs = [] + + # iterate through the file + fh = open(effects, 'r') + next(fh) # skip header + for line in fh: + entries = line.rstrip().split('\t') + print(entries) + # print(line) + + for epilen in epilens: + local_var_start = int(entries[15]) + local_var_end = int(entries[16]) + + wt_subseq = entries[12] + mt_subseq = entries[13] + + # adjust length of seqs (according to epilen) + if local_var_start >= epilen + 1: + left = local_var_start - (epilen + 1) + else: + left = 0 + + if local_var_end + (epilen - 1) <= len(mt_subseq): + right = local_var_end + (epilen - 1) + else: + right = len(mt_subseq) - 1 + + wt_subseq_adj = wt_subseq[left:right] + mt_subseq_adj = mt_subseq[left:right] + + # determine the epitope sequences + if '$' in wt_subseq_adj: + wt_epitope_seq = wt_subseq_adj.split('$')[0] + else: + wt_epitope_seq = wt_subseq_adj + mt_epitope_seq = mt_subseq_adj + + if len(wt_epitope_seq) >= epilen+1: + fh_wt[epilen].write(f'>{wt_cnt[epilen]}\n{wt_epitope_seq}\n') + entries.append(wt_cnt[epilen]) + wt_cnt[epilen] += 1 + else: + entries.append(0) + + if len(mt_epitope_seq) >= epilen+1: + fh_mt[epilen].write(f'>{mt_cnt[epilen]}\n{mt_epitope_seq}\n') + entries.append(mt_cnt[epilen]) + mt_cnt[epilen] += 1 + else: + entries.append(0) + + subseqs.append(entries) + + # its necessary to close the files + fh.close() + for epilen in epilens: + fh_wt[epilen].close() + fh_mt[epilen].close() + + wt_affinities = self.collect_binding_affinities(alleles, wt_fname, epilens, 'wt', options.threads) + mt_affinities = self.collect_binding_affinities(alleles, mt_fname, epilens, 'mt', options.threads) + + + @staticmethod + def collect_binding_affinities(alleles, fnames, epilens, group, threads): + affinities_results = {} + number_of_threads = int(threads) + + with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor: + futures = {} + for allele in alleles: + for epilen in epilens: + future = executor.submit(calc_binding_affinities, fnames[epilen], allele, epilen, group) + futures[future] = epilen + + for future in concurrent.futures.as_completed(futures): + epilen = futures[future] + binding_affinities = future.result() + + # check epilen already present + if epilen not in affinities_results.keys(): + affinities_results[epilen] = {} + + for seqnum in binding_affinities.keys(): + if seqnum not in affinities_results[epilen].keys(): + affinities_results[epilen][seqnum] = binding_affinities[seqnum] + else: + for seq in binding_affinities[seqnum].keys(): + if seq not in affinities_results[epilen][seqnum].keys(): + affinities_results[epilen][seqnum][seq] = binding_affinities[seqnum][seq] + + return affinities_results + + + def calc_binding_affinities(fa_file, allele, epilen, group): + binding_affinities = {} +# binding_affinities[epilen] = {} + print(f'calculate binding affinities - allele:{allele} epilen:{epilen} group: {group}') + call = ['python', + 'workflow/scripts/mhc_i/src/predict_binding.py', + 'netmhcpan', + allele, + str(epilen), + fa_file] + + result = subprocess.run(call, + stdout = subprocess.PIPE, + universal_newlines=True) + predictions = result.stdout.rstrip().split('\n')[1:] + + for line in predictions: + entries = line.split('\t') + if group == 'mt': + if float(entries[8]) >= 500: + continue + + # sequence number in results used as key + seqnum = int(entries[1]) + epitope_seq = entries[5] + allele = entries[0] + + # start and end in sequence (0-based) + start = int(entries[2])-1 + end = int(entries[3])-1 + + ic50 = float(entries[8]) + rank = float(entries[9]) + + if seqnum not in binding_affinities: + binding_affinities[seqnum] = {} + + if epitope_seq not in binding_affinities[seqnum]: + binding_affinities[seqnum][epitope_seq] = (allele, start, end, ic50, rank) + + return binding_affinities + + + + + + diff --git a/workflow/scripts/priorization/reference.py b/workflow/scripts/priorization/reference.py new file mode 100644 index 0000000..e22d6fb --- /dev/null +++ b/workflow/scripts/priorization/reference.py @@ -0,0 +1,38 @@ +import pyfaidx +import re + +class Proteome: + def __init__(self, fastaFile): + # parse proteome + ref = pyfaidx.Fasta(fastaFile) + self.proteome = {} + for record in ref: + identifier = record.long_name + match = re.search(r'transcript:([^.\s]+)', identifier) + if match: + tid = match.group(1) + self.proteome[tid] = str(record) + + +class Annotations: + def __init__(self, gtfFile): + # parse annotations + anno_fh = open(gtfFile, 'r') + self.anno = {} + for line in anno_fh: + # ignore header + if not line.startswith('#'): + l = line.rstrip().split('\t') + + if l[2] == 'exon': + exon_number = re.search(r'exon_number "([^.\s]+)"', l[8]).group(1) + transcript_id = re.search(r'transcript_id "([^.\s]+)"', l[8]).group(1) + + if transcript_id not in self.anno: + self.anno[transcript_id] = {} + + if not exon_number in self.anno[transcript_id]: + self.anno[transcript_id][int(exon_number)] = [l[3],l[4]] + + anno_fh.close() + diff --git a/workflow/scripts/priorization/variants.py b/workflow/scripts/priorization/variants.py new file mode 100644 index 0000000..149f3a4 --- /dev/null +++ b/workflow/scripts/priorization/variants.py @@ -0,0 +1,869 @@ +from pathlib import Path +import re +import vcfpy +import sys + + +class VariantEffectsEntry: + # create an empty variant effects entry + def __init__(self): + self.data = {} + self.data['chrom'] = "" + self.data['start'] = "" + self.data['end'] = "" + self.data['gene_id'] = "" + self.data['gene_name'] = "" + self.data['transcript_id'] = "" + self.data['transcript'] = "" + self.data['cds'] = "" + self.data['cds_breakpoint'] = "" + self.data['source'] = "" + self.data['group'] = "" + self.data['var_type'] = "" + self.data['wt_subseq'] = "" + self.data['mt_subseq'] = "" + self.data["var_start"] = "" + self.data["local_var_start"] = "" + self.data["local_var_end"] = "" + self.data['vaf'] = "" + self.data['ao'] = "" + self.data['dp'] = "" + self.data['NMD'] = "." + self.data['PTC_dist_ejc'] = "." + self.data['PTC_exon_number'] = "." + self.data['NMD_esc_rule'] = "." + + def __init__(self, chrom, start, end, gene_id, gene_name, + transcript_id, transcript, cds, cds_breakpoint, source, + group, var_type, wt_subseq, mt_subseq, var_start, + local_var_start, local_var_end, vaf, ao, dp): + self.data = {} + self.data['chrom'] = chrom + self.data['start'] = start + self.data['end'] = end + self.data['gene_id'] = gene_id + self.data['gene_name'] = gene_name + self.data['transcript_id'] = transcript_id + self.data['transcript'] = transcript + self.data['cds'] = cds + self.data['cds_breakpoint'] = cds_breakpoint + self.data['source'] = source + self.data['group'] = group + self.data['var_type'] = var_type + self.data['wt_subseq'] = wt_subseq + self.data['mt_subseq'] = mt_subseq + self.data["var_start"] = var_start + self.data["local_var_start"] = local_var_start + self.data["local_var_end"] = local_var_end + self.data['vaf'] = vaf + self.data['ao'] = ao + self.data['dp'] = dp + self.data['NMD'] = "." + self.data['PTC_dist_ejc'] = "." + self.data['PTC_exon_number'] = "." + self.data['NMD_esc_rule'] = "." + + + # getter & setter + def get_nmd(self): + return self.data['NMD'] + + def set_nmd(self, nmd): + self.data['NMD'] = nmd + + + def scan_nmd(self, anno): + # check for NMD in framshift events + if "frameshift" in self.data["var_type"]: + + if self.data["transcript"] == ".": + self.data["NMD"] = "unknown" + return + else: + seg2_start = self.data["start"].split('|')[1] + # search for premature stop codon in cds + stop_pos, stop_coord = self.find_stop_codon(self.data["cds"], + self.data["cds_breakpoint"], + seg2_start) + + # check if stop codon is PTC + if stop_pos != -1: + tid2 = self.data["transcript_id"].split('|')[1] + + if tid2 == '.': + self.data['NMD'] = "unknown" + return + + exoninfo_tid2 = anno[tid2] + exons_tid2 = list(exoninfo_tid2.keys()) + + exon_num, dist_ejc = self.annotate_stop_codon(exoninfo_tid2, stop_coord) + if exon_num != -1: + self.data["PTC_exon_number"] = f'{exon_num}' + if max(exons_tid2) > 1: + self.data["PTC_dist_ejc"] = dist_ejc + + # check if NMD is escaped + nmd_escape = self.check_escape(exoninfo_tid2, + stop_coord, + exon_num, + dist_ejc) + + if nmd_escape != -1: + self.data["NMD"] = "NMD_escaping_variant" + self.data["NMD_esc_rule"] = nmd_escape + else: + self.data["NMD"] = "NMD_variant" + + + + def find_stop_codon(self, cds, breakpoint, seg2_start): + """search for stop codon in coding sequence + returns the index of the stop codon and + genomic coordinate of the end of the second segment + """ + stop_pos = -1 # position of (detected) stop_codon --> within cds + for i in range(0, len(cds), 3): + codon = cds[i:i+3] + if codon == 'TAA' or codon == 'TAG' or codon == 'TGA': + stop_pos = i + break + + # stop codon found before breakpoint (should be invoked) + if stop_pos <= breakpoint: + return -1, -1 + else: + segment2 = stop_pos - breakpoint + stop_coord = int(seg2_start) + segment2 + return stop_pos, stop_coord + + + def annotate_stop_codon(self, exoninfo, stop_coord): + stop_exon = -1 + for exon_number in exoninfo: + exon_start = int(exoninfo[exon_number][0]) + exon_end = int(exoninfo[exon_number][1]) + if stop_coord >= exon_start and stop_coord <= exon_end: + stop_exon = exon_number + dist_ejc = exon_end - stop_coord + return stop_exon, dist_ejc + + # no exon information found for stop_codon + return -1, -1 + + + def check_escape(self, exoninfo, stop_coord, exon_num, dist_ejc): + """ + 1. If the variant is in an intronless transcript, meaning only one exon exist in the transcript. + 2. The variant falls in the first 100 coding bases in the transcript. + vvv + ..ES...EE..I.ES...EE.I.ES....EE.I.ES....EE + (ES= exon_start,EE = exon_end, I = intron, v = variant location) + 3. The variant location falls 50 bases upstream of the penultimate (second to the last) exon. + vvv + ES...EE..I.ES...EE.I.ES....EE.I.ES....EE + (ES= exon_start,EE = exon_end, I = intron, v = variant location) + 4. The variant location falls in the last exon of the transcript. + vvvv + ES...EE..I.ES...EE.I.ES....EE.I.ES....EE + (ES= exon_start,EE = exon_end, I = intron, v = variant location) + + """ + exons = list(exoninfo.keys()) + last_exon = int(max(exons)) + + if exon_num == 1: + if last_exon == 1: # there is only one exon + return 1 + elif last_exon > 1: + exon_start = int(exoninfo[1][0]) + # check if PTC is within + if stop_coord - exon_start < 100: + return 2 + if exon_num == last_exon-1: + exon_start = int(exoninfo[last_exon-1][0]) + exon_end = int(exoninfo[last_exon-1][1]) + if exon_end - stop_coord <= 50: + return 3 + elif exon_num == last_exon: + return 4 + + return -1 + + + def print_vee(self): + for k,v in self.data.items(): + print(f'{k}: {v}\t', end='') + print('\n') + + + + +class Fusions: + def __init__(self, fusionsFile, confidence, proteome, annotation, effects): +# self.variantEffects = VariantEffectsEntry() + + group = Path(fusionsFile).stem.split('_fusions')[0] + confidenceLevel = {"low": ["low", "medium", "high"], + "medium": ["medium", "high"], + "high": ["high"]} + + + with open(fusionsFile) as fh: + next(fh) # ignore header + for line in fh: + cols = line.rstrip().split('\t') + if cols[14] in confidenceLevel[confidence]: + + eventType = cols[8] + readingFrame = cols[15] + consequence = self.determineConsequence(readingFrame, eventType) + if consequence == None: + continue + + peptideSeq = cols[28] + if peptideSeq == '.': + continue + + # get genomic coordiantes + chrom, start, end = self.determineGenomicCoordinates(cols[4],cols[5]) + gene_name = cols[0] + '|' + cols[1] + gene_id = cols[20] + '|' + cols[21] + + tid1 = cols[22] + tid2 = cols[23] + + transcript_id = tid1 + '|' + tid2 + + # print(f'tid2: {tid2}') + # print(f'transcript_id: {transcript_id}') + + ao = str(len(cols[29])) + dp = str(cols[12]+cols[13]) + + transcript = cols[27].upper() + cds, cds_breakpoint = self.determineCodingSequence(transcript) + + # retrieve polypeptide sequence of + fusion_pepseq = cols[28] + if fusion_pepseq == '.': + continue + + # determine (complete) wildtype peptide sequence (which is the wt of first gene) + gene1_wt_seq = '.' + # retrieve + if tid1 in proteome: + gene1_wt_seq = proteome[tid1] + wt_subseq, mt_subseq, var_start, local_var_start, local_var_end = self.determine_peptide_sequence(fusion_pepseq, + gene1_wt_seq) + vee = VariantEffectsEntry(chrom, start, end, gene_id, + gene_name, transcript_id, + transcript, cds, cds_breakpoint, + "fusion", group, consequence, # var_type + wt_subseq, mt_subseq, var_start, + local_var_start, local_var_end, + ".", # vaf not available for fusions + ao, dp) + + #vee.print_vee() + vee.scan_nmd(annotation) + effects.print_entry(vee) + + + def determineCodingSequence(self, transcript): + """ determines the coding sequence """ +# print(f'transcript: {transcript}') + + #search for start codon in transcript + start_codon = re.search(r'ATG', transcript) +# print(f'start_codon: {start_codon}') + if start_codon: # start codon could have been found +# print(f'start_codon: {start_codon}') + cds = transcript[start_codon.start()+3:] # determine the coding sequence + cds_bp = cds.find('|') + cds = cds.replace('|', '') + return cds, cds_bp + + else: + return '.', '.' + + + def determineConsequence(self, readingFrame, event): + consequence = "" + if readingFrame == '.' or readingFrame == "stop_codon": + return None + else: + if readingFrame == "in-frame": + consequence = "inframe_" + elif readingFrame == "out-of-frame": + consequence = "frameshift_" + + evts = event.split('/') + for evt in evts: + if (evt == "read-through" or + evt == "ITD" or + evt == "5'-5'" or + evt == "3'-3'"): + return None + elif evt == "translocation": + consequence += "trs" + return consequence + elif evt == "duplication": + consequence += "dup" + return consequence + elif evt == "deletion": + consequence += "del" + return consequence + elif evt == "inversion": + consequence += "inv" + return consequence + return None + + + def determineGenomicCoordinates(self, breakpoint1, breakpoint2): + chrom1 = breakpoint1.split(':')[0] + chrom2 = breakpoint2.split(':')[0] + + start1 = breakpoint1.split(':')[1] + start2 = breakpoint2.split(':')[1] + + chrom = chrom1 + '|' + chrom2 + start = start1 + '|' + start2 + end = -1 + + return chrom, start, end + + def determine_peptide_sequence(self, fusion_seq, wildtype_seq): + fusion_seg1 = fusion_seq.split('|')[0] + fusion_seg2 = fusion_seq.split('|')[1] + var_start = len(fusion_seg1) + 1 # position of the fusion + + # extract sequences + fusion_seg1_sub = fusion_seg1.split('?')[-1] + fusion_seg2_sub = fusion_seg2.split('?')[0].split('*')[0] + + # mt sequence + if len(fusion_seg1_sub) >= 24: + mt = fusion_seg1_sub[-24:] + local_var_start = 25 # first base of fused transcript + if len(fusion_seg2_sub) >= 24: + mt += fusion_seg2_sub[:24] + else: + mt += fusion_seg2_sub + elif len(fusion_seg1_sub) < 24: + mt = fusion_seg1_sub + local_var_start = len(fusion_seg1_sub) + 1 + if len(fusion_seg2_sub) >= 24: + mt += fusion_seg2_sub[:24] + else: + mt += fusion_seg2_sub + + # arriba returns variants (in lower case) + mt = mt.upper() + + # end of variant is always the length of the sequence + local_var_end = len(mt) + + # wt sequences + wt = '.' + # assume that the fusion sequence is long + + if len(fusion_seg1_sub) > 10: + # search for variants in the first segment + variants = find_all_lowercase(fusion_seg1_sub) + if variants != []: # variants within the segment + # extract sequence that is wildtype (in fusion - before first lowercase) + wt_prefix = fusion_seg1_sub[:variants[0]] + else: + wt_prefix = fusion_seg1_sub + + wt = self.extract_wildtype_sequence(wildtype_seq, wt_prefix, len(mt)) + return wt, mt, var_start, local_var_start, local_var_end + + + # extract the wildtype sequence (using ensemble info and fusion sequence) + def extract_wildtype_sequence(self,wt_seq, wt_prefix, mt_len): + # search for subsequence in wildtype peptide sequences + wt_start = wt_seq.find(wt_prefix) + if wt_start != -1: # wildtype sequence (before first variant) can be found in ensemble + return self.fillup_wildtype_sequence(wt_seq[wt_start:], mt_len) + else: # wildrtype sequence cannot be found in ensemble + if wt_prefix.isupper(): + return self.fillup_wildtype_sequence(wt_prefix, mt_len) + else: # if not all in uppercase wt cannot be determined + return '.' + + + # makes sure that length of wildtype sequence equals length of mutant sequence + def fillup_wildtype_sequence(self, wt_seq,mt_len): + # wildtype sequence of seg1 exceeds length of mt (take so much until length is reached) + if len(wt_seq) >= mt_len: + wt = wt_seq[:mt_len] + else: + wt = wt_seq + '$'*(mt_len-len(wt_seq)) + return wt + + + +class Variants(): + def __init__(self, variants, effects): + + input_vcfs = variants.split(' ') + for vcf in input_vcfs: + vcf_reader = vcfpy.Reader(open(vcf, 'r')) + csq_format = self.parse_csq_format(vcf_reader.header) + + transcript_count = {} + for entry in vcf_reader: + # FILTER (when applicable) + if (entry.INFO['SRC'] == 'snv' or + entry.INFO['SRC'] == 'short_indel'): + if 'PASS' not in entry.FILTER: + continue + + # resolved alleles specific to vep + alleles_vep = self.resolve_alleles(entry) + + chromosome = entry.CHROM + start = entry.affected_start + stop = entry.affected_end + ref = entry.REF + alts = entry.ALT + + for alt_i, alt_v in enumerate(alts): + # for alt in alts: + csq_allele = alleles_vep[str(alt_v.value)] + csq_fields = self.parse_csq_entries( + entry.INFO["CSQ"], + csq_format, + csq_allele + ) + + for field in csq_fields: + gene_name = field["SYMBOL"] + gene_id = field["Gene"] + transcript_id = field['Feature'] + if transcript_id in transcript_count: + transcript_count[transcript_id] += 1 + else: + transcript_count[transcript_id] = 1 + + nmd = '.' + csq = self.resolve_consequence(field['Consequence']) + if csq is None: + continue + elif csq == "frameshift": + if field["NMD"] != 'NMD_escaping_variant': + continue + elif field["NMD"] == "NMD_escaping_variant": + nmd = "NMD_escaping_variant" + + if field["DownstreamProtein"] == "": + continue + + vaf = self.determine_variant_allele_frequency(entry, alt_i) + # determine allele depth (number of reads) + mt_ad = self.determine_allele_depth(entry, alt_i) + dp = self.determine_read_depth(entry) + + if field["Amino_acids"]: + aa_change = field["Amino_acids"] + else: + continue + + if csq == "frameshift": + var_start = self.get_variant_startpos(field['Protein_position']) + wt_seq = field["WildtypeProtein"] + # mutant/variant sequence is wt until mutation start + mt_seq = wt_seq[:var_start] + field["DownstreamProtein"] + # length of variant is length of downstream sequence + var_len = len(field["WildtypeProtein"]) + + # makes sure that wt and mt are of same length + wt_subseq, mt_subseq, n_var_start = self.determine_fs_subseq( + wt_seq, + mt_seq, + var_start + ) + + # print('-------frameshift') + # print(f'field-Protein_Position:{field["Protein_position"]}') + # print(f'Downstream_Protein: {field["DownstreamProtein"]}') + # print(f'var_start: {var_start}') + # print(f'aa_change:{aa_change}') + # print(f'wt_seq: {wt_seq}') + # print(f'mt_seq: {mt_seq}') + + # print(f'wt_subseq: {wt_subseq}') + # print(f'mt_subseq: {mt_subseq}') + # print(f'n_var_start: {n_var_start}') + + + vee = VariantEffectsEntry(entry.CHROM, + start, + stop, + gene_id, + gene_name, + transcript_id, + '.', # transcript + '.', # cds + '.', # cds_breakpoint + entry.INFO["SRC"], + entry.INFO["GRP"], + csq, + wt_subseq, + mt_subseq, + var_start, + n_var_start, + len(mt_subseq)-1, + vaf, + mt_ad, + dp) + + vee.set_nmd(nmd) + effects.print_entry(vee) + + + + elif (csq == "missense" or + csq == "inframe_ins" or + csq == "inframe_del"): + + + # retrieve the start and end of the variant / initial variant start + var_start = self.get_variant_startpos(field["Protein_position"]) + wt_seq = field["WildtypeProtein"] # wildtype peptide sequence + + wt_aa_change, mt_aa_change = self.determine_aa_change(aa_change) + + # scan for stop codons + wt_aa_change, wt_stop_codon = self.scan_stop_codon(wt_aa_change) + mt_aa_change, mt_stop_codon = self.scan_stop_codon(mt_aa_change) + + mt_seq = wt_seq[:var_start] + mt_aa_change + if not mt_stop_codon: + mt_seq += wt_seq[var_start+len(wt_aa_change):] + + # change end when deletion and insertion (as opposed to missense) +# var_end = var_start + # if 'del' in csq: + # elif 'ins' in csq: + # var_end = var_start + len(mt_aa_change) + # else: + # var_end = var_start + + # print(f'var_start: {var_start}') + # print(f'var_end: {var_end}') + # print('-------') + # print(f'field-Protein_Position:{field["Protein_position"]}') + # print(f'aa_change:{aa_change}') + # print(f'wt_seq: {wt_seq}') + + # print(f'wt_aa_change: {wt_aa_change}') + # print(f'mt_aa_change: {mt_aa_change}') + + # print(f'mt_seq: {mt_seq}') + + # # generate subsequence + wt_subseq, mt_subseq, n_var_start, n_var_end = self.determine_subseq(wt_seq, + mt_seq, + var_start, + len(mt_aa_change)) + + + vee = VariantEffectsEntry(entry.CHROM, + start, + stop, + gene_id, + gene_name, + transcript_id, + '.', # transcript + '.', # cds + '.', # cds_breakpoint + entry.INFO["SRC"], + entry.INFO["GRP"], + csq, + wt_subseq, + mt_subseq, + var_start, + n_var_start, + n_var_end, + vaf, + mt_ad, + dp) + + effects.print_entry(vee) + + + + + # print(f'wt_subseq: {wt_subseq}') + # print(f'mt_subseq: {mt_subseq}') + # print(f'n_var_start: {n_var_start}') + # print(f'n_var_end: {n_var_end}') + + + + #output_row["aa_change"] = field["Amino_acids"] + # else: + # output_row["aa_change"] = "NA" + # continue + + @staticmethod + def determine_subseq(wt_seq, mt_seq, var_start, var_len): + """ determine the subsequence to consider for priorization """ + # left and right are start/end of the subsequence + # new_var_start is the new start of the variant (after subsetting) + if var_start <= 24 : + left = 0 # start of subsequence + new_var_start = var_start + else: + left = var_start - 24 + new_var_start = 24 + + if var_start + var_len + 24 >= len(mt_seq): + right = len(mt_seq) + else: + right = var_start + var_len + 24 + new_var_end = new_var_start + var_len + + # get wildtype sequence with new subsets + mt_subseq = mt_seq[left:right] + if right >= len(wt_seq): + wt_subseq = wt_seq[left:] + else: + wt_subseq = wt_seq[left:right] + + if wt_subseq < mt_subseq: + for i in range(len(wt_subseq),len(mt_subseq)): + wt_subseq += '$' + + return wt_subseq, mt_subseq, new_var_start, new_var_end + + @staticmethod + def determine_fs_subseq(wt_seq, mt_seq, start_pos_var): + subseqs = {} + if start_pos_var < 24: + subseq_start = 0 + else: + subseq_start = start_pos_var - 24 + + wt_subseq = wt_seq[subseq_start:] + mt_subseq = mt_seq[subseq_start:] + + # make sure wt and mt subsequence of of same length + if len(wt_subseq) > len(mt_subseq): + wt_subseq = wt_subseq[:len(mt_subseq)] + else: + for i in range(len(wt_subseq),len(mt_subseq)): + wt_subseq += '$' + + # determine (new) variant start within subsequence + new_start_pos_var = start_pos_var - subseq_start + + return wt_subseq, mt_subseq, new_start_pos_var + + + @staticmethod + def scan_stop_codon(sequence): + """ checks for stop codons and removes them """ + stop_codon = False + if '*' in sequence: + sequence = sequence.split('*')[0] + stop_codon = True + if 'X' in sequence: + sequence = sequence.split('X')[0] + stop_codon = True + + return sequence, stop_codon + + + @staticmethod + def determine_aa_change(aa_change): + wt_aa_change, mt_aa_change = aa_change.split('/') + if wt_aa_change == '-': + wt_aa_change = '' + if mt_aa_change == '-': + mt_aa_change = '' + + return wt_aa_change, mt_aa_change + + + + def get_variant_startpos(self, protein_position): + if '-' not in protein_position: # single number + startpos = int(protein_position) - 1 + else: + if protein_position.split('/')[0] == '-': + startpos = int(protein_position.split('-')[0]) + else: + startpos = int(protein_position.split('-')[0]) - 1 + + return startpos + + + + def parse_csq_entries(self, csq_entries, csq_format, csq_allele): + csq_format_array = csq_format.split("|") + + transcripts = [] + for entry in csq_entries: + values = entry.split('|') + transcript = {} + for key, value in zip(csq_format_array, values): + transcript[key] = value + if transcript['Allele'] == csq_allele: + transcripts.append(transcript) + + return transcripts + + + def parse_csq_format(self,vcf_header): + if vcf_header.get_info_field_info('CSQ') is None: + sys.exit("Failed to extract format string form info description for tag (CSQ)") + else: + csq_header = vcf_header.get_info_field_info('CSQ') + format_pattern = re.compile("Format: (.*)") + match = format_pattern.search(csq_header.description) + return match.group(1) + + + def resolve_alleles(self, entry): + alleles = {} + for alt in entry.ALT: + alt = str(alt.value) + # most likely an SNV + if alt[0:1] != entry.REF[0:1]: + alleles[alt] = alt + elif alt[1:] == "": + alleles[alt] = "-" + else: + alleles[alt] = alt[1:] + + return alleles + + + def resolve_consequence(self, consequence_string): + consequences = { + consequence.lower() for consequence in consequence_string.split("&") + } + if "start_lost" in consequences: + consequence = None + elif "frameshift_variant" in consequences: + consequence = "frameshift" + elif "missense_variant" in consequences: + consequence = "missense" + elif "inframe_insertion" in consequences: + consequence = "inframe_ins" + elif "inframe_deletion" in consequences: + consequence = "inframe_del" + else: + consequence = None + + return consequence + + def determine_variant_allele_frequency(self, entry, i): + vaf = -1 + if (entry.INFO['SRC'] == 'short_indel' or + entry.INFO['SRC'] == 'snv'): + vaf = entry.calls[0].data['AF'][i] + + if entry.INFO['SRC'] == 'long_indel': + vaf = entry.INFO['AB'] + + return vaf + + def determine_read_depth(self, entry): + dp = -1 + if (entry.INFO['SRC'] == 'short_indel' or + entry.INFO['SRC'] == 'snv'): + dp = entry.calls[0].data['DP'] + + if (entry.INFO['SRC'] == 'long_indel' or + entry.INFO['SRC'] == 'exitron'): + dp = entry.INFO['DP'] + + return dp + + + def determine_allele_depth(self, entry, i): + #wt_ad = -1 + mt_ad = -1 + + # GATK + if (entry.INFO['SRC'] == 'short_indel' or + entry.INFO['SRC'] == 'snv'): + #wt_ad = entry.calls[0].data['AD'][0] + mt_ad = entry.calls[0].data['AD'][i+1] + + # transIndel + if (entry.INFO['SRC'] == 'long_indel' or + entry.INFO['SRC'] == 'exitron'): + mt_ad = entry.INFO['AO'] + #wt_ad = str(int(float(entry.INFO['DP']) - float(mt_ad))) + + return mt_ad + + +# augmented transcript +class VariantEffects: + def __init__(self, output_dir): + + output_dir_path = Path(output_dir) + if not output_dir_path.exists(): + output_dir_path.mkdir() + + self.variantEffectsFile = Path(output_dir_path, "variant_effects.tsv") + # create dir if it not exists + + self.fh = open(self.variantEffectsFile, 'w') + + + + # TODO: no hard coding of output + def print_header(self): + self.fh.write("chrom\tstart\tend\tgene_id\tgene_name\ttranscript_id\t") + self.fh.write("transcript\tcds\tcds_breakpoint\tsource\tgroup\t") + self.fh.write("var_type\twt_subseq\tmt_subseq\tvar_start\t") + self.fh.write("local_var_start\tlocal_var_end\t") + self.fh.write("vaf\tao\tdp\tNMD\tPTC_dist_ejc\tPTC_exon_number\t") + self.fh.write("NMD_esc_rule\n") + + def print_entry(self, entry): + self.fh.write(f'{entry.data["chrom"]}\t') + self.fh.write(f'{entry.data["start"]}\t') + self.fh.write(f'{entry.data["end"]}\t') + self.fh.write(f'{entry.data["gene_id"]}\t') + self.fh.write(f'{entry.data["gene_name"]}\t') + self.fh.write(f'{entry.data["transcript_id"]}\t') + self.fh.write(f'{entry.data["transcript"]}\t') + self.fh.write(f'{entry.data["cds"]}\t') + self.fh.write(f'{entry.data["cds_breakpoint"]}\t') + self.fh.write(f'{entry.data["source"]}\t') + self.fh.write(f'{entry.data["group"]}\t') + self.fh.write(f'{entry.data["var_type"]}\t') + self.fh.write(f'{entry.data["wt_subseq"]}\t') + self.fh.write(f'{entry.data["mt_subseq"]}\t') + self.fh.write(f'{entry.data["var_start"]}\t') + self.fh.write(f'{entry.data["local_var_start"]}\t') + self.fh.write(f'{entry.data["local_var_end"]}\t') + self.fh.write(f'{entry.data["vaf"]}\t') + self.fh.write(f'{entry.data["ao"]}\t') + self.fh.write(f'{entry.data["dp"]}\t') + self.fh.write(f'{entry.data["NMD"]}\t') + self.fh.write(f'{entry.data["PTC_dist_ejc"]}\t') + self.fh.write(f'{entry.data["PTC_exon_number"]}\t') + self.fh.write(f'{entry.data["NMD_esc_rule"]}\n') + + + def close_file(self): + self.fh.close() + + +# deter +def find_all_lowercase(seq): + matches = [] + for i in range(len(seq)): + if seq[i].islower(): + matches.append(i) + return matches