diff --git a/workflow/Snakefile b/workflow/Snakefile
index 88dbcb8..66291c9 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -4,18 +4,20 @@ from snakemake.utils import min_version
 min_version("6.4.1")
 
 #### setup #######
-configfile: "config/config.yaml"
+#configfile: "config/config.yaml"
 
 rule all:
   input:
     expand("results/{sample}/priorization/neoantigens.tsv", sample=config['data']['name'])
 
 #### load rules ####
+include: "rules/prelim.smk"
 include: "rules/ref.smk"
 include: "rules/common.smk"
 include: "rules/hlatyping.smk"
 include: "rules/preproc.smk"
 include: "rules/align.smk"
+include: "rules/quantification.smk"
 include: "rules/genefusion.smk"
 include: "rules/altsplicing.smk"
 include: "rules/exitron.smk"
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
index 1363e16..188a33f 100644
--- a/workflow/rules/align.smk
+++ b/workflow/rules/align.smk
@@ -114,25 +114,45 @@ if config['data']['rnaseq_filetype'] == '.bam':
       "v1.32.1/bio/samtools/merge"
 
 # post-processingn and realignment
-rule postproc:
+rule rnaseq_postproc_fixmate:
   input:
     "results/{sample}/rnaseq/align/{group}_aligned_STAR.bam"
   output:
-    "results/{sample}/rnaseq/align/{group}_final_STAR.bam"
+    "results/{sample}/rnaseq/align/{group}_fixmate_STAR.bam"
   conda:
     "../envs/samtools.yml"
   log:
-    "logs/{sample}/postproc/rnaseq_{group}.log"
-  threads: config['threads']
+    "logs/{sample}/postproc/rnaseq_{group}_fixmate.log"
+  threads: 4
   params:
     mapq="--min-MQ config['mapq']"
   shell:
     """
-      samtools view -bh -F 4 {params.mapq} {input} -o - \
-      | samtools sort -n -@ {threads} -m1g -O bam - -o - \
-      | samtools fixmate -pcmu -O bam -@ {threads} - - \
-      | samtools sort -@ {threads} -m1g -O bam - -o - \
-      | samtools markdup -r -@ {threads} - {output} > {log} 2>&1 
+      samtools view -h -F 4 {params.mapq} {input} -o - \
+      | samtools sort -n -@4 -m4g -O SAM - -o - \
+      | samtools fixmate -pcmu -O bam -@ {threads} - {output} > {log} 2>&1
+    """
+
+# sort and markdup needed to be separated (ensure no core dump for whatever reason) 
+rule rnaseq_postproc_markdup:
+  input:
+    bam="results/{sample}/rnaseq/align/{group}_fixmate_STAR.bam",
+    tmp="tmp/"
+  output:
+    "results/{sample}/rnaseq/align/{group}_final_STAR.bam"
+  conda:
+    "../envs/samtools.yml"
+  log:
+    "logs/{sample}/postproc/rnaseq_{group}_markdup.log"
+  threads: 4
+  resources:
+    mem_mb_per_cpu=4000
+  shell:
+    """
+      samtools sort -@4 -m4G -O BAM -T tmp/ {input.bam} \
+          -o tmp/sorted.bam > {log} 2>&1
+      samtools markdup -r -@4 tmp/sorted.bam {output} > {log} 2>&1 
+      rm tmp/sorted.bam
     """
 
 rule postproc_bam_index:
diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index d2d42ad..de23fec 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -31,18 +31,16 @@ rule download_vep_cache:
 
 rule index_variants:
   input:
-    "results/{sample}/variants/{vartype}.vcf"
+    "results/{sample}/variants/{vartype}.vcf.gz"
   output:
-    bgzip="results/{sample}/variants/{vartype}.vcf.gz",
-    idx="results/{sample}/variants/{vartype}.vcf.gz.tbi"
+    "results/{sample}/variants/{vartype}.vcf.gz.tbi"
   log:
     "logs/indexvcf/{sample}_{vartype}.log"
   conda:
     "../envs/samtools.yml"
   shell:
     """
-      bgzip {input}
-      tabix {input}.gz
+      tabix {input}
     """
 
 rule annotate_variants:
diff --git a/workflow/rules/exitron.smk b/workflow/rules/exitron.smk
index 169aed1..4f17ba8 100644
--- a/workflow/rules/exitron.smk
+++ b/workflow/rules/exitron.smk
@@ -13,7 +13,6 @@ rule download_genome:
       curl -L  https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz \
           | gzip -d > resources/refs/gencode.v37.annotation.gtf
     """
-      
 
 rule prepare_cds:
   output:
@@ -24,10 +23,16 @@ rule prepare_cds:
     "../envs/basic.yml"
   shell:
     """
-      cat resources/refs/gencode.v37.annotation.gtf \
+      cat resources/refs/genome.gtf \
           | awk 'OFS="\\t" {{if ($3=="CDS") {{print $1,$4-1,$5,$10,$16,$7}}}}' \
           | tr -d '";' > {output}
     """
+    
+    #"""
+      #cat resources/refs/gencode.v37.annotation.gtf \
+          #| awk 'OFS="\\t" {{if ($3=="CDS") {{print $1,$4-1,$5,$10,$16,$7}}}}' \
+          #| tr -d '";' > {output}
+    #"""
 
 rule prepare_scanexitron_config:
   output:
@@ -39,17 +44,24 @@ rule prepare_scanexitron_config:
   shell:
     """
       python3 workflow/scripts/prep_scanexitron_config.py \
-          resources/refs/hg38.fa \
-          resources/refs/gencode.v37.annotation.gtf \
+          resources/refs/genome.fasta \
+          resources/refs/genome.gtf \
           resources/refs/CDS.bed \
           {output} > {log}
     """
+      
+      #python3 workflow/scripts/prep_scanexitron_config.py \
+          #resources/refs/hg38.fa \
+          #resources/refs/gencode.v37.annotation.gtf \
+          #resources/refs/CDS.bed \
+          #{output} > {log}
 
 rule scanexitron:
     input: 
         bam = "results/{sample}/rnaseq/align/{group}_final_STAR.bam",
-        fasta = "resources/refs/hg38.fa",
-        gtf = "resources/refs/gencode.v37.annotation.gtf",
+        idx = "results/{sample}/rnaseq/align/{group}_final_STAR.bam.bai",
+        fasta = "resources/refs/genome.fasta",
+        gtf = "resources/refs/genome.gtf",
         cds = "resources/refs/CDS.bed",
         config="resources/scanexitron_config.ini"
     output:
@@ -84,46 +96,64 @@ rule exitron_to_vcf:
   input:
     "results/{sample}/rnaseq/exitron/{group}.exitron"
   output:
-    "results/{sample}/rnaseq/exitron/{group}_exitron.vcf"
+    "results/{sample}/rnaseq/exitron/{group}_exitrons.vcf"
   log:
     "logs/exitron2vcf_{sample}_{group}.log"
   conda:
     "../envs/scanexitron.yml"
   shell:
     """
-      python workflow/scripts/exitron2vcf2.py \
+      python workflow/scripts/exitron2vcf.py \
         {input} \
         {output} \
         resources/refs/hg38.fa > {log}
     """
 
-rule combine_exitrons:
+rule exitron_augment:
   input:
-    get_exitrons,
+    "results/{sample}/rnaseq/exitron/{group}_exitrons.vcf"
   output:
-    "results/{sample}/rnaseq/exitron/exitrons_combined.vcf"
+    "results/{sample}/rnaseq/exitron/{group}_exitrons_augmented.vcf"
   message:
-    "Combining exitrons on sample:{wildcards.sample}"
+    "Augmenting exitrons on sample:{wildcards.sample} of group:{wildcards.group}"
   log:
-    "logs/{sample}/scanexitron/combine_groups.log"
+    "logs/exitron_augment_{sample}_{group}.log"
   conda:
     "../envs/manipulate_vcf.yml"
   shell:
     """
-      python workflow/scripts/combine_vcf.py '{input}' exitron {output} > {log} 2>&1
+      python workflow/scripts/add_infos_to_vcf.py {input} exitron {output} > {log} 2>&1
     """
 
-rule sort_exitrons:
+rule sort_exitron:
   input:
-    "results/{sample}/rnaseq/exitron/exitrons_combined.vcf",
+    "results/{sample}/rnaseq/exitron/{group}_exitrons_augmented.vcf"
   output:
-    "results/{sample}/variants/exitrons.vcf"
+    "results/{sample}/rnaseq/exitron/{group}_exitrons.vcf.gz"
+  message:
+    "Sorting and compressing exitrons on sample:{wildcards.sample} of group:{wildcards.group}"
+  log:
+    "logs/exitron_sort_{sample}_{group}.log"
+  conda:
+    "../envs/samtools.yml"
+  shell:
+    """
+      bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1
+    """
+
+rule combine_exitrons:
+  input:
+    get_exitrons
+  output:
+    "results/{sample}/variants/exitrons.vcf.gz"
+  message:
+    "Combining exitrons on sample:{wildcards.sample}"
   log:
-    "logs/{sample}/scanexitron/sort_exitrons.log"
+    "logs/{sample}/exitrons/combine_exitrons.log"
   conda:
     "../envs/samtools.yml"
   shell:
     """
-      bcftools sort {input} -o {output}
+      bcftools concat --naive -O z {input} -o  - | bcftools sort -O z -o {output} > {log} 2>&1
     """
 
diff --git a/workflow/rules/genefusion.smk b/workflow/rules/genefusion.smk
index e0d4cdd..401fb38 100644
--- a/workflow/rules/genefusion.smk
+++ b/workflow/rules/genefusion.smk
@@ -14,7 +14,7 @@ rule arriba:
         default_blacklist=False,  # optional
         default_known_fusions=True,  # optional
         sv_file="",  # optional
-        extra=f"""-G 'gene_name=gene_name gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS' \
+        extra=f"""-G 'gene_name=gene_name|gene_id gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS' \
           -E {config['genefusion']['maxevalue']} \
           -S {config['genefusion']['suppreads']} \
           -L {config['genefusion']['maxidentity']} \
@@ -30,54 +30,52 @@ rule arriba:
     wrapper:
         "v2.6.0/bio/arriba"
           
-          #-V {config['genefusion']['maxmismatch']} \
+#rule fusions_to_vcf:
+    #input:
+        #"results/{sample}/rnaseq/genefusion/{group}_fusions.tsv",
+    #output:
+        #"results/{sample}/rnaseq/genefusion/{group}_fusions.vcf"
+    #log:
+        #"logs/{sample}/genefusion/fusion_to_vcf_{group}.log"
+    #conda:
+      #"../envs/basic.yml"
+    #shell:
+      #"""
+        #workflow/scripts/convert_fusions_to_vcf.sh \
+            #resources/refs/genome.fasta \
+            #{input} \
+            #{output} > {log} 2>&1
+      #"""
 
-rule fusions_to_vcf:
-    input:
-        "results/{sample}/rnaseq/genefusion/{group}_fusions.tsv",
-    output:
-        "results/{sample}/rnaseq/genefusion/{group}_fusions.vcf"
-    log:
-        "logs/{sample}/genefusion/fusion_to_vcf_{group}.log"
-    conda:
-      "../envs/basic.yml"
-    shell:
-      """
-        workflow/scripts/convert_fusions_to_vcf.sh \
-            resources/refs/genome.fasta \
-            {input} \
-            {output} > {log} 2>&1
-      """
-
-rule combine_fusions:
-  input:
-    get_fusions,
-  output:
-    "results/{sample}/rnaseq/genefusion/fusions.vcf"
-  message:
-    "Combining gene fusion events on sample:{wildcards.sample}"
-  log: 
-    "logs/{sample}/combine/fusions.log"
-  conda:
-    "../envs/manipulate_vcf.yml"
-  shell:
-    """
-      python workflow/scripts/combine_vcf.py '{input}' fusion {output} > {log} 2>&1
+#rule combine_fusions:
+  #input:
+    #get_fusions,
+  #output:
+    #"results/{sample}/rnaseq/genefusion/fusions.vcf"
+  #message:
+    #"Combining gene fusion events on sample:{wildcards.sample}"
+  #log: 
+    #"logs/{sample}/combine/fusions.log"
+  #conda:
+    #"../envs/manipulate_vcf.yml"
+  #shell:
+    #"""
+      #python workflow/scripts/combine_vcf.py '{input}' fusion {output} > {log} 2>&1
 
-    """
+    #"""
 
-rule sort_fusions:
-  input:
-    "results/{sample}/rnaseq/genefusion/fusions.vcf"
-  output:
-    "results/{sample}/variants/fusions.vcf"
-  message:
-    "Sorting gene fusion events on sample:{wildcards.sample}"
-  log:
-    "logs/{sample}/genefusion/sort_fusions.log"
-  conda:
-    "../envs/samtools.yml"
-  shell:
-    """
-      bcftools sort {input} -o {output} > {log} 2>&1
-    """
+#rule sort_fusions:
+  #input:
+    #"results/{sample}/rnaseq/genefusion/fusions.vcf"
+  #output:
+    #"results/{sample}/variants/fusions.vcf"
+  #message:
+    #"Sorting gene fusion events on sample:{wildcards.sample}"
+  #log:
+    #"logs/{sample}/genefusion/sort_fusions.log"
+  #conda:
+    #"../envs/samtools.yml"
+  #shell:
+    #"""
+      #bcftools sort {input} -o {output} > {log} 2>&1
+    #"""
diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk
index 402e3dc..d7c4786 100644
--- a/workflow/rules/germline.smk
+++ b/workflow/rules/germline.smk
@@ -37,6 +37,7 @@ rule detect_variants_htc_first_round:
   input:
     # single or list of bam files
     bam="results/{sample}/{seqtype}/align/{group}_final_BWA.bam",
+    idx="results/{sample}/{seqtype}/align/{group}_final_BWA.bam.bai",
     ref="resources/refs/genome.fasta",
     #known="resources/vqsr/dbSNP_b150.vcf.gz"  # optional
   output:
@@ -48,7 +49,7 @@ rule detect_variants_htc_first_round:
   params:
     extra="", 
     java_opts="",
-  threads: config['threads']
+  threads: 4
   resources:
     mem_mb=1024,
   wrapper:
@@ -82,7 +83,7 @@ rule recalibrate_variants_first_round:
     },
     annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
     extra=""
-  threads: config['threads']
+  threads: 4
   resources:
     mem_mb=1024,
   wrapper:
@@ -183,7 +184,7 @@ rule detect_variants_htc_final_round:
     params:
       extra="",  # optional
       java_opts="",  # optional
-    threads: config['threads']
+    threads: 4
     resources:
       mem_mb=1024,
     wrapper:
@@ -217,7 +218,7 @@ rule recalibrate_variants_final_round:
     },
     annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
     extra=""
-  threads: config['threads']
+  threads: 1
   resources:
     mem_mb=1024,
   wrapper:
diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk
index 32765cb..41fa1a3 100644
--- a/workflow/rules/hlatyping.smk
+++ b/workflow/rules/hlatyping.smk
@@ -1,8 +1,8 @@
 ####### CLASS I HLA GENOTYPING ###########
 rule get_hla_panel:
   output:
-    dna="resources/hla/hla_ref_dna.fasta",
-    rna="resources/hla/hla_ref_rna.fasta"
+    dna="resources/hla/hla_ref_DNA.fa",
+    rna="resources/hla/hla_ref_RNA.fa"
   message:
     "Downloading HLA reference panels"
   conda:
@@ -17,163 +17,104 @@ rule get_hla_panel:
 
 rule index_hla_panel:
     input:
-        dna="resources/hla/hla_ref_dna.fasta",
-        rna="resources/hla/hla_ref_rna.fasta"
+        fa="resources/hla/hla_ref_{type}.fa",
     output:
-        dna=multiext("resources/hla/yara/idx/dna", 
+        panel=multiext("resources/hla/yara_index/{type}", 
             ".lf.drp", ".lf.drs", ".lf.drv", 
             ".lf.pst", ".rid.concat", ".rid.limits",
             ".sa.ind", ".sa.len", ".sa.val", 
             ".txt.concat", ".txt.limits", ".txt.size"),
-        rna=multiext("resources/hla/yara/idx/rna", 
-            ".lf.drp", ".lf.drs", ".lf.drv", 
-            ".lf.pst", ".rid.concat", ".rid.limits",
-            ".sa.ind", ".sa.len", ".sa.val", 
-            ".txt.concat", ".txt.limits", ".txt.size")
     log:
-        "logs/yara_indexer.log"
+        "logs/yara_indexer_{type}.log"
     conda:
         "../envs/yara.yml"
     shell:
-        """
-        yara_indexer -o resources/hla/yara/idx/dna {input.dna} > {log}
-        yara_indexer -o resources/hla/yara/idx/rna {input.rna} >> {log}
-        """
+      """
+          yara_indexer -o resources/hla/yara_index/{wildcards.type} {input.fa} > {log}
+      """
 
-# map reads against reference DNA
-if config['data']['dnaseq'] is not None:
-  if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'DNA':
-    if config['data']['dnaseq_readtype'] == 'SE':
-      rule get_hla_filtering_input_single_DNA:
-        input:
-          get_hla_flt_dna_se,
-          dna=multiext("resources/hla/yara/idx/dna", 
-            ".lf.drp", ".lf.drs", ".lf.drv", 
-            ".lf.pst", ".rid.concat", ".rid.limits",
-            ".sa.ind", ".sa.len", ".sa.val", 
-            ".txt.concat", ".txt.limits", ".txt.size"),
-        output:
-          "results/{sample}/hla/{group}_flt_DNA.bam"
-        message:
-          "Mapping HLA reads against reference"
-        log:
-          "logs/{sample}/{group}_hla_reads_filtering_DNA"
-        conda:
-          "../envs/yara.yml"
-        shell:
-          """
-            samtools fastq {input[0]} > results/{wildcards.sample}/hla/{wildcards.group}_DNA.fastq 
-            yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/dna \
-              results/{wildcards.sample}/hla/{wildcards.group}_DNA.fastq \
-              | samtools view -h -F 4 -b1 - | samtools sort - -o {output}
-            samtools index {output}
-          """
+######### get input reads ########
+rule get_reads_hlatyping_SE:
+  input:
+    get_input_hlatyping_SE
+  output:
+    "results/{sample}/hla/reads/{group}_{type}_SE.fq"
+  message:
+    "Retrieve reads for HLA genotyping by converting the alignments ({wildcards.type}) of group:{wildcards.group} from sample:{wildcards.sample} to reads in FASTQ format"
+  log:
+    "logs/{sample}/hla/get_hla_input_reads_{group}_{type}.log"
+  conda:
+    "../envs/samtools.yml"
+  shell:
+    """
+      samtools fastq {input} > {output}
+    """
 
-if config['data']['dnaseq'] is not None:
-  if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'DNA':
-    if config['data']['dnaseq_readtype'] == 'PE':
-      rule get_hla_filtering_input_paired_DNA:
-        input:
-          unpack(get_hla_flt_dna_pe),
-          dna=multiext("resources/hla/yara/idx/dna", 
-            ".lf.drp", ".lf.drs", ".lf.drv", 
-            ".lf.pst", ".rid.concat", ".rid.limits",
-            ".sa.ind", ".sa.len", ".sa.val", 
-            ".txt.concat", ".txt.limits", ".txt.size"),
-        output:
-          fwd="results/{sample}/hla/{group}_R1_flt_DNA.bam",
-          rev="results/{sample}/hla/{group}_R2_flt_DNA.bam",
-        message:
-          "Mapping HLA reads against reference"
-        log:
-          "logs/{sample}/{group}_hla_reads_filtering_dna"
-        conda:
-          "../envs/yara.yml"
-        threads: config['threads']
-        shell:
-          """
-            samtools fastq {input.r1} > results/{wildcards.sample}/hla/{wildcards.group}_flt_R1_DNA.fastq
-            yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/dna \
-                results/{wildcards.sample}/hla/{wildcards.group}_flt_R1_DNA.fastq \
-                | samtools view -h -F 4 -b1 - | samtools sort - -o {output.fwd}
-            samtools index {output.fwd}
-            samtools fastq {input.r2} > results/{wildcards.sample}/hla/{wildcards.group}_flt_R2_DNA.fastq
-            yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/dna \
-                results/{wildcards.sample}/hla/{wildcards.group}_flt_R2_DNA.fastq \
-                | samtools view -h -F 4 -b1 - | samtools sort - -o {output.rev}
-            samtools index {output.rev}
-          """
+rule get_reads_hlatyping_PE:
+  input:
+    unpack(get_input_hlatyping_PE)
+  output:
+    fwd="results/{sample}/hla/reads/{group}_{type}_PE_{readtype}.fq",
+  message:
+    "Retrieve paired-end reads ({wildcards.type}) for HLA genotyping - sample:{wildcards.sample} group:{wildcards.group}"
+  log:
+    "logs/{sample}/hla/get_reads_hlatyping_PE_{group}_{type}_{readtype}.log"
+  conda:
+    "../envs/samtools.yml"
+  shell:
+    """
+      samtools fastq {input.fwd} > {output.fwd}
+    """
 
-# map reads against reference RNA
-if config['data']['rnaseq'] is not None:
-  if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'RNA':
-    if config['data']['rnaseq_readtype'] == 'SE':
-      rule get_hla_filtering_input_single_RNA:
-        input:
-          get_hla_flt_rna_se,
-          dna=multiext("resources/hla/yara/idx/rna", 
-            ".lf.drp", ".lf.drs", ".lf.drv", 
-            ".lf.pst", ".rid.concat", ".rid.limits",
-            ".sa.ind", ".sa.len", ".sa.val", 
-            ".txt.concat", ".txt.limits", ".txt.size"),
-        output:
-          "results/{sample}/hla/{group}_flt_RNA.bam"
-        message:
-          "Mapping HLA reads against reference"
-        log:
-          "logs/{sample}/{group}_hla_reads_filtering_rna.log"
-        threads: config['threads']
-        conda:
-          "../envs/yara.yml"
-        shell:
-          """
-            samtools fastq {input[0]} > results/{wildcards.sample}/hla/{wildcards.group}.fastq 
-            yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/rna \
-                results/{wildcards.sample}/hla/{wildcards.group}.fastq \
-                | samtools view -h -F 4 -b1 - | samtools sort - -o {output}
-            samtools index {output}
-          """
+######### single-end reads #########
+rule filter_reads_mhcI_SE:
+  input:
+    reads="results/{sample}/hla/reads/{group}_{type}_SE.fq",
+    panel=multiext("resources/hla/yara_index/{type}", 
+        ".lf.drp", ".lf.drs", ".lf.drv", 
+        ".lf.pst", ".rid.concat", ".rid.limits",
+        ".sa.ind", ".sa.len", ".sa.val", 
+        ".txt.concat", ".txt.limits", ".txt.size"),
+  output:
+    "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam"
+  message:
+    "Filter the {wildcards.type} reads of group:{wildcards.group} of sample:{wildcards.sample} against the HLA panel"
+  log:
+    "logs/{sample}/{group}_hla_reads_filtering_{type}"
+  threads:
+    config["threads"]
+  conda:
+    "../envs/yara.yml"
+  shell:
+    """
+      if [ "{wildcards.type}" == "DNA" ]; then
+        yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.type} \
+          {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log}
+      elif [ "{wildcards.type}" == "RNA" ]; then
+        yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.type} \
+            {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log}
+      fi
+    """
 
-if config['data']['rnaseq'] is not None:
-  if config['hlatyping']['mode'] == 'BOTH' or config['hlatyping']['mode'] == 'RNA':
-    if config['data']['rnaseq_readtype'] == 'PE':
-      rule get_hla_filtering_input_paired_RNA:
-        input:
-          unpack(get_hla_flt_rna_pe),
-          dna=multiext("resources/hla/yara/idx/rna", 
-            ".lf.drp", ".lf.drs", ".lf.drv", 
-            ".lf.pst", ".rid.concat", ".rid.limits",
-            ".sa.ind", ".sa.len", ".sa.val", 
-            ".txt.concat", ".txt.limits", ".txt.size"),
-        output:
-          fwd="results/{sample}/hla/{group}_R1_flt_RNA.bam",
-          rev="results/{sample}/hla/{group}_R2_flt_RNA.bam",
-        message:
-          "Mapping HLA reads against reference"
-        log:
-          "logs/{sample}/{group}_hla_reads_filtering_RNA"
-        conda:
-          "../envs/yara.yml"
-        threads: config['threads']
-        shell:
-          """
-            samtools fastq {input.r1} > results/{wildcards.sample}/hla/{wildcards.group}_r1.fastq
-            yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/rna \
-                results/{wildcards.sample}/hla/{wildcards.group}_r1.fastq \
-                | samtools view -h -F 4 -b1 - | samtools sort - -o {output.fwd}
-            samtools index -m1g {output.fwd}
-            samtools fastq {input.r2} > results/{wildcards.sample}/hla/{wildcards.group}_r2.fastq
-            yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara/idx/rna \
-                results/{wildcards.sample}/hla/{wildcards.group}_r2.fastq \
-                | samtools view -h -F 4 -b1 - | samtools sort - -o {output.rev}
-            samtools index -m1g {output.rev}
-          """
+rule index_reads_mhcI_SE:
+  input:
+    "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam"
+  output:
+    "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam.bai"
+  log:
+    "logs/{sample}/{group}_hla_reads_filtering_{type}_index"
+  params:
+      extra="",  # optional params string
+  threads: 4  # This value - 1 will be sent to -@
+  wrapper:
+    "v2.3.0/bio/samtools/index"
 
-checkpoint split_bam_single:
+checkpoint split_reads_mhcI_SE:
   input:
-    "results/{sample}/hla/{group}_flt_{type}.bam",
+    reads="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam",
+    index="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE.bam.bai"
   output:
-    directory("results/{sample}/hla/{group}_flt_{type}_splitted"),
+    directory("results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE/")
   message:
     "Splitting filtered BAM files for HLA typing"
   log:
@@ -183,21 +124,20 @@ checkpoint split_bam_single:
   threads: 1
   shell:
     """
-      mkdir -p results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_splitted/
+      mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.type}_flt_SE/
       gatk SplitSamByNumberOfReads \
-          -I {input[0]} \
+          -I {input.reads} \
           --OUTPUT {output} \
           --OUT_PREFIX R \
           --SPLIT_TO_N_READS 100000
     """
 
-
-rule hla_genotyping_single:
+rule hlatyping_mhcI_SE:  
   input:
-    "results/{sample}/hla/{group}_flt_{type}_splitted/R_{no}.bam",
+    reads="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_SE/R_{no}.bam",
   output:
-    pdf="results/{sample}/hla/{group}_flt_{type}_typed/{no}_coverage_plot.pdf",
-    tsv="results/{sample}/hla/{group}_flt_{type}_typed/{no}_result.tsv"
+    pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_SE/{no}_coverage_plot.pdf",
+    tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_SE/{no}_result.tsv"
   message:
     "HLA typing from splitted BAM files"
   log:
@@ -207,23 +147,23 @@ rule hla_genotyping_single:
   threads: config['threads']
   shell:
     """
-      samtools index {input}
-      if [ "{wildcards.type}" == "RNA" ]; then
-        OptiTypePipeline.py --input {input} \
-            --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \
-            --prefix {wildcards.no} --rna -v > {log}
-      elif [ "{wildcards.type}" == "DNA" ]; then
-        OptiTypePipeline.py --input {input} \
-            --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \
+      samtools index {input.reads}
+      if [ "{wildcards.type}" == "DNA" ]; then
+        OptiTypePipeline.py --input {input.reads} \
+            --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_SE/ \
             --prefix {wildcards.no} --dna -v > {log}
+      elif [ "{wildcards.type}" == "RNA" ]; then
+        OptiTypePipeline.py --input {input.reads} \
+            --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_SE/ \
+            --prefix {wildcards.no} --rna -v > {log}
       fi
     """
 
-rule combine_genotyping_single:
+rule combine_mhcI_SE:
   input:
-    aggregate_genotyping_single
+    aggregate_mhcI_SE
   output:
-    "results/{sample}/hla/alleles/classI_{group}_{type}_SE.tsv"
+    "results/{sample}/hla/mhc-I/genotyping/{group}_{type}_SE.tsv"
   log:
     "logs/{sample}/optitype/{group}_{type}_call.log"
   conda:
@@ -231,16 +171,55 @@ rule combine_genotyping_single:
   threads: 1
   shell:
     """
-      python3 workflow/scripts/merge_classI_alleles.py \
+      python3 workflow/scripts/combine_optitype_results.py \
           '{input}' {output}
-    """ 
+    """
+
+############# paired-end reads ###########
+rule filter_reads_mhcI_PE:
+  input:
+    reads="results/{sample}/hla/reads/{group}_{type}_PE_{readtype}.fq",
+    panel=multiext("resources/hla/yara_index/{type}",
+        ".lf.drp", ".lf.drs", ".lf.drv",
+        ".lf.pst", ".rid.concat", ".rid.limits",
+        ".sa.ind", ".sa.len", ".sa.val", 
+        ".txt.concat", ".txt.limits", ".txt.size"),
+  output:
+    reads="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_{readtype}.bam",
+  message:
+    "Filter the {wildcards.type} reads of group:{wildcards.group} of sample:{wildcards.sample} against the HLA panel"
+  log:
+    "logs/{sample}/{group}_hla_reads_filtering_{type}_{readtype}.log"
+  conda:
+    "../envs/yara.yml"
+  threads: config['threads']
+  shell:
+    """
+      yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.type} \
+          {input.reads} | samtools view -h -F 4 -b1 - | samtools sort -O bam - -o {output.reads} > {log}
+    """
+
+rule index_reads_mhcI_PE:
+  input:
+    "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_{readtype}.bam"
+  output:
+    "results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_{readtype}.bam.bai"
+  log:
+    "logs/{sample}/{group}_hla_reads_filtering_{type}_index_{readtype}"
+  params:
+      extra="",  # optional params string
+  threads: 4  # This value - 1 will be sent to -@
+  wrapper:
+    "v2.3.0/bio/samtools/index"
 
-checkpoint split_bam_paired:
+checkpoint split_reads_mhcI_PE:
   input:
-    fwd="results/{sample}/hla/{group}_R1_flt_{type}.bam",
-    rev="results/{sample}/hla/{group}_R2_flt_{type}.bam"
+    fwd="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R1.bam",
+    fwd_idx="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R1.bam.bai",
+    rev="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R2.bam",
+    rev_idx="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE_R2.bam.bai"
   output:
-    directory("results/{sample}/hla/{group}_flt_{type}_splitted"),
+    directory("results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE/")
   message:
     "Splitting filtered BAM files for HLA typing"
   log:
@@ -250,12 +229,13 @@ checkpoint split_bam_paired:
   threads: 1
   shell:
     """
-      mkdir -p results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_splitted/
+      mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.type}_flt_PE/
       gatk SplitSamByNumberOfReads \
           -I {input.fwd} \
           --OUTPUT {output} \
           --OUT_PREFIX R1 \
           --SPLIT_TO_N_READS 100000
+      
       gatk SplitSamByNumberOfReads \
           -I {input.rev} \
           --OUTPUT {output} \
@@ -263,14 +243,13 @@ checkpoint split_bam_paired:
           --SPLIT_TO_N_READS 100000
     """
 
-
-rule hla_genotyping_paired:
+rule hlatyping_mhcI_PE:  
   input:
-    fwd="results/{sample}/hla/{group}_flt_{type}_splitted/R1_{no}.bam",
-    rev="results/{sample}/hla/{group}_flt_{type}_splitted/R2_{no}.bam"
+    fwd="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE/R1_{no}.bam",
+    rev="results/{sample}/hla/mhc-I/reads/{group}_{type}_flt_PE/R2_{no}.bam",
   output:
-    pdf="results/{sample}/hla/{group}_flt_{type}_typed/{no}_coverage_plot.pdf",
-    tsv="results/{sample}/hla/{group}_flt_{type}_typed/{no}_result.tsv"
+    pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_PE/{no}_coverage_plot.pdf",
+    tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{type}_flt_PE/{no}_result.tsv"
   message:
     "HLA typing from splitted BAM files"
   log:
@@ -282,36 +261,38 @@ rule hla_genotyping_paired:
     """
       samtools index {input.fwd}
       samtools index {input.rev}
-      if [ "{wildcards.type}" == "RNA" ]; then
-        OptiTypePipeline.py --input {input.fwd} {input.rev} \
-            --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \
-            --prefix {wildcards.no} --rna -v > {log}
-      elif [ "{wildcards.type}" == "DNA" ]; then
+      if [ "{wildcards.type}" == "DNA" ]; then
         OptiTypePipeline.py --input {input.fwd} {input.rev} \
-            --outdir results/{wildcards.sample}/hla/{wildcards.group}_flt_{wildcards.type}_typed/ \
+            --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_PE/ \
             --prefix {wildcards.no} --dna -v > {log}
+      elif [ "{wildcards.type}" == "RNA" ]; then
+        OptiTypePipeline.py --input {input.fwd} {input.rev} \
+            --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.type}_flt_PE/ \
+            --prefix {wildcards.no} --rna -v > {log}
       fi
     """
 
-rule combine_genotyping_paired:
+rule combine_mhcI_PE:
   input:
-    aggregate_genotyping_paired
+    aggregate_mhcI_PE
   output:
-    "results/{sample}/hla/alleles/classI_{group}_{type}_PE.tsv"
+    "results/{sample}/hla/mhc-I/genotyping/{group}_{type}_PE.tsv"
   log:
     "logs/{sample}/optitype/{group}_{type}_call.log"
+  conda:
+    "../envs/basic.yml"
+  threads: 1
   shell:
     """
-      python3 workflow/scripts/merge_classI_alleles.py \
+      python3 workflow/scripts/combine_optitype_results.py \
           '{input}' {output}
-    """ 
-
+    """
 
-rule merge_allels:
+rule merge_mhcI_allels:
   input:
-    get_alleles
+    get_mhcI_alleles
   output:
-    "results/{sample}/hla/classI_alleles.tsv",
+    "results/{sample}/hla/mhc-I.tsv",
   message:
     "Merging HLA alleles from different sources"
   log:
@@ -321,27 +302,103 @@ rule merge_allels:
   threads: 1
   shell:
     """
-      cat {input} | sort | uniq > {output}
+      python workflow/scripts/merge_mhcI_alleles.py \
+          '{input}' {output}
     """
+    
+######### MHC-II HLA GENOTYPING ###########
+rule filter_reads_mhcII_PE:
+  input:
+    sample=["results/{sample}/hla/reads/{group}_{type}_PE_R1.fq", "results/{sample}/hla/reads/{group}_{type}_PE_R2.fq"],
+    idx=multiext(
+      "resources/hla/bowtie2_index",
+      ".1.bt2",
+      ".2.bt2",
+      ".3.bt2",
+      ".4.bt2",
+      ".rev.1.bt2",
+      ".rev.2.bt2",
+    ),
+  output:
+    "results/{sample}/hla/mhc-II/reads/{group}_{type}_hla.bam"
+  log:
+    "logs/bowtie2/{sample}/{group}_{type}.log",
+  params:
+    extra="",  # optional parameters
+  threads: config['threads']  # Use at least two threads
+  wrapper:
+    "v2.11.1/bio/bowtie2/align"
 
-####### CLASS II HLA GENOTYPING ###########
-#rule classII_get_reads_DNA_SE:
+rule bam2fastq_reads_mhcII:
+  input:
+    "results/{sample}/hla/mhc-II/reads/{group}_{type}_hla.bam"
+  output:
+    fwd="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R1.fastq",
+    rev="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R2.fastq",
+  log:
+    "logs/{sample}/{group}_{type}_hla_sam.log"
+  conda:
+    "../envs/samtools.yml"
+  threads: 1
+  shell:
+    """
+      samtools fastq -F 4 {input} -1 {output.fwd} -2 {output.rev} -s /dev/null
+    """
+
+rule hlatyping_mhcII:
+  input:
+    fwd="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R1.fastq",
+    rev="results/{sample}/hla/mhc-II/reads/{group}_{type}_hla_R2.fastq"
+  output:
+    "results/{sample}/hla/mhc-II/genotyping/{group}_{type}/result/{group}_{type}_final.result.txt"
+  log:
+    "logs/{sample}/hla/{group}_{type}_hlahd.log"
+  conda:
+    "../envs/hlahd.yml"
+  threads: config['threads']
+  shell:
+    """
+      hlahd.sh \
+          -t {threads} \
+          -m 100 \
+          -c 0.95 \
+          -f {config[hlatyping][freqdata]} \
+          {input.fwd} {input.rev} \
+          {config[hlatyping][split]} \
+          {config[hlatyping][dict]} \
+          {wildcards.group}_{wildcards.type} \
+          results/{wildcards.sample}/hla/mhc-II/genotyping/
+    """
+
+#rule merge_mhcII_allels:
   #input:
-    #get_hla_flt_dna_se
+    #get_mhcII_alleles
   #output:
-    #"results/{sample}/hla/classII/{group}_DNA.fastq"
+    #"results/{sample}/hla/mhc-I.tsv",
   #message:
-    #"Extracting DNA reads for HLA classII typing"
+    #"Merging HLA alleles from different sources"
   #log:
-    #"logs/{sample}/optitype/{group}_DNA_call.log"
+    #"logs/{sample}/optitype/merge_classI_alleles.log"
   #conda:
     #"../envs/basic.yml"
   #threads: 1
   #shell:
     #"""
-      
+      #python workflow/scripts/merge_mhcI_alleles.py \
+          #'{input}' {output}
     #"""
 
 
-#def get_hla_flt_dna_se(wildcards):
+
+    
+
+
+
+
+
+
+
+
+
+
 
diff --git a/workflow/rules/neoantigen.smk b/workflow/rules/neoantigen.smk
index fc04f7b..5d875c0 100644
--- a/workflow/rules/neoantigen.smk
+++ b/workflow/rules/neoantigen.smk
@@ -75,7 +75,7 @@ rule priorization:
     pred_aff="workflow/scripts/mhc_i/",
     pred_imm="workflow/scripts/immunogenicity/",
     peptides="results/{sample}/priorization/peptides.tsv",
-    alleles="results/{sample}/hla/classI_alleles.tsv",
+    alleles="results/{sample}/hla/mhc-I.tsv",
   output:
     "results/{sample}/priorization/neoantigens.tsv"
   message:
diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk
index e2f471e..849372e 100644
--- a/workflow/rules/preproc.smk
+++ b/workflow/rules/preproc.smk
@@ -36,8 +36,8 @@ rule fastqc_forward:
     input:
       get_qc_input_fwd
     output:
-        html="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc_raw.html",
-        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc_raw.zip"
+        html="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc_raw.html",
+        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc_raw.zip"
     params:
         extra = ""
     log:
@@ -52,8 +52,8 @@ rule fastqc_reverse:
     input:
       get_qc_input_rev
     output:
-        html="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc_raw.html",
-        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc_raw.zip"
+        html="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc_raw.html",
+        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc_raw.zip"
     params:
         extra = "--quiet"
     log:
@@ -67,13 +67,13 @@ rule fastqc_reverse:
 rule preproc_paired_end:
     input:
       unpack(get_preproc_input),
-      qc1="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc_raw.html",
-      qc2="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc_raw.html"
+      qc1="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc_raw.html",
+      qc2="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc_raw.html"
     output:
-        trimmed=["results/{sample}/{seqtype}/reads/{group}_r1_preproc.fq.gz", 
-                 "results/{sample}/{seqtype}/reads/{group}_r2_preproc.fq.gz"],
-        unpaired1="results/{sample}/{seqtype}/reads/{group}_r1_preproc_unpaired.fq.gz",
-        unpaired2="results/{sample}/{seqtype}/reads/{group}_r2_preproc_unpaired.fq.gz",
+        trimmed=["results/{sample}/{seqtype}/reads/{group}_R1_preproc.fq.gz", 
+                 "results/{sample}/{seqtype}/reads/{group}_R2_preproc.fq.gz"],
+        unpaired1="results/{sample}/{seqtype}/reads/{group}_R1_preproc_unpaired.fq.gz",
+        unpaired2="results/{sample}/{seqtype}/reads/{group}_R2_preproc_unpaired.fq.gz",
         failed="results/{sample}/{seqtype}/reads/{group}_preproc_failed.fq.gz",
         html="results/{sample}/{seqtype}/reads/{group}_preproc_report.html",
         json="results/{sample}/{seqtype}/reads/{group}_preproc_report.json",
@@ -96,10 +96,10 @@ rule preproc_paired_end:
 # fastqc after pre-processing (forward)
 rule fastqc_forward_after:
     input:
-      "results/{sample}/{seqtype}/reads/{group}_r1_preproc.fq.gz"
+      "results/{sample}/{seqtype}/reads/{group}_R1_preproc.fq.gz"
     output:
-        html="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc.html",
-        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r1_fastqc.zip"
+        html="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc.html",
+        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R1_fastqc.zip"
     params:
         extra = ""
     log:
@@ -113,10 +113,10 @@ rule fastqc_forward_after:
 # fastqc after pre-processing (reverse)
 rule fastqc_reverse_after:
     input:
-      "results/{sample}/{seqtype}/reads/{group}_r2_preproc.fq.gz"
+      "results/{sample}/{seqtype}/reads/{group}_R2_preproc.fq.gz"
     output:
-        html="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc.html",
-        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_r2_fastqc.zip"
+        html="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc.html",
+        zip="results/{sample}/{seqtype}/qualitycontrol/{group}_R2_fastqc.zip"
     params:
         extra = "--quiet"
     log:
diff --git a/workflow/scripts/combine_vcf.py b/workflow/scripts/combine_vcf.py
deleted file mode 100644
index 21d21b4..0000000
--- a/workflow/scripts/combine_vcf.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import vcfpy
-import sys
-from pathlib import Path
-
-'''
-This script combines multiple vcfs into one vcf. It adds a new INFO field for the group name
-
-Usage:
-python combine_vcf.py '<input_vcfs>' <method> <output_vcf>
-'''
-
-def main():
-    vcfs = sys.argv[1].split(' ')
-    # iterate through all vcfs
-    for i,v in enumerate(vcfs):
-        # extract the replicate (e.g., {rep1}_*.vcf)
-        if '_sliprem.vcf' in str(Path(v).name):
-            rpl = Path(v).name.split('_sliprem.vcf')[0]
-        elif '_somatic.short.indels.vcf' in str(Path(v).name):
-            rpl = Path(v).name.split('_somatic.short.indels.vcf')[0]
-        elif '_somatic.snvs.vcf' in str(Path(v).name):
-            rpl = Path(v).name.split('_somatic.snvs.vcf')[0]
-        elif '_fusions.vcf' in str(Path(v).name):
-            rpl = Path(v).name.split('_fusions.vcf')[0]
-        elif '_exitron.vcf' in str(Path(v).name):
-            rpl = Path(v).name.split('_exitron.vcf')[0]
-
-        reader = vcfpy.Reader.from_path(v)
-        reader.header.add_info_line(
-            vcfpy.OrderedDict(
-                [('ID', 'GRP'), 
-                 ('Number', '1'), 
-                 ('Type', 'String'), 
-                 ('Description', 'Name of the group')]))
-        reader.header.add_info_line(
-            vcfpy.OrderedDict(
-                [('ID', 'SRC'),
-                 ('Number', '1'),
-                 ('Type', 'String'),
-                 ('Description', 'Source of the variant')]))
-
-        writer = vcfpy.Writer.from_path(sys.argv[3], reader.header)
-
-        # iterate through all records
-        for record in reader:
-            print(record)
-            record.INFO['GRP'] = rpl
-            record.INFO['SRC'] = sys.argv[2]
-            writer.write_record(record)
-
-main()
diff --git a/workflow/scripts/exitron2vcf.py b/workflow/scripts/exitron2vcf.py
index cccea60..674048d 100644
--- a/workflow/scripts/exitron2vcf.py
+++ b/workflow/scripts/exitron2vcf.py
@@ -111,6 +111,7 @@ def main():
                         [('SVTYPE', 'DEL'),
                          ('END', end-1),
                          ('DP', dp),
+                         ('AO', ao),
                          ('STRAND', strand),
                          ('SplicedSite', spliced_site),
                          ('GeneName', gene_name),
diff --git a/workflow/scripts/merge_classI_alleles.py b/workflow/scripts/merge_classI_alleles.py
deleted file mode 100644
index b534074..0000000
--- a/workflow/scripts/merge_classI_alleles.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import sys
-
-def main():
-    alleles = []
-
-    # load alleles
-    allele_list = load_alleles('workflow/scripts/valid_alleles/netmhcpan.txt')
-
-    #print(allele_list)
-
-    for alfile in sys.argv[1].split(' '):
-        with open(alfile, 'r') as f:
-            next(f)
-            for line in f:
-                line_list = line.rstrip().split('\t')
-                for i in range(1,7):
-                    # modify header
-                    modname = 'HLA-' + line_list[i]
-                    if modname in allele_list:
-                        if modname not in alleles:
-                            alleles.append(modname)
-    # remove duplicates
-    alleles = list(set(alleles))
-
-    
-    output = open(sys.argv[2], 'w')
-    for i in alleles:
-        output.write(i + '\n')
-    output.close()
-
-
-def load_alleles(alleles_file):
-    alleles = []
-    with open(alleles_file, 'r') as f:
-        for line in f:
-            alleles.append(line.rstrip())
-    return alleles
-
-
-main()
diff --git a/workflow/scripts/priorization/__pycache__/prediction.cpython-37.pyc b/workflow/scripts/priorization/__pycache__/prediction.cpython-37.pyc
new file mode 100644
index 0000000..a7330a6
Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/prediction.cpython-37.pyc differ
diff --git a/workflow/scripts/priorization/__pycache__/reference.cpython-311.pyc b/workflow/scripts/priorization/__pycache__/reference.cpython-311.pyc
new file mode 100644
index 0000000..93b0531
Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/reference.cpython-311.pyc differ
diff --git a/workflow/scripts/priorization/__pycache__/reference.cpython-37.pyc b/workflow/scripts/priorization/__pycache__/reference.cpython-37.pyc
new file mode 100644
index 0000000..36d9dfe
Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/reference.cpython-37.pyc differ
diff --git a/workflow/scripts/priorization/__pycache__/variants.cpython-37.pyc b/workflow/scripts/priorization/__pycache__/variants.cpython-37.pyc
new file mode 100644
index 0000000..cf8b0f1
Binary files /dev/null and b/workflow/scripts/priorization/__pycache__/variants.cpython-37.pyc differ
diff --git a/workflow/scripts/priorization/compile.py b/workflow/scripts/priorization/compile.py
new file mode 100644
index 0000000..2f0855b
--- /dev/null
+++ b/workflow/scripts/priorization/compile.py
@@ -0,0 +1,61 @@
+import configargparse
+import reference
+import variants
+import prediction
+
+import pdb
+
+
+def main():
+    options = parse_arguments()
+
+    proteome = reference.Proteome(options.proteome)
+    annotation = reference.Annotations(options.anno)
+
+    variant_effects = variants.VariantEffects(options.output_dir)
+    variant_effects.print_header()
+
+    if options.fusion is not None:
+        fusions = variants.Fusions(options.fusion, 
+                                   options.confidence, 
+                                   proteome.proteome, 
+                                   annotation.anno,
+                                   variant_effects)
+
+    if options.input is not None:
+        other_variants = variants.Variants(options.input, 
+                                       variant_effects)
+
+    variant_effects.close_file()
+
+
+    # predict binding affinities
+    binding = prediction.BindingAffinities(variant_effects.variantEffectsFile,
+                                           options.mhcI,  
+                                           options.mhcII,
+                                           options.mhcI_len,
+                                           options.mhcII_len)
+
+
+
+def parse_arguments():
+    p = configargparse.ArgParser()
+    
+    # define command line arguments
+    p.add('-i', '--input', required=False, help='input file')
+    # when fusion file is provided (remains optional)
+    p.add('-f', '--fusion', required=False, help='fusion file')
+    p.add('-c', '--confidence', required=False, choices=['high', 'medium', 'low'], help='confidence level of fusion events') 
+    p.add("--mhcI", required=False, help='MHC-I allele')
+    p.add("--mhcII", required=False, help='MHC-II allele')
+    p.add("--mhcI_len", required=False, help='MHC-I peptide length')
+    p.add("--mhcII_len", required=False, help='MHC-II peptide length')
+    p.add('-p', '--proteome', required=True, help='proteome file')
+    p.add('-a', '--anno', required=True, help='annotation file')
+    p.add('-o', '--output_dir', required=True, help='output directory')
+
+    return p.parse_args()
+    
+
+
+main()
diff --git a/workflow/scripts/priorization/prediction.py b/workflow/scripts/priorization/prediction.py
new file mode 100644
index 0000000..398938c
--- /dev/null
+++ b/workflow/scripts/priorization/prediction.py
@@ -0,0 +1,189 @@
+import tempfile
+import pdb
+import os
+import concurrent.futures
+
+class BindingAffinities:
+    def __init__(self, effects, mhcI, mhcII, mhcI_len, mhcII_len):
+    
+        with tempfile.TemporaryDirectory() as tmp_seqs:
+            if mhcI is not None:
+                mhcI_alleles = {}
+                fh_mhcI_alleles = open(mhcI, 'r')
+                for allele in fh_mhcI_alleles:
+                    line = allele.rstrip().split('\t')
+                    mhcI_alleles[line[1]] = line[0]
+
+            wt_fname = {}
+            mt_fname = {}
+        
+            # counters to store number of sequence in fa
+            wt_cnt = {}
+            mt_cnt = {}
+
+            # file handler for wt and mt sequences
+            fh_wt = {}
+            fh_mt = {}
+
+            # extract mhcI lens
+            epilens = []
+            for lens in mhcI_len.split(','):
+                if '-' in lens:
+                    lower, upper = lens.split('-',1)
+                    epilens.extend(range(int(lower), int(upper)+1))
+                else:
+                    epilens.append(int(lens))
+
+            for epilen in epilens:
+                wt_fname[epilen] = os.path.join(tmp_seqs, f'wt_{epilen}.fa')
+                mt_fname[epilen] = os.path.join(tmp_seqs, f'mt_{epilen}.fa')
+
+                fh_wt[epilen] = open(wt_fname[epilen], 'w')
+                fh_mt[epilen] = open(mt_fname[epilen], 'w')
+
+                # initialise counter
+                wt_cnt[epilen] = 1
+                mt_cnt[epilen] = 1
+
+            subseqs = []
+
+            # iterate through the file
+            fh = open(effects, 'r')
+            next(fh)   # skip header
+            for line in fh:
+                entries = line.rstrip().split('\t')
+                print(entries)
+                # print(line)
+
+                for epilen in epilens:
+                    local_var_start = int(entries[15])
+                    local_var_end = int(entries[16])
+
+                    wt_subseq = entries[12]
+                    mt_subseq = entries[13]
+
+                    # adjust length of seqs (according to epilen)
+                    if local_var_start >= epilen + 1:
+                        left = local_var_start - (epilen + 1)
+                    else:
+                        left = 0
+
+                    if local_var_end + (epilen - 1) <= len(mt_subseq):
+                        right = local_var_end + (epilen - 1)
+                    else:
+                        right = len(mt_subseq) - 1
+
+                    wt_subseq_adj = wt_subseq[left:right]
+                    mt_subseq_adj = mt_subseq[left:right]
+
+                    # determine the epitope sequences
+                    if '$' in wt_subseq_adj:
+                        wt_epitope_seq = wt_subseq_adj.split('$')[0]
+                    else:
+                        wt_epitope_seq = wt_subseq_adj
+                    mt_epitope_seq = mt_subseq_adj
+
+                    if len(wt_epitope_seq) >= epilen+1:
+                        fh_wt[epilen].write(f'>{wt_cnt[epilen]}\n{wt_epitope_seq}\n')
+                        entries.append(wt_cnt[epilen])
+                        wt_cnt[epilen] += 1
+                    else:
+                        entries.append(0)
+
+                    if len(mt_epitope_seq) >= epilen+1:
+                        fh_mt[epilen].write(f'>{mt_cnt[epilen]}\n{mt_epitope_seq}\n')
+                        entries.append(mt_cnt[epilen])
+                        mt_cnt[epilen] += 1
+                    else:
+                        entries.append(0)
+
+                subseqs.append(entries)
+
+            # its necessary to close the files
+            fh.close()
+            for epilen in epilens:
+                fh_wt[epilen].close()
+                fh_mt[epilen].close()
+        
+            wt_affinities = self.collect_binding_affinities(alleles, wt_fname, epilens, 'wt', options.threads)
+            mt_affinities = self.collect_binding_affinities(alleles, mt_fname, epilens, 'mt', options.threads)
+
+
+    @staticmethod
+    def collect_binding_affinities(alleles, fnames, epilens, group, threads):
+        affinities_results = {}
+        number_of_threads = int(threads)
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
+            futures = {}
+            for allele in alleles:
+                for epilen in epilens:
+                    future = executor.submit(calc_binding_affinities, fnames[epilen], allele, epilen, group)
+                    futures[future] = epilen
+
+            for future in concurrent.futures.as_completed(futures):
+                epilen = futures[future]
+                binding_affinities = future.result()
+
+                # check epilen already present
+                if epilen not in affinities_results.keys():
+                    affinities_results[epilen] = {}
+
+                for seqnum in binding_affinities.keys():
+                    if seqnum not in affinities_results[epilen].keys():
+                        affinities_results[epilen][seqnum] = binding_affinities[seqnum]
+                    else:
+                        for seq in binding_affinities[seqnum].keys():
+                            if seq not in affinities_results[epilen][seqnum].keys():
+                                affinities_results[epilen][seqnum][seq] = binding_affinities[seqnum][seq]
+
+        return affinities_results
+
+
+    def calc_binding_affinities(fa_file, allele, epilen, group):
+        binding_affinities = {}
+#    binding_affinities[epilen] = {}
+        print(f'calculate binding affinities - allele:{allele} epilen:{epilen} group: {group}')
+        call = ['python', 
+            'workflow/scripts/mhc_i/src/predict_binding.py', 
+            'netmhcpan', 
+            allele, 
+            str(epilen),
+            fa_file]
+
+        result = subprocess.run(call,
+            stdout = subprocess.PIPE,
+            universal_newlines=True)
+        predictions = result.stdout.rstrip().split('\n')[1:]
+
+        for line in predictions:
+            entries = line.split('\t')
+            if group == 'mt':
+                if float(entries[8]) >= 500:
+                    continue
+
+            # sequence number in results used as key
+            seqnum = int(entries[1])
+            epitope_seq = entries[5]
+            allele = entries[0]
+
+            # start and end in sequence (0-based)
+            start = int(entries[2])-1 
+            end = int(entries[3])-1
+
+            ic50 = float(entries[8])
+            rank = float(entries[9])
+
+            if seqnum not in binding_affinities:
+                binding_affinities[seqnum] = {}
+
+            if epitope_seq not in binding_affinities[seqnum]:
+                binding_affinities[seqnum][epitope_seq] = (allele, start, end, ic50, rank)
+
+        return binding_affinities
+
+
+
+
+            
+
diff --git a/workflow/scripts/priorization/reference.py b/workflow/scripts/priorization/reference.py
new file mode 100644
index 0000000..e22d6fb
--- /dev/null
+++ b/workflow/scripts/priorization/reference.py
@@ -0,0 +1,38 @@
+import pyfaidx
+import re
+
+class Proteome:
+    def __init__(self, fastaFile):
+        # parse proteome
+        ref = pyfaidx.Fasta(fastaFile)
+        self.proteome = {}
+        for record in ref:
+            identifier = record.long_name
+            match = re.search(r'transcript:([^.\s]+)', identifier)
+            if match:
+                tid = match.group(1)
+                self.proteome[tid] = str(record)
+
+
+class Annotations:
+    def __init__(self, gtfFile):
+        # parse annotations
+        anno_fh = open(gtfFile, 'r')
+        self.anno = {}
+        for line in anno_fh:
+            # ignore header
+            if not line.startswith('#'):
+                l = line.rstrip().split('\t')
+
+                if l[2] == 'exon': 
+                    exon_number = re.search(r'exon_number "([^.\s]+)"', l[8]).group(1)
+                    transcript_id = re.search(r'transcript_id "([^.\s]+)"', l[8]).group(1)
+
+                    if transcript_id not in self.anno:
+                        self.anno[transcript_id] = {}
+
+                    if not exon_number in self.anno[transcript_id]:
+                        self.anno[transcript_id][int(exon_number)] = [l[3],l[4]]
+
+        anno_fh.close()
+
diff --git a/workflow/scripts/priorization/variants.py b/workflow/scripts/priorization/variants.py
new file mode 100644
index 0000000..149f3a4
--- /dev/null
+++ b/workflow/scripts/priorization/variants.py
@@ -0,0 +1,869 @@
+from pathlib import Path
+import re
+import vcfpy
+import sys
+
+
+class VariantEffectsEntry:
+    # create an empty variant effects entry
+    def __init__(self):
+        self.data = {}
+        self.data['chrom'] = ""
+        self.data['start'] = ""
+        self.data['end'] = ""
+        self.data['gene_id'] = ""
+        self.data['gene_name'] = ""
+        self.data['transcript_id'] = ""
+        self.data['transcript'] = ""
+        self.data['cds'] = ""
+        self.data['cds_breakpoint'] = ""
+        self.data['source'] = ""
+        self.data['group'] = ""
+        self.data['var_type'] = ""
+        self.data['wt_subseq'] = ""
+        self.data['mt_subseq'] = ""
+        self.data["var_start"] = ""
+        self.data["local_var_start"] = ""
+        self.data["local_var_end"] = ""
+        self.data['vaf'] = ""
+        self.data['ao'] = ""
+        self.data['dp'] = ""
+        self.data['NMD'] = "."
+        self.data['PTC_dist_ejc'] = "."
+        self.data['PTC_exon_number'] = "."
+        self.data['NMD_esc_rule'] = "."
+
+    def __init__(self, chrom, start, end, gene_id, gene_name,
+                 transcript_id, transcript, cds, cds_breakpoint, source, 
+                 group, var_type, wt_subseq, mt_subseq, var_start,
+                 local_var_start, local_var_end, vaf, ao, dp):
+        self.data = {}
+        self.data['chrom'] = chrom
+        self.data['start'] = start
+        self.data['end'] = end
+        self.data['gene_id'] = gene_id
+        self.data['gene_name'] = gene_name
+        self.data['transcript_id'] = transcript_id
+        self.data['transcript'] = transcript
+        self.data['cds'] = cds
+        self.data['cds_breakpoint'] = cds_breakpoint
+        self.data['source'] = source
+        self.data['group'] = group
+        self.data['var_type'] = var_type
+        self.data['wt_subseq'] = wt_subseq
+        self.data['mt_subseq'] = mt_subseq
+        self.data["var_start"] = var_start
+        self.data["local_var_start"] = local_var_start
+        self.data["local_var_end"] = local_var_end
+        self.data['vaf'] = vaf
+        self.data['ao'] = ao
+        self.data['dp'] = dp
+        self.data['NMD'] = "."
+        self.data['PTC_dist_ejc'] = "."
+        self.data['PTC_exon_number'] = "."
+        self.data['NMD_esc_rule'] = "."
+
+
+    # getter & setter
+    def get_nmd(self):
+        return self.data['NMD']
+
+    def set_nmd(self, nmd):
+        self.data['NMD'] = nmd
+
+
+    def scan_nmd(self, anno):
+        # check for NMD in framshift events
+        if "frameshift" in self.data["var_type"]:
+
+            if self.data["transcript"] == ".":
+                self.data["NMD"] = "unknown"
+                return 
+            else:
+                seg2_start = self.data["start"].split('|')[1]
+                # search for premature stop codon in cds
+                stop_pos, stop_coord = self.find_stop_codon(self.data["cds"],
+                                                            self.data["cds_breakpoint"],
+                                                            seg2_start)
+
+                # check if stop codon is PTC
+                if stop_pos != -1:
+                    tid2 = self.data["transcript_id"].split('|')[1]
+
+                    if tid2 == '.':
+                        self.data['NMD'] = "unknown"
+                        return
+
+                    exoninfo_tid2 = anno[tid2]
+                    exons_tid2 = list(exoninfo_tid2.keys())
+
+                    exon_num, dist_ejc = self.annotate_stop_codon(exoninfo_tid2, stop_coord)
+                    if exon_num != -1:
+                        self.data["PTC_exon_number"] = f'{exon_num}'
+                        if max(exons_tid2) > 1:
+                            self.data["PTC_dist_ejc"] = dist_ejc
+
+                        # check if NMD is escaped
+                        nmd_escape = self.check_escape(exoninfo_tid2,
+                                                       stop_coord,
+                                                       exon_num,
+                                                       dist_ejc)
+
+                        if nmd_escape != -1:
+                            self.data["NMD"] = "NMD_escaping_variant"
+                            self.data["NMD_esc_rule"] = nmd_escape
+                        else:
+                            self.data["NMD"] = "NMD_variant"
+
+
+
+    def find_stop_codon(self, cds, breakpoint, seg2_start):
+        """search for stop codon in coding sequence
+        returns the index of the stop codon and 
+        genomic coordinate of the end of the second segment
+        """
+        stop_pos = -1 # position of (detected) stop_codon --> within cds
+        for i in range(0, len(cds), 3):
+            codon = cds[i:i+3]
+            if codon == 'TAA' or codon == 'TAG' or codon == 'TGA':
+                stop_pos = i
+                break
+
+        # stop codon found before breakpoint (should be invoked)
+        if stop_pos <= breakpoint:
+            return -1, -1
+        else:
+            segment2 = stop_pos - breakpoint
+            stop_coord = int(seg2_start) + segment2
+            return stop_pos, stop_coord
+
+
+    def annotate_stop_codon(self, exoninfo, stop_coord):
+        stop_exon = -1
+        for exon_number in exoninfo:
+            exon_start = int(exoninfo[exon_number][0])
+            exon_end = int(exoninfo[exon_number][1])
+            if stop_coord >= exon_start and stop_coord <= exon_end:
+                stop_exon = exon_number
+                dist_ejc = exon_end - stop_coord
+                return stop_exon, dist_ejc
+
+        # no exon information found for stop_codon
+        return -1, -1
+
+
+    def check_escape(self, exoninfo, stop_coord, exon_num, dist_ejc):
+        """ 
+        1. If the variant is in an intronless transcript, meaning only one exon exist in the transcript.
+        2. The variant falls in the first 100 coding bases in the transcript.
+             vvv
+          ..ES...EE..I.ES...EE.I.ES....EE.I.ES....EE 
+        (ES= exon_start,EE = exon_end, I = intron, v = variant location)
+        3. The variant location falls 50 bases upstream of the penultimate (second to the last) exon.
+                                   vvv
+          ES...EE..I.ES...EE.I.ES....EE.I.ES....EE 
+        (ES= exon_start,EE = exon_end, I = intron, v = variant location)
+        4. The variant location  falls in the last exon of the transcript.
+                                                vvvv
+              ES...EE..I.ES...EE.I.ES....EE.I.ES....EE 
+         (ES= exon_start,EE = exon_end, I = intron, v = variant location)
+
+        """
+        exons = list(exoninfo.keys())
+        last_exon = int(max(exons))
+
+        if exon_num == 1:
+            if last_exon == 1: # there is only one exon
+                return 1
+            elif last_exon > 1:
+                exon_start = int(exoninfo[1][0])
+                # check if PTC is within 
+                if stop_coord - exon_start < 100:
+                    return 2
+        if exon_num == last_exon-1:
+            exon_start = int(exoninfo[last_exon-1][0])
+            exon_end = int(exoninfo[last_exon-1][1])
+            if exon_end - stop_coord <= 50:
+                return 3
+        elif exon_num == last_exon:
+            return 4
+
+        return -1
+
+
+    def print_vee(self):
+        for k,v in self.data.items():
+            print(f'{k}: {v}\t', end='')
+        print('\n')
+
+
+
+
+class Fusions:
+    def __init__(self, fusionsFile, confidence, proteome, annotation, effects):
+#        self.variantEffects = VariantEffectsEntry()
+
+        group = Path(fusionsFile).stem.split('_fusions')[0]
+        confidenceLevel = {"low": ["low", "medium", "high"],
+                           "medium": ["medium", "high"],
+                           "high": ["high"]}
+
+
+        with open(fusionsFile) as fh:
+            next(fh) # ignore header
+            for line in fh:
+                cols = line.rstrip().split('\t')
+                if cols[14] in confidenceLevel[confidence]:
+
+                    eventType = cols[8]
+                    readingFrame = cols[15]
+                    consequence = self.determineConsequence(readingFrame, eventType)
+                    if consequence == None:
+                        continue
+
+                    peptideSeq = cols[28]
+                    if peptideSeq == '.':
+                        continue
+                    
+                    # get genomic coordiantes
+                    chrom, start, end = self.determineGenomicCoordinates(cols[4],cols[5])
+                    gene_name = cols[0] + '|' + cols[1]
+                    gene_id = cols[20] + '|' + cols[21]
+
+                    tid1 = cols[22]
+                    tid2 = cols[23]
+
+                    transcript_id = tid1 + '|' + tid2
+
+                    # print(f'tid2: {tid2}')
+                    # print(f'transcript_id: {transcript_id}')
+
+                    ao = str(len(cols[29]))
+                    dp = str(cols[12]+cols[13])
+
+                    transcript = cols[27].upper()
+                    cds, cds_breakpoint = self.determineCodingSequence(transcript)
+
+                    # retrieve polypeptide sequence of 
+                    fusion_pepseq = cols[28]
+                    if fusion_pepseq == '.':
+                        continue
+
+                    # determine (complete) wildtype peptide sequence (which is the wt of first gene)
+                    gene1_wt_seq = '.'
+                    # retrieve 
+                    if tid1 in proteome:
+                        gene1_wt_seq = proteome[tid1]
+                    wt_subseq, mt_subseq, var_start, local_var_start, local_var_end = self.determine_peptide_sequence(fusion_pepseq, 
+                                                                                                                      gene1_wt_seq)
+                    vee = VariantEffectsEntry(chrom, start, end, gene_id,
+                                              gene_name, transcript_id,
+                                              transcript, cds, cds_breakpoint, 
+                                              "fusion", group, consequence, # var_type
+                                              wt_subseq, mt_subseq, var_start, 
+                                              local_var_start, local_var_end,
+                                              ".", # vaf not available for fusions 
+                                              ao, dp)
+
+                    #vee.print_vee()
+                    vee.scan_nmd(annotation)
+                    effects.print_entry(vee)
+
+
+    def determineCodingSequence(self, transcript):
+        """ determines the coding sequence """
+#        print(f'transcript: {transcript}')
+
+        #search for start codon in transcript
+        start_codon = re.search(r'ATG', transcript)
+#        print(f'start_codon: {start_codon}')
+        if start_codon: # start codon could have been found
+#            print(f'start_codon: {start_codon}')
+            cds = transcript[start_codon.start()+3:] # determine the coding sequence
+            cds_bp = cds.find('|')
+            cds = cds.replace('|', '')
+            return cds, cds_bp
+
+        else:
+            return '.', '.'
+
+
+    def determineConsequence(self, readingFrame, event):
+        consequence = ""
+        if readingFrame == '.' or readingFrame == "stop_codon":
+            return None
+        else:
+            if readingFrame == "in-frame":
+                consequence = "inframe_"
+            elif readingFrame == "out-of-frame":
+                consequence = "frameshift_"
+
+            evts = event.split('/')
+            for evt in evts:
+                if (evt == "read-through" or
+                    evt == "ITD" or
+                    evt == "5'-5'" or
+                    evt == "3'-3'"):
+                    return None
+                elif evt == "translocation":
+                    consequence += "trs"
+                    return consequence
+                elif evt == "duplication":
+                    consequence += "dup"
+                    return consequence
+                elif evt == "deletion":
+                    consequence += "del"
+                    return consequence
+                elif evt == "inversion":
+                    consequence += "inv"
+                    return consequence
+        return None
+
+
+    def determineGenomicCoordinates(self, breakpoint1, breakpoint2):
+        chrom1 = breakpoint1.split(':')[0]
+        chrom2 = breakpoint2.split(':')[0]
+
+        start1 = breakpoint1.split(':')[1]
+        start2 = breakpoint2.split(':')[1]
+
+        chrom = chrom1 + '|' + chrom2
+        start = start1 + '|' + start2  
+        end = -1
+
+        return chrom, start, end
+
+    def determine_peptide_sequence(self, fusion_seq, wildtype_seq):
+        fusion_seg1 = fusion_seq.split('|')[0]
+        fusion_seg2 = fusion_seq.split('|')[1]
+        var_start = len(fusion_seg1) + 1 # position of the fusion
+
+        # extract sequences
+        fusion_seg1_sub = fusion_seg1.split('?')[-1]
+        fusion_seg2_sub = fusion_seg2.split('?')[0].split('*')[0]
+
+        # mt sequence
+        if len(fusion_seg1_sub) >= 24:
+            mt = fusion_seg1_sub[-24:] 
+            local_var_start = 25 # first base of fused transcript
+            if len(fusion_seg2_sub) >= 24:
+                mt += fusion_seg2_sub[:24]
+            else:
+                mt += fusion_seg2_sub
+        elif len(fusion_seg1_sub) < 24:
+            mt = fusion_seg1_sub
+            local_var_start = len(fusion_seg1_sub) + 1
+            if len(fusion_seg2_sub) >= 24:
+                mt += fusion_seg2_sub[:24]
+            else:
+                mt += fusion_seg2_sub
+       
+        # arriba returns variants (in lower case)
+        mt = mt.upper()
+
+        # end of variant is always the length of the sequence
+        local_var_end = len(mt)
+
+        # wt sequences
+        wt = '.'
+        # assume that the fusion sequence is long 
+
+        if len(fusion_seg1_sub) > 10:
+            # search for variants in the first segment 
+            variants = find_all_lowercase(fusion_seg1_sub)
+            if variants != []: # variants within the segment
+                # extract sequence that is wildtype (in fusion - before first lowercase)
+                wt_prefix = fusion_seg1_sub[:variants[0]]
+            else:
+                wt_prefix = fusion_seg1_sub
+
+            wt = self.extract_wildtype_sequence(wildtype_seq, wt_prefix, len(mt))
+        return wt, mt, var_start, local_var_start, local_var_end
+
+
+    # extract the wildtype sequence (using ensemble info and fusion sequence)
+    def extract_wildtype_sequence(self,wt_seq, wt_prefix, mt_len):
+        # search for subsequence in wildtype peptide sequences
+        wt_start = wt_seq.find(wt_prefix)
+        if wt_start != -1: # wildtype sequence (before first variant) can be found in ensemble
+            return self.fillup_wildtype_sequence(wt_seq[wt_start:], mt_len)
+        else: # wildrtype sequence cannot be found in ensemble
+            if wt_prefix.isupper():
+                return self.fillup_wildtype_sequence(wt_prefix, mt_len)
+            else: # if not all in uppercase wt cannot be determined
+                return '.'
+
+
+    # makes sure that length of wildtype sequence equals length of mutant sequence
+    def fillup_wildtype_sequence(self, wt_seq,mt_len):
+        # wildtype sequence of seg1 exceeds length of mt (take so much until length is reached)
+        if len(wt_seq) >= mt_len:
+            wt = wt_seq[:mt_len]
+        else:
+            wt = wt_seq + '$'*(mt_len-len(wt_seq))
+        return wt
+
+
+
+class Variants():
+    def __init__(self, variants, effects):
+
+        input_vcfs = variants.split(' ')
+        for vcf in input_vcfs:
+            vcf_reader = vcfpy.Reader(open(vcf, 'r'))
+            csq_format = self.parse_csq_format(vcf_reader.header)
+        
+            transcript_count = {}
+            for entry in vcf_reader:
+                # FILTER (when applicable)
+                if (entry.INFO['SRC'] == 'snv' or
+                    entry.INFO['SRC'] == 'short_indel'):
+                    if 'PASS' not in entry.FILTER:
+                        continue
+            
+                # resolved alleles specific to vep
+                alleles_vep = self.resolve_alleles(entry)
+
+                chromosome = entry.CHROM
+                start = entry.affected_start
+                stop = entry.affected_end
+                ref = entry.REF
+                alts = entry.ALT
+            
+                for alt_i, alt_v in enumerate(alts):
+                # for alt in alts:
+                    csq_allele = alleles_vep[str(alt_v.value)]
+                    csq_fields = self.parse_csq_entries(
+                            entry.INFO["CSQ"], 
+                            csq_format,
+                            csq_allele
+                    )
+                
+                for field in csq_fields:
+                    gene_name = field["SYMBOL"]
+                    gene_id = field["Gene"]
+                    transcript_id = field['Feature']
+                    if transcript_id in transcript_count:
+                        transcript_count[transcript_id] += 1
+                    else:
+                        transcript_count[transcript_id] = 1
+
+                    nmd = '.'
+                    csq = self.resolve_consequence(field['Consequence'])
+                    if csq is None:
+                        continue
+                    elif csq == "frameshift":
+                        if field["NMD"] != 'NMD_escaping_variant':
+                            continue
+                        elif field["NMD"] == "NMD_escaping_variant":
+                            nmd = "NMD_escaping_variant"
+
+                        if field["DownstreamProtein"] == "":
+                            continue
+                    
+                    vaf = self.determine_variant_allele_frequency(entry, alt_i)
+                    # determine allele depth (number of reads)
+                    mt_ad = self.determine_allele_depth(entry, alt_i)
+                    dp = self.determine_read_depth(entry)
+                    
+                    if field["Amino_acids"]:
+                        aa_change = field["Amino_acids"]
+                    else:
+                        continue
+
+                    if csq == "frameshift":
+                        var_start = self.get_variant_startpos(field['Protein_position'])
+                        wt_seq = field["WildtypeProtein"]
+                        # mutant/variant sequence is wt until mutation start
+                        mt_seq = wt_seq[:var_start] + field["DownstreamProtein"]
+                        # length of variant is length of downstream sequence
+                        var_len = len(field["WildtypeProtein"])
+                        
+                        # makes sure that wt and mt are of same length
+                        wt_subseq, mt_subseq, n_var_start = self.determine_fs_subseq(
+                                wt_seq,
+                                mt_seq,
+                                var_start
+                        )
+                        
+                        # print('-------frameshift')
+                        # print(f'field-Protein_Position:{field["Protein_position"]}')
+                        # print(f'Downstream_Protein: {field["DownstreamProtein"]}')
+                        # print(f'var_start: {var_start}')
+                        # print(f'aa_change:{aa_change}')
+                        # print(f'wt_seq: {wt_seq}')
+                        # print(f'mt_seq: {mt_seq}')
+
+                        # print(f'wt_subseq: {wt_subseq}')
+                        # print(f'mt_subseq: {mt_subseq}')
+                        # print(f'n_var_start: {n_var_start}')
+
+                    
+                        vee = VariantEffectsEntry(entry.CHROM, 
+                                                  start, 
+                                                  stop,
+                                                  gene_id,
+                                                  gene_name,
+                                                  transcript_id,
+                                                  '.', # transcript
+                                                  '.', # cds
+                                                  '.', # cds_breakpoint
+                                                  entry.INFO["SRC"],
+                                                  entry.INFO["GRP"],
+                                                  csq,
+                                                  wt_subseq,
+                                                  mt_subseq,
+                                                  var_start,
+                                                  n_var_start,
+                                                  len(mt_subseq)-1,
+                                                  vaf,
+                                                  mt_ad,
+                                                  dp)
+
+                        vee.set_nmd(nmd)
+                        effects.print_entry(vee)
+
+
+
+                    elif (csq == "missense" or
+                          csq == "inframe_ins" or 
+                          csq == "inframe_del"):
+                        
+
+                        # retrieve the start and end of the variant / initial variant start 
+                        var_start = self.get_variant_startpos(field["Protein_position"])
+                        wt_seq = field["WildtypeProtein"] # wildtype peptide sequence
+
+                        wt_aa_change, mt_aa_change = self.determine_aa_change(aa_change)
+                        
+                        # scan for stop codons
+                        wt_aa_change, wt_stop_codon = self.scan_stop_codon(wt_aa_change)
+                        mt_aa_change, mt_stop_codon = self.scan_stop_codon(mt_aa_change)
+
+                        mt_seq = wt_seq[:var_start] + mt_aa_change
+                        if not mt_stop_codon:
+                            mt_seq += wt_seq[var_start+len(wt_aa_change):]
+
+                        # change end when deletion and insertion (as opposed to missense)
+#                            var_end = var_start
+                        # if 'del' in csq:
+                        # elif 'ins' in csq:
+                            # var_end = var_start + len(mt_aa_change)
+                        # else:
+                            # var_end = var_start
+
+                        # print(f'var_start: {var_start}')
+                        # print(f'var_end: {var_end}')
+                        # print('-------')
+                        # print(f'field-Protein_Position:{field["Protein_position"]}')
+                        # print(f'aa_change:{aa_change}')
+                        # print(f'wt_seq: {wt_seq}')
+
+                        # print(f'wt_aa_change: {wt_aa_change}')
+                        # print(f'mt_aa_change: {mt_aa_change}')
+
+                        # print(f'mt_seq: {mt_seq}')
+
+                        # # generate subsequence
+                        wt_subseq, mt_subseq, n_var_start, n_var_end = self.determine_subseq(wt_seq,
+                                                                     mt_seq, 
+                                                                     var_start,
+                                                                     len(mt_aa_change))
+
+
+                        vee = VariantEffectsEntry(entry.CHROM, 
+                                                  start, 
+                                                  stop,
+                                                  gene_id,
+                                                  gene_name,
+                                                  transcript_id,
+                                                  '.', # transcript
+                                                  '.', # cds
+                                                  '.', # cds_breakpoint
+                                                  entry.INFO["SRC"],
+                                                  entry.INFO["GRP"],
+                                                  csq,
+                                                  wt_subseq,
+                                                  mt_subseq,
+                                                  var_start,
+                                                  n_var_start,
+                                                  n_var_end,
+                                                  vaf,
+                                                  mt_ad,
+                                                  dp)
+
+                        effects.print_entry(vee)
+
+
+
+
+                        # print(f'wt_subseq: {wt_subseq}')
+                        # print(f'mt_subseq: {mt_subseq}')
+                        # print(f'n_var_start: {n_var_start}')
+                        # print(f'n_var_end: {n_var_end}')
+
+
+
+                        #output_row["aa_change"] = field["Amino_acids"]
+                    # else:
+                        # output_row["aa_change"] = "NA"
+                        # continue
+
+    @staticmethod
+    def determine_subseq(wt_seq, mt_seq, var_start, var_len):
+        """ determine the subsequence to consider for priorization """
+        # left and right are start/end of the subsequence
+        # new_var_start is the new start of the variant (after subsetting) 
+        if var_start <= 24 :
+            left = 0 # start of subsequence
+            new_var_start = var_start
+        else:
+            left = var_start - 24
+            new_var_start = 24
+
+        if var_start + var_len + 24 >= len(mt_seq):
+            right = len(mt_seq)
+        else:
+            right = var_start  + var_len + 24
+        new_var_end = new_var_start + var_len
+
+        # get wildtype sequence with new subsets
+        mt_subseq = mt_seq[left:right] 
+        if right >= len(wt_seq):
+            wt_subseq = wt_seq[left:]
+        else:
+            wt_subseq = wt_seq[left:right]
+
+        if wt_subseq < mt_subseq:
+            for i in range(len(wt_subseq),len(mt_subseq)):
+                wt_subseq += '$'
+
+        return wt_subseq, mt_subseq, new_var_start, new_var_end
+
+    @staticmethod
+    def determine_fs_subseq(wt_seq, mt_seq, start_pos_var):
+        subseqs = {}
+        if start_pos_var < 24:
+            subseq_start = 0
+        else:
+            subseq_start = start_pos_var - 24
+
+        wt_subseq = wt_seq[subseq_start:]
+        mt_subseq = mt_seq[subseq_start:]
+
+        # make sure wt and mt subsequence of of same length
+        if len(wt_subseq) > len(mt_subseq):
+            wt_subseq = wt_subseq[:len(mt_subseq)]
+        else:
+            for i in range(len(wt_subseq),len(mt_subseq)):
+                wt_subseq += '$'
+
+        # determine (new) variant start within subsequence
+        new_start_pos_var = start_pos_var - subseq_start
+
+        return wt_subseq, mt_subseq, new_start_pos_var
+
+
+    @staticmethod
+    def scan_stop_codon(sequence):
+        """ checks for stop codons and removes them """
+        stop_codon = False
+        if '*' in sequence:
+            sequence = sequence.split('*')[0]
+            stop_codon = True
+        if 'X' in sequence:
+            sequence = sequence.split('X')[0]
+            stop_codon = True
+
+        return sequence, stop_codon
+
+
+    @staticmethod
+    def determine_aa_change(aa_change):
+        wt_aa_change, mt_aa_change = aa_change.split('/')
+        if wt_aa_change == '-':
+            wt_aa_change = ''
+        if mt_aa_change == '-':
+            mt_aa_change = ''
+
+        return wt_aa_change, mt_aa_change
+
+
+
+    def get_variant_startpos(self, protein_position):
+        if '-' not in protein_position: # single number
+            startpos = int(protein_position) - 1
+        else:
+            if protein_position.split('/')[0] == '-':
+                startpos = int(protein_position.split('-')[0])
+            else:
+                startpos = int(protein_position.split('-')[0]) - 1
+
+        return startpos
+
+
+
+    def parse_csq_entries(self, csq_entries, csq_format, csq_allele):
+        csq_format_array = csq_format.split("|")
+
+        transcripts = []
+        for entry in csq_entries:
+            values = entry.split('|')
+            transcript = {}
+            for key, value in zip(csq_format_array, values):
+                transcript[key] = value
+            if transcript['Allele'] == csq_allele:
+                transcripts.append(transcript)
+
+            return transcripts
+
+
+    def parse_csq_format(self,vcf_header):
+        if vcf_header.get_info_field_info('CSQ') is None:
+            sys.exit("Failed to extract format string form info description for tag (CSQ)")
+        else:
+            csq_header = vcf_header.get_info_field_info('CSQ')
+            format_pattern = re.compile("Format: (.*)")
+            match = format_pattern.search(csq_header.description)
+            return match.group(1)
+
+
+    def resolve_alleles(self, entry):
+        alleles = {}
+        for alt in entry.ALT:
+            alt = str(alt.value)
+            # most likely an SNV
+            if alt[0:1] != entry.REF[0:1]:
+                alleles[alt] = alt
+            elif alt[1:] == "":
+                alleles[alt] = "-"
+            else:
+                alleles[alt] = alt[1:]
+
+        return alleles
+
+
+    def resolve_consequence(self, consequence_string):
+        consequences = {
+            consequence.lower() for consequence in consequence_string.split("&")
+        }
+        if "start_lost" in consequences:
+            consequence = None
+        elif "frameshift_variant" in consequences:
+            consequence = "frameshift"
+        elif "missense_variant" in consequences:
+            consequence = "missense"
+        elif "inframe_insertion" in consequences:
+            consequence = "inframe_ins"
+        elif "inframe_deletion" in consequences:
+            consequence = "inframe_del"
+        else:
+            consequence = None
+        
+        return consequence
+
+    def determine_variant_allele_frequency(self, entry, i):
+        vaf = -1
+        if (entry.INFO['SRC'] == 'short_indel' or 
+            entry.INFO['SRC'] == 'snv'):
+            vaf = entry.calls[0].data['AF'][i]
+
+        if entry.INFO['SRC'] == 'long_indel':
+            vaf = entry.INFO['AB']
+
+        return vaf
+
+    def determine_read_depth(self, entry):
+        dp = -1
+        if (entry.INFO['SRC'] == 'short_indel' or 
+            entry.INFO['SRC'] == 'snv'):
+            dp = entry.calls[0].data['DP']
+
+        if (entry.INFO['SRC'] == 'long_indel' or 
+            entry.INFO['SRC'] == 'exitron'):
+            dp = entry.INFO['DP']
+
+        return dp
+
+
+    def determine_allele_depth(self, entry, i):
+        #wt_ad = -1
+        mt_ad = -1
+
+        # GATK
+        if (entry.INFO['SRC'] == 'short_indel' or 
+            entry.INFO['SRC'] == 'snv'):
+            #wt_ad = entry.calls[0].data['AD'][0]
+            mt_ad = entry.calls[0].data['AD'][i+1]
+
+        # transIndel
+        if (entry.INFO['SRC'] == 'long_indel' or 
+            entry.INFO['SRC'] == 'exitron'):
+            mt_ad = entry.INFO['AO']
+            #wt_ad = str(int(float(entry.INFO['DP']) - float(mt_ad)))
+
+        return mt_ad
+
+
+# augmented transcript
+class VariantEffects:
+    def __init__(self, output_dir):
+
+        output_dir_path =  Path(output_dir)
+        if not output_dir_path.exists():
+            output_dir_path.mkdir()
+
+        self.variantEffectsFile = Path(output_dir_path, "variant_effects.tsv")
+        # create dir if it not exists
+
+        self.fh = open(self.variantEffectsFile, 'w')
+
+
+
+    # TODO: no hard coding of output
+    def print_header(self):
+        self.fh.write("chrom\tstart\tend\tgene_id\tgene_name\ttranscript_id\t")
+        self.fh.write("transcript\tcds\tcds_breakpoint\tsource\tgroup\t")
+        self.fh.write("var_type\twt_subseq\tmt_subseq\tvar_start\t")
+        self.fh.write("local_var_start\tlocal_var_end\t")
+        self.fh.write("vaf\tao\tdp\tNMD\tPTC_dist_ejc\tPTC_exon_number\t")
+        self.fh.write("NMD_esc_rule\n")
+
+    def print_entry(self, entry):
+        self.fh.write(f'{entry.data["chrom"]}\t')
+        self.fh.write(f'{entry.data["start"]}\t')
+        self.fh.write(f'{entry.data["end"]}\t')
+        self.fh.write(f'{entry.data["gene_id"]}\t')
+        self.fh.write(f'{entry.data["gene_name"]}\t')
+        self.fh.write(f'{entry.data["transcript_id"]}\t')
+        self.fh.write(f'{entry.data["transcript"]}\t')
+        self.fh.write(f'{entry.data["cds"]}\t')
+        self.fh.write(f'{entry.data["cds_breakpoint"]}\t')
+        self.fh.write(f'{entry.data["source"]}\t')
+        self.fh.write(f'{entry.data["group"]}\t')
+        self.fh.write(f'{entry.data["var_type"]}\t')
+        self.fh.write(f'{entry.data["wt_subseq"]}\t')
+        self.fh.write(f'{entry.data["mt_subseq"]}\t')
+        self.fh.write(f'{entry.data["var_start"]}\t')
+        self.fh.write(f'{entry.data["local_var_start"]}\t')
+        self.fh.write(f'{entry.data["local_var_end"]}\t')
+        self.fh.write(f'{entry.data["vaf"]}\t')
+        self.fh.write(f'{entry.data["ao"]}\t')
+        self.fh.write(f'{entry.data["dp"]}\t')
+        self.fh.write(f'{entry.data["NMD"]}\t')
+        self.fh.write(f'{entry.data["PTC_dist_ejc"]}\t')
+        self.fh.write(f'{entry.data["PTC_exon_number"]}\t')
+        self.fh.write(f'{entry.data["NMD_esc_rule"]}\n')
+        
+
+    def close_file(self):
+        self.fh.close()
+
+
+# deter
+def find_all_lowercase(seq):
+    matches = []
+    for i in range(len(seq)):
+        if seq[i].islower():
+            matches.append(i)
+    return matches