Merge branch 'main' into vep_cache_speedup

merge recent changes into branch to be up to date
ylab-hi · Jan 17, 2024 · 9edea0b · 9edea0b
2 parents c1abdb9 + a5f6a6c
commit 9edea0b
Show file tree

Hide file tree

Showing 14 changed files with 124 additions and 40 deletions.
diff --git a/.tests/integration/config_basic/config.yaml b/.tests/integration/config_basic/config.yaml
@@ -5,14 +5,14 @@ basequal: 20  # overall required base quality
 
 ### data
 data:
-  name:  patient2_test
+  name:  basic_sample
   dnaseq: 
     dna_normal: TESLA_testdata/patient2/WES/TESLA_9_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_9_2.fastq.gz
     dna_tumor: TESLA_testdata/patient2/WES/TESLA_10_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_10_2.fastq.gz
   rnaseq:
     rna_tumor: TESLA_testdata/patient2/RNA/TESLA_11_1.fastq.gz TESLA_testdata/patient2/RNA/TESLA_11_2.fastq.gz
   normal: dna_normal
-
+  
   custom:
     variants:
     hlatyping:
@@ -84,16 +84,15 @@ quantification:
   mode: BOTH # RNA, RNA or BOTH
 
 hlatyping:
-  class: BOTH # I, II or BOTH
-  mode: BOTH  # DNA, RNA or BOTH
+  class: I # I, II or BOTH
   # specific path for class II hlatyping (only required when class: II, or BOTH)
-  MHC-I_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
+  MHC-I_mode: DNA, RNA # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
   MHC-II_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
   freqdata: ./hlahd_files/freq_data/ 
   split: ./hlahd_files/HLA_gene.split.txt
   dict: ./hlahd_files/dictionary/
 
-priorization:
+prioritization:
   class: I # I, II or BOTH
   lengths:
     MHC-I: 8,9,10,11

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,15 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-[0.1.0] - 2023-08-17
+## [0.1.0] - 2023-08-17
 
 ### Added
 
 - Comprehensive workflow with different modules to detect variants from sequencing data
-- Preprocessing
-- Alignment
-- MHCI+II HLA typing 
-- Alternative splicing
-- Gene fusion 
-- Exitron 
-- Indels and SNVs (GATK)
+- Different modules for each step
+- Support data in single-end, paired-end .fastq or BAM files
+- preprocessing, alignment
+- genotyping
+- alternative splicing
+- gene fusion, exitron, SNVs and indels
diff --git a/README.md b/README.md
@@ -69,6 +69,15 @@ cd /path/to/your/working/directory/
 snakemake --cores all --use-conda
 ```
 
+As mentioned above, when exitron detection is activated the singularity option `--use-singularity` has to be used as well.
+
+```bash
+snakemake --cores all --use-conda --use-singularity
+```
+
+In addition, custom configfiles can be configured using `--configfile <path/to/configfile>`. In principle, this merely 
+overwrites the default config, and should include all key/value pairs of the valid config file.
+
 For more detailed instructions and explanations on how to use ScanNeo2, please consult the [wiki](https://github.com/ylab-hi/ScanNeo2/wiki).
 
 ## Docker Support

diff --git a/config/config.yaml b/config/config.yaml
@@ -22,7 +22,6 @@ data:
 preproc: 
   activate: true  # whether (=true) or not (=false) to include pre-processing
   minlen: 10
-  qual: 20
   slidingwindow:
     activate: true
     wsize: 3
@@ -77,6 +76,10 @@ indel:
   sliplen: 8  # min number of reference bases to suspect slippage event
   sliprate: 0.1  # frequency of slippage when it is supsected
 
+quantification:
+  activate: true
+  mode: BOTH # RNA, RNA or BOTH
+
 hlatyping:
   class: BOTH # I, II or BOTH
   mode: BOTH  # DNA, RNA or BOTH
@@ -87,7 +90,7 @@ hlatyping:
   split: ./hlahd_files/HLA_gene.split.txt
   dict: ./hlahd_files/dictionary/
 
-priorization:
+prioritization:
   class: I # I, II or BOTH
   lengths:
     MHC-I: 8,9,10,11

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -4,7 +4,7 @@ from snakemake.utils import min_version
 min_version("6.4.1")
 
 #### setup #######
-#configfile: "config/config.yaml"
+configfile: "config/config.yaml"
 
 rule all:
   input:

diff --git a/workflow/envs/prioritization.yml b/workflow/envs/prioritization.yml
@@ -0,0 +1,13 @@
+channels:
+ - bioconda
+ - conda-forge
+ - anaconda
+dependencies:
+ - python>=3.6
+ - vcfpy
+ - pyfaidx
+ - configargparse
+ - pandas
+ - pip
+ - pip:
+   - gffutils
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -199,11 +199,11 @@ def aggregate_mhcI_PE(wildcards):
     no=glob_wildcards(os.path.join(checkpoint_output, "R1_{no}.bam")).no)
 
 
-def get_mhcI_alleles(wildcards):
+def get_predicted_mhcI_alleles(wildcards):
   values = []
 
   # routines to genotype from DNA
-  if config['hlatyping']['MHC-I_mode'] in ['DNA', 'BOTH']:
+  if "DNA" in config['hlatyping']['MHC-I_mode']:
     if config['data']['dnaseq'] is not None:
       for key in config['data']['dnaseq'].keys():
         if key not in config['data']['normal']:
@@ -216,7 +216,7 @@ def get_mhcI_alleles(wildcards):
       print('dnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is DNA or BOTH -- will be ignored')
 
   # routines to genotype from RNA
-  if config['hlatyping']['MHC-I_mode'] in ['RNA', 'BOTH']:
+  if "RNA" in config['hlatyping']['MHC-I_mode']:
     if config['data']['rnaseq'] is not None:
       for key in config['data']['rnaseq'].keys():
         if key not in config['data']['normal']:
@@ -230,8 +230,11 @@ def get_mhcI_alleles(wildcards):
 
 
   # if alleles have been specified in the config file, add them to the list
-  if config['data']['custom']['hlatyping']['MHC-I'] is not None:
-    values.append(config['custom']['hlatyping']['MHC-I'])
+  #if "custom" in config['hlatyping']['MHC-I_mode']:
+    #if config['data']['custom']['hlatyping']['MHC-I'] is not None:
+      #values.append(config['custom']['hlatyping']['MHC-I'])
+  #if config['data']['custom']['hlatyping']['MHC-I'] is not None:
+    #values.append(config['custom']['hlatyping']['MHC-I'])
 
 
   if len(values) == 0:
@@ -240,13 +243,27 @@ def get_mhcI_alleles(wildcards):
 
   return values
 
+def get_all_mhcI_alleles(wildcards):
+  values = []
 
-##### MHC CLASS I
+  if ("DNA" in config['hlatyping']['MHC-I_mode'] or
+      "RNA" in config['hlatyping']['MHC-I_mode']):
+    values += expand("results/{sample}/hla/mhc-I/genotyping/mhc-I.tsv",
+                    sample = wildcards.sample)
 
+  if "custom" in config["hlatyping"]["MHC-I_mode"]:
+    values += [config["data"]["custom"]["hlatyping"]["MHC-I"]]
 
+  if len(values) == 0:
+    print('No hla data found. Check config file for correct specification of data and hla genotyping mode')
+    sys.exit(1)
+
+  return values
 
 
 
+##### MHC CLASS I
+
 
 # returns list of hla typing results for the given sample and group
 
@@ -329,7 +346,12 @@ def aggregate_aligned_rg(wildcards):
 def get_readgroups_input(wildcards):
   # return only bam from STAR align
   if config['data'][f'{wildcards.seqtype}_filetype'] in ['.fq','.fastq']:
-    return ["results/{sample}/{seqtype}/align/{group}_final_STAR.bam".format(**wildcards)]
+    return expand("results/{sample}/{seqtype}/align/{group}_final_STAR.bam",
+                  sample=wildcards.sample,
+                  seqtype=wildcards.seqtype,
+                  group=wildcards.group)
+
+#    return ["results/{sample}/{seqtype}/align/{group}_final_STAR.bam".format(**wildcards)]
 
   elif config['data'][f'{wildcards.seqtype}_filetype'] in ['.bam']:
     val = []
@@ -341,6 +363,7 @@ def get_readgroups_input(wildcards):
       # needs both the raw data and star aligned bam 
       val.append(str(config['data']['rnaseq'][wildcards.group]))
       val += expand("results/{sample}/{seqtype}/align/{group}_final_STAR.bam",
+          sample=wildcards.sample,
           seqtype='rnaseq',
           group=wildcards.group)
 
@@ -555,6 +578,7 @@ def get_mhcI(wildcards):
   if config['prioritization']['class'] in ['I', 'BOTH']:
     alleles += expand("results/{sample}/hla/mhc-I.tsv",
                       sample=config['data']['name'])
+
   return alleles
 
 def get_mhcII(wildcards):

diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk
@@ -288,23 +288,41 @@ rule combine_mhcI_PE:
           '{input}' {output}
     """
 
-rule merge_mhcI_allels:
+rule merge_predicted_mhcI_allels:
   input:
-    get_mhcI_alleles
+    get_predicted_mhcI_alleles
   output:
-    "results/{sample}/hla/mhc-I.tsv",
+    "results/{sample}/hla/genotyping/mhc-I.tsv",
   message:
     "Merging HLA alleles from different sources"
   log:
-    "logs/{sample}/optitype/merge_classI_alleles.log"
+    "logs/{sample}/optitype/merge_predicted_mhc-I.log"
   conda:
     "../envs/basic.yml"
   threads: 1
   shell:
     """
-      python workflow/scripts/merge_mhcI_alleles.py \
+      python workflow/scripts/genotyping/merge_predicted_mhcI.py \
           '{input}' {output}
     """
+
+rule combine_all_mhcI_alleles:
+  input:
+    get_all_mhcI_alleles
+  output:
+    "results/{sample}/hla/mhc-I.tsv"
+  message:
+    "Combining HLA alleles from different sources"
+  log:
+    "logs/{sample}/genotyping/combine_all_mhc-I.log"
+  conda:
+    "../envs/basic.yml"
+  threads: 1
+  shell:
+    """
+      python workflow/scripts/genotyping/combine_all_alleles.py \
+          '{input}' {output} > {log} 2>&1\
+    """
 
 ######### MHC-II HLA GENOTYPING ###########
 rule filter_reads_mhcII_PE:

diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk
@@ -82,7 +82,7 @@ rule preproc_paired_end:
     params:
       dapters="",
       extra=lambda wildcards: "-u 100 -e {0} -l {1} {2} ".format(
-          config['preproc']['qual'], 
+          config['basequal'], 
           config['preproc']['minlen'],
           "-3 --cut_tail_window_size {} cut_tail_mean_quality {}".format(
               config['preproc']['slidingwindow']['wsize'],

diff --git a/workflow/rules/prioritization.smk b/workflow/rules/prioritization.smk
@@ -38,7 +38,7 @@ rule priorization:
     annotation="resources/refs/genome_tmp.gtf",
     counts="results/{sample}/quantification/allcounts.txt"
   output:
-    "results/{sample}/priorization/",
+    "results/{sample}/prioritization/",
   message:
     "Predicting binding affinities on sample:{wildcards.sample}"
   conda:

diff --git a/workflow/scripts/genotyping/combine_all_alleles.py b/workflow/scripts/genotyping/combine_all_alleles.py
@@ -0,0 +1,28 @@
+import os
+import sys
+
+def main():
+    infiles = sys.argv[1].split(' ')
+    alleles = {}
+    for mhc in infiles:
+        fh_in = open(mhc, 'r')
+        # skip header
+        for line in fh_in:
+            cols = line.rstrip().split('\t')
+            source = cols[0]
+            mhc = cols[1]
+
+            if mhc not in alleles:
+                alleles[mhc] = source
+            else:
+                alleles[mhc] += ',' + source
+        fh_in.close()
+
+    outfile = sys.argv[2]
+    fh_out = open(outfile, 'w')
+    for allele in alleles:
+        fh_out.write(f'{alleles[allele]}\t{allele}\n')
+    fh_out.close()
+
+
+main()
diff --git a/workflow/scripts/merge_mhcI_alleles.py → ...cripts/genotyping/merge_predicted_mhcI.py b/workflow/scripts/merge_mhcI_alleles.py → ...cripts/genotyping/merge_predicted_mhcI.py
@@ -29,9 +29,4 @@ def main():
                 out.write(f',{v}')
         out.write(f'\t{al}\n')
 
-
-
-
-
-
 main()
diff --git a/workflow/scripts/prioritization/compile.py b/workflow/scripts/prioritization/compile.py
@@ -5,9 +5,6 @@
 import prediction
 import filtering
 
-import pdb
-
-
 def main():
     options = parse_arguments()
 

diff --git a/workflow/scripts/prioritization/prediction.py b/workflow/scripts/prioritization/prediction.py
@@ -1,5 +1,4 @@
 import tempfile
-import pdb
 import os
 import concurrent.futures
 import subprocess
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,9 +29,4 @@ def main():
		out.write(f',{v}')
		out.write(f'\t{al}\n')






		main()