From a5f6a6c2429787287459eb000952a8bf163dcc97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20A=2E=20Sch=C3=A4fer?= Date: Tue, 16 Jan 2024 15:28:10 -0600 Subject: [PATCH] Minor fixes (#5) * added instructions for exitron detection and using custom configfiles * fixed formatting errors * separated routines for merging hla alleles to allow incorporating custom alleles more easier * typo in output folder * wrong parentheses added * syntax error resolved - excessive colon * added sample wildcards that was missing * fixed error in custom input of mhc * fixed missing librarues and removed debugger * refactor genotyping scripts * added instructions and more detail in changelog * removed local base quality parameter in preproc --- .tests/integration/config_basic/config.yaml | 11 +++--- CHANGELOG.md | 15 ++++---- README.md | 9 +++++ config/config.yaml | 7 +++- workflow/Snakefile | 2 +- workflow/envs/prioritization.yml | 13 +++++++ workflow/rules/common.smk | 38 +++++++++++++++---- workflow/rules/hlatyping.smk | 28 +++++++++++--- workflow/rules/preproc.smk | 2 +- workflow/rules/prioritization.smk | 2 +- .../scripts/genotyping/combine_all_alleles.py | 28 ++++++++++++++ .../merge_predicted_mhcI.py} | 5 --- workflow/scripts/prioritization/compile.py | 3 -- workflow/scripts/prioritization/prediction.py | 1 - 14 files changed, 124 insertions(+), 40 deletions(-) create mode 100644 workflow/envs/prioritization.yml create mode 100644 workflow/scripts/genotyping/combine_all_alleles.py rename workflow/scripts/{merge_mhcI_alleles.py => genotyping/merge_predicted_mhcI.py} (98%) diff --git a/.tests/integration/config_basic/config.yaml b/.tests/integration/config_basic/config.yaml index 872e344..fe4d4a8 100644 --- a/.tests/integration/config_basic/config.yaml +++ b/.tests/integration/config_basic/config.yaml @@ -5,14 +5,14 @@ basequal: 20 # overall required base quality ### data data: - name: patient2_test + name: basic_sample dnaseq: dna_normal: TESLA_testdata/patient2/WES/TESLA_9_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_9_2.fastq.gz dna_tumor: TESLA_testdata/patient2/WES/TESLA_10_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_10_2.fastq.gz rnaseq: rna_tumor: TESLA_testdata/patient2/RNA/TESLA_11_1.fastq.gz TESLA_testdata/patient2/RNA/TESLA_11_2.fastq.gz normal: dna_normal - + custom: variants: hlatyping: @@ -84,16 +84,15 @@ quantification: mode: BOTH # RNA, RNA or BOTH hlatyping: - class: BOTH # I, II or BOTH - mode: BOTH # DNA, RNA or BOTH + class: I # I, II or BOTH # specific path for class II hlatyping (only required when class: II, or BOTH) - MHC-I_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom) + MHC-I_mode: DNA, RNA # DNA, RNA, or BOTH (if empty alleles have to be specified in custom) MHC-II_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom) freqdata: ./hlahd_files/freq_data/ split: ./hlahd_files/HLA_gene.split.txt dict: ./hlahd_files/dictionary/ -priorization: +prioritization: class: I # I, II or BOTH lengths: MHC-I: 8,9,10,11 diff --git a/CHANGELOG.md b/CHANGELOG.md index a1fbc16..4e34ac8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,15 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -[0.1.0] - 2023-08-17 +## [0.1.0] - 2023-08-17 ### Added - Comprehensive workflow with different modules to detect variants from sequencing data -- Preprocessing -- Alignment -- MHCI+II HLA typing -- Alternative splicing -- Gene fusion -- Exitron -- Indels and SNVs (GATK) +- Different modules for each step +- Support data in single-end, paired-end .fastq or BAM files +- preprocessing, alignment +- genotyping +- alternative splicing +- gene fusion, exitron, SNVs and indels diff --git a/README.md b/README.md index 1ad3e70..1964a2a 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,15 @@ cd /path/to/your/working/directory/ snakemake --cores all --use-conda ``` +As mentioned above, when exitron detection is activated the singularity option `--use-singularity` has to be used as well. + +```bash +snakemake --cores all --use-conda --use-singularity +``` + +In addition, custom configfiles can be configured using `--configfile `. In principle, this merely +overwrites the default config, and should include all key/value pairs of the valid config file. + For more detailed instructions and explanations on how to use ScanNeo2, please consult the [wiki](https://github.com/ylab-hi/ScanNeo2/wiki). ## Docker Support diff --git a/config/config.yaml b/config/config.yaml index e5cf8e1..804c358 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -22,7 +22,6 @@ data: preproc: activate: true # whether (=true) or not (=false) to include pre-processing minlen: 10 - qual: 20 slidingwindow: activate: true wsize: 3 @@ -77,6 +76,10 @@ indel: sliplen: 8 # min number of reference bases to suspect slippage event sliprate: 0.1 # frequency of slippage when it is supsected +quantification: + activate: true + mode: BOTH # RNA, RNA or BOTH + hlatyping: class: BOTH # I, II or BOTH mode: BOTH # DNA, RNA or BOTH @@ -87,7 +90,7 @@ hlatyping: split: ./hlahd_files/HLA_gene.split.txt dict: ./hlahd_files/dictionary/ -priorization: +prioritization: class: I # I, II or BOTH lengths: MHC-I: 8,9,10,11 diff --git a/workflow/Snakefile b/workflow/Snakefile index 626eafd..7057dbd 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -4,7 +4,7 @@ from snakemake.utils import min_version min_version("6.4.1") #### setup ####### -#configfile: "config/config.yaml" +configfile: "config/config.yaml" rule all: input: diff --git a/workflow/envs/prioritization.yml b/workflow/envs/prioritization.yml new file mode 100644 index 0000000..d337067 --- /dev/null +++ b/workflow/envs/prioritization.yml @@ -0,0 +1,13 @@ +channels: + - bioconda + - conda-forge + - anaconda +dependencies: + - python>=3.6 + - vcfpy + - pyfaidx + - configargparse + - pandas + - pip + - pip: + - gffutils diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d620fd8..276163d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -199,11 +199,11 @@ def aggregate_mhcI_PE(wildcards): no=glob_wildcards(os.path.join(checkpoint_output, "R1_{no}.bam")).no) -def get_mhcI_alleles(wildcards): +def get_predicted_mhcI_alleles(wildcards): values = [] # routines to genotype from DNA - if config['hlatyping']['MHC-I_mode'] in ['DNA', 'BOTH']: + if "DNA" in config['hlatyping']['MHC-I_mode']: if config['data']['dnaseq'] is not None: for key in config['data']['dnaseq'].keys(): if key not in config['data']['normal']: @@ -216,7 +216,7 @@ def get_mhcI_alleles(wildcards): print('dnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is DNA or BOTH -- will be ignored') # routines to genotype from RNA - if config['hlatyping']['MHC-I_mode'] in ['RNA', 'BOTH']: + if "RNA" in config['hlatyping']['MHC-I_mode']: if config['data']['rnaseq'] is not None: for key in config['data']['rnaseq'].keys(): if key not in config['data']['normal']: @@ -230,8 +230,11 @@ def get_mhcI_alleles(wildcards): # if alleles have been specified in the config file, add them to the list - if config['data']['custom']['hlatyping']['MHC-I'] is not None: - values.append(config['custom']['hlatyping']['MHC-I']) + #if "custom" in config['hlatyping']['MHC-I_mode']: + #if config['data']['custom']['hlatyping']['MHC-I'] is not None: + #values.append(config['custom']['hlatyping']['MHC-I']) + #if config['data']['custom']['hlatyping']['MHC-I'] is not None: + #values.append(config['custom']['hlatyping']['MHC-I']) if len(values) == 0: @@ -240,13 +243,27 @@ def get_mhcI_alleles(wildcards): return values +def get_all_mhcI_alleles(wildcards): + values = [] -##### MHC CLASS I + if ("DNA" in config['hlatyping']['MHC-I_mode'] or + "RNA" in config['hlatyping']['MHC-I_mode']): + values += expand("results/{sample}/hla/mhc-I/genotyping/mhc-I.tsv", + sample = wildcards.sample) + if "custom" in config["hlatyping"]["MHC-I_mode"]: + values += [config["data"]["custom"]["hlatyping"]["MHC-I"]] + if len(values) == 0: + print('No hla data found. Check config file for correct specification of data and hla genotyping mode') + sys.exit(1) + + return values +##### MHC CLASS I + # returns list of hla typing results for the given sample and group @@ -329,7 +346,12 @@ def aggregate_aligned_rg(wildcards): def get_readgroups_input(wildcards): # return only bam from STAR align if config['data'][f'{wildcards.seqtype}_filetype'] in ['.fq','.fastq']: - return ["results/{sample}/{seqtype}/align/{group}_final_STAR.bam".format(**wildcards)] + return expand("results/{sample}/{seqtype}/align/{group}_final_STAR.bam", + sample=wildcards.sample, + seqtype=wildcards.seqtype, + group=wildcards.group) + +# return ["results/{sample}/{seqtype}/align/{group}_final_STAR.bam".format(**wildcards)] elif config['data'][f'{wildcards.seqtype}_filetype'] in ['.bam']: val = [] @@ -341,6 +363,7 @@ def get_readgroups_input(wildcards): # needs both the raw data and star aligned bam val.append(str(config['data']['rnaseq'][wildcards.group])) val += expand("results/{sample}/{seqtype}/align/{group}_final_STAR.bam", + sample=wildcards.sample, seqtype='rnaseq', group=wildcards.group) @@ -555,6 +578,7 @@ def get_mhcI(wildcards): if config['prioritization']['class'] in ['I', 'BOTH']: alleles += expand("results/{sample}/hla/mhc-I.tsv", sample=config['data']['name']) + return alleles def get_mhcII(wildcards): diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk index 578f648..c4d7dbc 100644 --- a/workflow/rules/hlatyping.smk +++ b/workflow/rules/hlatyping.smk @@ -288,23 +288,41 @@ rule combine_mhcI_PE: '{input}' {output} """ -rule merge_mhcI_allels: +rule merge_predicted_mhcI_allels: input: - get_mhcI_alleles + get_predicted_mhcI_alleles output: - "results/{sample}/hla/mhc-I.tsv", + "results/{sample}/hla/genotyping/mhc-I.tsv", message: "Merging HLA alleles from different sources" log: - "logs/{sample}/optitype/merge_classI_alleles.log" + "logs/{sample}/optitype/merge_predicted_mhc-I.log" conda: "../envs/basic.yml" threads: 1 shell: """ - python workflow/scripts/merge_mhcI_alleles.py \ + python workflow/scripts/genotyping/merge_predicted_mhcI.py \ '{input}' {output} """ + +rule combine_all_mhcI_alleles: + input: + get_all_mhcI_alleles + output: + "results/{sample}/hla/mhc-I.tsv" + message: + "Combining HLA alleles from different sources" + log: + "logs/{sample}/genotyping/combine_all_mhc-I.log" + conda: + "../envs/basic.yml" + threads: 1 + shell: + """ + python workflow/scripts/genotyping/combine_all_alleles.py \ + '{input}' {output} > {log} 2>&1\ + """ ######### MHC-II HLA GENOTYPING ########### rule filter_reads_mhcII_PE: diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk index 849372e..0c8e4b0 100644 --- a/workflow/rules/preproc.smk +++ b/workflow/rules/preproc.smk @@ -82,7 +82,7 @@ rule preproc_paired_end: params: dapters="", extra=lambda wildcards: "-u 100 -e {0} -l {1} {2} ".format( - config['preproc']['qual'], + config['basequal'], config['preproc']['minlen'], "-3 --cut_tail_window_size {} cut_tail_mean_quality {}".format( config['preproc']['slidingwindow']['wsize'], diff --git a/workflow/rules/prioritization.smk b/workflow/rules/prioritization.smk index b024286..9e2c9fc 100644 --- a/workflow/rules/prioritization.smk +++ b/workflow/rules/prioritization.smk @@ -38,7 +38,7 @@ rule priorization: annotation="resources/refs/genome_tmp.gtf", counts="results/{sample}/quantification/allcounts.txt" output: - "results/{sample}/priorization/", + "results/{sample}/prioritization/", message: "Predicting binding affinities on sample:{wildcards.sample}" conda: diff --git a/workflow/scripts/genotyping/combine_all_alleles.py b/workflow/scripts/genotyping/combine_all_alleles.py new file mode 100644 index 0000000..e8c68fe --- /dev/null +++ b/workflow/scripts/genotyping/combine_all_alleles.py @@ -0,0 +1,28 @@ +import os +import sys + +def main(): + infiles = sys.argv[1].split(' ') + alleles = {} + for mhc in infiles: + fh_in = open(mhc, 'r') + # skip header + for line in fh_in: + cols = line.rstrip().split('\t') + source = cols[0] + mhc = cols[1] + + if mhc not in alleles: + alleles[mhc] = source + else: + alleles[mhc] += ',' + source + fh_in.close() + + outfile = sys.argv[2] + fh_out = open(outfile, 'w') + for allele in alleles: + fh_out.write(f'{alleles[allele]}\t{allele}\n') + fh_out.close() + + +main() diff --git a/workflow/scripts/merge_mhcI_alleles.py b/workflow/scripts/genotyping/merge_predicted_mhcI.py similarity index 98% rename from workflow/scripts/merge_mhcI_alleles.py rename to workflow/scripts/genotyping/merge_predicted_mhcI.py index 591070f..ed94b4e 100644 --- a/workflow/scripts/merge_mhcI_alleles.py +++ b/workflow/scripts/genotyping/merge_predicted_mhcI.py @@ -29,9 +29,4 @@ def main(): out.write(f',{v}') out.write(f'\t{al}\n') - - - - - main() diff --git a/workflow/scripts/prioritization/compile.py b/workflow/scripts/prioritization/compile.py index eaf2d73..3119a09 100644 --- a/workflow/scripts/prioritization/compile.py +++ b/workflow/scripts/prioritization/compile.py @@ -5,9 +5,6 @@ import prediction import filtering -import pdb - - def main(): options = parse_arguments() diff --git a/workflow/scripts/prioritization/prediction.py b/workflow/scripts/prioritization/prediction.py index 741c4e2..ca5a44d 100644 --- a/workflow/scripts/prioritization/prediction.py +++ b/workflow/scripts/prioritization/prediction.py @@ -1,5 +1,4 @@ import tempfile -import pdb import os import concurrent.futures import subprocess