diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index c73b42d2..d2a42f64 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -4,6 +4,7 @@ from pathlib import Path configfile: "config/deeprvat_annotation_config.yaml" + # init general species = config.get("species") or "homo_sapiens" @@ -26,8 +27,12 @@ saved_deepripe_models_path = ( merge_nthreads = int(config.get("merge_nthreads") or 64) # If modules are used we load them here -load_bfc = " ".join([config["bcftools_load_cmd"], "&&" if config["bcftools_load_cmd"] else ""]) -load_hts = " ".join([config["htslib_load_cmd"], "&&" if config["htslib_load_cmd"] else ""]) +load_bfc = " ".join( + [config["bcftools_load_cmd"], "&&" if config["bcftools_load_cmd"] else ""] +) +load_hts = " ".join( + [config["htslib_load_cmd"], "&&" if config["htslib_load_cmd"] else ""] +) load_perl = " ".join([config["perl_load_cmd"], "&&" if config["perl_load_cmd"] else ""]) load_vep = " ".join([config["vep_load_cmd"], "&&" if config["vep_load_cmd"] else ""]) @@ -53,7 +58,7 @@ vep_cache_dir = Path(config.get("vep_cache_dir")) or vep_source_dir / "cache" vep_plugin_dir = Path(config.get("vep_plugin_dir")) or vep_source_dir / "Plugin" vep_input_format = config.get("vep_input_format") or "vcf" vep_nfork = int(config.get("vep_nfork") or 5) -af_mode= config.get("af_mode") or "af" +af_mode = config.get("af_mode") or "af" # init plugIns spliceAI_snv_file = config["spliceAI_snv_file"] @@ -72,11 +77,11 @@ pvcf_blocks_df = pd.read_csv( absplice_repo_dir = Path(config["absplice_repo_dir"]) n_cores_absplice = int(config.get("n_cores_absplice") or 4) ncores_merge_absplice = int(config.get("n_cores_merge_absplice") or 64) -#init deepripe +# init deepripe n_jobs_deepripe = int(config.get("n_jobs_deepripe") or 8) # init kipoi-veff2 kipoi_repo_dir = Path(config["kipoiveff_repo_dir"]) -ncores_addis = int(config.get("n_jobs_addids") or 32) +ncores_addis = int(config.get("n_jobs_addids") or 32) # Filter out which chromosomes to work with pvcf_blocks_df = pvcf_blocks_df[ pvcf_blocks_df["Chromosome"].isin([str(c) for c in included_chromosomes]) @@ -103,16 +108,19 @@ rule all: rule aggregate_and_merge_absplice: input: abscore_files=expand( - [anno_tmp_dir / "absplice" / (source_variant_file_pattern + "_AbSplice_DNA.csv")], + [ + anno_tmp_dir + / "absplice" + / (source_variant_file_pattern + "_AbSplice_DNA.csv") + ], zip, chr=chromosomes, block=block, ), - current_annotation_file=anno_dir / "vep_deepripe_deepsea.parquet" + current_annotation_file=anno_dir / "vep_deepripe_deepsea.parquet", output: annotations=anno_dir / "current_annotations_absplice.parquet", - scores=anno_tmp_dir / "abSplice_score_file.parquet" - + scores=anno_tmp_dir / "abSplice_score_file.parquet", shell: " ".join( [ @@ -123,10 +131,9 @@ rule aggregate_and_merge_absplice: str(anno_tmp_dir / "absplice"), "{output.annotations}", "{output.scores}", - f"{ncores_merge_absplice}" - ]) - - + f"{ncores_merge_absplice}", + ] + ) rule merge_deepsea_pcas: @@ -134,7 +141,7 @@ rule merge_deepsea_pcas: annotations=anno_dir / "vep_deepripe.parquet", deepsea_pcas=anno_dir / "deepSea_pca" / "deepsea_pca.parquet", output: - anno_dir / "vep_deepripe_deepsea.parquet" + anno_dir / "vep_deepripe_deepsea.parquet", shell: " ".join( [ @@ -147,40 +154,55 @@ rule merge_deepsea_pcas: ] ) + rule concat_annotations: input: - pvcf = metadata_dir / config['pvcf_blocks_file'], - anno_dir = anno_dir, - vcf_files= - expand([anno_dir / f"{source_variant_file_pattern}_merged.parquet"], + pvcf=metadata_dir / config["pvcf_blocks_file"], + anno_dir=anno_dir, + vcf_files=expand( + [anno_dir / f"{source_variant_file_pattern}_merged.parquet"], zip, chr=chromosomes, - block=block) - output: anno_dir / "vep_deepripe.parquet" + block=block, + ), + output: + anno_dir / "vep_deepripe.parquet", shell: - " ".join([ - "python", - str(annotation_python_file), - "concat-annotations", - "{input.pvcf}", - "{input.anno_dir}", - f"{str(source_variant_file_pattern + '_merged.parquet').format(chr='{{chr}}', block='{{block}}')}", - "{output}", - f" --included-chromosomes {','.join(included_chromosomes)}" - ]) + " ".join( + [ + "python", + str(annotation_python_file), + "concat-annotations", + "{input.pvcf}", + "{input.anno_dir}", + f"{str(source_variant_file_pattern + '_merged.parquet').format(chr='{{chr}}', block='{{block}}')}", + "{output}", + f" --included-chromosomes {','.join(included_chromosomes)}", + ] + ) + rule merge_annotations: input: - vep = anno_dir / (source_variant_file_pattern + "_vep_anno.tsv"), - deepripe_parclip = anno_dir / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"), - deepripe_k5 = anno_dir / (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"), - deepripe_hg2 = anno_dir / (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"), - variant_file = variant_file - - + vep=anno_dir / (source_variant_file_pattern + "_vep_anno.tsv"), + deepripe_parclip=anno_dir + / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"), + deepripe_k5=anno_dir + / (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"), + deepripe_hg2=anno_dir + / (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"), + variant_file=variant_file, output: anno_dir / f"{source_variant_file_pattern}_merged.parquet", - shell: "HEADER=$(grep -n '#Uploaded_variation' "+"{input.vep}" +"| head | cut -f 1 -d ':') && python "+f"{annotation_python_file} "+"merge-annotations $(($HEADER-1)) {input.vep} {input.deepripe_parclip} {input.deepripe_hg2} {input.deepripe_k5} {input.variant_file} {output}" + shell: + ( + "HEADER=$(grep -n '#Uploaded_variation' " + + "{input.vep}" + + "| head | cut -f 1 -d ':') && python " + + f"{annotation_python_file} " + + "merge-annotations $(($HEADER-1)) {input.vep} {input.deepripe_parclip} {input.deepripe_hg2} {input.deepripe_k5} {input.variant_file} {output}" + ) + rule mv_absplice_files: input: @@ -223,7 +245,6 @@ rule absplice: block=block, ), config=absplice_repo_dir / "example" / "workflow" / "mv_config.done", - output: expand( [ @@ -242,7 +263,10 @@ rule absplice: ), threads: n_cores_absplice shell: - f"""python -m snakemake -s {str(absplice_repo_dir/"example"/"workflow"/"Snakefile")} -j 1 --use-conda --rerun-incomplete --directory {str(absplice_repo_dir /"example"/"workflow")} -c"""+"{threads}" + ( + f"""python -m snakemake -s {str(absplice_repo_dir/"example"/"workflow"/"Snakefile")} -j 1 --use-conda --rerun-incomplete --directory {str(absplice_repo_dir /"example"/"workflow")} -c""" + + "{threads}" + ) rule mod_config_absplice: @@ -251,6 +275,7 @@ rule mod_config_absplice: shell: f""" rm {absplice_repo_dir}/example/workflow/config.yaml && cp {deeprvat_parent_path}/pipelines/resources/absplice_config.yaml {absplice_repo_dir}/example/workflow/config.yaml && touch {absplice_repo_dir}/example/workflow/mv_config.done""" + rule link_files_absplice: input: anno_tmp_dir / (source_variant_file_pattern + "_variants_header.vcf.gz"), @@ -262,13 +287,11 @@ rule link_files_absplice: f"mkdir -p {absplice_repo_dir/'example/data/resources/analysis_files/input_files'} && ln -s -r {{input}} {{output}}" - rule deepSea_PCA: input: anno_dir / "all_variants.wID.deepSea.csv", output: anno_dir / "deepSea_pca" / "deepsea_pca.parquet", - shell: " ".join( [ @@ -280,7 +303,7 @@ rule deepSea_PCA: "deepsea-pca", f"--n-components {config['deepsea_pca_n_components']}", "{input}", - str(anno_dir / "deepSea_pca"/ "pca_matrix"), + str(anno_dir / "deepSea_pca" / "pca_matrix"), str(anno_dir / "deepSea_pca"), ] ) @@ -291,9 +314,8 @@ rule add_ids_deepSea: variant_file=variant_file, annotation_file=anno_dir / "all_variants.deepSea.csv", output: - anno_dir / "all_variants.wID.deepSea.csv" + anno_dir / "all_variants.wID.deepSea.csv", threads: ncores_addis - shell: " ".join( [ @@ -312,7 +334,8 @@ rule concat_deepSea: input: expand( [ - anno_dir / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"), + anno_dir + / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"), ], zip, chr=chromosomes, @@ -320,7 +343,6 @@ rule concat_deepSea: ), output: anno_dir / "all_variants.deepSea.csv", - shell: " ".join( [ @@ -331,9 +353,9 @@ rule concat_deepSea: ",".join(included_chromosomes), "--sep '\t'", f"{anno_dir}", - str(source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv").format( - chr="{{chr}}", block="{{block}}" - ), + str( + source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv" + ).format(chr="{{chr}}", block="{{block}}"), str(metadata_dir / config["pvcf_blocks_file"]), str( anno_dir / "all_variants.deepSea.csv", @@ -344,7 +366,8 @@ rule concat_deepSea: rule deepSea: input: - variants=anno_tmp_dir / (source_variant_file_pattern + "_variants_header.vcf.gz"), + variants=anno_tmp_dir + / (source_variant_file_pattern + "_variants_header.vcf.gz"), fasta=fasta_dir / fasta_file_name, output: anno_dir / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"), @@ -354,15 +377,12 @@ rule deepSea: "kipoi_veff2_predict {input.variants} {input.fasta} {output} -l 1000 -m 'DeepSEA/predict' -s 'diff'" - - rule deepRiPe_parclip: input: variants=anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"), fasta=fasta_dir / fasta_file_name, output: anno_dir / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"), - shell: f"mkdir -p {pybedtools_tmp_path/'parclip'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path/'parclip'} {saved_deepripe_models_path} {{threads}} 'parclip'" @@ -384,13 +404,11 @@ rule deepRiPe_eclip_k5: fasta=fasta_dir / fasta_file_name, output: anno_dir / (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"), - threads: lambda wildcards, attempt: n_jobs_deepripe * attempt shell: f"mkdir -p {pybedtools_tmp_path/'k5'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path/'k5'} {saved_deepripe_models_path} {{threads}} 'eclip_k5'" - rule vep: input: vcf=anno_tmp_dir / (source_variant_file_pattern + "_stripped.vcf.gz"), @@ -398,7 +416,6 @@ rule vep: output: anno_dir / (source_variant_file_pattern + "_vep_anno.tsv"), threads: vep_nfork - shell: " ".join( [ @@ -442,13 +459,15 @@ rule vep: f"--plugin CADD,{cadd_snv_file},{cadd_indel_file}", f"--plugin SpliceAI,snv={spliceAI_snv_file},indel={spliceAI_indel_file}", f"--plugin PrimateAI,{primateAIfile}", - f"--plugin Condel,{condel_config_path},s,2" + f"--plugin Condel,{condel_config_path},s,2", ] ) + rule extract_with_header: input: - source_variant_dir / (source_variant_file_pattern + f".{config['source_variant_file_type']}"), + source_variant_dir + / (source_variant_file_pattern + f".{config['source_variant_file_type']}"), output: anno_tmp_dir / (source_variant_file_pattern + "_variants_header.vcf.gz"), shell: @@ -473,7 +492,8 @@ rule strip_chr_name: rule extract_variants: input: - source_variant_dir / (source_variant_file_pattern + f".{config['source_variant_file_type']}"), + source_variant_dir + / (source_variant_file_pattern + f".{config['source_variant_file_type']}"), output: anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"), shell: