From 11c69a538e0dc52695266d0ffb134ed38e7d4a1a Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Fri, 13 Oct 2023 16:27:38 +0200 Subject: [PATCH 1/6] Fix processing of Uploaded_variation col --- deeprvat/annotations/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 766ce98f..a616799e 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -1142,7 +1142,7 @@ def process_deepripe(deepripe_df:object, column_prefix:str)->object: return deepripe_df def process_vep(vep_file:object)->object: - vep_file[["chrom", "pos", "ref", "alt"]] = vep_file["#Uploaded_variation"].str.split( + vep_file[["chrom", "pos", "ref", "alt"]] = vep_file["#Uploaded_variation"].str.replace('_',':').replace('/',':').split( ":", expand=True ) From 81324a020474f54d81921d1e5b1fe6b0fe49016f Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Fri, 13 Oct 2023 16:31:47 +0200 Subject: [PATCH 2/6] add vep to conda env --- deeprvat_annotations.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deeprvat_annotations.yml b/deeprvat_annotations.yml index c7b5e1bc..0c4b6896 100644 --- a/deeprvat_annotations.yml +++ b/deeprvat_annotations.yml @@ -18,4 +18,5 @@ dependencies: - fastparquet=2023.4.0 #comment out lines below if you want to use preinstalled bcftools or samtools - bcftools=1.17 - - samtools=1.17 \ No newline at end of file + - samtools=1.17 + - ensembl-vep=110.1 \ No newline at end of file From 15f50bc2a2f5809f1f12cf235b3e638aeb11b054 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Fri, 13 Oct 2023 16:37:40 +0200 Subject: [PATCH 3/6] Fix snakemake typos --- pipelines/annotations.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index 651dcf01..8d3f17e3 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -111,7 +111,7 @@ rule aggregate_and_merge_absplice: ), current_annotation_file=anno_dir / "vep_deepripe_deepsea.parquet" output: - annotations=anno_dir / "vcomplete_annotations.parquet.parquet" + annotations=anno_dir / "vcomplete_annotations.parquet.parquet", scores=anno_tmp_dir / "abSplice_score_file.parquet", shell: @@ -129,7 +129,7 @@ rule aggregate_and_merge_absplice: rule merge_deepsea_pcas: input: - annotations=anno_dir / "vep_deepripe.parquet" + annotations=anno_dir / "vep_deepripe.parquet", deepsea_pcas=anno_dir / "deepSea_pca" / "deepsea_pca.parquet", output: anno_dir / "vep_deepripe_deepsea.parquet" From 3bb17c66f1acbd73ec69d9ebaad57eae54aa22ec Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Fri, 13 Oct 2023 16:38:05 +0200 Subject: [PATCH 4/6] Remoce unused import --- pipelines/annotations.snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index 8d3f17e3..4bf8cabe 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -1,5 +1,4 @@ import pandas as pd -import os from pathlib import Path From e29fcc09da6436d05f12a987e80c826f3be8ad15 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Sun, 15 Oct 2023 12:24:56 +0200 Subject: [PATCH 5/6] Fix split of cols --- deeprvat/annotations/annotations.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 09981973..d83bf7c3 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -1090,9 +1090,12 @@ def process_deepripe(deepripe_df:object, column_prefix:str)->object: deepripe_df.drop_duplicates(subset=["chrom", "pos", "ref", "alt"], inplace=True) return deepripe_df -def process_vep(vep_file:object)->object: - vep_file[["chrom", "pos", "ref", "alt"]] = vep_file["#Uploaded_variation"].str.replace('_',':').replace('/',':').split( - ":", expand=True +def process_vep(vep_file: object) -> object: + vep_file[["chrom", "pos", "ref", "alt"]] = ( + vep_file["#Uploaded_variation"] + .str.replace("_", ":") + .str.replace("/", ":") + .str.split(":", expand=True) ) vep_file["pos"] = vep_file["pos"].astype(int) From 7be19158d708735073ec96ef530490639e521bc0 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Sun, 15 Oct 2023 12:25:59 +0200 Subject: [PATCH 6/6] Only read the file in one place --- deeprvat/annotations/annotations.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index d83bf7c3..c2061752 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -1139,16 +1139,14 @@ def concat_annotations(pvcf_blocks_file:str, annotation_dir:str, filename_patter ] for f in tqdm(file_paths): logger.info(f"processing file {f}") + file = pd.read_parquet(f) + logger.info(file.shape) + logger.info(file.columns) + if f == file_paths[0]: logger.info("creating new file") - file = pd.read_parquet(f) - logger.info(file.shape) - logger.info(file.columns) file.to_parquet(out_file, engine= "fastparquet") else: - file = pd.read_parquet(f) - logger.info(file.shape) - logger.info(file.columns) try: file.to_parquet(out_file, engine= "fastparquet", append=True) except ValueError: