Add plof column anno df (#90)

* Add --chromosomes flag to add_variants_id * fixup! Format Python code with psf/black pull_request * Revert "Change actions to us ubuntu-latest" This reverts commit 2f821a8. * Add logging * added compute_plof function, added it to pipeline, created tests * plof cast to int values, tests reflect expected change * added plof column to annotation_colnames_filling_values.yaml file, added test for select_rename_fill function with input containing plof column * reset snakefile to main version * fixup! Format Python code with psf/black pull_request * Update github-actions.yml * Update autoblack_pull_request.yml * Update docs-tests.yml * Update test-runner.yml * Update .readthedocs.yaml * added plof rule to snakefile, updated rulegraph in docs * added test that actually contains plof row --------- Co-authored-by: Magnus Wahlberg <endast@gmail.com> Co-authored-by: PMBio <PMBio@users.noreply.github.com> Co-authored-by: Mück <m991k@b260-pc003.inet.dkfz-heidelberg.de>
PMBio · May 16, 2024 · 92ea94d · 92ea94d
1 parent 7d92e9b
commit 92ea94d
Show file tree

Hide file tree

Showing 14 changed files with 477 additions and 438 deletions.
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -1898,6 +1898,36 @@ def process_vep(
     return vep_file
 
 
+@cli.command()
+@click.argument("anno_df_in", type=click.Path(exists=True))
+@click.argument("anno_df_out", type=click.Path())
+def compute_plof(anno_df_in, anno_df_out):
+    """
+    Cumputes and adds plof column based on plof function.
+
+    Parameters:
+    - anno_df_in(str): File path of annotation file to read in
+    - anno_df_out(str): File path of output file
+
+    Returns:
+    None
+
+    Example: deeprvat_annotations compute_plof annotations.parquet annotations_plof.parquet
+    """
+    anno_df = pd.read_parquet(anno_df_in)
+    PLOF_COLS = [
+        "Consequence_stop_gained",
+        "Consequence_frameshift_variant",
+        "Consequence_stop_lost",
+        "Consequence_start_lost",
+        "Consequence_splice_acceptor_variant",
+        "Consequence_splice_donor_variant",
+    ]
+
+    anno_df["is_plof"] = anno_df[PLOF_COLS].eq(1).any(axis=1).astype(int)
+    anno_df.to_parquet(anno_df_out)
+
+
 @cli.command()
 @click.argument("filenames", type=str)
 @click.argument("out_file", type=click.Path())

diff --git a/docs/_static/annotations_rulegraph.svg b/docs/_static/annotations_rulegraph.svg
diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
@@ -132,7 +132,7 @@ ncores_agg_absplice = int(config.get("ncores_agg_absplice") or 4)
 source_variant_file_pattern_complete = (
     source_variant_file_pattern + "." + source_variant_file_type
 )
-print(f"{included_chromosomes=}")
+
 file_paths = [
     glob(
         str(
@@ -187,7 +187,7 @@ with open(absplice_main_conf_path, "r") as fd:
 
 rule all:
     input:
-        anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
+        anno_dir / "complete_annotations.parquet",
 
 
 if not gene_id_file:
@@ -629,13 +629,21 @@ rule filter_by_exon_distance:
             ]
         )
 
+rule compute_plof_column:
+    input: rules.filter_by_exon_distance.output,
+    output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_plof.parquet",
+    resources: mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
+    shell: 'deeprvat_annotations compute-plof {input} {output}'
+
+
+
 
 rule select_rename_fill_columns:
     input:
         yaml_file=annotation_columns_yaml_file,
-        annotations_path=rules.filter_by_exon_distance.output,
+        annotations_path=rules.compute_plof_column.output,
     output:
-        anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
+        anno_dir / "complete_annotations.parquet",
     resources:
         mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
@@ -647,4 +655,4 @@ rule select_rename_fill_columns:
                 "{input.annotations_path}",
                 "{output}",
             ]
-        )
+        )
diff --git a/pipelines/config/annotation_colnames_filling_values.yaml b/pipelines/config/annotation_colnames_filling_values.yaml
@@ -75,3 +75,5 @@ annotation_column_names:
     'DeepSEA_PC_6' : 0
   'AF' :
     'AF' : 0
+  'is_plof':
+    'is_plof' : 0
diff --git a/pipelines/resources/absplice_splicing_pred_RNA.snakefile b/pipelines/resources/absplice_splicing_pred_RNA.snakefile
diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py
@@ -623,6 +623,18 @@ def test_filter_by_exon_distance(
             "annotations.parquet",
             "expected.parquet",
         ),
+        (
+            "select_rename_fill_columns_plof",
+            "annotation_colnames_filling_values.yaml",
+            "annotations.parquet",
+            "expected.parquet",
+        ),
+        (
+            "select_rename_fill_columns_plof2",
+            "annotation_colnames_filling_values.yaml",
+            "annotations.parquet",
+            "expected.parquet",
+        ),
     ],
 )
 def test_select_rename_fill_annotations(
@@ -650,3 +662,34 @@ def test_select_rename_fill_annotations(
     assert_frame_equal(
         written_results, expected_results[written_results.columns], check_exact=False
     )
+
+
+@pytest.mark.parametrize(
+    "test_data_name_dir, annotations_in, expected",
+    [
+        (
+            "compute_plof_small",
+            "annotations.parquet",
+            "expected.parquet",
+        ),
+    ],
+)
+def test_compute_plof(test_data_name_dir, annotations_in, expected, tmp_path):
+    current_test_data_dir = tests_data_dir / "compute_plof" / test_data_name_dir
+    annotations_in_path = current_test_data_dir / "input" / annotations_in
+    expected_path = current_test_data_dir / "expected" / expected
+    output_path = tmp_path / "out.parquet"
+    cli_runner = CliRunner()
+    cli_parameters = [
+        "compute-plof",
+        annotations_in_path.as_posix(),
+        output_path.as_posix(),
+    ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(
+        written_results, expected_results[written_results.columns], check_exact=False
+    )
diff --git a/tests/annotations/test_data/compute_plof/compute_plof_small/expected/expected.parquet b/tests/annotations/test_data/compute_plof/compute_plof_small/expected/expected.parquet
diff --git a/tests/annotations/test_data/compute_plof/compute_plof_small/input/annotations.parquet b/tests/annotations/test_data/compute_plof/compute_plof_small/input/annotations.parquet
diff --git a/...data/select_rename_fill_columns/select_rename_fill_columns_plof/expected/expected.parquet b/...data/select_rename_fill_columns/select_rename_fill_columns_plof/expected/expected.parquet
diff --git a/...ill_columns/select_rename_fill_columns_plof/input/annotation_colnames_filling_values.yaml b/...ill_columns/select_rename_fill_columns_plof/input/annotation_colnames_filling_values.yaml
@@ -0,0 +1,65 @@
+annotation_column_names: 
+  'af' : 
+    'combined_UKB_NFE_AF' : 0
+  'maf_mb' : 
+    'combined_UKB_NFE_AF_MB' : 10000
+  'maf' :
+    'combined_UKB_NFE_MAF' : 0
+  'PolyPhen' : 
+    'polyphen_score' : 0
+  'SIFT' : 
+    'sift_score' : 1
+  'QKI_hg2' : 
+    'DeepRipe_plus_QKI_lip_hg2' : 0
+  'QKI_k5' : 
+    'DeepRipe_plus_QKI_clip_k5' : 0
+  'KHDRBS1_k5' : 
+    'DeepRipe_plus_KHDRBS1_clip_k5' : 0
+  'ELAVL1_parclip' : 
+    'DeepRipe_plus_ELAVL1_parclip' : 0
+  'TARDBP_parclip' : 
+    'DeepRipe_plus_TARDBP_parclip' : 0
+  'HNRNPD_parclip' : 
+    'DeepRipe_plus_HNRNPD_parclip' : 0
+  'MBNL1_parclip' : 
+    'DeepRipe_plus_MBNL1_parclip' : 0
+  'QKI_parclip' : 
+    'DeepRipe_plus_QKI_parclip' : 0
+  'Consequence_splice_acceptor_variant' :
+    'Consequence_splice_acceptor_variant' : 0
+  'Consequence_splice_donor_variant' :
+    'Consequence_splice_donor_variant' : 0
+  'Consequence_stop_gained' :
+    'Consequence_stop_gained' : 0
+  'Consequence_frameshift_variant' :
+    'Consequence_frameshift_variant' : 0
+  'Consequence_stop_lost' :
+    'Consequence_stop_lost' : 0
+  'Consequence_start_lost' :
+    'Consequence_start_lost' : 0
+  'Consequence_inframe_insertion' :
+    'Consequence_inframe_insertion' : 0
+  'Consequence_inframe_deletion' :
+    'Consequence_inframe_deletion' : 0
+  'Consequence_missense_variant' :
+    'Consequence_missense_variant' : 0
+  'Consequence_protein_altering_variant' :
+    'Consequence_protein_altering_variant' : 0
+  'Consequence_splice_region_variant' :
+    'Consequence_splice_region_variant' : 0
+  'DeepSEA_PC_1' :
+    'DeepSEA_PC_1' : 0
+  'DeepSEA_PC_2' :
+    'DeepSEA_PC_2' : 0
+  'DeepSEA_PC_3' :
+    'DeepSEA_PC_3' : 0
+  'DeepSEA_PC_4' :
+    'DeepSEA_PC_4' : 0
+  'DeepSEA_PC_5' :
+    'DeepSEA_PC_5' : 0
+  'DeepSEA_PC_6' :
+    'DeepSEA_PC_6' : 0
+  'AF' :
+    'AF' : 0
+  'is_plof':
+    'is_plof' : 0
diff --git a/...data/select_rename_fill_columns/select_rename_fill_columns_plof/input/annotations.parquet b/...data/select_rename_fill_columns/select_rename_fill_columns_plof/input/annotations.parquet
diff --git a/...ata/select_rename_fill_columns/select_rename_fill_columns_plof2/expected/expected.parquet b/...ata/select_rename_fill_columns/select_rename_fill_columns_plof2/expected/expected.parquet
diff --git a/...ll_columns/select_rename_fill_columns_plof2/input/annotation_colnames_filling_values.yaml b/...ll_columns/select_rename_fill_columns_plof2/input/annotation_colnames_filling_values.yaml
@@ -0,0 +1,65 @@
+annotation_column_names: 
+  'af' : 
+    'combined_UKB_NFE_AF' : 0
+  'maf_mb' : 
+    'combined_UKB_NFE_AF_MB' : 10000
+  'maf' :
+    'combined_UKB_NFE_MAF' : 0
+  'PolyPhen' : 
+    'polyphen_score' : 0
+  'SIFT' : 
+    'sift_score' : 1
+  'QKI_hg2' : 
+    'DeepRipe_plus_QKI_lip_hg2' : 0
+  'QKI_k5' : 
+    'DeepRipe_plus_QKI_clip_k5' : 0
+  'KHDRBS1_k5' : 
+    'DeepRipe_plus_KHDRBS1_clip_k5' : 0
+  'ELAVL1_parclip' : 
+    'DeepRipe_plus_ELAVL1_parclip' : 0
+  'TARDBP_parclip' : 
+    'DeepRipe_plus_TARDBP_parclip' : 0
+  'HNRNPD_parclip' : 
+    'DeepRipe_plus_HNRNPD_parclip' : 0
+  'MBNL1_parclip' : 
+    'DeepRipe_plus_MBNL1_parclip' : 0
+  'QKI_parclip' : 
+    'DeepRipe_plus_QKI_parclip' : 0
+  'Consequence_splice_acceptor_variant' :
+    'Consequence_splice_acceptor_variant' : 0
+  'Consequence_splice_donor_variant' :
+    'Consequence_splice_donor_variant' : 0
+  'Consequence_stop_gained' :
+    'Consequence_stop_gained' : 0
+  'Consequence_frameshift_variant' :
+    'Consequence_frameshift_variant' : 0
+  'Consequence_stop_lost' :
+    'Consequence_stop_lost' : 0
+  'Consequence_start_lost' :
+    'Consequence_start_lost' : 0
+  'Consequence_inframe_insertion' :
+    'Consequence_inframe_insertion' : 0
+  'Consequence_inframe_deletion' :
+    'Consequence_inframe_deletion' : 0
+  'Consequence_missense_variant' :
+    'Consequence_missense_variant' : 0
+  'Consequence_protein_altering_variant' :
+    'Consequence_protein_altering_variant' : 0
+  'Consequence_splice_region_variant' :
+    'Consequence_splice_region_variant' : 0
+  'DeepSEA_PC_1' :
+    'DeepSEA_PC_1' : 0
+  'DeepSEA_PC_2' :
+    'DeepSEA_PC_2' : 0
+  'DeepSEA_PC_3' :
+    'DeepSEA_PC_3' : 0
+  'DeepSEA_PC_4' :
+    'DeepSEA_PC_4' : 0
+  'DeepSEA_PC_5' :
+    'DeepSEA_PC_5' : 0
+  'DeepSEA_PC_6' :
+    'DeepSEA_PC_6' : 0
+  'AF' :
+    'AF' : 0
+  'is_plof':
+    'is_plof' : 0
diff --git a/...ata/select_rename_fill_columns/select_rename_fill_columns_plof2/input/annotations.parquet b/...ata/select_rename_fill_columns/select_rename_fill_columns_plof2/input/annotations.parquet