added further tests

PMBio · May 8, 2024 · 88a2f09 · 88a2f09
1 parent 7e7ff8f
commit 88a2f09
Show file tree

Hide file tree

Showing 20 changed files with 276 additions and 47 deletions.
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -2000,19 +2000,19 @@ def calculate_maf(annotations_path: str, out_file: str):
 
 
 @cli.command()
-@click.argument("protein_id_file", type=click.Path(exists=True))
+@click.argument("gene_id_file", type=click.Path(exists=True))
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
-def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str):
+def add_gene_ids(gene_id_file: str, annotations_path: str, out_file: str):
     """
-    Add protein IDs to the annotations based on protein ID mapping file.
+    Add gene IDs to the annotations based on gene ID mapping file.
 
     Parameters:
-    - protein_id_file (str): Path to the protein ID mapping file.
+    - gene_id_file (str): Path to the gene ID mapping file.
     - annotations_path (str): Path to the annotations file.
     - out_file (str): Path to the output file to save the annotations with protein IDs.
     """
-    genes = pd.read_parquet(protein_id_file)
+    genes = pd.read_parquet(gene_id_file)
     genes[["gene_base", "feature"]] = genes["gene"].str.split(".", expand=True)
     genes.drop(columns=["feature", "gene", "gene_name", "gene_type"], inplace=True)
     genes.rename(columns={"id": "gene_id"}, inplace=True)
@@ -2027,7 +2027,7 @@ def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str):
 @cli.command()
 @click.argument("gtf_filepath", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
-def create_protein_id_file(gtf_filepath: str, out_file: str):
+def create_gene_id_file(gtf_filepath: str, out_file: str):
     """
     Create a protein ID mapping file from the GTF file.
 

diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
@@ -185,7 +185,7 @@ if not gene_id_file:
         shell:
             " ".join([
                 f"deeprvat_annotations", 
-                "create-protein-id-file",
+                "create-gene-id-file",
                 "{input}",
                 "{output}"
             ])
@@ -215,7 +215,7 @@ rule add_gene_ids:
     shell:
         " ".join([
             f"deeprvat_annotations", 
-            "add-protein-ids",
+            "add-gene-ids",
             "{input.gene_id_file}",
             "{input.annotations_path}",
             "{output}"

diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py
@@ -372,6 +372,8 @@ def test_merge_absplice_scores(
     assert written_results.shape == expected_results.shape
     assert_frame_equal(written_results, expected_results, check_exact = False)
 
+
+
 @pytest.mark.parametrize(
     "test_data_name_dir, genotype_file, variant_file, expected",
     [
@@ -402,47 +404,211 @@ def test_calculate_allele_frequencies(
     written_results = pd.read_parquet(output_path)
     expected_results = pd.read_parquet(expected_path)
     assert written_results.shape == expected_results.shape
+    assert_frame_equal(written_results, expected_results, check_exact = False)
+
+
+
+@pytest.mark.parametrize(
+    "test_data_name_dir, af_df, annotaton_df, expected",
+    [
+        (   "merge_af_small",
+            "af_df.parquet",
+            "vep_deepripe_deepsea_absplice.parquet",
+            "vep_deepripe_deepsea_absplice_af.parquet",
+        ),
+    ]
+)
+def test_merge_af(
+     test_data_name_dir, af_df, annotaton_df, expected, tmp_path
+):
+    current_test_data_dir = tests_data_dir / 'merge_af' / test_data_name_dir
+    af_path = current_test_data_dir / 'input' /  af_df
+    annotaions_path = current_test_data_dir / 'input' /annotaton_df
+    expected_path = current_test_data_dir / 'expected' / expected
+    output_path = tmp_path / 'out.parquet'
+    cli_runner = CliRunner()
+    cli_parameters = [
+        'merge-af',
+        af_path.as_posix(),
+        annotaions_path.as_posix(),
+        output_path.as_posix()
+        ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
     assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
 
 
 
+@pytest.mark.parametrize(
+    "test_data_name_dir, annotations, expected",
+    [
+        (   "calculate_MAF_small",
+            "annotations.parquet",
+            "expected.parquet",
+        ),
+    ]
+)
+def test_calculate_maf(
+     test_data_name_dir, annotations, expected, tmp_path
+):
+    current_test_data_dir = tests_data_dir / 'calculate_MAF' / test_data_name_dir
+    annotations_path = current_test_data_dir / 'input' /  annotations
+    expected_path = current_test_data_dir / 'expected' / expected
+    output_path = tmp_path / 'out.parquet'
+    cli_runner = CliRunner()
+    cli_parameters = [
+        'calculate-maf',
+        annotations_path.as_posix(),
+        output_path.as_posix()
+        ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(written_results, expected_results, check_exact = False)
+
+
+
+@pytest.mark.parametrize(
+    "test_data_name_dir, gtf_file, expected",
+    [
+        (   "create_gene_id_file_small",
+            "gencode.v44.annotation.gtf.gz",
+            "protein_coding_genes.parquet",
+        ),
+    ]
+)
+def test_create_gene_id_file(
+     test_data_name_dir, gtf_file, expected, tmp_path
+):
+    current_test_data_dir = tests_data_dir / 'create_gene_id_file' / test_data_name_dir
+    input_path1 = current_test_data_dir / 'input' /  gtf_file
+    expected_path = current_test_data_dir / 'expected' / expected
+    output_path = tmp_path / 'out.parquet'
+    cli_runner = CliRunner()
+    cli_parameters = [
+        'create-gene-id-file',
+        input_path1.as_posix(),
+        output_path.as_posix(),
+        ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
+
+
+@pytest.mark.parametrize(
+    "test_data_name_dir, annotations, gene_id_file , expected",
+    [
+        (   "add_gene_ids_small",
+            "annotations.parquet",
+            "protein_coding_genes.parquet",
+            "expected.parquet",
+        ),
+    ]
+)
+def test_add_gene_ids(
+     test_data_name_dir, annotations, gene_id_file , expected, tmp_path
+):
+    current_test_data_dir = tests_data_dir / 'add_gene_ids' / test_data_name_dir
+    annotations_path = current_test_data_dir / 'input' /  annotations
+    gene_id_path = current_test_data_dir / 'input' /gene_id_file 
+    expected_path = current_test_data_dir / 'expected' / expected
+    output_path = tmp_path / 'out.parquet'
+    cli_runner = CliRunner()
+    cli_parameters = [
+        'add-gene-ids',
+        gene_id_path.as_posix(),
+        annotations_path.as_posix(),
+        output_path.as_posix(),
+        ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(written_results, expected_results, check_exact = False)
+
+
+@pytest.mark.parametrize(
+    "test_data_name_dir, gtf_file, annotations, gene_id_file, expected",
+    [
+        (   "filter_by_exon_distance_small",
+            "gencode.v44.annotation.gtf.gz",
+            "annotations.parquet",
+            "protein_coding_genes.parquet",
+            "expected.parquet",
+        ),
+    ]
+)
+def test_filter_by_exon_distance(
+     test_data_name_dir, gtf_file, annotations, gene_id_file, expected, tmp_path
+):
+    current_test_data_dir = tests_data_dir / 'filter_by_exon_distance' / test_data_name_dir
+    gtf_file_path = current_test_data_dir / 'input' /  gtf_file
+    annotations_path = current_test_data_dir / 'input' / annotations
+    gene_id_path = current_test_data_dir / 'input' / gene_id_file
+
+    expected_path = current_test_data_dir / 'expected' / expected
+    output_path = tmp_path / 'out.parquet'
+    cli_runner = CliRunner()
+    cli_parameters = [
+        'filter-annotations-by-exon-distance',
+        annotations_path.as_posix(),
+        gtf_file_path.as_posix(),
+        gene_id_path.as_posix(),
+        output_path.as_posix(),
+        ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(written_results, expected_results, check_exact = False)
+
+
+@pytest.mark.parametrize(
+    "test_data_name_dir, yaml_file, annotations, expected",
+    [
+        (   "select_rename_fill_columns_small",
+            "annotation_colnames_filling_values.yaml",
+            "annotations.parquet",
+            "expected.parquet",
+        ),
+    ]
+)
+def test_select_rename_fill_annotations(
+     test_data_name_dir, yaml_file, annotations, expected, tmp_path
+):
+    current_test_data_dir = tests_data_dir / 'select_rename_fill_columns' / test_data_name_dir
+    yaml_file_path = current_test_data_dir / 'input' /  yaml_file
+    annotations_path = current_test_data_dir / 'input' /annotations
+    expected_path = current_test_data_dir / 'expected' / expected
+    output_path = tmp_path / 'out.parquet'
+    cli_runner = CliRunner()
+    cli_parameters = [
+        'select-rename-fill-annotations',
+        yaml_file_path.as_posix(),
+        annotations_path.as_posix(),
+        output_path.as_posix(),
+        ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
+
 
-# @pytest.mark.parametrize(
-#     "test_data_name_dir, input_file_1, input_file_2, parameter1, expected",
-#     [
-#         (   "test_name_dir",
-#             "input_file1.parquet",
-#             "input_file2.parquet",
-#             "8",
-#             "expected.parquet",
-#         ),
-#     ]
-# )
-# def template(
-#      test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path
-# ):
-#     current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir
-#     input_path1 = current_test_data_dir / 'input' /  input_file_1
-#     input_path2 = current_test_data_dir / 'input' /input_file_2
-#     expected_path = current_test_data_dir / 'expected' / expected
-#     output_path = tmp_path / 'out.parquet'
-#     cli_runner = CliRunner()
-#     cli_parameters = [
-#         'function-name',
-#         input_path1.as_posix(),
-#         input_path2.as_posix(),
-#         output_path.as_posix(),
-#         parameter1,
-#         ]
-#     result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
-#     assert result.exit_code == 0
-#     written_results = pd.read_parquet(output_path)
-#     expected_results = pd.read_parquet(expected_path)
-#     assert written_results.shape == expected_results.shape
-#     assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
 
 # @pytest.mark.parametrize(
-#     "test_name_dir, input_file_1, input_file_2, parameter1, expected",
+#     "test_data_name_dir, input_file_1, input_file_2, parameter1, expected",
 #     [
 #         (   "test_name_dir",
 #             "input_file1.parquet",
@@ -456,15 +622,15 @@ def test_calculate_allele_frequencies(
 #      test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path
 # ):
 #     current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir
-#     input_path1 = current_test_data_dir / 'input' /  input_file_1
-#     input_path2 = current_test_data_dir / 'input' /input_file_2
+#     input_file_1_path = current_test_data_dir / 'input' /  input_file_1
+#     input_file_2_path = current_test_data_dir / 'input' /input_file_2
 #     expected_path = current_test_data_dir / 'expected' / expected
 #     output_path = tmp_path / 'out.parquet'
 #     cli_runner = CliRunner()
 #     cli_parameters = [
 #         'function-name',
-#         input_path1.as_posix(),
-#         input_path2.as_posix(),
+#         input_file_1_path.as_posix(),
+#         input_file_2_path.as_posix(),
 #         output_path.as_posix(),
 #         parameter1,
 #         ]
@@ -473,4 +639,4 @@ def test_calculate_allele_frequencies(
 #     written_results = pd.read_parquet(output_path)
 #     expected_results = pd.read_parquet(expected_path)
 #     assert written_results.shape == expected_results.shape
-#     assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
+#     assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
diff --git a/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/expected/expected.parquet b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/expected/expected.parquet
diff --git a/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/annotations.parquet b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/annotations.parquet
diff --git a/.../annotations/test_data/add_gene_ids/add_gene_ids_small/input/protein_coding_genes.parquet b/.../annotations/test_data/add_gene_ids/add_gene_ids_small/input/protein_coding_genes.parquet
diff --git a/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/expected/expected.parquet b/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/expected/expected.parquet
diff --git a/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/input/annotations.parquet b/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/input/annotations.parquet
diff --git a/..._data/create_gene_id_file/create_gene_id_file_small/expected/protein_coding_genes.parquet b/..._data/create_gene_id_file/create_gene_id_file_small/expected/protein_coding_genes.parquet
diff --git a/...st_data/create_gene_id_file/create_gene_id_file_small/input/gencode.v44.annotation.gtf.gz b/...st_data/create_gene_id_file/create_gene_id_file_small/input/gencode.v44.annotation.gtf.gz
diff --git a/...test_data/filter_by_exon_distance/filter_by_exon_distance_small/expected/expected.parquet b/...test_data/filter_by_exon_distance/filter_by_exon_distance_small/expected/expected.parquet
diff --git a/...test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/annotations.parquet b/...test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/annotations.parquet
diff --git a/...filter_by_exon_distance/filter_by_exon_distance_small/input/gencode.v44.annotation.gtf.gz b/...filter_by_exon_distance/filter_by_exon_distance_small/input/gencode.v44.annotation.gtf.gz
diff --git a/.../filter_by_exon_distance/filter_by_exon_distance_small/input/protein_coding_genes.parquet b/.../filter_by_exon_distance/filter_by_exon_distance_small/input/protein_coding_genes.parquet
diff --git a/...tions/test_data/merge_af/merge_af_small/expected/vep_deepripe_deepsea_absplice_af.parquet b/...tions/test_data/merge_af/merge_af_small/expected/vep_deepripe_deepsea_absplice_af.parquet
diff --git a/tests/annotations/test_data/merge_af/merge_af_small/input/af_df.parquet b/tests/annotations/test_data/merge_af/merge_af_small/input/af_df.parquet
diff --git a/...annotations/test_data/merge_af/merge_af_small/input/vep_deepripe_deepsea_absplice.parquet b/...annotations/test_data/merge_af/merge_af_small/input/vep_deepripe_deepsea_absplice.parquet
diff --git a/...ata/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected.parquet b/...ata/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected.parquet
diff --git a/...ll_columns/select_rename_fill_columns_small/input/annotation_colnames_filling_values.yaml b/...ll_columns/select_rename_fill_columns_small/input/annotation_colnames_filling_values.yaml
@@ -0,0 +1,63 @@
+annotation_column_names: 
+  'af' : 
+    'combined_UKB_NFE_AF' : 0
+  'maf_mb' : 
+    'combined_UKB_NFE_AF_MB' : 10000
+  'maf' :
+    'combined_UKB_NFE_MAF' : 0
+  'PolyPhen' : 
+    'polyphen_score' : 0
+  'SIFT' : 
+    'sift_score' : 1
+  'QKI_hg2' : 
+    'DeepRipe_plus_QKI_lip_hg2' : 0
+  'QKI_k5' : 
+    'DeepRipe_plus_QKI_clip_k5' : 0
+  'KHDRBS1_k5' : 
+    'DeepRipe_plus_KHDRBS1_clip_k5' : 0
+  'ELAVL1_parclip' : 
+    'DeepRipe_plus_ELAVL1_parclip' : 0
+  'TARDBP_parclip' : 
+    'DeepRipe_plus_TARDBP_parclip' : 0
+  'HNRNPD_parclip' : 
+    'DeepRipe_plus_HNRNPD_parclip' : 0
+  'MBNL1_parclip' : 
+    'DeepRipe_plus_MBNL1_parclip' : 0
+  'QKI_parclip' : 
+    'DeepRipe_plus_QKI_parclip' : 0
+  'Consequence_splice_acceptor_variant' :
+    'Consequence_splice_acceptor_variant' : 0
+  'Consequence_splice_donor_variant' :
+    'Consequence_splice_donor_variant' : 0
+  'Consequence_stop_gained' :
+    'Consequence_stop_gained' : 0
+  'Consequence_frameshift_variant' :
+    'Consequence_frameshift_variant' : 0
+  'Consequence_stop_lost' :
+    'Consequence_stop_lost' : 0
+  'Consequence_start_lost' :
+    'Consequence_start_lost' : 0
+  'Consequence_inframe_insertion' :
+    'Consequence_inframe_insertion' : 0
+  'Consequence_inframe_deletion' :
+    'Consequence_inframe_deletion' : 0
+  'Consequence_missense_variant' :
+    'Consequence_missense_variant' : 0
+  'Consequence_protein_altering_variant' :
+    'Consequence_protein_altering_variant' : 0
+  'Consequence_splice_region_variant' :
+    'Consequence_splice_region_variant' : 0
+  'DeepSEA_PC_1' :
+    'DeepSEA_PC_1' : 0
+  'DeepSEA_PC_2' :
+    'DeepSEA_PC_2' : 0
+  'DeepSEA_PC_3' :
+    'DeepSEA_PC_3' : 0
+  'DeepSEA_PC_4' :
+    'DeepSEA_PC_4' : 0
+  'DeepSEA_PC_5' :
+    'DeepSEA_PC_5' : 0
+  'DeepSEA_PC_6' :
+    'DeepSEA_PC_6' : 0
+  'AF' :
+    'AF' : 0
diff --git a/...ata/select_rename_fill_columns/select_rename_fill_columns_small/input/annotations.parquet b/...ata/select_rename_fill_columns/select_rename_fill_columns_small/input/annotations.parquet