with these changes, a copy of the annotations.parquet file is saved c…

…ontaining NA values before filling these
PMBio · Dec 10, 2024 · ab2d25c · ab2d25c
1 parent 38c09db
commit ab2d25c
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 1 deletion.
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -2029,8 +2029,9 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
 @click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
+@click.option("--keep_unfilled", type=click.Path(), default=None)
 def select_rename_fill_annotations(
-    annotation_columns_yaml_file: str, annotations_path: str, out_file: str
+    annotation_columns_yaml_file: str, annotations_path: str, out_file: str, keep_unfilled: str
 ):
     """
     Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
@@ -2039,6 +2040,7 @@ def select_rename_fill_annotations(
     - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
     - annotations_path (str): Path to the annotations file.
     - out_file (str): Path to save the modified annotations file.
+    - wether to keep annotations data frame containing NA values before filling them
     """
 
     logger.info(
@@ -2052,6 +2054,7 @@ def select_rename_fill_annotations(
         annotations_path, columns=list(set(prior_names + key_cols))
     )
     anno_df.rename(columns=column_name_mapping, inplace=True)
+    if (keep_unfilled is not None): anno_df.to_parquet(keep_unfilled)
     anno_df.fillna(fill_value_mapping, inplace=True)
     anno_df.to_parquet(out_file)
 

diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
@@ -587,6 +587,7 @@ rule select_rename_fill_columns:
     params: 
         annotations_in=rules.compute_plof_column.params.annotations_out,
         annotations_out = anno_dir / "annotations.parquet",
+        unfilled = anno_dir / "unfilled_annotations.parquet"
     resources:
         mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
@@ -597,6 +598,7 @@ rule select_rename_fill_columns:
                 "{input.yaml_file}",
                 "{params.annotations_in}",
                 "{params.annotations_out}",
+                "--keep_unfilled {params.unfilled}"
             ]
         ) +" && touch {output.chckpt}"
 

diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py
@@ -735,6 +735,53 @@ def test_select_rename_fill_annotations(
         written_results, expected_results[written_results.columns], check_exact=False
     )
 
+@pytest.mark.parametrize(
+    "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
+    [
+        (
+            "select_rename_fill_columns_small",
+            "annotation_colnames_filling_values.yaml",
+            "annotations.parquet",
+            "expected.parquet",
+            "expected_unfilled.parquet",
+        ),
+    ],
+)
+def test_select_rename_fill_annotations_unfilled(
+    test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path
+):
+    current_test_data_dir = (
+        tests_data_dir / "select_rename_fill_columns" / test_data_name_dir
+    )
+    yaml_file_path = current_test_data_dir / "input" / yaml_file
+    annotations_path = current_test_data_dir / "input" / annotations
+    expected_path = current_test_data_dir / "expected" / expected
+    expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled
+    output_path = tmp_path / "out.parquet"
+    unfilled_path = tmp_path / "unfilled.parquet"
+    cli_runner = CliRunner()
+    cli_parameters = [
+        "select-rename-fill-annotations",
+        yaml_file_path.as_posix(),
+        annotations_path.as_posix(),
+        output_path.as_posix(),
+        "--keep_unfilled",
+        unfilled_path
+
+    ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    written_unfilled = pd.read_parquet(unfilled_path)
+    expected_unfilled = pd.read_parquet(expected_unfilled_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(
+        written_results, expected_results[written_results.columns], check_exact=False
+    )
+    assert written_unfilled.shape == expected_unfilled.shape
+    assert_frame_equal(written_unfilled, expected_unfilled[written_unfilled.columns],check_exact=False)
+
 
 @pytest.mark.parametrize(
     "test_data_name_dir, annotations_in, expected",

diff --git a/...t_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet b/...t_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet