PMBio · meyerkm · Dec 11, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -745,7 +745,7 @@ def deepsea_pca(
     X = df[deepSEAcols].to_numpy()
     del df
     logger.info(
-        "checking wether input contains data frame with pre-calculated means and SDs"
+        "checking whether input contains data frame with pre-calculated means and SDs"
     )
     if os.path.exists(means_sd_df):
         logger.info("standardizing values using existing mean and SD")
@@ -2029,8 +2029,12 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
 @click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
+@click.option("--keep_unfilled", type=click.Path(), default=None)
 def select_rename_fill_annotations(
-    annotation_columns_yaml_file: str, annotations_path: str, out_file: str
+    annotation_columns_yaml_file: str,
+    annotations_path: str,
+    out_file: str,
+    keep_unfilled: str,
 ):
     """
     Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
@@ -2039,6 +2043,7 @@ def select_rename_fill_annotations(
     - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
     - annotations_path (str): Path to the annotations file.
     - out_file (str): Path to save the modified annotations file.
+    - keep_unfilled (str, optional): Path to save annotations data frame containing NA values before filling them
     """
 
     logger.info(
@@ -2052,6 +2057,8 @@ def select_rename_fill_annotations(
         annotations_path, columns=list(set(prior_names + key_cols))
     )
     anno_df.rename(columns=column_name_mapping, inplace=True)
+    if keep_unfilled is not None:
+        anno_df.to_parquet(keep_unfilled)
     anno_df.fillna(fill_value_mapping, inplace=True)
     anno_df.to_parquet(out_file)
 

diff --git a/docs/annotations.md b/docs/annotations.md
@@ -110,6 +110,9 @@ Data for VEP plugins and the CADD cache are stored in `annotation_data`.
 
 ## Running the pipeline on your own data 
 Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data. 
+
+## Configuring the annotation pipeline
+
 You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of  the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example:
 ```shell
   'CADD_RAW' : 
@@ -119,6 +122,14 @@ You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the c
 ```
 Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`. 
 
+You can also modify the `example/config/annotation_colnames_filling_values.yaml` file to choose custom filling values for each of the annotations. 
+For each of the annotations the second value represents the value to use to fill in `NA` values, i.e. in the example above, in the `CADD_raw` column `NA` values are filled using `0`. 
+If you want to keep a copy of the annotations data before any `NA` values are filled, you can add 
+```shell 
+keep_unfilled: True
+```
+ to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml).
+
 You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add 
 ```shell
 af_mode : 'af_gnomade'

diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
@@ -587,6 +587,7 @@ rule select_rename_fill_columns:
     params: 
         annotations_in=rules.compute_plof_column.params.annotations_out,
         annotations_out = anno_dir / "annotations.parquet",
+        unfilled = lambda w: f"--keep_unfilled {anno_dir / 'unfilled_annotations.parquet'}" if (config.get('keep_unfilled')) else ""
     resources:
         mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
@@ -597,6 +598,7 @@ rule select_rename_fill_columns:
                 "{input.yaml_file}",
                 "{params.annotations_in}",
                 "{params.annotations_out}",
+                "{params.unfilled}"
             ]
         ) +" && touch {output.chckpt}"
 

diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py
@@ -736,6 +736,55 @@ def test_select_rename_fill_annotations(
     )
 
 
+@pytest.mark.parametrize(
+    "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
+    [
+        (
+            "select_rename_fill_columns_small",
+            "annotation_colnames_filling_values.yaml",
+            "annotations.parquet",
+            "expected.parquet",
+            "expected_unfilled.parquet",
+        ),
+    ],
+)
+def test_select_rename_fill_annotations_unfilled(
+    test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path
+):
+    current_test_data_dir = (
+        tests_data_dir / "select_rename_fill_columns" / test_data_name_dir
+    )
+    yaml_file_path = current_test_data_dir / "input" / yaml_file
+    annotations_path = current_test_data_dir / "input" / annotations
+    expected_path = current_test_data_dir / "expected" / expected
+    expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled
+    output_path = tmp_path / "out.parquet"
+    unfilled_path = tmp_path / "unfilled.parquet"
+    cli_runner = CliRunner()
+    cli_parameters = [
+        "select-rename-fill-annotations",
+        yaml_file_path.as_posix(),
+        annotations_path.as_posix(),
+        output_path.as_posix(),
+        "--keep_unfilled",
+        unfilled_path,
+    ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    written_unfilled = pd.read_parquet(unfilled_path)
+    expected_unfilled = pd.read_parquet(expected_unfilled_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(
+        written_results, expected_results[written_results.columns], check_exact=False
+    )
+    assert written_unfilled.shape == expected_unfilled.shape
+    assert_frame_equal(
+        written_unfilled, expected_unfilled[written_unfilled.columns], check_exact=False
+    )
+
+
 @pytest.mark.parametrize(
     "test_data_name_dir, annotations_in, expected",
     [

diff --git a/...t_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet b/...t_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet