Skip to content

Commit

Permalink
Saving an unfilled copy of the annotations (#153)
Browse files Browse the repository at this point in the history
* with these changes, a copy of the annotations.parquet file is  saved containing NA values before filling these

* fixup! Format Python code with psf/black pull_request

* make unfilled option configurable, default: no copy saved

* fixup description text

---------

Co-authored-by: Mück <m991k@b260-pc003.inet.dkfz-heidelberg.de>
Co-authored-by: PMBio <PMBio@users.noreply.github.com>
Co-authored-by: Kayla Meyer <meyer.kmt@gmail.com>
  • Loading branch information
4 people authored Dec 11, 2024
1 parent 38c09db commit a85ee57
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 2 deletions.
11 changes: 9 additions & 2 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ def deepsea_pca(
X = df[deepSEAcols].to_numpy()
del df
logger.info(
"checking wether input contains data frame with pre-calculated means and SDs"
"checking whether input contains data frame with pre-calculated means and SDs"
)
if os.path.exists(means_sd_df):
logger.info("standardizing values using existing mean and SD")
Expand Down Expand Up @@ -2029,8 +2029,12 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
@click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
@click.argument("annotations_path", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
@click.option("--keep_unfilled", type=click.Path(), default=None)
def select_rename_fill_annotations(
annotation_columns_yaml_file: str, annotations_path: str, out_file: str
annotation_columns_yaml_file: str,
annotations_path: str,
out_file: str,
keep_unfilled: str,
):
"""
Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
Expand All @@ -2039,6 +2043,7 @@ def select_rename_fill_annotations(
- annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
- annotations_path (str): Path to the annotations file.
- out_file (str): Path to save the modified annotations file.
- keep_unfilled (str, optional): Path to save annotations data frame containing NA values before filling them
"""

logger.info(
Expand All @@ -2052,6 +2057,8 @@ def select_rename_fill_annotations(
annotations_path, columns=list(set(prior_names + key_cols))
)
anno_df.rename(columns=column_name_mapping, inplace=True)
if keep_unfilled is not None:
anno_df.to_parquet(keep_unfilled)
anno_df.fillna(fill_value_mapping, inplace=True)
anno_df.to_parquet(out_file)

Expand Down
11 changes: 11 additions & 0 deletions docs/annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ Data for VEP plugins and the CADD cache are stored in `annotation_data`.

## Running the pipeline on your own data
Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data.

## Configuring the annotation pipeline

You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example:
```shell
'CADD_RAW' :
Expand All @@ -119,6 +122,14 @@ You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the c
```
Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`.

You can also modify the `example/config/annotation_colnames_filling_values.yaml` file to choose custom filling values for each of the annotations.
For each of the annotations the second value represents the value to use to fill in `NA` values, i.e. in the example above, in the `CADD_raw` column `NA` values are filled using `0`.
If you want to keep a copy of the annotations data before any `NA` values are filled, you can add
```shell
keep_unfilled: True
```
to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml).

You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add
```shell
af_mode : 'af_gnomade'
Expand Down
2 changes: 2 additions & 0 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ rule select_rename_fill_columns:
params:
annotations_in=rules.compute_plof_column.params.annotations_out,
annotations_out = anno_dir / "annotations.parquet",
unfilled = lambda w: f"--keep_unfilled {anno_dir / 'unfilled_annotations.parquet'}" if (config.get('keep_unfilled')) else ""
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
Expand All @@ -597,6 +598,7 @@ rule select_rename_fill_columns:
"{input.yaml_file}",
"{params.annotations_in}",
"{params.annotations_out}",
"{params.unfilled}"
]
) +" && touch {output.chckpt}"

Expand Down
49 changes: 49 additions & 0 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,55 @@ def test_select_rename_fill_annotations(
)


@pytest.mark.parametrize(
"test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
[
(
"select_rename_fill_columns_small",
"annotation_colnames_filling_values.yaml",
"annotations.parquet",
"expected.parquet",
"expected_unfilled.parquet",
),
],
)
def test_select_rename_fill_annotations_unfilled(
test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path
):
current_test_data_dir = (
tests_data_dir / "select_rename_fill_columns" / test_data_name_dir
)
yaml_file_path = current_test_data_dir / "input" / yaml_file
annotations_path = current_test_data_dir / "input" / annotations
expected_path = current_test_data_dir / "expected" / expected
expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled
output_path = tmp_path / "out.parquet"
unfilled_path = tmp_path / "unfilled.parquet"
cli_runner = CliRunner()
cli_parameters = [
"select-rename-fill-annotations",
yaml_file_path.as_posix(),
annotations_path.as_posix(),
output_path.as_posix(),
"--keep_unfilled",
unfilled_path,
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
written_unfilled = pd.read_parquet(unfilled_path)
expected_unfilled = pd.read_parquet(expected_unfilled_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(
written_results, expected_results[written_results.columns], check_exact=False
)
assert written_unfilled.shape == expected_unfilled.shape
assert_frame_equal(
written_unfilled, expected_unfilled[written_unfilled.columns], check_exact=False
)


@pytest.mark.parametrize(
"test_data_name_dir, annotations_in, expected",
[
Expand Down
Binary file not shown.

0 comments on commit a85ee57

Please sign in to comment.