Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Saving an unfilled copy of the annotations #153

Merged
merged 5 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ def deepsea_pca(
X = df[deepSEAcols].to_numpy()
del df
logger.info(
"checking wether input contains data frame with pre-calculated means and SDs"
"checking whether input contains data frame with pre-calculated means and SDs"
)
if os.path.exists(means_sd_df):
logger.info("standardizing values using existing mean and SD")
Expand Down Expand Up @@ -2029,8 +2029,12 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
@click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
@click.argument("annotations_path", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
@click.option("--keep_unfilled", type=click.Path(), default=None)
def select_rename_fill_annotations(
annotation_columns_yaml_file: str, annotations_path: str, out_file: str
annotation_columns_yaml_file: str,
annotations_path: str,
out_file: str,
keep_unfilled: str,
):
"""
Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
Expand All @@ -2039,6 +2043,7 @@ def select_rename_fill_annotations(
- annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
- annotations_path (str): Path to the annotations file.
- out_file (str): Path to save the modified annotations file.
- keep_unfilled (str, optional): Path to save annotations data frame containing NA values before filling them
"""

logger.info(
Expand All @@ -2052,6 +2057,8 @@ def select_rename_fill_annotations(
annotations_path, columns=list(set(prior_names + key_cols))
)
anno_df.rename(columns=column_name_mapping, inplace=True)
if keep_unfilled is not None:
anno_df.to_parquet(keep_unfilled)
anno_df.fillna(fill_value_mapping, inplace=True)
anno_df.to_parquet(out_file)

Expand Down
11 changes: 11 additions & 0 deletions docs/annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ Data for VEP plugins and the CADD cache are stored in `annotation_data`.

## Running the pipeline on your own data
Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data.

## Configuring the annotation pipeline

You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example:
```shell
'CADD_RAW' :
Expand All @@ -119,6 +122,14 @@ You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the c
```
Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`.

You can also modify the `example/config/annotation_colnames_filling_values.yaml` file to choose custom filling values for each of the annotations.
For each of the annotations the second value represents the value to use to fill in `NA` values, i.e. in the example above, in the `CADD_raw` column `NA` values are filled using `0`.
If you want to keep a copy of the annotations data before any `NA` values are filled, you can add
```shell
keep_unfilled: True
```
to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml).

You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add
```shell
af_mode : 'af_gnomade'
Expand Down
2 changes: 2 additions & 0 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ rule select_rename_fill_columns:
params:
annotations_in=rules.compute_plof_column.params.annotations_out,
annotations_out = anno_dir / "annotations.parquet",
unfilled = lambda w: f"--keep_unfilled {anno_dir / 'unfilled_annotations.parquet'}" if (config.get('keep_unfilled')) else ""
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
Expand All @@ -597,6 +598,7 @@ rule select_rename_fill_columns:
"{input.yaml_file}",
"{params.annotations_in}",
"{params.annotations_out}",
"{params.unfilled}"
]
) +" && touch {output.chckpt}"

Expand Down
49 changes: 49 additions & 0 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,55 @@ def test_select_rename_fill_annotations(
)


@pytest.mark.parametrize(
"test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
[
(
"select_rename_fill_columns_small",
"annotation_colnames_filling_values.yaml",
"annotations.parquet",
"expected.parquet",
"expected_unfilled.parquet",
),
],
)
def test_select_rename_fill_annotations_unfilled(
test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path
):
current_test_data_dir = (
tests_data_dir / "select_rename_fill_columns" / test_data_name_dir
)
yaml_file_path = current_test_data_dir / "input" / yaml_file
annotations_path = current_test_data_dir / "input" / annotations
expected_path = current_test_data_dir / "expected" / expected
expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled
output_path = tmp_path / "out.parquet"
unfilled_path = tmp_path / "unfilled.parquet"
cli_runner = CliRunner()
cli_parameters = [
"select-rename-fill-annotations",
yaml_file_path.as_posix(),
annotations_path.as_posix(),
output_path.as_posix(),
"--keep_unfilled",
unfilled_path,
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
written_unfilled = pd.read_parquet(unfilled_path)
expected_unfilled = pd.read_parquet(expected_unfilled_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(
written_results, expected_results[written_results.columns], check_exact=False
)
assert written_unfilled.shape == expected_unfilled.shape
assert_frame_equal(
written_unfilled, expected_unfilled[written_unfilled.columns], check_exact=False
)


@pytest.mark.parametrize(
"test_data_name_dir, annotations_in, expected",
[
Expand Down
Binary file not shown.
Loading