Skip to content

Commit

Permalink
with these changes, a copy of the annotations.parquet file is saved c…
Browse files Browse the repository at this point in the history
…ontaining NA values before filling these
  • Loading branch information
Mück committed Dec 10, 2024
1 parent 38c09db commit ab2d25c
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 1 deletion.
5 changes: 4 additions & 1 deletion deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2029,8 +2029,9 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
@click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
@click.argument("annotations_path", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
@click.option("--keep_unfilled", type=click.Path(), default=None)
def select_rename_fill_annotations(
annotation_columns_yaml_file: str, annotations_path: str, out_file: str
annotation_columns_yaml_file: str, annotations_path: str, out_file: str, keep_unfilled: str
):
"""
Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
Expand All @@ -2039,6 +2040,7 @@ def select_rename_fill_annotations(
- annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
- annotations_path (str): Path to the annotations file.
- out_file (str): Path to save the modified annotations file.
- wether to keep annotations data frame containing NA values before filling them
"""

logger.info(
Expand All @@ -2052,6 +2054,7 @@ def select_rename_fill_annotations(
annotations_path, columns=list(set(prior_names + key_cols))
)
anno_df.rename(columns=column_name_mapping, inplace=True)
if (keep_unfilled is not None): anno_df.to_parquet(keep_unfilled)
anno_df.fillna(fill_value_mapping, inplace=True)
anno_df.to_parquet(out_file)

Expand Down
2 changes: 2 additions & 0 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ rule select_rename_fill_columns:
params:
annotations_in=rules.compute_plof_column.params.annotations_out,
annotations_out = anno_dir / "annotations.parquet",
unfilled = anno_dir / "unfilled_annotations.parquet"
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
Expand All @@ -597,6 +598,7 @@ rule select_rename_fill_columns:
"{input.yaml_file}",
"{params.annotations_in}",
"{params.annotations_out}",
"--keep_unfilled {params.unfilled}"
]
) +" && touch {output.chckpt}"

Expand Down
47 changes: 47 additions & 0 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,53 @@ def test_select_rename_fill_annotations(
written_results, expected_results[written_results.columns], check_exact=False
)

@pytest.mark.parametrize(
"test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
[
(
"select_rename_fill_columns_small",
"annotation_colnames_filling_values.yaml",
"annotations.parquet",
"expected.parquet",
"expected_unfilled.parquet",
),
],
)
def test_select_rename_fill_annotations_unfilled(
test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path
):
current_test_data_dir = (
tests_data_dir / "select_rename_fill_columns" / test_data_name_dir
)
yaml_file_path = current_test_data_dir / "input" / yaml_file
annotations_path = current_test_data_dir / "input" / annotations
expected_path = current_test_data_dir / "expected" / expected
expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled
output_path = tmp_path / "out.parquet"
unfilled_path = tmp_path / "unfilled.parquet"
cli_runner = CliRunner()
cli_parameters = [
"select-rename-fill-annotations",
yaml_file_path.as_posix(),
annotations_path.as_posix(),
output_path.as_posix(),
"--keep_unfilled",
unfilled_path

]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
written_unfilled = pd.read_parquet(unfilled_path)
expected_unfilled = pd.read_parquet(expected_unfilled_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(
written_results, expected_results[written_results.columns], check_exact=False
)
assert written_unfilled.shape == expected_unfilled.shape
assert_frame_equal(written_unfilled, expected_unfilled[written_unfilled.columns],check_exact=False)


@pytest.mark.parametrize(
"test_data_name_dir, annotations_in, expected",
Expand Down
Binary file not shown.

0 comments on commit ab2d25c

Please sign in to comment.