diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 56e88eac..9ceec7df 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -745,7 +745,7 @@ def deepsea_pca( X = df[deepSEAcols].to_numpy() del df logger.info( - "checking wether input contains data frame with pre-calculated means and SDs" + "checking whether input contains data frame with pre-calculated means and SDs" ) if os.path.exists(means_sd_df): logger.info("standardizing values using existing mean and SD") @@ -2029,8 +2029,12 @@ def create_gene_id_file(gtf_filepath: str, out_file: str): @click.argument("annotation_columns_yaml_file", type=click.Path(exists=True)) @click.argument("annotations_path", type=click.Path(exists=True)) @click.argument("out_file", type=click.Path()) +@click.option("--keep_unfilled", type=click.Path(), default=None) def select_rename_fill_annotations( - annotation_columns_yaml_file: str, annotations_path: str, out_file: str + annotation_columns_yaml_file: str, + annotations_path: str, + out_file: str, + keep_unfilled: str, ): """ Select, rename, and fill missing values in annotation columns based on a YAML configuration file. @@ -2039,6 +2043,7 @@ def select_rename_fill_annotations( - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings. - annotations_path (str): Path to the annotations file. - out_file (str): Path to save the modified annotations file. + - keep_unfilled (str, optional): Path to save annotations data frame containing NA values before filling them """ logger.info( @@ -2052,6 +2057,8 @@ def select_rename_fill_annotations( annotations_path, columns=list(set(prior_names + key_cols)) ) anno_df.rename(columns=column_name_mapping, inplace=True) + if keep_unfilled is not None: + anno_df.to_parquet(keep_unfilled) anno_df.fillna(fill_value_mapping, inplace=True) anno_df.to_parquet(out_file) diff --git a/docs/annotations.md b/docs/annotations.md index 77125e38..5004b96d 100644 --- a/docs/annotations.md +++ b/docs/annotations.md @@ -110,6 +110,9 @@ Data for VEP plugins and the CADD cache are stored in `annotation_data`. ## Running the pipeline on your own data Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data. + +## Configuring the annotation pipeline + You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example: ```shell 'CADD_RAW' : @@ -119,6 +122,14 @@ You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the c ``` Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`. +You can also modify the `example/config/annotation_colnames_filling_values.yaml` file to choose custom filling values for each of the annotations. +For each of the annotations the second value represents the value to use to fill in `NA` values, i.e. in the example above, in the `CADD_raw` column `NA` values are filled using `0`. +If you want to keep a copy of the annotations data before any `NA` values are filled, you can add +```shell +keep_unfilled: True +``` + to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). + You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add ```shell af_mode : 'af_gnomade' diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index 82bb4722..e3860c11 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -587,6 +587,7 @@ rule select_rename_fill_columns: params: annotations_in=rules.compute_plof_column.params.annotations_out, annotations_out = anno_dir / "annotations.parquet", + unfilled = lambda w: f"--keep_unfilled {anno_dir / 'unfilled_annotations.parquet'}" if (config.get('keep_unfilled')) else "" resources: mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1), shell: @@ -597,6 +598,7 @@ rule select_rename_fill_columns: "{input.yaml_file}", "{params.annotations_in}", "{params.annotations_out}", + "{params.unfilled}" ] ) +" && touch {output.chckpt}" diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py index 58b2957c..82dce87f 100644 --- a/tests/annotations/test_annotations.py +++ b/tests/annotations/test_annotations.py @@ -736,6 +736,55 @@ def test_select_rename_fill_annotations( ) +@pytest.mark.parametrize( + "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled", + [ + ( + "select_rename_fill_columns_small", + "annotation_colnames_filling_values.yaml", + "annotations.parquet", + "expected.parquet", + "expected_unfilled.parquet", + ), + ], +) +def test_select_rename_fill_annotations_unfilled( + test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path +): + current_test_data_dir = ( + tests_data_dir / "select_rename_fill_columns" / test_data_name_dir + ) + yaml_file_path = current_test_data_dir / "input" / yaml_file + annotations_path = current_test_data_dir / "input" / annotations + expected_path = current_test_data_dir / "expected" / expected + expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled + output_path = tmp_path / "out.parquet" + unfilled_path = tmp_path / "unfilled.parquet" + cli_runner = CliRunner() + cli_parameters = [ + "select-rename-fill-annotations", + yaml_file_path.as_posix(), + annotations_path.as_posix(), + output_path.as_posix(), + "--keep_unfilled", + unfilled_path, + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + written_unfilled = pd.read_parquet(unfilled_path) + expected_unfilled = pd.read_parquet(expected_unfilled_path) + assert written_results.shape == expected_results.shape + assert_frame_equal( + written_results, expected_results[written_results.columns], check_exact=False + ) + assert written_unfilled.shape == expected_unfilled.shape + assert_frame_equal( + written_unfilled, expected_unfilled[written_unfilled.columns], check_exact=False + ) + + @pytest.mark.parametrize( "test_data_name_dir, annotations_in, expected", [ diff --git a/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet new file mode 100644 index 00000000..3a831933 Binary files /dev/null and b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet differ