Skip to content

Commit

Permalink
Add plof column anno df (#90)
Browse files Browse the repository at this point in the history
* Add --chromosomes flag to add_variants_id

* fixup! Format Python code with psf/black pull_request

* Revert "Change actions to us ubuntu-latest"

This reverts commit 2f821a8.

* Add logging

* added compute_plof function, added it to pipeline, created tests

* plof cast to int values, tests reflect expected change

* added plof column to  annotation_colnames_filling_values.yaml file, added test for select_rename_fill function with input containing plof column

* reset snakefile to main version

* fixup! Format Python code with psf/black pull_request

* Update github-actions.yml

* Update autoblack_pull_request.yml

* Update docs-tests.yml

* Update test-runner.yml

* Update .readthedocs.yaml

* added plof rule to snakefile, updated rulegraph in docs

* added test that actually contains plof row

---------

Co-authored-by: Magnus Wahlberg <endast@gmail.com>
Co-authored-by: PMBio <PMBio@users.noreply.github.com>
Co-authored-by: Mück <m991k@b260-pc003.inet.dkfz-heidelberg.de>
  • Loading branch information
4 people authored May 16, 2024
1 parent 7d92e9b commit 92ea94d
Show file tree
Hide file tree
Showing 14 changed files with 477 additions and 438 deletions.
30 changes: 30 additions & 0 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1898,6 +1898,36 @@ def process_vep(
return vep_file


@cli.command()
@click.argument("anno_df_in", type=click.Path(exists=True))
@click.argument("anno_df_out", type=click.Path())
def compute_plof(anno_df_in, anno_df_out):
"""
Cumputes and adds plof column based on plof function.
Parameters:
- anno_df_in(str): File path of annotation file to read in
- anno_df_out(str): File path of output file
Returns:
None
Example: deeprvat_annotations compute_plof annotations.parquet annotations_plof.parquet
"""
anno_df = pd.read_parquet(anno_df_in)
PLOF_COLS = [
"Consequence_stop_gained",
"Consequence_frameshift_variant",
"Consequence_stop_lost",
"Consequence_start_lost",
"Consequence_splice_acceptor_variant",
"Consequence_splice_donor_variant",
]

anno_df["is_plof"] = anno_df[PLOF_COLS].eq(1).any(axis=1).astype(int)
anno_df.to_parquet(anno_df_out)


@cli.command()
@click.argument("filenames", type=str)
@click.argument("out_file", type=click.Path())
Expand Down
576 changes: 259 additions & 317 deletions docs/_static/annotations_rulegraph.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 13 additions & 5 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ ncores_agg_absplice = int(config.get("ncores_agg_absplice") or 4)
source_variant_file_pattern_complete = (
source_variant_file_pattern + "." + source_variant_file_type
)
print(f"{included_chromosomes=}")

file_paths = [
glob(
str(
Expand Down Expand Up @@ -187,7 +187,7 @@ with open(absplice_main_conf_path, "r") as fd:

rule all:
input:
anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
anno_dir / "complete_annotations.parquet",


if not gene_id_file:
Expand Down Expand Up @@ -629,13 +629,21 @@ rule filter_by_exon_distance:
]
)

rule compute_plof_column:
input: rules.filter_by_exon_distance.output,
output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_plof.parquet",
resources: mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell: 'deeprvat_annotations compute-plof {input} {output}'




rule select_rename_fill_columns:
input:
yaml_file=annotation_columns_yaml_file,
annotations_path=rules.filter_by_exon_distance.output,
annotations_path=rules.compute_plof_column.output,
output:
anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
anno_dir / "complete_annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
Expand All @@ -647,4 +655,4 @@ rule select_rename_fill_columns:
"{input.annotations_path}",
"{output}",
]
)
)
2 changes: 2 additions & 0 deletions pipelines/config/annotation_colnames_filling_values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,5 @@ annotation_column_names:
'DeepSEA_PC_6' : 0
'AF' :
'AF' : 0
'is_plof':
'is_plof' : 0
116 changes: 0 additions & 116 deletions pipelines/resources/absplice_splicing_pred_RNA.snakefile

This file was deleted.

43 changes: 43 additions & 0 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,18 @@ def test_filter_by_exon_distance(
"annotations.parquet",
"expected.parquet",
),
(
"select_rename_fill_columns_plof",
"annotation_colnames_filling_values.yaml",
"annotations.parquet",
"expected.parquet",
),
(
"select_rename_fill_columns_plof2",
"annotation_colnames_filling_values.yaml",
"annotations.parquet",
"expected.parquet",
),
],
)
def test_select_rename_fill_annotations(
Expand Down Expand Up @@ -650,3 +662,34 @@ def test_select_rename_fill_annotations(
assert_frame_equal(
written_results, expected_results[written_results.columns], check_exact=False
)


@pytest.mark.parametrize(
"test_data_name_dir, annotations_in, expected",
[
(
"compute_plof_small",
"annotations.parquet",
"expected.parquet",
),
],
)
def test_compute_plof(test_data_name_dir, annotations_in, expected, tmp_path):
current_test_data_dir = tests_data_dir / "compute_plof" / test_data_name_dir
annotations_in_path = current_test_data_dir / "input" / annotations_in
expected_path = current_test_data_dir / "expected" / expected
output_path = tmp_path / "out.parquet"
cli_runner = CliRunner()
cli_parameters = [
"compute-plof",
annotations_in_path.as_posix(),
output_path.as_posix(),
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(
written_results, expected_results[written_results.columns], check_exact=False
)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
annotation_column_names:
'af' :
'combined_UKB_NFE_AF' : 0
'maf_mb' :
'combined_UKB_NFE_AF_MB' : 10000
'maf' :
'combined_UKB_NFE_MAF' : 0
'PolyPhen' :
'polyphen_score' : 0
'SIFT' :
'sift_score' : 1
'QKI_hg2' :
'DeepRipe_plus_QKI_lip_hg2' : 0
'QKI_k5' :
'DeepRipe_plus_QKI_clip_k5' : 0
'KHDRBS1_k5' :
'DeepRipe_plus_KHDRBS1_clip_k5' : 0
'ELAVL1_parclip' :
'DeepRipe_plus_ELAVL1_parclip' : 0
'TARDBP_parclip' :
'DeepRipe_plus_TARDBP_parclip' : 0
'HNRNPD_parclip' :
'DeepRipe_plus_HNRNPD_parclip' : 0
'MBNL1_parclip' :
'DeepRipe_plus_MBNL1_parclip' : 0
'QKI_parclip' :
'DeepRipe_plus_QKI_parclip' : 0
'Consequence_splice_acceptor_variant' :
'Consequence_splice_acceptor_variant' : 0
'Consequence_splice_donor_variant' :
'Consequence_splice_donor_variant' : 0
'Consequence_stop_gained' :
'Consequence_stop_gained' : 0
'Consequence_frameshift_variant' :
'Consequence_frameshift_variant' : 0
'Consequence_stop_lost' :
'Consequence_stop_lost' : 0
'Consequence_start_lost' :
'Consequence_start_lost' : 0
'Consequence_inframe_insertion' :
'Consequence_inframe_insertion' : 0
'Consequence_inframe_deletion' :
'Consequence_inframe_deletion' : 0
'Consequence_missense_variant' :
'Consequence_missense_variant' : 0
'Consequence_protein_altering_variant' :
'Consequence_protein_altering_variant' : 0
'Consequence_splice_region_variant' :
'Consequence_splice_region_variant' : 0
'DeepSEA_PC_1' :
'DeepSEA_PC_1' : 0
'DeepSEA_PC_2' :
'DeepSEA_PC_2' : 0
'DeepSEA_PC_3' :
'DeepSEA_PC_3' : 0
'DeepSEA_PC_4' :
'DeepSEA_PC_4' : 0
'DeepSEA_PC_5' :
'DeepSEA_PC_5' : 0
'DeepSEA_PC_6' :
'DeepSEA_PC_6' : 0
'AF' :
'AF' : 0
'is_plof':
'is_plof' : 0
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
annotation_column_names:
'af' :
'combined_UKB_NFE_AF' : 0
'maf_mb' :
'combined_UKB_NFE_AF_MB' : 10000
'maf' :
'combined_UKB_NFE_MAF' : 0
'PolyPhen' :
'polyphen_score' : 0
'SIFT' :
'sift_score' : 1
'QKI_hg2' :
'DeepRipe_plus_QKI_lip_hg2' : 0
'QKI_k5' :
'DeepRipe_plus_QKI_clip_k5' : 0
'KHDRBS1_k5' :
'DeepRipe_plus_KHDRBS1_clip_k5' : 0
'ELAVL1_parclip' :
'DeepRipe_plus_ELAVL1_parclip' : 0
'TARDBP_parclip' :
'DeepRipe_plus_TARDBP_parclip' : 0
'HNRNPD_parclip' :
'DeepRipe_plus_HNRNPD_parclip' : 0
'MBNL1_parclip' :
'DeepRipe_plus_MBNL1_parclip' : 0
'QKI_parclip' :
'DeepRipe_plus_QKI_parclip' : 0
'Consequence_splice_acceptor_variant' :
'Consequence_splice_acceptor_variant' : 0
'Consequence_splice_donor_variant' :
'Consequence_splice_donor_variant' : 0
'Consequence_stop_gained' :
'Consequence_stop_gained' : 0
'Consequence_frameshift_variant' :
'Consequence_frameshift_variant' : 0
'Consequence_stop_lost' :
'Consequence_stop_lost' : 0
'Consequence_start_lost' :
'Consequence_start_lost' : 0
'Consequence_inframe_insertion' :
'Consequence_inframe_insertion' : 0
'Consequence_inframe_deletion' :
'Consequence_inframe_deletion' : 0
'Consequence_missense_variant' :
'Consequence_missense_variant' : 0
'Consequence_protein_altering_variant' :
'Consequence_protein_altering_variant' : 0
'Consequence_splice_region_variant' :
'Consequence_splice_region_variant' : 0
'DeepSEA_PC_1' :
'DeepSEA_PC_1' : 0
'DeepSEA_PC_2' :
'DeepSEA_PC_2' : 0
'DeepSEA_PC_3' :
'DeepSEA_PC_3' : 0
'DeepSEA_PC_4' :
'DeepSEA_PC_4' : 0
'DeepSEA_PC_5' :
'DeepSEA_PC_5' : 0
'DeepSEA_PC_6' :
'DeepSEA_PC_6' : 0
'AF' :
'AF' : 0
'is_plof':
'is_plof' : 0
Binary file not shown.

0 comments on commit 92ea94d

Please sign in to comment.