Skip to content

Commit

Permalink
added further tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Mück committed May 8, 2024
1 parent 7e7ff8f commit 88a2f09
Show file tree
Hide file tree
Showing 20 changed files with 276 additions and 47 deletions.
12 changes: 6 additions & 6 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2000,19 +2000,19 @@ def calculate_maf(annotations_path: str, out_file: str):


@cli.command()
@click.argument("protein_id_file", type=click.Path(exists=True))
@click.argument("gene_id_file", type=click.Path(exists=True))
@click.argument("annotations_path", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str):
def add_gene_ids(gene_id_file: str, annotations_path: str, out_file: str):
"""
Add protein IDs to the annotations based on protein ID mapping file.
Add gene IDs to the annotations based on gene ID mapping file.
Parameters:
- protein_id_file (str): Path to the protein ID mapping file.
- gene_id_file (str): Path to the gene ID mapping file.
- annotations_path (str): Path to the annotations file.
- out_file (str): Path to the output file to save the annotations with protein IDs.
"""
genes = pd.read_parquet(protein_id_file)
genes = pd.read_parquet(gene_id_file)
genes[["gene_base", "feature"]] = genes["gene"].str.split(".", expand=True)
genes.drop(columns=["feature", "gene", "gene_name", "gene_type"], inplace=True)
genes.rename(columns={"id": "gene_id"}, inplace=True)
Expand All @@ -2027,7 +2027,7 @@ def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str):
@cli.command()
@click.argument("gtf_filepath", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
def create_protein_id_file(gtf_filepath: str, out_file: str):
def create_gene_id_file(gtf_filepath: str, out_file: str):
"""
Create a protein ID mapping file from the GTF file.
Expand Down
4 changes: 2 additions & 2 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ if not gene_id_file:
shell:
" ".join([
f"deeprvat_annotations",
"create-protein-id-file",
"create-gene-id-file",
"{input}",
"{output}"
])
Expand Down Expand Up @@ -215,7 +215,7 @@ rule add_gene_ids:
shell:
" ".join([
f"deeprvat_annotations",
"add-protein-ids",
"add-gene-ids",
"{input.gene_id_file}",
"{input.annotations_path}",
"{output}"
Expand Down
244 changes: 205 additions & 39 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,8 @@ def test_merge_absplice_scores(
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results, check_exact = False)



@pytest.mark.parametrize(
"test_data_name_dir, genotype_file, variant_file, expected",
[
Expand Down Expand Up @@ -402,47 +404,211 @@ def test_calculate_allele_frequencies(
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results, check_exact = False)



@pytest.mark.parametrize(
"test_data_name_dir, af_df, annotaton_df, expected",
[
( "merge_af_small",
"af_df.parquet",
"vep_deepripe_deepsea_absplice.parquet",
"vep_deepripe_deepsea_absplice_af.parquet",
),
]
)
def test_merge_af(
test_data_name_dir, af_df, annotaton_df, expected, tmp_path
):
current_test_data_dir = tests_data_dir / 'merge_af' / test_data_name_dir
af_path = current_test_data_dir / 'input' / af_df
annotaions_path = current_test_data_dir / 'input' /annotaton_df
expected_path = current_test_data_dir / 'expected' / expected
output_path = tmp_path / 'out.parquet'
cli_runner = CliRunner()
cli_parameters = [
'merge-af',
af_path.as_posix(),
annotaions_path.as_posix(),
output_path.as_posix()
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)



@pytest.mark.parametrize(
"test_data_name_dir, annotations, expected",
[
( "calculate_MAF_small",
"annotations.parquet",
"expected.parquet",
),
]
)
def test_calculate_maf(
test_data_name_dir, annotations, expected, tmp_path
):
current_test_data_dir = tests_data_dir / 'calculate_MAF' / test_data_name_dir
annotations_path = current_test_data_dir / 'input' / annotations
expected_path = current_test_data_dir / 'expected' / expected
output_path = tmp_path / 'out.parquet'
cli_runner = CliRunner()
cli_parameters = [
'calculate-maf',
annotations_path.as_posix(),
output_path.as_posix()
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results, check_exact = False)



@pytest.mark.parametrize(
"test_data_name_dir, gtf_file, expected",
[
( "create_gene_id_file_small",
"gencode.v44.annotation.gtf.gz",
"protein_coding_genes.parquet",
),
]
)
def test_create_gene_id_file(
test_data_name_dir, gtf_file, expected, tmp_path
):
current_test_data_dir = tests_data_dir / 'create_gene_id_file' / test_data_name_dir
input_path1 = current_test_data_dir / 'input' / gtf_file
expected_path = current_test_data_dir / 'expected' / expected
output_path = tmp_path / 'out.parquet'
cli_runner = CliRunner()
cli_parameters = [
'create-gene-id-file',
input_path1.as_posix(),
output_path.as_posix(),
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)


@pytest.mark.parametrize(
"test_data_name_dir, annotations, gene_id_file , expected",
[
( "add_gene_ids_small",
"annotations.parquet",
"protein_coding_genes.parquet",
"expected.parquet",
),
]
)
def test_add_gene_ids(
test_data_name_dir, annotations, gene_id_file , expected, tmp_path
):
current_test_data_dir = tests_data_dir / 'add_gene_ids' / test_data_name_dir
annotations_path = current_test_data_dir / 'input' / annotations
gene_id_path = current_test_data_dir / 'input' /gene_id_file
expected_path = current_test_data_dir / 'expected' / expected
output_path = tmp_path / 'out.parquet'
cli_runner = CliRunner()
cli_parameters = [
'add-gene-ids',
gene_id_path.as_posix(),
annotations_path.as_posix(),
output_path.as_posix(),
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results, check_exact = False)


@pytest.mark.parametrize(
"test_data_name_dir, gtf_file, annotations, gene_id_file, expected",
[
( "filter_by_exon_distance_small",
"gencode.v44.annotation.gtf.gz",
"annotations.parquet",
"protein_coding_genes.parquet",
"expected.parquet",
),
]
)
def test_filter_by_exon_distance(
test_data_name_dir, gtf_file, annotations, gene_id_file, expected, tmp_path
):
current_test_data_dir = tests_data_dir / 'filter_by_exon_distance' / test_data_name_dir
gtf_file_path = current_test_data_dir / 'input' / gtf_file
annotations_path = current_test_data_dir / 'input' / annotations
gene_id_path = current_test_data_dir / 'input' / gene_id_file

expected_path = current_test_data_dir / 'expected' / expected
output_path = tmp_path / 'out.parquet'
cli_runner = CliRunner()
cli_parameters = [
'filter-annotations-by-exon-distance',
annotations_path.as_posix(),
gtf_file_path.as_posix(),
gene_id_path.as_posix(),
output_path.as_posix(),
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results, check_exact = False)


@pytest.mark.parametrize(
"test_data_name_dir, yaml_file, annotations, expected",
[
( "select_rename_fill_columns_small",
"annotation_colnames_filling_values.yaml",
"annotations.parquet",
"expected.parquet",
),
]
)
def test_select_rename_fill_annotations(
test_data_name_dir, yaml_file, annotations, expected, tmp_path
):
current_test_data_dir = tests_data_dir / 'select_rename_fill_columns' / test_data_name_dir
yaml_file_path = current_test_data_dir / 'input' / yaml_file
annotations_path = current_test_data_dir / 'input' /annotations
expected_path = current_test_data_dir / 'expected' / expected
output_path = tmp_path / 'out.parquet'
cli_runner = CliRunner()
cli_parameters = [
'select-rename-fill-annotations',
yaml_file_path.as_posix(),
annotations_path.as_posix(),
output_path.as_posix(),
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)


# @pytest.mark.parametrize(
# "test_data_name_dir, input_file_1, input_file_2, parameter1, expected",
# [
# ( "test_name_dir",
# "input_file1.parquet",
# "input_file2.parquet",
# "8",
# "expected.parquet",
# ),
# ]
# )
# def template(
# test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path
# ):
# current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir
# input_path1 = current_test_data_dir / 'input' / input_file_1
# input_path2 = current_test_data_dir / 'input' /input_file_2
# expected_path = current_test_data_dir / 'expected' / expected
# output_path = tmp_path / 'out.parquet'
# cli_runner = CliRunner()
# cli_parameters = [
# 'function-name',
# input_path1.as_posix(),
# input_path2.as_posix(),
# output_path.as_posix(),
# parameter1,
# ]
# result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
# assert result.exit_code == 0
# written_results = pd.read_parquet(output_path)
# expected_results = pd.read_parquet(expected_path)
# assert written_results.shape == expected_results.shape
# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)

# @pytest.mark.parametrize(
# "test_name_dir, input_file_1, input_file_2, parameter1, expected",
# "test_data_name_dir, input_file_1, input_file_2, parameter1, expected",
# [
# ( "test_name_dir",
# "input_file1.parquet",
Expand All @@ -456,15 +622,15 @@ def test_calculate_allele_frequencies(
# test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path
# ):
# current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir
# input_path1 = current_test_data_dir / 'input' / input_file_1
# input_path2 = current_test_data_dir / 'input' /input_file_2
# input_file_1_path = current_test_data_dir / 'input' / input_file_1
# input_file_2_path = current_test_data_dir / 'input' /input_file_2
# expected_path = current_test_data_dir / 'expected' / expected
# output_path = tmp_path / 'out.parquet'
# cli_runner = CliRunner()
# cli_parameters = [
# 'function-name',
# input_path1.as_posix(),
# input_path2.as_posix(),
# input_file_1_path.as_posix(),
# input_file_2_path.as_posix(),
# output_path.as_posix(),
# parameter1,
# ]
Expand All @@ -473,4 +639,4 @@ def test_calculate_allele_frequencies(
# written_results = pd.read_parquet(output_path)
# expected_results = pd.read_parquet(expected_path)
# assert written_results.shape == expected_results.shape
# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
annotation_column_names:
'af' :
'combined_UKB_NFE_AF' : 0
'maf_mb' :
'combined_UKB_NFE_AF_MB' : 10000
'maf' :
'combined_UKB_NFE_MAF' : 0
'PolyPhen' :
'polyphen_score' : 0
'SIFT' :
'sift_score' : 1
'QKI_hg2' :
'DeepRipe_plus_QKI_lip_hg2' : 0
'QKI_k5' :
'DeepRipe_plus_QKI_clip_k5' : 0
'KHDRBS1_k5' :
'DeepRipe_plus_KHDRBS1_clip_k5' : 0
'ELAVL1_parclip' :
'DeepRipe_plus_ELAVL1_parclip' : 0
'TARDBP_parclip' :
'DeepRipe_plus_TARDBP_parclip' : 0
'HNRNPD_parclip' :
'DeepRipe_plus_HNRNPD_parclip' : 0
'MBNL1_parclip' :
'DeepRipe_plus_MBNL1_parclip' : 0
'QKI_parclip' :
'DeepRipe_plus_QKI_parclip' : 0
'Consequence_splice_acceptor_variant' :
'Consequence_splice_acceptor_variant' : 0
'Consequence_splice_donor_variant' :
'Consequence_splice_donor_variant' : 0
'Consequence_stop_gained' :
'Consequence_stop_gained' : 0
'Consequence_frameshift_variant' :
'Consequence_frameshift_variant' : 0
'Consequence_stop_lost' :
'Consequence_stop_lost' : 0
'Consequence_start_lost' :
'Consequence_start_lost' : 0
'Consequence_inframe_insertion' :
'Consequence_inframe_insertion' : 0
'Consequence_inframe_deletion' :
'Consequence_inframe_deletion' : 0
'Consequence_missense_variant' :
'Consequence_missense_variant' : 0
'Consequence_protein_altering_variant' :
'Consequence_protein_altering_variant' : 0
'Consequence_splice_region_variant' :
'Consequence_splice_region_variant' : 0
'DeepSEA_PC_1' :
'DeepSEA_PC_1' : 0
'DeepSEA_PC_2' :
'DeepSEA_PC_2' : 0
'DeepSEA_PC_3' :
'DeepSEA_PC_3' : 0
'DeepSEA_PC_4' :
'DeepSEA_PC_4' : 0
'DeepSEA_PC_5' :
'DeepSEA_PC_5' : 0
'DeepSEA_PC_6' :
'DeepSEA_PC_6' : 0
'AF' :
'AF' : 0
Binary file not shown.

0 comments on commit 88a2f09

Please sign in to comment.