diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index e31016cc..0f772c4f 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -2000,19 +2000,19 @@ def calculate_maf(annotations_path: str, out_file: str): @cli.command() -@click.argument("protein_id_file", type=click.Path(exists=True)) +@click.argument("gene_id_file", type=click.Path(exists=True)) @click.argument("annotations_path", type=click.Path(exists=True)) @click.argument("out_file", type=click.Path()) -def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str): +def add_gene_ids(gene_id_file: str, annotations_path: str, out_file: str): """ - Add protein IDs to the annotations based on protein ID mapping file. + Add gene IDs to the annotations based on gene ID mapping file. Parameters: - - protein_id_file (str): Path to the protein ID mapping file. + - gene_id_file (str): Path to the gene ID mapping file. - annotations_path (str): Path to the annotations file. - out_file (str): Path to the output file to save the annotations with protein IDs. """ - genes = pd.read_parquet(protein_id_file) + genes = pd.read_parquet(gene_id_file) genes[["gene_base", "feature"]] = genes["gene"].str.split(".", expand=True) genes.drop(columns=["feature", "gene", "gene_name", "gene_type"], inplace=True) genes.rename(columns={"id": "gene_id"}, inplace=True) @@ -2027,7 +2027,7 @@ def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str): @cli.command() @click.argument("gtf_filepath", type=click.Path(exists=True)) @click.argument("out_file", type=click.Path()) -def create_protein_id_file(gtf_filepath: str, out_file: str): +def create_gene_id_file(gtf_filepath: str, out_file: str): """ Create a protein ID mapping file from the GTF file. diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index 5e500c14..869c4d31 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -185,7 +185,7 @@ if not gene_id_file: shell: " ".join([ f"deeprvat_annotations", - "create-protein-id-file", + "create-gene-id-file", "{input}", "{output}" ]) @@ -215,7 +215,7 @@ rule add_gene_ids: shell: " ".join([ f"deeprvat_annotations", - "add-protein-ids", + "add-gene-ids", "{input.gene_id_file}", "{input.annotations_path}", "{output}" diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py index 962ca10d..4fe3849a 100644 --- a/tests/annotations/test_annotations.py +++ b/tests/annotations/test_annotations.py @@ -372,6 +372,8 @@ def test_merge_absplice_scores( assert written_results.shape == expected_results.shape assert_frame_equal(written_results, expected_results, check_exact = False) + + @pytest.mark.parametrize( "test_data_name_dir, genotype_file, variant_file, expected", [ @@ -402,47 +404,211 @@ def test_calculate_allele_frequencies( written_results = pd.read_parquet(output_path) expected_results = pd.read_parquet(expected_path) assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results, check_exact = False) + + + +@pytest.mark.parametrize( + "test_data_name_dir, af_df, annotaton_df, expected", + [ + ( "merge_af_small", + "af_df.parquet", + "vep_deepripe_deepsea_absplice.parquet", + "vep_deepripe_deepsea_absplice_af.parquet", + ), + ] +) +def test_merge_af( + test_data_name_dir, af_df, annotaton_df, expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'merge_af' / test_data_name_dir + af_path = current_test_data_dir / 'input' / af_df + annotaions_path = current_test_data_dir / 'input' /annotaton_df + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'merge-af', + af_path.as_posix(), + annotaions_path.as_posix(), + output_path.as_posix() + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) +@pytest.mark.parametrize( + "test_data_name_dir, annotations, expected", + [ + ( "calculate_MAF_small", + "annotations.parquet", + "expected.parquet", + ), + ] +) +def test_calculate_maf( + test_data_name_dir, annotations, expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'calculate_MAF' / test_data_name_dir + annotations_path = current_test_data_dir / 'input' / annotations + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'calculate-maf', + annotations_path.as_posix(), + output_path.as_posix() + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results, check_exact = False) + + + +@pytest.mark.parametrize( + "test_data_name_dir, gtf_file, expected", + [ + ( "create_gene_id_file_small", + "gencode.v44.annotation.gtf.gz", + "protein_coding_genes.parquet", + ), + ] +) +def test_create_gene_id_file( + test_data_name_dir, gtf_file, expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'create_gene_id_file' / test_data_name_dir + input_path1 = current_test_data_dir / 'input' / gtf_file + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'create-gene-id-file', + input_path1.as_posix(), + output_path.as_posix(), + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) + + +@pytest.mark.parametrize( + "test_data_name_dir, annotations, gene_id_file , expected", + [ + ( "add_gene_ids_small", + "annotations.parquet", + "protein_coding_genes.parquet", + "expected.parquet", + ), + ] +) +def test_add_gene_ids( + test_data_name_dir, annotations, gene_id_file , expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'add_gene_ids' / test_data_name_dir + annotations_path = current_test_data_dir / 'input' / annotations + gene_id_path = current_test_data_dir / 'input' /gene_id_file + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'add-gene-ids', + gene_id_path.as_posix(), + annotations_path.as_posix(), + output_path.as_posix(), + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results, check_exact = False) + + +@pytest.mark.parametrize( + "test_data_name_dir, gtf_file, annotations, gene_id_file, expected", + [ + ( "filter_by_exon_distance_small", + "gencode.v44.annotation.gtf.gz", + "annotations.parquet", + "protein_coding_genes.parquet", + "expected.parquet", + ), + ] +) +def test_filter_by_exon_distance( + test_data_name_dir, gtf_file, annotations, gene_id_file, expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'filter_by_exon_distance' / test_data_name_dir + gtf_file_path = current_test_data_dir / 'input' / gtf_file + annotations_path = current_test_data_dir / 'input' / annotations + gene_id_path = current_test_data_dir / 'input' / gene_id_file + + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'filter-annotations-by-exon-distance', + annotations_path.as_posix(), + gtf_file_path.as_posix(), + gene_id_path.as_posix(), + output_path.as_posix(), + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results, check_exact = False) + + +@pytest.mark.parametrize( + "test_data_name_dir, yaml_file, annotations, expected", + [ + ( "select_rename_fill_columns_small", + "annotation_colnames_filling_values.yaml", + "annotations.parquet", + "expected.parquet", + ), + ] +) +def test_select_rename_fill_annotations( + test_data_name_dir, yaml_file, annotations, expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'select_rename_fill_columns' / test_data_name_dir + yaml_file_path = current_test_data_dir / 'input' / yaml_file + annotations_path = current_test_data_dir / 'input' /annotations + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'select-rename-fill-annotations', + yaml_file_path.as_posix(), + annotations_path.as_posix(), + output_path.as_posix(), + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) + -# @pytest.mark.parametrize( -# "test_data_name_dir, input_file_1, input_file_2, parameter1, expected", -# [ -# ( "test_name_dir", -# "input_file1.parquet", -# "input_file2.parquet", -# "8", -# "expected.parquet", -# ), -# ] -# ) -# def template( -# test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path -# ): -# current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir -# input_path1 = current_test_data_dir / 'input' / input_file_1 -# input_path2 = current_test_data_dir / 'input' /input_file_2 -# expected_path = current_test_data_dir / 'expected' / expected -# output_path = tmp_path / 'out.parquet' -# cli_runner = CliRunner() -# cli_parameters = [ -# 'function-name', -# input_path1.as_posix(), -# input_path2.as_posix(), -# output_path.as_posix(), -# parameter1, -# ] -# result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) -# assert result.exit_code == 0 -# written_results = pd.read_parquet(output_path) -# expected_results = pd.read_parquet(expected_path) -# assert written_results.shape == expected_results.shape -# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) # @pytest.mark.parametrize( -# "test_name_dir, input_file_1, input_file_2, parameter1, expected", +# "test_data_name_dir, input_file_1, input_file_2, parameter1, expected", # [ # ( "test_name_dir", # "input_file1.parquet", @@ -456,15 +622,15 @@ def test_calculate_allele_frequencies( # test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path # ): # current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir -# input_path1 = current_test_data_dir / 'input' / input_file_1 -# input_path2 = current_test_data_dir / 'input' /input_file_2 +# input_file_1_path = current_test_data_dir / 'input' / input_file_1 +# input_file_2_path = current_test_data_dir / 'input' /input_file_2 # expected_path = current_test_data_dir / 'expected' / expected # output_path = tmp_path / 'out.parquet' # cli_runner = CliRunner() # cli_parameters = [ # 'function-name', -# input_path1.as_posix(), -# input_path2.as_posix(), +# input_file_1_path.as_posix(), +# input_file_2_path.as_posix(), # output_path.as_posix(), # parameter1, # ] @@ -473,4 +639,4 @@ def test_calculate_allele_frequencies( # written_results = pd.read_parquet(output_path) # expected_results = pd.read_parquet(expected_path) # assert written_results.shape == expected_results.shape -# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) +# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) \ No newline at end of file diff --git a/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/expected/expected.parquet b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/expected/expected.parquet new file mode 100644 index 00000000..cf3bad1a Binary files /dev/null and b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/expected/expected.parquet differ diff --git a/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/annotations.parquet b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/annotations.parquet new file mode 100644 index 00000000..33835b07 Binary files /dev/null and b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/annotations.parquet differ diff --git a/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/protein_coding_genes.parquet b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/protein_coding_genes.parquet new file mode 100644 index 00000000..b693bbc4 Binary files /dev/null and b/tests/annotations/test_data/add_gene_ids/add_gene_ids_small/input/protein_coding_genes.parquet differ diff --git a/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/expected/expected.parquet b/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/expected/expected.parquet new file mode 100644 index 00000000..33835b07 Binary files /dev/null and b/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/expected/expected.parquet differ diff --git a/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/input/annotations.parquet b/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/input/annotations.parquet new file mode 100644 index 00000000..2a0d5962 Binary files /dev/null and b/tests/annotations/test_data/calculate_MAF/calculate_MAF_small/input/annotations.parquet differ diff --git a/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_small/expected/protein_coding_genes.parquet b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_small/expected/protein_coding_genes.parquet new file mode 100644 index 00000000..b693bbc4 Binary files /dev/null and b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_small/expected/protein_coding_genes.parquet differ diff --git a/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_small/input/gencode.v44.annotation.gtf.gz b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_small/input/gencode.v44.annotation.gtf.gz new file mode 100644 index 00000000..f1796e6a Binary files /dev/null and b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_small/input/gencode.v44.annotation.gtf.gz differ diff --git a/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/expected/expected.parquet b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/expected/expected.parquet new file mode 100644 index 00000000..cf3bad1a Binary files /dev/null and b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/expected/expected.parquet differ diff --git a/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/annotations.parquet b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/annotations.parquet new file mode 100644 index 00000000..cf3bad1a Binary files /dev/null and b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/annotations.parquet differ diff --git a/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/gencode.v44.annotation.gtf.gz b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/gencode.v44.annotation.gtf.gz new file mode 100644 index 00000000..f1796e6a Binary files /dev/null and b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/gencode.v44.annotation.gtf.gz differ diff --git a/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/protein_coding_genes.parquet b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/protein_coding_genes.parquet new file mode 100644 index 00000000..b693bbc4 Binary files /dev/null and b/tests/annotations/test_data/filter_by_exon_distance/filter_by_exon_distance_small/input/protein_coding_genes.parquet differ diff --git a/tests/annotations/test_data/merge_af/merge_af_small/expected/vep_deepripe_deepsea_absplice_af.parquet b/tests/annotations/test_data/merge_af/merge_af_small/expected/vep_deepripe_deepsea_absplice_af.parquet new file mode 100644 index 00000000..2a0d5962 Binary files /dev/null and b/tests/annotations/test_data/merge_af/merge_af_small/expected/vep_deepripe_deepsea_absplice_af.parquet differ diff --git a/tests/annotations/test_data/merge_af/merge_af_small/input/af_df.parquet b/tests/annotations/test_data/merge_af/merge_af_small/input/af_df.parquet new file mode 100644 index 00000000..d453427a Binary files /dev/null and b/tests/annotations/test_data/merge_af/merge_af_small/input/af_df.parquet differ diff --git a/tests/annotations/test_data/merge_af/merge_af_small/input/vep_deepripe_deepsea_absplice.parquet b/tests/annotations/test_data/merge_af/merge_af_small/input/vep_deepripe_deepsea_absplice.parquet new file mode 100644 index 00000000..3766aa55 Binary files /dev/null and b/tests/annotations/test_data/merge_af/merge_af_small/input/vep_deepripe_deepsea_absplice.parquet differ diff --git a/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected.parquet b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected.parquet new file mode 100644 index 00000000..710d4112 Binary files /dev/null and b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected.parquet differ diff --git a/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/input/annotation_colnames_filling_values.yaml b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/input/annotation_colnames_filling_values.yaml new file mode 100644 index 00000000..fe0c483b --- /dev/null +++ b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/input/annotation_colnames_filling_values.yaml @@ -0,0 +1,63 @@ +annotation_column_names: + 'af' : + 'combined_UKB_NFE_AF' : 0 + 'maf_mb' : + 'combined_UKB_NFE_AF_MB' : 10000 + 'maf' : + 'combined_UKB_NFE_MAF' : 0 + 'PolyPhen' : + 'polyphen_score' : 0 + 'SIFT' : + 'sift_score' : 1 + 'QKI_hg2' : + 'DeepRipe_plus_QKI_lip_hg2' : 0 + 'QKI_k5' : + 'DeepRipe_plus_QKI_clip_k5' : 0 + 'KHDRBS1_k5' : + 'DeepRipe_plus_KHDRBS1_clip_k5' : 0 + 'ELAVL1_parclip' : + 'DeepRipe_plus_ELAVL1_parclip' : 0 + 'TARDBP_parclip' : + 'DeepRipe_plus_TARDBP_parclip' : 0 + 'HNRNPD_parclip' : + 'DeepRipe_plus_HNRNPD_parclip' : 0 + 'MBNL1_parclip' : + 'DeepRipe_plus_MBNL1_parclip' : 0 + 'QKI_parclip' : + 'DeepRipe_plus_QKI_parclip' : 0 + 'Consequence_splice_acceptor_variant' : + 'Consequence_splice_acceptor_variant' : 0 + 'Consequence_splice_donor_variant' : + 'Consequence_splice_donor_variant' : 0 + 'Consequence_stop_gained' : + 'Consequence_stop_gained' : 0 + 'Consequence_frameshift_variant' : + 'Consequence_frameshift_variant' : 0 + 'Consequence_stop_lost' : + 'Consequence_stop_lost' : 0 + 'Consequence_start_lost' : + 'Consequence_start_lost' : 0 + 'Consequence_inframe_insertion' : + 'Consequence_inframe_insertion' : 0 + 'Consequence_inframe_deletion' : + 'Consequence_inframe_deletion' : 0 + 'Consequence_missense_variant' : + 'Consequence_missense_variant' : 0 + 'Consequence_protein_altering_variant' : + 'Consequence_protein_altering_variant' : 0 + 'Consequence_splice_region_variant' : + 'Consequence_splice_region_variant' : 0 + 'DeepSEA_PC_1' : + 'DeepSEA_PC_1' : 0 + 'DeepSEA_PC_2' : + 'DeepSEA_PC_2' : 0 + 'DeepSEA_PC_3' : + 'DeepSEA_PC_3' : 0 + 'DeepSEA_PC_4' : + 'DeepSEA_PC_4' : 0 + 'DeepSEA_PC_5' : + 'DeepSEA_PC_5' : 0 + 'DeepSEA_PC_6' : + 'DeepSEA_PC_6' : 0 + 'AF' : + 'AF' : 0 diff --git a/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/input/annotations.parquet b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/input/annotations.parquet new file mode 100644 index 00000000..cf3bad1a Binary files /dev/null and b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/input/annotations.parquet differ