Skip to content

Commit

Permalink
added tests for annotation pipeline, variant file now parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
Mück committed Apr 25, 2024
1 parent a8fcd47 commit a66b5b6
Show file tree
Hide file tree
Showing 26 changed files with 491 additions and 15 deletions.
10 changes: 4 additions & 6 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1469,16 +1469,14 @@ def add_ids(annotation_file: str, variant_file: str, njobs: int, out_file: str):
@cli.command()
@click.argument("annotation_file", type=click.Path(exists=True))
@click.argument("variant_file", type=click.Path(exists=True))
@click.argument("njobs", type=int)
@click.argument("out_file", type=click.Path())
def add_ids_dask(annotation_file: str, variant_file: str, njobs: int, out_file: str):
def add_ids_dask(annotation_file: str, variant_file: str, out_file: str):
"""
Add identifiers from a variant file to an annotation file using Dask and save the result.
Parameters:
- annotation_file (str): Path to the input annotation file in Parquet format.
- variant_file (str): Path to the input variant file in Parquet format.
- njobs (int): Number of parallel jobs to process the data.
- out_file (str): Path to save the processed data in Parquet format.
Returns:
Expand All @@ -1494,7 +1492,7 @@ def add_ids_dask(annotation_file: str, variant_file: str, njobs: int, out_file:
$ python annotations.py add_ids_dask annotation_data.parquet variant_data.parquet 4 processed_data.parquet
"""
data = dd.read_parquet(annotation_file, blocksize=25e9)
all_variants = pd.read_table(variant_file)
all_variants = pd.read_parquet(variant_file)
data = data.rename(
columns={
"#CHROM": "chrom",
Expand Down Expand Up @@ -1667,7 +1665,7 @@ def merge_annotations(
logger.info("load variant_file")

logger.info(f"reading in {variant_file}")
variants = pd.read_csv(variant_file, sep="\t")
variants = pd.read_parquet(variant_file)

logger.info("merge vep to variants M:1")
ca = vep_df.merge(
Expand Down Expand Up @@ -1932,7 +1930,7 @@ def get_af_from_gt(genotype_file: str, variants_filepath: str, out_file: str):
"""
import h5py

variants = pd.read_table(variants_filepath)
variants = pd.read_parquet(variants_filepath)
max_variant_id = variants["id"].max()

logger.info("Computing allele frequencies")
Expand Down
12 changes: 6 additions & 6 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ included_chromosomes = config.get(
)

preprocess_dir = Path(config.get("preprocessing_workdir", ""))
variant_file = config.get("variant_file_path") or preprocess_dir / 'norm' / 'variants' / 'variants.tsv.gz'
variant_pq = config.get("variant_parquetfile_path") or preprocess_dir / 'norm' / 'variants' / 'variants.parquet'
genotype_file = config.get("genotype_file_path") or preprocess_dir / 'preprocessed' / 'genotypes.h5'
saved_deepripe_models_path = (
Path(config["faatpipe_repo_dir"]) / "data" / "deepripe_models"
Expand Down Expand Up @@ -118,7 +118,8 @@ file_paths = [

file_paths = list(chain.from_iterable(file_paths))
human_sort(file_paths)
file_stems = [Path(p).stem.split('.')[0] for p in file_paths]
file_stems = [re.compile(source_variant_file_pattern.format(chr = "(\d+|X|Y)",block='\d+')).search(i).group() for i in file_paths]


absplice_download_dir = config.get('absplice_download_dir') or absplice_repo_dir /'example'/'data'/'resources'/'downloaded_files'
absplice_output_dir = config.get('absplice_output_dir', anno_tmp_dir /'absplice')
Expand Down Expand Up @@ -261,7 +262,7 @@ rule merge_allele_frequency:
rule calculate_allele_frequency:
input:
genotype_file = genotype_file,
variants = variant_file
variants = variant_pq
output:
allele_frequencies = anno_tmp_dir / "af_df.parquet"
shell:
Expand Down Expand Up @@ -361,7 +362,7 @@ rule merge_annotations:
/ (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"),
deepripe_hg2=anno_dir
/ (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"),
variant_file=variant_file,
variant_file=variant_pq,
vcf_file= anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"),
output:
anno_dir / f"{source_variant_file_pattern}_merged.parquet",
Expand Down Expand Up @@ -396,7 +397,7 @@ rule deepSea_PCA:

rule add_ids_deepSea:
input:
variant_file=variant_file,
variant_file=variant_pq,
annotation_file=deepSEA_tmp_dir / "deepsea_pca.parquet",
output:
directory(anno_dir / "all_variants.wID.deepSea.parquet"),
Expand All @@ -408,7 +409,6 @@ rule add_ids_deepSea:
"add-ids-dask",
"{input.annotation_file}",
"{input.variant_file}",
"{threads}",
"{output}",
]
)
Expand Down
180 changes: 177 additions & 3 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,16 @@
"out_scores.parquet",
"expected.parquet",
),
(
"concatenate_deepseascores_medium",
"deepsea_scores_1.tsv",
"deepsea_scores_2.tsv",
"out_scores.parquet",
"expected.parquet",
),
],
)

def test_concatenate_deepsea(
test_data_name_dir, deepsea_scores_1, deepsea_scores_2, out_scores, expected_out_scores, tmp_path
):
Expand All @@ -31,19 +39,185 @@ def test_concatenate_deepsea(

deepsea_score_file_1 = current_test_data_dir / "input" / deepsea_scores_1
deepsea_score_file_2 = current_test_data_dir / "input" / deepsea_scores_2
out_out_scores_file = tmp_path / out_scores
out_scores_file = tmp_path / out_scores
expected_out_scores_file = current_test_data_dir / "expected" / expected_out_scores

cli_parameters = [
"concatenate-deepsea",
",".join([deepsea_score_file_1.as_posix(), deepsea_score_file_2.as_posix()]),
out_out_scores_file.as_posix(),"8"
out_scores_file.as_posix(),
"8"
]

result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0

written_results = pd.read_parquet(out_out_scores_file)
written_results = pd.read_parquet(out_scores_file)

expected_scores = pd.read_parquet(expected_out_scores_file)
assert_frame_equal(written_results, expected_scores, check_exact = False)





@pytest.mark.parametrize(
"test_data_name_dir, deapseascores_file, variant_file, out_df, expected_out_df",
[
(
"add_ids_small",
"deepseascores.parquet",
"variants.parquet",
"out_df.parquet",
"expected.parquet",
),
(
"add_ids_medium",
"deepseascores.parquet",
"variants.parquet",
"out_df.parquet",
"expected.parquet",
),
],
)
def test_add_ids_dask(
test_data_name_dir, deapseascores_file, variant_file, out_df, expected_out_df, tmp_path
):
cli_runner = CliRunner()

current_test_data_dir = tests_data_dir / "add_ids_dask" / test_data_name_dir

deepsea_score_file = current_test_data_dir / "input" / deapseascores_file
variant_path = current_test_data_dir / "input" / variant_file
out_scores_file = tmp_path / out_df
expected_out_scores_file = current_test_data_dir / "expected" / expected_out_df

cli_parameters = [
'add-ids-dask',
deepsea_score_file.as_posix(),
variant_path.as_posix(),
out_scores_file.as_posix()
]

result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0

written_results = pd.read_parquet(out_scores_file)

expected_scores = pd.read_parquet(expected_out_scores_file)
assert_frame_equal(written_results, expected_scores, check_exact = False)





@pytest.mark.parametrize(
"test_data_name_dir, deapseascores_file, pca_file, mean_sds_file, expected_out_df",
[
(
"deepsea_pca_small",
"deepseascores.parquet",
"deepsea_pca/pca.npy",
"deepsea_pca/mean_sds.parquet",
"deepsea_pca.parquet",
),
(
"deepsea_pca_small",
"deepseascores.parquet",
"deepsea_pca/pca.npy",
"{tmp_path}/mean_sds.parquet",
"deepsea_pca.parquet",
),
(
"deepsea_pca_small",
"deepseascores.parquet",
"{tmp_path}/pca.npy",
"deepsea_pca/mean_sds.parquet",
"deepsea_pca.parquet",
),
(
"deepsea_pca_small",
"deepseascores.parquet",
"{tmp_path}/pca.npy",
"{tmp_path}/mean_sds.parquet",
"deepsea_pca.parquet",
),
],
)
def test_deepsea_pca(
test_data_name_dir, deapseascores_file, pca_file, mean_sds_file, expected_out_df, tmp_path
):
cli_runner = CliRunner()

current_test_data_dir = tests_data_dir / "deepsea_pca" / test_data_name_dir

deepsea_score_file = current_test_data_dir / "input" / deapseascores_file
pca_file = current_test_data_dir / pca_file.format(tmp_path = tmp_path)
mean_sds_file = current_test_data_dir / mean_sds_file.format(tmp_path = tmp_path)
expected_out_scores_file = current_test_data_dir / "expected" / expected_out_df

cli_parameters = [
'deepsea-pca',
deepsea_score_file.as_posix(),
pca_file.as_posix(),
mean_sds_file.as_posix(),
tmp_path.as_posix()
]

result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0

written_results = pd.read_parquet(tmp_path / expected_out_df)

expected_scores = pd.read_parquet(expected_out_scores_file)
assert_frame_equal(written_results, expected_scores, check_exact = False)


@pytest.mark.parametrize(
"test_data_name_dir, expected, hg2_output, k5_output, parclip_output, variants, vcf, vep_output, vep_header_line",
[
(
"merge_annotations_small",
"merged_annotations_expected.parquet",
"test_hg2_deepripe.csv.gz",
"test_k5_deepripe.csv.gz",
"test_parclip.csv.gz",
"variants.parquet",
"test.vcf",
"test_vep.tsv",
"49",
),
]
)
def test_merge_annotations(
test_data_name_dir, expected, hg2_output, k5_output, parclip_output, variants, vcf, vep_output, vep_header_line, tmp_path
):
current_test_data_dir = tests_data_dir / 'merge_annotations'/ test_data_name_dir
expected_path = current_test_data_dir / 'expected' / expected
hg2_deepripe_path = current_test_data_dir / 'input' / hg2_output
k5_deepripe_path = current_test_data_dir / 'input' / k5_output
parclip_deepripe_path = current_test_data_dir / 'input' / parclip_output
vcf_path = current_test_data_dir / 'input' / vcf
vep_output_path = current_test_data_dir / 'input' / vep_output
variants_path = current_test_data_dir / 'input' / variants
output_path = tmp_path / 'out_merged.parquet'
cli_runner = CliRunner()

cli_parameters = ["merge-annotations",
vep_header_line,
vep_output_path.as_posix(),
parclip_deepripe_path.as_posix(),
hg2_deepripe_path.as_posix(),
k5_deepripe_path.as_posix(),
variants_path.as_posix(),
vcf_path.as_posix(),
output_path.as_posix(),
]
result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
assert result.exit_code == 0
written_results = pd.read_parquet(output_path)
expected_results = pd.read_parquet(expected_path)
assert written_results.shape == expected_results.shape
assert_frame_equal(written_results[expected_results.columns], expected_results, check_exact = False)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
chr3 3474106 rs111 C T
chr3 6134790 rs881 A G
chr3 6492413 rs94 A G
chr3 7479092 rs200 T A
chr3 10151779 rs337 G T
chr3 10963200 rs632 C A
chr3 13336897 rs178 T A
chr3 25565017 rs37 C G
chr3 28027872 rs721 T C
chr3 30429305 rs135 T C
chr3 39059372 rs23 A C
chr3 47378509 rs727 T A
chr3 47839379 rs268 C T
chr3 55062103 rs873 A G
chr3 56288165 rs664 G C
chr3 64813843 rs815 A G
chr3 70306576 rs107 C G
chr3 72140079 rs492 A T
chr3 72906610 rs930 T G
chr3 74562325 rs523 G T
chr3 78839934 rs583 G A
chr3 81414874 rs170 A T
chr3 97458263 rs548 A T
chr3 97649369 rs546 C G
chr3 97949211 rs543 G A
chr3 99075824 rs838 T C
chr3 101580812 rs311 A C
chr3 103151123 rs382 C A
chr3 103329532 rs179 T C
chr3 103928516 rs19 A T
chr3 105180981 rs341 A G
chr3 111113126 rs470 A G
chr3 111866541 rs467 T A
chr3 117455785 rs718 C A
chr3 120258434 rs506 A C
chr3 120364684 rs367 T G
chr3 122803142 rs488 A C
chr3 125013245 rs146 A G
chr3 127342540 rs318 G T
chr3 133734681 rs104 G A
chr3 139349025 rs665 T C
chr3 140275153 rs791 G C
chr3 145304395 rs102 C G
chr3 147901161 rs274 C T
chr3 150051584 rs123 C A
chr3 150399452 rs648 T A
chr3 158349305 rs748 T A
chr3 158851780 rs408 A T
chr3 160382108 rs963 A C
chr3 168465216 rs751 G C
chr3 171089322 rs197 A T
chr3 177499702 rs376 G C
chr3 185836100 rs581 G T
chr3 191856146 rs596 G C
chr3 192824921 rs701 C G
chr3 193390684 rs434 A C
chr3 194577309 rs70 T G
chr3 194762766 rs598 G A
chr3 197345633 rs356 T A
chr3 197732094 rs693 G T
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit a66b5b6

Please sign in to comment.