Skip to content

Commit

Permalink
Add checks for empty samples and variants
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed May 22, 2024
1 parent cd7037a commit bacc7d2
Show file tree
Hide file tree
Showing 14 changed files with 97 additions and 3 deletions.
5 changes: 5 additions & 0 deletions deeprvat/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,8 @@ def process_sparse_gt(
variants = variants[~variants["id"].isin(variant_ids_to_exclude)]
if not skip_sanity_checks:
assert total_variants - len(variants) == len(variant_ids_to_exclude)
if variants.empty:
raise ValueError("All variants have been filtered out.")

logging.info(f"Dropped {total_variants - len(variants)} variants")
logging.info(f"...done ({time.time() - start_time} s)")
Expand Down Expand Up @@ -313,6 +315,9 @@ def process_sparse_gt(

samples = sorted(list(samples))

if len(samples) == 0:
raise ValueError("All samples have been excluded.")

logging.info("Processing sparse GT files by chromosome")
total_calls_dropped = 0
variant_groups = variants.groupby("chrom")
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
100096
100097
100099
100100
100101
100102
100103
100104
100105
100106
100107
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
100096
100097
100099
100100
100101
100102
100103
100104
100105
100106
100107
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
chr1 16103 T G
chr1 51479 T A
chr1 51898 C A
chr1 51928 G A
chr1 51954 G C
chr1 54490 G A
chr1 54669 C T
chr1 54708 G C
chr1 54716 C T
chr1 54725 T G
chr1 54727 T C
chr1 54753 T G
chr1 55299 C T
chr1 55326 T C
chr1 55330 G A
chr1 55351 T A
chr1 55365 A G
chr1 55367 G A
chr1 55385 A G
chr1 55388 C T
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
100096
100097
100099
100100
100101
100102
100103
100104
100105
100106
100107
Binary file not shown.
Empty file.
42 changes: 39 additions & 3 deletions tests/preprocessing/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def load_h5_archive(h5_path):


@pytest.mark.parametrize(
"test_data_name_dir, extra_cli_params, genotype_file_name",
"test_data_name_dir, extra_cli_params, genotype_file_name, should_fail",
[
(
"no_filters_minimal",
Expand All @@ -32,6 +32,7 @@ def load_h5_archive(h5_path):
"1",
],
"genotypes_chr1.h5",
False,
),
(
"no_filters_minimal_str_samples",
Expand All @@ -40,6 +41,7 @@ def load_h5_archive(h5_path):
"1",
],
"genotypes_chr1.h5",
False,
),
(
"filter_variants_minimal",
Expand All @@ -50,6 +52,18 @@ def load_h5_archive(h5_path):
f"{(tests_data_dir / 'process_sparse_gt/filter_variants_minimal/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
False,
),
(
"filter_variants_all",
[
"--chromosomes",
"1",
"--exclude-variants",
f"{(tests_data_dir / 'process_sparse_gt/filter_variants_all/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
True,
),
(
"filter_variants_multiple",
Expand All @@ -60,6 +74,7 @@ def load_h5_archive(h5_path):
f"{(tests_data_dir / 'process_sparse_gt/filter_variants_multiple/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
False,
),
(
"filter_samples_minimal",
Expand All @@ -70,6 +85,18 @@ def load_h5_archive(h5_path):
f"{(tests_data_dir / 'process_sparse_gt/filter_samples_minimal/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
False,
),
(
"filter_samples_all",
[
"--chromosomes",
"1",
"--exclude-samples",
f"{(tests_data_dir / 'process_sparse_gt/filter_samples_all/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
True,
),
(
"filter_calls_minimal",
Expand All @@ -80,6 +107,7 @@ def load_h5_archive(h5_path):
f"{(tests_data_dir / 'process_sparse_gt/filter_calls_minimal/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
False,
),
(
"filter_calls_vars_samples_minimal",
Expand All @@ -94,11 +122,12 @@ def load_h5_archive(h5_path):
f"{(tests_data_dir / 'process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/variants/').as_posix()}",
],
"genotypes_chr1.h5",
False,
),
],
)
def test_process_sparse_gt_file(
test_data_name_dir, extra_cli_params, genotype_file_name, tmp_path
test_data_name_dir, extra_cli_params, genotype_file_name, should_fail, tmp_path
):
cli_runner = CliRunner()

Expand Down Expand Up @@ -127,7 +156,14 @@ def test_process_sparse_gt_file(
out_file_base.as_posix(),
]

result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=False)
result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=True)

if should_fail:
assert isinstance(result.exception, ValueError)
return
else:
assert result.exception is None

assert result.exit_code == 0

h5_file = out_file_base.as_posix().replace("genotypes", genotype_file_name)
Expand Down

0 comments on commit bacc7d2

Please sign in to comment.