diff --git a/deeprvat/preprocessing/preprocess.py b/deeprvat/preprocessing/preprocess.py index e979912b..9f5f22d1 100644 --- a/deeprvat/preprocessing/preprocess.py +++ b/deeprvat/preprocessing/preprocess.py @@ -277,6 +277,8 @@ def process_sparse_gt( variants = variants[~variants["id"].isin(variant_ids_to_exclude)] if not skip_sanity_checks: assert total_variants - len(variants) == len(variant_ids_to_exclude) + if variants.empty: + raise ValueError("All variants have been filtered out.") logging.info(f"Dropped {total_variants - len(variants)} variants") logging.info(f"...done ({time.time() - start_time} s)") @@ -313,6 +315,9 @@ def process_sparse_gt( samples = sorted(list(samples)) + if len(samples) == 0: + raise ValueError("All samples have been excluded.") + logging.info("Processing sparse GT files by chromosome") total_calls_dropped = 0 variant_groups = variants.groupby("chrom") diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz new file mode 100644 index 00000000..607f68df Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz new file mode 100644 index 00000000..0fee2c66 Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet new file mode 100644 index 00000000..df779fb3 Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz new file mode 100644 index 00000000..6da9e9cd Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv new file mode 100644 index 00000000..0f863dc6 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv @@ -0,0 +1,20 @@ +chr1 16103 T G +chr1 51479 T A +chr1 51898 C A +chr1 51928 G A +chr1 51954 G C +chr1 54490 G A +chr1 54669 C T +chr1 54708 G C +chr1 54716 C T +chr1 54725 T G +chr1 54727 T C +chr1 54753 T G +chr1 55299 C T +chr1 55326 T C +chr1 55330 G A +chr1 55351 T A +chr1 55365 A G +chr1 55367 G A +chr1 55385 A G +chr1 55388 C T diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet new file mode 100644 index 00000000..df779fb3 Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py index a8fc0415..cd5828b4 100644 --- a/tests/preprocessing/test_preprocess.py +++ b/tests/preprocessing/test_preprocess.py @@ -23,7 +23,7 @@ def load_h5_archive(h5_path): @pytest.mark.parametrize( - "test_data_name_dir, extra_cli_params, genotype_file_name", + "test_data_name_dir, extra_cli_params, genotype_file_name, should_fail", [ ( "no_filters_minimal", @@ -32,6 +32,7 @@ def load_h5_archive(h5_path): "1", ], "genotypes_chr1.h5", + False, ), ( "no_filters_minimal_str_samples", @@ -40,6 +41,7 @@ def load_h5_archive(h5_path): "1", ], "genotypes_chr1.h5", + False, ), ( "filter_variants_minimal", @@ -50,6 +52,18 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_variants_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, + ), + ( + "filter_variants_all", + [ + "--chromosomes", + "1", + "--exclude-variants", + f"{(tests_data_dir / 'process_sparse_gt/filter_variants_all/input/qc').as_posix()}", + ], + "genotypes_chr1.h5", + True, ), ( "filter_variants_multiple", @@ -60,6 +74,7 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_variants_multiple/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, ), ( "filter_samples_minimal", @@ -70,6 +85,18 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_samples_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, + ), + ( + "filter_samples_all", + [ + "--chromosomes", + "1", + "--exclude-samples", + f"{(tests_data_dir / 'process_sparse_gt/filter_samples_all/input/qc').as_posix()}", + ], + "genotypes_chr1.h5", + True, ), ( "filter_calls_minimal", @@ -80,6 +107,7 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_calls_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, ), ( "filter_calls_vars_samples_minimal", @@ -94,11 +122,12 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/variants/').as_posix()}", ], "genotypes_chr1.h5", + False, ), ], ) def test_process_sparse_gt_file( - test_data_name_dir, extra_cli_params, genotype_file_name, tmp_path + test_data_name_dir, extra_cli_params, genotype_file_name, should_fail, tmp_path ): cli_runner = CliRunner() @@ -127,7 +156,14 @@ def test_process_sparse_gt_file( out_file_base.as_posix(), ] - result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=False) + result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=True) + + if should_fail: + assert isinstance(result.exception, ValueError) + return + else: + assert result.exception is None + assert result.exit_code == 0 h5_file = out_file_base.as_posix().replace("genotypes", genotype_file_name)