From 8640c046b84f06565dd5de883cbee8155f609570 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Fri, 24 May 2024 10:35:25 +0200 Subject: [PATCH] Add checks for empty samples and variants (#98) --- deeprvat/preprocessing/preprocess.py | 5 +++ .../expected/expected_data.npz | Bin 0 -> 777 bytes .../input/qc/excluded_samples.csv | 11 +++++ .../filter_samples_all/input/samples_chr.csv | 11 +++++ .../input/sparse_gt/chr1/input_c1_b1.tsv.gz | Bin 0 -> 286 bytes .../filter_samples_all/input/variants.parquet | Bin 0 -> 4039 bytes .../filter_samples_all/input/variants.tsv.gz | Bin 0 -> 198 bytes .../expected/expected_data.npz | 0 .../input/qc/input_c1_b1.tsv | 20 +++++++++ .../filter_variants_all/input/samples_chr.csv | 11 +++++ .../input/sparse_gt/chr1/input_c1_b1.tsv.gz | 0 .../input/variants.parquet | Bin 0 -> 4039 bytes .../filter_variants_all/input/variants.tsv.gz | 0 tests/preprocessing/test_preprocess.py | 42 ++++++++++++++++-- 14 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz diff --git a/deeprvat/preprocessing/preprocess.py b/deeprvat/preprocessing/preprocess.py index e979912b..9f5f22d1 100644 --- a/deeprvat/preprocessing/preprocess.py +++ b/deeprvat/preprocessing/preprocess.py @@ -277,6 +277,8 @@ def process_sparse_gt( variants = variants[~variants["id"].isin(variant_ids_to_exclude)] if not skip_sanity_checks: assert total_variants - len(variants) == len(variant_ids_to_exclude) + if variants.empty: + raise ValueError("All variants have been filtered out.") logging.info(f"Dropped {total_variants - len(variants)} variants") logging.info(f"...done ({time.time() - start_time} s)") @@ -313,6 +315,9 @@ def process_sparse_gt( samples = sorted(list(samples)) + if len(samples) == 0: + raise ValueError("All samples have been excluded.") + logging.info("Processing sparse GT files by chromosome") total_calls_dropped = 0 variant_groups = variants.groupby("chrom") diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz new file mode 100644 index 0000000000000000000000000000000000000000..607f68df517b3bd3b2d7496c593468000e3230d7 GIT binary patch literal 777 zcmWIWW@Zs#U|`??Vnv4PpNsF!1F|ZZ7#M^YL>S5vi!u}QO5$@9ONufp^zsTS85sn? zvOq-)AeA66`;GW3X@Qd`0*(c&oi{0JPRx?Lr3>;VXD(c~c)`4B@gYG|W{A&^pE8L{ zsNDOErXOpUc)E(%5~iP0)2z6j&9dU!%9X;b=f=j7&~)icz${U(&CZ#88fUwm=kR;| zS7*5Jwa__JPD7VrY3qeE&gJsEENwDgbzW8HSlW8=i}OyN4IdI+Cm(0MaCP&;4bvys zm3*BOFiYCkUh#jSW9F9+j0^!NKD2XRA727=GsuU+Kp&>3=H-`E7Np|vB1kC+P|J%7 zQ`4LlCqDYm$>8*C;b+d5&dV>pbuL%mWxL~xaJJpXf66I*Db#z~2F<)XaE_8Et0Cm(c4#OY}1<;wE1{(}`tUB}p zxj~lBz5h@$Bz!lv)S-H^%(qe4KVM4JD3`RQccPq+zx3D-7ph&SKM&8^T#xE0*T5z4 zzI^CNFXvqF`3sx^=fti@{UML?nA^GM0XKP!Q{a%tc+?;BNS*%r_gXc3InnXHtk=LL za5%R$ooTjPpPt8=)8sMUuRT~-w;uOwodRdKf0WbAYuwLo_Ry)Fd$9AO)pUU+^0Kxlk1(a^Gfmips#a7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..6da9e9cd4a8e235b6ec1086605868903df410051 GIT binary patch literal 198 zcmV;%06G63iwFoAmWgBn|8`+=X<=@3b1rmqb^t`r!4AS83Y^dCd469D^^b^3KL*(A?uF4E5&Mi zu$QG2*;2_RSvkxug!W}}VbZg{%%>Df6&E>X(b@&4S0RR6309QVUxN`vj0QIF? A)Bpeg literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv new file mode 100644 index 00000000..0f863dc6 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv @@ -0,0 +1,20 @@ +chr1 16103 T G +chr1 51479 T A +chr1 51898 C A +chr1 51928 G A +chr1 51954 G C +chr1 54490 G A +chr1 54669 C T +chr1 54708 G C +chr1 54716 C T +chr1 54725 T G +chr1 54727 T C +chr1 54753 T G +chr1 55299 C T +chr1 55326 T C +chr1 55330 G A +chr1 55351 T A +chr1 55365 A G +chr1 55367 G A +chr1 55385 A G +chr1 55388 C T diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet new file mode 100644 index 0000000000000000000000000000000000000000..df779fb34a6a6da307133e7fc4234f7e1f7631fb GIT binary patch literal 4039 zcmcgvO>7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py index a8fc0415..cd5828b4 100644 --- a/tests/preprocessing/test_preprocess.py +++ b/tests/preprocessing/test_preprocess.py @@ -23,7 +23,7 @@ def load_h5_archive(h5_path): @pytest.mark.parametrize( - "test_data_name_dir, extra_cli_params, genotype_file_name", + "test_data_name_dir, extra_cli_params, genotype_file_name, should_fail", [ ( "no_filters_minimal", @@ -32,6 +32,7 @@ def load_h5_archive(h5_path): "1", ], "genotypes_chr1.h5", + False, ), ( "no_filters_minimal_str_samples", @@ -40,6 +41,7 @@ def load_h5_archive(h5_path): "1", ], "genotypes_chr1.h5", + False, ), ( "filter_variants_minimal", @@ -50,6 +52,18 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_variants_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, + ), + ( + "filter_variants_all", + [ + "--chromosomes", + "1", + "--exclude-variants", + f"{(tests_data_dir / 'process_sparse_gt/filter_variants_all/input/qc').as_posix()}", + ], + "genotypes_chr1.h5", + True, ), ( "filter_variants_multiple", @@ -60,6 +74,7 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_variants_multiple/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, ), ( "filter_samples_minimal", @@ -70,6 +85,18 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_samples_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, + ), + ( + "filter_samples_all", + [ + "--chromosomes", + "1", + "--exclude-samples", + f"{(tests_data_dir / 'process_sparse_gt/filter_samples_all/input/qc').as_posix()}", + ], + "genotypes_chr1.h5", + True, ), ( "filter_calls_minimal", @@ -80,6 +107,7 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_calls_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, ), ( "filter_calls_vars_samples_minimal", @@ -94,11 +122,12 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/variants/').as_posix()}", ], "genotypes_chr1.h5", + False, ), ], ) def test_process_sparse_gt_file( - test_data_name_dir, extra_cli_params, genotype_file_name, tmp_path + test_data_name_dir, extra_cli_params, genotype_file_name, should_fail, tmp_path ): cli_runner = CliRunner() @@ -127,7 +156,14 @@ def test_process_sparse_gt_file( out_file_base.as_posix(), ] - result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=False) + result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=True) + + if should_fail: + assert isinstance(result.exception, ValueError) + return + else: + assert result.exception is None + assert result.exit_code == 0 h5_file = out_file_base.as_posix().replace("genotypes", genotype_file_name)