From 7e7ff8f868ad1bc5cb5274bce97cab9657153dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=BCck?= Date: Wed, 8 May 2024 10:51:45 +0200 Subject: [PATCH] added test for calculate_allele frequencies --- deeprvat/annotations/annotations.py | 9 ++- tests/annotations/test_annotations.py | 68 ++++++++++++++++++ .../expected/af_df.parquet | Bin 0 -> 2313 bytes .../input/genotypes.h5 | Bin 0 -> 6924 bytes .../input/variants.parquet | Bin 0 -> 3522 bytes 5 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/expected/af_df.parquet create mode 100644 tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/input/genotypes.h5 create mode 100644 tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/input/variants.parquet diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 72eba673..e31016cc 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -1738,7 +1738,14 @@ def process_vep( ) if "#Uploaded_variation" in vep_file.columns: vep_file = vep_file.merge(vcf_df, on="#Uploaded_variation", how = 'left') - vep_file.loc[vep_file.chrom.isna(),['chrom','pos','ref','alt']]=vep_file[vep_file['chrom'].isna()]['#Uploaded_variation'].str.replace("_", ":").str.replace("/", ":").str.split(':', expand=True).values + if vep_file.chrom.isna().sum()>0: + vep_file.loc[vep_file.chrom.isna(),['chrom','pos','ref','alt']]=vep_file[vep_file['chrom'].isna()]['#Uploaded_variation'].str.replace("_", ":").str.replace("/", ":").str.split(':', expand=True).values + assert vep_file.chrom.isna().sum() == 0 + assert vep_file.pos.isna().sum() == 0 + assert vep_file.ref.isna().sum() == 0 + assert vep_file.alt.isna().sum() == 0 + + if "pos" in vep_file.columns: vep_file["pos"] = vep_file["pos"].astype(int) diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py index 83ae128e..962ca10d 100644 --- a/tests/annotations/test_annotations.py +++ b/tests/annotations/test_annotations.py @@ -372,6 +372,74 @@ def test_merge_absplice_scores( assert written_results.shape == expected_results.shape assert_frame_equal(written_results, expected_results, check_exact = False) +@pytest.mark.parametrize( + "test_data_name_dir, genotype_file, variant_file, expected", + [ + ( "calculate_allele_frequency_small", + "genotypes.h5", + "variants.parquet", + "af_df.parquet", + ), + ] +) +def test_calculate_allele_frequencies( + test_data_name_dir, genotype_file, variant_file, expected, tmp_path +): + current_test_data_dir = tests_data_dir / 'calculate_allele_frequency' / test_data_name_dir + genotype_filepath = current_test_data_dir / 'input' / genotype_file + variant_filepath = current_test_data_dir / 'input' /variant_file + expected_path = current_test_data_dir / 'expected' / expected + output_path = tmp_path / 'out.parquet' + cli_runner = CliRunner() + cli_parameters = [ + 'get-af-from-gt', + genotype_filepath.as_posix(), + variant_filepath.as_posix(), + output_path.as_posix(), + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + assert written_results.shape == expected_results.shape + assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) + + + + +# @pytest.mark.parametrize( +# "test_data_name_dir, input_file_1, input_file_2, parameter1, expected", +# [ +# ( "test_name_dir", +# "input_file1.parquet", +# "input_file2.parquet", +# "8", +# "expected.parquet", +# ), +# ] +# ) +# def template( +# test_data_name_dir, input_file_1, input_file_2, parameter1, expected, tmp_path +# ): +# current_test_data_dir = tests_data_dir / 'test_name' / test_data_name_dir +# input_path1 = current_test_data_dir / 'input' / input_file_1 +# input_path2 = current_test_data_dir / 'input' /input_file_2 +# expected_path = current_test_data_dir / 'expected' / expected +# output_path = tmp_path / 'out.parquet' +# cli_runner = CliRunner() +# cli_parameters = [ +# 'function-name', +# input_path1.as_posix(), +# input_path2.as_posix(), +# output_path.as_posix(), +# parameter1, +# ] +# result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) +# assert result.exit_code == 0 +# written_results = pd.read_parquet(output_path) +# expected_results = pd.read_parquet(expected_path) +# assert written_results.shape == expected_results.shape +# assert_frame_equal(written_results, expected_results[written_results.columns], check_exact = False) # @pytest.mark.parametrize( # "test_name_dir, input_file_1, input_file_2, parameter1, expected", diff --git a/tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/expected/af_df.parquet b/tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/expected/af_df.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d453427a39e9a9ec5f384163b1b70ff646ff7825 GIT binary patch literal 2313 zcmcIm-EQJW6dnknC~72%)(E6rAZv?yK~jD~T2_%N5Sa~w51xlk& zp0ENy@-zz9R3x^WEMLxXeE#R?0Ts(ulPI3eM0J&lrc>!(qv;=``R^k6#Q%ZM7C;!H zq6jWYxFH04;ChD5pfh<2CNe%E9VIYFT;KTHazUkF5Z+_dL`R+5Ib$|7G`p z+FPlRMnuJLk@WA8{7*J8e3jp-j^=~O^90s*9`S~ z+ZNKg_7*lyNBfqp%+!#9Z`zBsVrX zI0ttE0(Wu&$i#F&3=G! zI)jpm{ljw2RfUp2D2KVl!8)fu6iNn;m94%jvpAnAp!`JMCI{TPH-k#Asx&bt zMpuQZ4fr?WYaZtYj-^2B*NWPpPH=g%rb}jjs65d(E@Uu|9ghU0Fap=Ry5eyck^)x;oY;nuZ}adn4K8Jm7HxJZv${x2*s9NNpLx z$|qah@_Fcgg0JDLFNejj_9a7Ya`60mlXZ<+MHlnl$TbmlqRo_qzAJLIbxnDbGYO7W zhvx~;th8Ad(Ur?w@q8L;fckq%%h=>;%FWv3zqu@*Z>4Pj$GM`1HI*uj*`CW;-(5m zgC$AotA1SBajVsB5w3s-4F=}}+=t5nh^v7zIa_kR^)>HMvF$_hVaY8&9J%n5k}Vt{ O1V7Y)KQtHcC+Ke`jYA3m literal 0 HcmV?d00001 diff --git a/tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/input/genotypes.h5 b/tests/annotations/test_data/calculate_allele_frequency/calculate_allele_frequency_small/input/genotypes.h5 new file mode 100644 index 0000000000000000000000000000000000000000..614d7a383cfd5d1446abefd710eae16cc58c66dc GIT binary patch literal 6924 zcmeHMOKuZE5bd^OFd>12f5JyzX4y0TOKcD^WPu1GEZBh;RuqYZD6vr3WeGRoC@eXG zkH8JWysj#{yW)5iDU#`mnCkbcrf2%Orsl)Xt<6`$%Gk+mX@TE#!gtoqD~sd`*05nbi^%C-o28_B5=G;*adykfWWc6-yB zvZ=}0U7LMEyeLyE&(4ERu#*b;dEcI|s|~+oTcV#|>PgC5<8o11$XqD|rq)>V`%mhN zCYsdOXw4^>zvRZ-)A8sk2bZY_DM*!fu^xZcX}!@v9 z{E5O36-K8F^HG2y#t#r;^nehrDtt%by9(b^_`bpq6h@td$HTQ2VzjIf<2nm5T2+YA z3PXHdVZ_2XWDhYO!a{sg;U$Ha6~3kL?E=4kx&6jp58Pns+MweUVYB04nUg+)a?bH>zKN)O*WqTW`3Ao>C_ZbWcfjJ~W_9!&Sj@Yw-2Y-e%PHHK zn*4A=B%c&W3M2)R0!e|SKvEznkQ7J?BnAF61@OKv|Cch$<2@X4$b)w*$;ErSjHh{g m9{`W<0n_66t}uYiJZfOn5~$^IA#n@WzA+7JTpw#ag;-) z^j4`=FKu(|DTn@qUYb+X_Rv#R^<1e{Px%9t`n@-70UD)B(kIVbzI~UujY|>E&IPzj z1vt`|={gMs#4Tf<%V?y6B ze$M5KyOGo9bpSzA=WL!0&lim65fjRW*m-u}7E#RE5NvKZ5Q6vT4EK;JfJS0b)+PP% z2r&$cKmL(fb)&yOZLczGuD1(CmSJ7>-j{Yb+`V@l49kA@NzM`2ExdDW4Fn+y1&*cOUo0@7GDEgIWW!`jfk$4>KlKQ5!rbvP7;vsF%U+v9{+2JbWPK5 zQ)TMMQ=8{2Tj++zVWij_@apZNjrMjXE+&bc1mu;g?x{L~b;*T95 zr0y&n$?6zGXjdS%<2RuM6Om`b=l9)x4bAz6YyXwX^_40t^hUGldCGb=LagU8dl5m> zy-rgwWTEZ-+HpUq8nQAtk(ydpH`>ALfe%003Iwg^3B@UMQ2M+dGz47%@*o|uLEt%6 z6-_=_09&d~Izy9^cMVl)$_imHy1F@>Aev%BkvasfD;+@=Is!;Hx|;T2D=^gtWw)qn z1f0{P8l9`LMLiSni_}0(FRB><=hVESL(naYnnu9at4Wm?6P zT8hwVwjgEqgPJO~gw~K0hdC5lt!AGn?kTOd+B66>64?%Khl7yV;7Ii>ayz~q2|hqx z7cP}<@z=n`5eAafO`{s5vQy->{JTH z=P>SL;DeJ2fs-=9UV$&!uki7+%JDwN`7_{8Ud!oPx0)%(j-$n%m@RiiV_&_cCdK2! zel4euU|PZy^^&G!%ICR8v&Kil?OZC~5Tk>L$j2?%PFofAbfrx~UR*sVv-Y9nkzSyf~j?_X5tD;fvNLlSv z&T9RU{9Z*jwaU(ch3CuR3SX+rnLT2ASLO%pQ?Oqd=BGN1WKSCYi#=1^C_cqmU8ZOp zM2jiP3$i19r95LR#819i2~%J6h$d&!5Z_Xr_z8Bm#Vo`s`&?{>RlY*)uIid5W=f|s zahu?zOd9f%K0n?m#U+TtEfuRWA2%z}#2LhZ#C3@+302U`4cW4_Qw2LfMz}(9l8Hye z%pg%nk(|ftB5%p8@Pm=PfTm(KIE;dHBuvjaz z`CA&3QLRJl(_wKZM`{RSG%zbhzAhPOH1<$`aL@N-tA?W0Jj` z9`s3AfFoKTicuOW%3|Sf+(6!dDYWi&mChB(G_TNs9T*MdfHkUpci|tu**~-izie*9 HueW~yD?CdN literal 0 HcmV?d00001