From ab2d25ca2fcceda2712e563a93a0e1599a338c51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=BCck?= <m991k@b260-pc003.inet.dkfz-heidelberg.de>
Date: Tue, 10 Dec 2024 15:38:52 +0100
Subject: [PATCH 1/4] with these changes, a copy of the annotations.parquet
 file is  saved containing NA values before filling these

---
 deeprvat/annotations/annotations.py           |   5 +-
 pipelines/annotations.snakefile               |   2 +
 tests/annotations/test_annotations.py         |  47 ++++++++++++++++++
 .../expected/expected_unfilled.parquet        | Bin 0 -> 27377 bytes
 4 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet

diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
index 56e88eac..83c76e7d 100644
--- a/deeprvat/annotations/annotations.py
+++ b/deeprvat/annotations/annotations.py
@@ -2029,8 +2029,9 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
 @click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
+@click.option("--keep_unfilled", type=click.Path(), default=None)
 def select_rename_fill_annotations(
-    annotation_columns_yaml_file: str, annotations_path: str, out_file: str
+    annotation_columns_yaml_file: str, annotations_path: str, out_file: str, keep_unfilled: str
 ):
     """
     Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
@@ -2039,6 +2040,7 @@ def select_rename_fill_annotations(
     - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
     - annotations_path (str): Path to the annotations file.
     - out_file (str): Path to save the modified annotations file.
+    - wether to keep annotations data frame containing NA values before filling them
     """
 
     logger.info(
@@ -2052,6 +2054,7 @@ def select_rename_fill_annotations(
         annotations_path, columns=list(set(prior_names + key_cols))
     )
     anno_df.rename(columns=column_name_mapping, inplace=True)
+    if (keep_unfilled is not None): anno_df.to_parquet(keep_unfilled)
     anno_df.fillna(fill_value_mapping, inplace=True)
     anno_df.to_parquet(out_file)
 
diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
index 82bb4722..e4e1db9b 100644
--- a/pipelines/annotations.snakefile
+++ b/pipelines/annotations.snakefile
@@ -587,6 +587,7 @@ rule select_rename_fill_columns:
     params: 
         annotations_in=rules.compute_plof_column.params.annotations_out,
         annotations_out = anno_dir / "annotations.parquet",
+        unfilled = anno_dir / "unfilled_annotations.parquet"
     resources:
         mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
@@ -597,6 +598,7 @@ rule select_rename_fill_columns:
                 "{input.yaml_file}",
                 "{params.annotations_in}",
                 "{params.annotations_out}",
+                "--keep_unfilled {params.unfilled}"
             ]
         ) +" && touch {output.chckpt}"
 
diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py
index 58b2957c..eb9d1d03 100644
--- a/tests/annotations/test_annotations.py
+++ b/tests/annotations/test_annotations.py
@@ -735,6 +735,53 @@ def test_select_rename_fill_annotations(
         written_results, expected_results[written_results.columns], check_exact=False
     )
 
+@pytest.mark.parametrize(
+    "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
+    [
+        (
+            "select_rename_fill_columns_small",
+            "annotation_colnames_filling_values.yaml",
+            "annotations.parquet",
+            "expected.parquet",
+            "expected_unfilled.parquet",
+        ),
+    ],
+)
+def test_select_rename_fill_annotations_unfilled(
+    test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path
+):
+    current_test_data_dir = (
+        tests_data_dir / "select_rename_fill_columns" / test_data_name_dir
+    )
+    yaml_file_path = current_test_data_dir / "input" / yaml_file
+    annotations_path = current_test_data_dir / "input" / annotations
+    expected_path = current_test_data_dir / "expected" / expected
+    expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled
+    output_path = tmp_path / "out.parquet"
+    unfilled_path = tmp_path / "unfilled.parquet"
+    cli_runner = CliRunner()
+    cli_parameters = [
+        "select-rename-fill-annotations",
+        yaml_file_path.as_posix(),
+        annotations_path.as_posix(),
+        output_path.as_posix(),
+        "--keep_unfilled",
+        unfilled_path
+
+    ]
+    result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
+    assert result.exit_code == 0
+    written_results = pd.read_parquet(output_path)
+    expected_results = pd.read_parquet(expected_path)
+    written_unfilled = pd.read_parquet(unfilled_path)
+    expected_unfilled = pd.read_parquet(expected_unfilled_path)
+    assert written_results.shape == expected_results.shape
+    assert_frame_equal(
+        written_results, expected_results[written_results.columns], check_exact=False
+    )
+    assert written_unfilled.shape == expected_unfilled.shape
+    assert_frame_equal(written_unfilled, expected_unfilled[written_unfilled.columns],check_exact=False)
+
 
 @pytest.mark.parametrize(
     "test_data_name_dir, annotations_in, expected",
diff --git a/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3a8319333db41bae2ad1ee940f1c10d4e5eba39a
GIT binary patch
literal 27377
zcmc&d4R}-KxoIsWAh5znO&KB-Idg(ENn2<flAioFp-tPgP15+&<+n|HlEx<KAF?sT
ztK!DQsW2aP>gM!v@4Sl63vBabx_KE_om1wcKK{Ti*PlB!QJ8-q-uF8{$vNjcX)Cm8
z9-AiL_n!Cfd%yR6zxVwrEu>4UDpB29q`I@HQMEv&SfEfe9Jk#_zVX?!-&>gZ>*kg}
zJ-29S{XZYuHhlePRlP2)*?6IOdFI}CUVmf9kB`&U4G$gmRLse^X8!QuFKU-&zFqtL
z!F})5)bHrovh|Ezn^7rLOH~)IQY}%HsY+F)i`7aVLapRs$`zN$j4Bk0vUSDqzNDyR
zhGJQ@DLizOl6a$dv1)cjZJDC9LOsVE4ky|yWLJx+t6EiDzNGwMarvQQ;;AA+`Bey2
ztO5lnRK*H-D}!GOg^WV#uHSgYt;b$AytnZE^Jm^-s8`;Te)+w<+RU979UXS;{M68+
zomuIAXleaj%@4(Hne(OL#TBd5!OE)4!%bH&tDC(jQ#^Wk)gRxUC!-J^qgu)1L<+%2
z<rQ+KR9+!EfkF<?Aa<94MsS>YD9LK1Otr94Rm>QOaxpN&yg+3OROQ0;<!97N?$c&O
z1RsTwSxedVEn1==w$IMVvBcCLPlm5ehvUI;S7g8+3nzP`kyO{9e<13Qrx?lh7L{j;
zh%-v!1+>;U`aDEsC6gLIUHtd@J74=I{k8WVe9#;I=&6ol8?XCv$)&XC6Y}FB!wvKa
z@>a9{eXZd;mo1)QF4fSxK77jb#KArEzGd5g_vLp|^lnG&<L_B(47l<0GioLO4B`_X
z1$ktQ|CM<;9bJfw-xW=SyAo77*|nj;>@v1#yMk0S(bZQgSp2<ni7j(LML3Z>q-9k!
zwTpk#f}CI%3l~2z(4PuN<6V9#6&{GjyJbw^(=y^f**Tj4IH^YD1;G22t>dj}KJ(gv
z)^VXqEi`g|CTn?&k>jI)Y4*jOk<U2&J>qp$PSEq3NGrQeMddFQ5eF6$H!TD?a3pA*
z@KJ^AI)N!)m389^#oQuA(OjjX7=F%!pAz_)0Y9a4mFs45Qu2ETH&%WoE2IK(f{)4@
z>!THmRVAFKE{=v6Ne>j4?<yv4Qxm%`07-F7dGP0xR1~r1%vt|T?7=!ircZOjJIhb5
z@3sBQ;qa+t^eZ#AR=VDNgm(0AzWbV%7FvD$^BJp7{%!rf*WWsQV#OVXcR#K!xnxl_
zU3tsuuWJmgG_D68Qmy21AwKa@c|Bw<%q{t1@uqEXx|}WMu7rO8ZCk;fePA(hY!PS(
zr;&%UtcIp`&mLczlS?Jsv&mF{qKoQJ3jOrqi-<=rI_FI~wF2FCaZaEMg#tz6ObqIZ
zLQon=Mf>B-O0fZG%W~qS%5xS>8i00ux+8e9t0vRF==42JzpkyXYu%<vzBoJc;{6-H
zwbMjoPQJ2Ill-7I6IgembLAZ?G8_Nv-<q%6u&#dV{OAt@r{~oFXjbskqPJJp<F?F0
zs+BwrWKsC2ye+?aNlrz=>`N=K-TmJrwi2Kb9A_R%vbHPLNO1d`-~F3vJ$>QN|7dPr
zzR+O3{pPo$A6V%X`(9Q)bN}^*A3oST{7LW4bj?M7xuNSn-G=1M+xE^_;HKYwz2`fZ
zKHpC7`q~Qz?p{yJ>~0<dgk%l|@rjShYvlB*oJK0dySsIRy$eO-o^D|-_SQ<`;Y&e1
zIDtH5C#0TjA9>TxyMpw89G-b<^YcOa3;i8GUwrspL-)!XO+QOtWk@tcemd{WV|4I2
z?Ti!Ot)`#6<F!+Zzjbo`FHcnLG{1B`eXXzI?|Xi7rHp!bOll>M1M!KE%ByF;Hm4qm
z^J^zvW@C$t^SfI^yt5iKgwx1Fc|saGRyuHGXY!=s|F*6;`m1<Z=FRrj&u_Tn(#+@Y
z&VP89<>LCwEBoH|UcD^ybaCaYhF^V2*Zt=GXOCQ3p1J?pKd&9QL0kXg&V#G|@W=w0
zEyY8ql{`+wCq62#p)<8P4GFiDFoio@L+q{wjo>)*P?FWi)OyeD`kWl)LhqSO`3F+c
zcyMMd@xoe=2z0`{vY+xD7Gx!oP1=e$lz7s{&t!=)`3yb^2y#hViGuk3x?Co+SSVU7
znoNcvD2TGsgG@HJ=i9_bmxF*fcC=*psC+gzwM%x=kXtg5TztR_g{iP4-W^>}JhA?q
z7fvMC#)XgAj*M=V$+Gzw2qqj%`C<rG+~?dVvtZ}J`-~DrQL%#fg()Z4>?ArlCWHM0
zVJ1lJFDie!h`8NI93?>v99JF+vi2tugEKo+M3|ft@g8eViseGHOD3o&9E|<JU^tQL
z9}rm|o0U9ZAwDvn^D0SW@XdEPHg68ErjPyX$?tzw9j1>z^4d?gY>Cmo`bMH?&)PMH
z)o<(@`OxpDm9`tLTaE|lZNHeaJhNyXeeB!z@AtK@p$!+t&!?jyw9t~85I>_<@;K2h
z!AC*IGRNYr8*+-0iNSb!{f?b@v;owE6UakWcJ=QoQVlf1!4RDpTNH|-BISbJ^+hw4
z3iBhglqD^X&QdCFoU1JP=-+3RmHh27`1S>StJ^cH?2;po*E81g&&N?XsZ<fdqa{DX
zc`3kIKAWqb0DNu~;O5seM*q|WB{Rx&?Ss2uh`6^or;M51;dr=<JA1ueT>fY=ai)p*
zM<Z_;B7AwM$ciqtjE8PMaVGfR^|bNtmSy|De~%&f*6pQ+<IVJK)zw=^b-$#Sq~d#i
zKjTOAle0?x`P5%F8$P(YijEw($6#+eaejE`HT02FmGO1P8)Ynmhg2(hTxf~#Q4n3m
zGWNFRlp#q!ERCeQQ7e;v+~p#Uwt$9k8hI$oYG`U*!+Y+WTo;I313G(k`=jwNd*Ei>
z!K)p_!yWvhgQl_p@GC`I>7rNt__O2to^7D1@Ul&(!|Uh)`je-9b(`p&UA8l-*GlPw
z$^H776@J5#L#L}B8NQm{e%Jd~RLptOP+DgF&5qgE80Iv7{#(nuT{4T#L#mZLPNaH#
zRDRJPxH6}lZ-`U!CZkh!&A0l97rdY#oJ1bNvTH6?(E59?+IR4<DpNM=Pk*V1ziarv
ze>|Xh{m0Ae58N4h{{4R}$ZWIy@7?e1ydbmUf`_W!`n9V5@n>Gzp}AF)+4B1{hfmyp
zir)3w(Fc$8>ts?+enze2pFw=$qw)$m+Lcp~Fy*WfoZ_CnhPd@=&<Kt*4<%WRh;~7s
z%&ULmz&kQ=o}W=G`A;Yc^Ph90AkOTI1#$l4Ku(Z(iGFG%(G!kivHqx^c)|~&;P~=T
z8DA7&gn3O`tlt*NNwQS5q1jj;>W|9?`p-ke!O%JHY&?&?>6!W?FLnOU^Xo4{)+Z@`
zGNJ>7TFJv8o4`k5B$r1+@$*nLCuAuUKf9WYti>@I^V2=VmL6V|iC~vqCR`MTFZ6aC
zSzftn5f*G=M$RJ&N8$1FFX8IAYQ=d)#4#!-%QBe-8whtpp-7aPJy%qIsEByAkJ#16
zi#ZX>vSLbI4RzfI$xl8kt52W0D!jdTe*FV;_ogd$UsNBOv;Fbz-+rt9C!byKIP_N{
z^Y!x%c`HBudS>C0FX}g)_>#Ud^VZAG0~MLo_Ypr&@0y<h4M|cn9)nuR<3NjzkAj9|
z0`UH8b7~U0nl%Dfb9aJxCk`6Haps{kA&r<l&y}wD14+O9TEpOdBbONdxSKkq+qsth
zDgCdT@A>lxeb<xEUs3zvcMXpna^JEm+F>9@?*GH@es?>4|C{&x;Hocfpg%lra(FY3
z(zr(WC)G+G2JwlH%4_7UbWS5O$tvEz$)t#FNl*_?AP-qt^~mlEzNvt3aehX;FYr;2
zQMASH8p%mlENXAuG)x>A0x@tLNDO>b{y==ZNc96~Q!inX?%}ce8;Y6G=TO4WEclrX
zKj*{G9O!$@1)kAs9{WpyFEBro6`rXR;G=*bXYxz}zW<GR#C@A`)q?q5P&5vOhr6io
zV3_LCbagRyzxzDsR1;gSBVHK=jo^rpM(|NtjZ~bjtW~H>lfj;F%&#&kRI^tUiyPzk
zkd7T1r_0I$jooHsVJ6hEV?j~R3%cvTBeaCrAxT~??uoD<^6dq7ZtO}u3nTPAT6Rcg
z>v6SOc3#4jxLAo}t!0Pu6uwxAMI{z#>4>8$j3m-33=1NtXRrekDGjiYGBR^#V|JcD
zF0$Vw8W5kCSde^24|aB}+?$0Eu1gI&BuV4g!ErMp7L3a&*e~M;Wfo4z!y0x-Mo*ku
zX6M1Nu`7yiZ$<f%ip3SDmlf!q3Q?fjAJca`gP5*655zRzNtoiR0>qTI=ZR@*dJ~0i
zN)yx7G$snviY2C_!AVS0+mI+wGma>DvuwJ0Sj05-m<rPZBBr-RLrh2GqW}#CVw%eK
zMB&To#Pk%ai2{{?iGo#ViD|5$PJLl>TFZ?@fr@y<bd<)3DXmx)v=~&r*Id4YP?m4A
zEL1?PNL`xn$3y<4`Y~c-C0m6G_EYItJXv}9(km;m%2egmtCm)>5Ny#Z%m8kzjKlqV
zcqE>tD1eAW!&InCJjN30N{u8CU}ZWU4fcn^l>i-2#}XsiiT*%uIG94?v2e;C@~8X&
zzOfRn`S*w5ayNRmquqL0UFD`tIcCIMRHB9+|2H29lZYQ|R>E0?>i4JC)Zm;6CIqY@
zkvFXZ4moj!=yhWfXJ#jvxUe>a|4{_a&iL??%%Hf}D*1~9oWF6wCz(B=KJxut0nX(_
zpPFQ5p-zhP!VOL`sb~nN#EwY_MdPx2Cp$de&dL3?inz#&lNXx?4lQ9!uzST0M=&uK
zbHX*w-d@MK$_qAy?C?e4?A`OR*^&jHWcCCeSc`Kbot*;WIXe<uv6kl~FX)uACGHlC
z#~BVi1<Y|FvsRuN;p`+67jGK=DlpDQez+-QNZN%Nn=LtL33FLrDE33y;fZ=7ejoEU
zaK~m$4qC#Pz!QpD6HJW7T<#Vv&Wd<$k_n6KLjK|_&d~%Pnq*eRQ49GduDymcI40;M
zv&CHk#hDOHOfpHac;eO}&c}GrlgymRICA$TalXa?oMe{78W7!5#Mv4LcnVnK!UJ<|
z%6r~oW+$1r_)Z_L3VeQ&84#I(%vC&`rHMQ-$xO0W>2OwZ<C9EKMn>^{H=LIVJu=B`
zqK(3t?i!pU>Fg8`7iXW`9Rhhy@`6q>TXM%rc5HG9$yJMFYEDqVk<iViO5!vnJ1k97
zm_pG&*bjdQ!xWRKXkfrUFoJ4Hi4p(6K>rX@_+WS-3HLG4s8+jLvszOLbse6xaB_9+
z>gvi(3iTY)<!bg^&fcq1?{!q6*Ge)#l4L9Vpw|YHtRaovq?IJQNmDm7W;BZChtQxU
zJ8$c5XTQTcgN2#5ljINy&%tj73H)Xrg7;4NF%ETlYWq69O(bcy!1Knn_867++1xc=
zopUhI;7$eNP0{A4F<__k5kJ8CygiYC&8_o!h9gZ@eZ)>_l6I4$H=rBV20XPq3>6%)
zud!QGfUmYE;ORg((jm`qve86FeKCp(*j$l7%$oAG0i3N?8?X%_JRCpHNT`7tV&+T^
zZ6HPgeivnr4s`=QZNO`+^;WxTgEoYhj0SX$YkeN4hMm_^A)6Isa`o7I`@6#<4k{3H
z4*F~$7vjm}aJwy?sb-TyA90I#A4Tw<>SQ#_m&sxAdE7%C8n@A_HJZJnO;K6?^S}jd
zR?ww&RLH9@HaytrZPhtVb|f$8L^~9A>osAokwWzCF<&yEvud2ZtqiSz$F1q~xOzf1
zOIC-Gwjm@}y3?a&pXqZ^KCiLe)nScvSe%ijD1$rZ_qdZjtA0E<Ii1G>HhoXX+{x(_
z*SSQFln?X`vT8cB>p84%G$DK9*Po2G>h$TL&OIFRxYM0Fcf=QWfNi)({WdBVvg&o6
zUPl7#k)!ELfQ|Keb!Yi}+}IO{xq5=yF?1Vur-BWX(AFe!i)|?ocP9f4U^5LeHWCAS
zb?b*<&y0FCDZ9(+Fm;StBb~b8M9``qaJG%PCT)ILe{Z!h8S*+KZJHiSN1vm;$*u1V
z*r*||E~#Z~yKpj@1G?H#pVzAOd1`x+T)~)L<MU8yug;nb+1w-G3*z2tUr#Vb^#tOq
ztz!Gy<Txulh1)k;qbu)T$<7sQfA~Fl^_cLn{z8tI1n7SZe?p~m`}l0_xu?0KueQ}?
z)epfQWo(l5wfG)&+1ymn>PQ4_?nux!tcCq&Z>Jp19TpdSw~jFTA-^n={Wmp!Cs^$o
zggu!IRP!+dmqY5SuxF-kFD3#p-w5*kULA$}=}5>%4F>YsF~sq>lL~s>C=N2uaDErX
zYS`lrjo+hB7udf@;;LnWag~KDoPS{As>nBYbfg?!)~EOTM(tzsEsP0TYh#;Rp9DW2
z@!KFfLb*)gY>3sjh#!}+qt9jPIJ-UF(du?sS}EhS?P=D}IIXR2eQT?>j*X8od>_lg
z6;AKc_%{l<=?LUHU)A^Y1mln|+w`NMY4)7~4>bxt2l<^yC}xE`z?$ZGlF2uvzDJt~
zRy!%=kEFSS+@1#il*@zhTum7F&MN-|{IMW9gShp5kZZ!Z;q3e<oR^Z2|0V*u8pa>U
z$s);@CnxWZI}<*ziy)lUg0YT#zG&3%@oBu(&fY+^gFQn{qy0lpkU+VAr^nd~XBG|c
z^wsUC+BpPyG0IDOI&~1+8X%rilJi61WR&r(jITeNT*=j@p{C8J`e0v!PWfCD-_NlS
z<PdJi8`wOxV0O-)Z$)v~)iG>z_YDohnF-DyPKc{yKKsYlN8@g4b^Ds&d?j9^!r6>W
zT+qWgFVq7*iQ;m7T>pI%e+v0)pxPMm>8y|wy8EU*r_g9k?Pt9v^3N97#<DdOu0A8P
z2I5)^2Pbzfo079k(56p9t*9T)<>UGNA*jhgjjV?XQ2NnMoqh!BY58hwQ^G5LhB7Ao
z9&`qA4F+QR5or!Ef%?HHA3uCHeOe}V6l@*ZJFGS@*n7TnuxNf7{R*$IV9Xsy=e%>#
zc|IJ|wny`xc_n+4g)5kC$kafXxPCUZK=!<e;v1YBN7)>&aQe2|^4WK5M`uBEU8eSG
zLiK<AjOW$rA^y3jFr2G=ULO^()(v^Jb$R(a*K>sz>hJcLbHwMdLj1L(x;)gmA+Lpc
zb}-(@^dz9J44)l?p%KRC>Y&a7c{tQrP>sLWGThiprkkJ#f%*x#_4H9e$m!8J0`)bT
z+G@0oy-m>w>gSlC)dQU!6TFR(LDbP{fR8msX!EQ=y`07&8v=r!5!r&iw+un=$fPmC
zvk|fb`a9k5-8=-nATo^_L=D{tVG@0>C87V*NSdMl(+BM$Kn>qpNFZhEM{OfNY9E;p
zX>$@innG<STQ_>N27O-(AIL`d0R1HN(S_caCX~stvZbmDz8J|KX0Q*vqX)Sc1Da?3
z0K{CgGQvDhA`##_nZ-BEU|>cC_>g)?uubTo1!}6Q>foE5T+`B9RfRZfZ9+_fsOZIf
zWIox@M@zn1Pj>GCkU7TDH<6~^S{A>>%&ZJU87YQ;A%aFpoeTBcKy`z0mY;58kdx0y
zvhrxqyg(jCYkA`+oacNxII{}QKIr+*EPhj;1V6)o06*9aN%FA@7P*;&H<3=WSUxkO
zubh00yhbdK<>L(&pAjL_?8)Lc_O2OQe>DJa#B&&6!!$U+>#u3$AZK5!{=iyA`U67-
zdt_?T*;)~!W752qqi-j>TR43A>;o;OjD56sIGWqd*4%lGG_K_MF_BGXiF^!eIsF-f
zh*L~|A)DYFK$_NM@wJSK@xg9ITruzHjoEsfy$S1lGG`s;nU<XVtu3HL!Tw;3La;v=
z)VA!|kK8cG!FL;zE1`9bsB`w4WLRMcFmFEeo-ZDP-(&j$2*@_Dmfx=f5`LU$Ucir?
zSr6<iG-geJY;4Tp*GuptM=He6^xGRmwH1<V&OPsF85QY^F){&v3_gp5Kd9C7b$3C%
zh@tEl<nVPH`>VwM5=kob2gqnhsXvgd$ss_}++QW~pQfQKes;g~BPfAC@SyKf{LuTz
z)gNL0+AP23qy*nY_MaBfAf0RL`1Q88R8K&kxPNwAtZgofqg7A?U{Ko!Iegv5K2H9#
zvQM`6+7oX=QrQ~yJiiGEzs#iy!TK{>ll2$b^CTO2OdgIO@UOibJ}!P>`x&_@%s<HY
zeh`1ofh>OwEnw<`{S22$;KlfF=93kDl-eg(OJHiu2+A0Mt<A<yj|88*z2ocCW~ps!
zr;HZq`gjI8{<=-UDsUx;8kZk1_ARhSCdxp)2|&70f&!`zqQr!IK}rKJjf6jTG$$|0
zX-Et7bGuDCj-H8Z>E-0(_0O1Y&Yzml^O*j}>w_b9ND^dX^xti?psPJF!rPy8KZg*w
zjDKuwv$)z?sitALhr{Ro2#t-yXCm#8(=hA9^N%nI;{g*trSXNWcYB@vY)v@ZzX#1i
z4CK~(6nv1-ekNq!jQUI{93Xj3$f>|Oz@wcgG%+Rv%H<b@*M~G8l#AD+fX{~3A97px
z&Ah+_uDC*>UJ93>;h*wJT^39vE@LiaE#)p;EoHA_!T&A?{yCwk{!g)L1N?7#;UByD
Ef2$f0KL7v#

literal 0
HcmV?d00001


From f22d20a2da2eedc7a5a6810db99079763198d063 Mon Sep 17 00:00:00 2001
From: PMBio <PMBio@users.noreply.github.com>
Date: Tue, 10 Dec 2024 14:57:30 +0000
Subject: [PATCH 2/4] fixup! Format Python code with psf/black pull_request

---
 deeprvat/annotations/annotations.py   | 8 ++++++--
 tests/annotations/test_annotations.py | 8 +++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
index 83c76e7d..f8894184 100644
--- a/deeprvat/annotations/annotations.py
+++ b/deeprvat/annotations/annotations.py
@@ -2031,7 +2031,10 @@ def create_gene_id_file(gtf_filepath: str, out_file: str):
 @click.argument("out_file", type=click.Path())
 @click.option("--keep_unfilled", type=click.Path(), default=None)
 def select_rename_fill_annotations(
-    annotation_columns_yaml_file: str, annotations_path: str, out_file: str, keep_unfilled: str
+    annotation_columns_yaml_file: str,
+    annotations_path: str,
+    out_file: str,
+    keep_unfilled: str,
 ):
     """
     Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
@@ -2054,7 +2057,8 @@ def select_rename_fill_annotations(
         annotations_path, columns=list(set(prior_names + key_cols))
     )
     anno_df.rename(columns=column_name_mapping, inplace=True)
-    if (keep_unfilled is not None): anno_df.to_parquet(keep_unfilled)
+    if keep_unfilled is not None:
+        anno_df.to_parquet(keep_unfilled)
     anno_df.fillna(fill_value_mapping, inplace=True)
     anno_df.to_parquet(out_file)
 
diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py
index eb9d1d03..82dce87f 100644
--- a/tests/annotations/test_annotations.py
+++ b/tests/annotations/test_annotations.py
@@ -735,6 +735,7 @@ def test_select_rename_fill_annotations(
         written_results, expected_results[written_results.columns], check_exact=False
     )
 
+
 @pytest.mark.parametrize(
     "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled",
     [
@@ -766,8 +767,7 @@ def test_select_rename_fill_annotations_unfilled(
         annotations_path.as_posix(),
         output_path.as_posix(),
         "--keep_unfilled",
-        unfilled_path
-
+        unfilled_path,
     ]
     result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False)
     assert result.exit_code == 0
@@ -780,7 +780,9 @@ def test_select_rename_fill_annotations_unfilled(
         written_results, expected_results[written_results.columns], check_exact=False
     )
     assert written_unfilled.shape == expected_unfilled.shape
-    assert_frame_equal(written_unfilled, expected_unfilled[written_unfilled.columns],check_exact=False)
+    assert_frame_equal(
+        written_unfilled, expected_unfilled[written_unfilled.columns], check_exact=False
+    )
 
 
 @pytest.mark.parametrize(

From 99d0df6eade509b8345477307a5bad96ac0c2638 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=BCck?= <m991k@b260-pc003.inet.dkfz-heidelberg.de>
Date: Wed, 11 Dec 2024 11:02:09 +0100
Subject: [PATCH 3/4] make unfilled option configurable, default: no copy saved

---
 docs/annotations.md             | 11 +++++++++++
 pipelines/annotations.snakefile |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/annotations.md b/docs/annotations.md
index 77125e38..5004b96d 100644
--- a/docs/annotations.md
+++ b/docs/annotations.md
@@ -110,6 +110,9 @@ Data for VEP plugins and the CADD cache are stored in `annotation_data`.
 
 ## Running the pipeline on your own data 
 Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data. 
+
+## Configuring the annotation pipeline
+
 You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of  the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example:
 ```shell
   'CADD_RAW' : 
@@ -119,6 +122,14 @@ You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the c
 ```
 Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`. 
 
+You can also modify the `example/config/annotation_colnames_filling_values.yaml` file to choose custom filling values for each of the annotations. 
+For each of the annotations the second value represents the value to use to fill in `NA` values, i.e. in the example above, in the `CADD_raw` column `NA` values are filled using `0`. 
+If you want to keep a copy of the annotations data before any `NA` values are filled, you can add 
+```shell 
+keep_unfilled: True
+```
+ to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml).
+
 You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add 
 ```shell
 af_mode : 'af_gnomade'
diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
index e4e1db9b..e3860c11 100644
--- a/pipelines/annotations.snakefile
+++ b/pipelines/annotations.snakefile
@@ -587,7 +587,7 @@ rule select_rename_fill_columns:
     params: 
         annotations_in=rules.compute_plof_column.params.annotations_out,
         annotations_out = anno_dir / "annotations.parquet",
-        unfilled = anno_dir / "unfilled_annotations.parquet"
+        unfilled = lambda w: f"--keep_unfilled {anno_dir / 'unfilled_annotations.parquet'}" if (config.get('keep_unfilled')) else ""
     resources:
         mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
@@ -598,7 +598,7 @@ rule select_rename_fill_columns:
                 "{input.yaml_file}",
                 "{params.annotations_in}",
                 "{params.annotations_out}",
-                "--keep_unfilled {params.unfilled}"
+                "{params.unfilled}"
             ]
         ) +" && touch {output.chckpt}"
 

From d368ae599f1936c4f3985720cd048312538b358a Mon Sep 17 00:00:00 2001
From: Kayla Meyer <meyer.kmt@gmail.com>
Date: Wed, 11 Dec 2024 16:00:50 +0100
Subject: [PATCH 4/4] fixup description text

---
 deeprvat/annotations/annotations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
index f8894184..9ceec7df 100644
--- a/deeprvat/annotations/annotations.py
+++ b/deeprvat/annotations/annotations.py
@@ -745,7 +745,7 @@ def deepsea_pca(
     X = df[deepSEAcols].to_numpy()
     del df
     logger.info(
-        "checking wether input contains data frame with pre-calculated means and SDs"
+        "checking whether input contains data frame with pre-calculated means and SDs"
     )
     if os.path.exists(means_sd_df):
         logger.info("standardizing values using existing mean and SD")
@@ -2043,7 +2043,7 @@ def select_rename_fill_annotations(
     - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
     - annotations_path (str): Path to the annotations file.
     - out_file (str): Path to save the modified annotations file.
-    - wether to keep annotations data frame containing NA values before filling them
+    - keep_unfilled (str, optional): Path to save annotations data frame containing NA values before filling them
     """
 
     logger.info(