From ab2d25ca2fcceda2712e563a93a0e1599a338c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=BCck?= Date: Tue, 10 Dec 2024 15:38:52 +0100 Subject: [PATCH 1/4] with these changes, a copy of the annotations.parquet file is saved containing NA values before filling these --- deeprvat/annotations/annotations.py | 5 +- pipelines/annotations.snakefile | 2 + tests/annotations/test_annotations.py | 47 ++++++++++++++++++ .../expected/expected_unfilled.parquet | Bin 0 -> 27377 bytes 4 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 56e88eac..83c76e7d 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -2029,8 +2029,9 @@ def create_gene_id_file(gtf_filepath: str, out_file: str): @click.argument("annotation_columns_yaml_file", type=click.Path(exists=True)) @click.argument("annotations_path", type=click.Path(exists=True)) @click.argument("out_file", type=click.Path()) +@click.option("--keep_unfilled", type=click.Path(), default=None) def select_rename_fill_annotations( - annotation_columns_yaml_file: str, annotations_path: str, out_file: str + annotation_columns_yaml_file: str, annotations_path: str, out_file: str, keep_unfilled: str ): """ Select, rename, and fill missing values in annotation columns based on a YAML configuration file. @@ -2039,6 +2040,7 @@ def select_rename_fill_annotations( - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings. - annotations_path (str): Path to the annotations file. - out_file (str): Path to save the modified annotations file. + - wether to keep annotations data frame containing NA values before filling them """ logger.info( @@ -2052,6 +2054,7 @@ def select_rename_fill_annotations( annotations_path, columns=list(set(prior_names + key_cols)) ) anno_df.rename(columns=column_name_mapping, inplace=True) + if (keep_unfilled is not None): anno_df.to_parquet(keep_unfilled) anno_df.fillna(fill_value_mapping, inplace=True) anno_df.to_parquet(out_file) diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index 82bb4722..e4e1db9b 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -587,6 +587,7 @@ rule select_rename_fill_columns: params: annotations_in=rules.compute_plof_column.params.annotations_out, annotations_out = anno_dir / "annotations.parquet", + unfilled = anno_dir / "unfilled_annotations.parquet" resources: mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1), shell: @@ -597,6 +598,7 @@ rule select_rename_fill_columns: "{input.yaml_file}", "{params.annotations_in}", "{params.annotations_out}", + "--keep_unfilled {params.unfilled}" ] ) +" && touch {output.chckpt}" diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py index 58b2957c..eb9d1d03 100644 --- a/tests/annotations/test_annotations.py +++ b/tests/annotations/test_annotations.py @@ -735,6 +735,53 @@ def test_select_rename_fill_annotations( written_results, expected_results[written_results.columns], check_exact=False ) +@pytest.mark.parametrize( + "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled", + [ + ( + "select_rename_fill_columns_small", + "annotation_colnames_filling_values.yaml", + "annotations.parquet", + "expected.parquet", + "expected_unfilled.parquet", + ), + ], +) +def test_select_rename_fill_annotations_unfilled( + test_data_name_dir, yaml_file, annotations, expected, expected_unfilled, tmp_path +): + current_test_data_dir = ( + tests_data_dir / "select_rename_fill_columns" / test_data_name_dir + ) + yaml_file_path = current_test_data_dir / "input" / yaml_file + annotations_path = current_test_data_dir / "input" / annotations + expected_path = current_test_data_dir / "expected" / expected + expected_unfilled_path = current_test_data_dir / "expected" / expected_unfilled + output_path = tmp_path / "out.parquet" + unfilled_path = tmp_path / "unfilled.parquet" + cli_runner = CliRunner() + cli_parameters = [ + "select-rename-fill-annotations", + yaml_file_path.as_posix(), + annotations_path.as_posix(), + output_path.as_posix(), + "--keep_unfilled", + unfilled_path + + ] + result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) + assert result.exit_code == 0 + written_results = pd.read_parquet(output_path) + expected_results = pd.read_parquet(expected_path) + written_unfilled = pd.read_parquet(unfilled_path) + expected_unfilled = pd.read_parquet(expected_unfilled_path) + assert written_results.shape == expected_results.shape + assert_frame_equal( + written_results, expected_results[written_results.columns], check_exact=False + ) + assert written_unfilled.shape == expected_unfilled.shape + assert_frame_equal(written_unfilled, expected_unfilled[written_unfilled.columns],check_exact=False) + @pytest.mark.parametrize( "test_data_name_dir, annotations_in, expected", diff --git a/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet b/tests/annotations/test_data/select_rename_fill_columns/select_rename_fill_columns_small/expected/expected_unfilled.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3a8319333db41bae2ad1ee940f1c10d4e5eba39a GIT binary patch literal 27377 zcmc&d4R}-KxoIsWAh5znO&KB-Idg(ENn2gM!v@4Sl63vBabx_KE_om1wcKK{Ti*PlB!QJ8-q-uF8{$vNjcX)Cm8 z9-AiL_n!Cfd%yR6zxVwrEu>4UDpB29q`I@HQMEv&SfEfe9Jk#_zVX?!-&>gZ>*kg} zJ-29S{XZYuHhlePRlP2)*?6IOdFI}CUVmf9kB`&U4G$gmRLse^X8!QuFKU-&zFqtL z!F})5)bHrovh|Ezn^7rLOH~)IQY}%HsY+F)i`7aVLapRs$`zN$j4Bk0vUSDqzNDyR zhGJQ@DLizOl6a$dv1)cjZJDC9LOsVE4ky|yWLJx+t6EiDzNGwMarvQQ;;AA+`Bey2 ztO5lnRK*H-D}!GOg^WV#uHSgYt;b$AytnZE^Jm^-s8`;Te)+w<+RU979UXS;{M68+ zomuIAXleaj%@4(Hne(OL#TBd5!OE)4!%bH&tDC(jQ#^Wk)gRxUC!-J^qgu)1L<+%2 zYD9LK1Otr94Rm>QOaxpN&yg+3OROQ0;;U`aDEsC6gLIUHtd@J74=I{k8WVe9#;I=&6ol8?XCv$)&XC6Y}FB!wvKa z@>a9{eXZd;mo1)QF4fSxK77jb#KArEzGd5g_vLp|^lnG&@v1#yMk0S(bZQgSp2q-9k! zwTpk#f}CI%3l~2z(4PuN<6V9#6&{GjyJbw^(=y^f**Tj4IH^YD1;G22t>dj}KJ(gv z)^VXqEi`g|CTn?&k>jI)Y4*jOkqp$PSEq3NGrQeMddFQ5eF6$H!TD?a3pA* z@KJ^AI)N!)m389^#oQuA(OjjX7=F%!pAz_)0Y9a4mFs45Qu2ETH&%WoE2IK(f{)4@ z>!THmRVAFKE{=v6Ne>j4?B-O0fZG%W~qS%5xS>8i00ux+8e9t0vRF==42JzpkyXYu%`N=K-TmJrwi2Kb9A_R%vbHPLNO1d`-~F3vJ$>QN|7dPr zzR+O3{pPo$A6V%X`(9Q)bN}^*A3oST{7LW4bj?M7xuNSn-G=1M+xE^_;HKYwz2`fZ zKHpC7`q~Qz?p{yJ>~0TxyMpw89G-b<^YcOa3;i8GUwrspL-)!XO+QOtWk@tcemd{WV|4I2 z?Ti!Ot)`#6M1M!KE%ByF;Hm4qm z^J^zvW@C$t^SfI^yt5iKgwx1Fc|saGRyuHGXY!=s|F*6;`m1LCwEBoH|UcD^ybaCaYhF^V2*Zt=GXOCQ3p1J?pKd&9QL0kXg&V#G|@W=w0 zEyY8ql{`+wCq62#p)<8P4GFiDFoio@L+q{wjo>)*P?FWi)OyeD`kWl)LhqSO`3F+c zcyMMd@xoe=2z0`{vY+xD7Gx!oP1=e$lz7s{&t!=)`3yb^2y#hViGuk3x?Co+SSVU7 znoNcvD2TGsgG@HJ=i9_bmxF*fcC=*psC+gzwM%x=kXtg5TztR_g{iP4-W^>}JhA?q z7fvMC#)XgAj*M=V$+Gzw2qqj%`C?ArlCWHM0 zVJ1lJFDie!h`8NI93?>v99JF+vi2tugEKo+M3|ft@g8eViseGHOD3o&9E|z^4d?gY>Cmo`bMH?&)PMH z)o<(@`OxpDm9`tLTaE|lZNHeaJhNyXeeB!z@AtK@p$!+t&!?jyw9t~85I>_<@;K2h z!AC*IGRNYr8*+-0iNSb!{f?b@v;owE6UakWcJ=QoQVlf1!4RDpTNH|-BISbJ^+hw4 z3iBhglqD^X&QdCFoU1JP=-+3RmHh27`1S>StJ^cH?2;po*E81g&&N?XsZfY=ai)p* zM z!K)p_!yWvhgQl_p@GC`I>7rNt__O2to^7D1@Ul&(!|Uh)`je-9b(`p&UA8l-*GlPw z$^H776@J5#L#L}B8NQm{e%Jd~RLptOP+DgF&5qgE80Iv7{#(nuT{4T#L#mZLPNaH# zRDRJPxH6}lZ-`U!CZkh!&A0l97rdY#oJ1bNvTH6?(E59?+IR4|Xh{m0Ae58N4h{{4R}$ZWIy@7?e1ydbmUf`_W!`n9V5@n>Gzp}AF)+4B1{hfmyp zir)3w(Fc$8>ts?+enze2pFw=$qw)$m+Lcp~Fy*WfoZ_CnhPd@=&W|9?`p-ke!O%JHY&?&?>6!W?FLnOU^Xo4{)+Z@` zGNJ>7TFJv8o4`k5B$r1+@$*nLCuAuUKf9WYti>@I^V2=VmL6V|iC~vqCR`MTFZ6aC zSzftn5f*G=M$RJ&N8$1FFX8IAYQ=d)#4#!-%QBe-8whtpp-7aPJy%qIsEByAkJ#16 zi#ZX>vSLbI4RzfI$xl8kt52W0D!jdTe*FV;_ogd$UsNBOv;Fbz-+rt9C!byKIP_N{ z^Y!x%c`HBudS>C0FX}g)_>#Ud^VZAG0~MLo_Ypr&@0ylxeb9@?*GH@es?>4|C{&x;Hocfpg%lra(FY3 z(zr(WC)G+G2JwlH%4_7UbWS5O$tvEz$)t#FNl*_?AP-qt^~mlEzNvt3aehX;FYr;2 zQMASH8p%mlENXAuG)x>A0x@tLNDO>b{y==ZNc96~Q!inX?%}ce8;Y6G=TO4WEclrX zKj*{G9O!$@1)kAs9{WpyFEBro6`rXR;G=*bXYxz}zWlfj;F%&#&kRI^tUiyPzk zkd7T1r_0I$jooHsVJ6hEV?j~R3%cvTBeaCrAxT~??uoD<^6dq7ZtO}u3nTPAT6Rcg z>v6SOc3#4jxLAo}t!0Pu6uwxAMI{z#>4>8$j3m-33=1NtXRrekDGjiYGBR^#V|JcD zF0$Vw8W5kCSde^24|aB}+?$0Eu1gI&BuV4g!ErMp7L3a&*e~M;Wfo4z!y0x-Mo*ku zX6M1Nu`7yiZ$qTI=ZR@*dJ~0i zN)yx7G$snviY2C_!AVS0+mI+wGma>DvuwJ0Sj05-mTFZ?@fr@y5Ny#Z%m8kzjKlqV zcqE>tD1eAW!&InCJjN30N{u8CU}ZWU4fcn^l>i-2#}XsiiT*%uIG94?v2e;C@~8X& zzOfRn`S*w5ayNRmquqL0UFD`tIcCIMRHB9+|2H29lZYQ|R>E0?>i4JC)Zm;6CIqY@ zkvFXZ4moj!=yhWfXJ#jvxUe>a|4{_a&iL??%%Hf}D*1~9oWF6wCz(B=KJxut0nX(_ zpPFQ5p-zhP!VOL`sb~nN#EwY_MdPx2Cp$de&dL3?inz#&lNXx?4lQ9!uzST0M=&uK zbHX*w-d@MK$_qAy?C?e4?A`OR*^&jHWcCCeSc`Kbot*;WIXe2OwZ^{H=LIVJu=B` zqK(3t?i!pU>Fg8`7iXW`9Rhhy@`6q>TXM%rc5HG9$yJMFYEDqVkrX@_+WS-3HLG4s8+jLvszOLbse6xaB_9+ z>gvi(3iTY)_l6I4$H=rBV20XPq3>6%) zud!QGfUmYE;ORg((jm`qve86FeKCp(*j$l7%$oAG0i3N?8?X%_JRCpHNT`7tV&+T^ zZ6HPgeivnr4s`=QZNO`+^;WxTgEoYhj0SX$YkeN4hMm_^A)6Isa`o7I`@6#<4k{3H z4*F~$7vjm}aJwy?sb-TyA90I#A4Tw<>SQ#_m&sxAdE7%C8n@A_HJZJnO;K6?^S}jd zR?ww&RLH9@HaytrZPhtVb|f$8L^~9A>osAokwWzCF<&yEvud2ZtqiSz$F1q~xOzf1 zOIC-Gwjm@}y3?a&pXqZ^KCiLe)nScvSe%ijD1$rZ_qdZjtA0EHhoXX+{x(_ z*SSQFln?X`vT8cB>p84%G$DK9*Po2G>h$TL&OIFRxYM0Fcf=QWfNi)({WdBVvg&o6 zUPl7#k)!ELfQ|Keb!Yi}+}IO{xq5=yF?1Vur-BWX(AFe!i)|?ocP9f4U^5LeHWCAS zb?b*<&y0FCDZ9(+Fm;StBb~b8M9``qaJG%PCT)ILe{Z!h8S*+KZJHiSN1vm;$*u1V z*r*||E~#Z~yKpj@1G?H#pVzAOd1`x+T)~)LPQ4_?nux!tcCq&Z>Jp19TpdSw~jFTA-^n={Wmp!Cs^$o zggu!IRP!+dmqY5SuxF-kFD3#p-w5*kULA$}=}5>%4F>YsF~sq>lL~s>C=N2uaDErX zYS`lrjo+hB7udf@;;LnWag~KDoPS{As>nBYbfg?!)~EOTM(tzsEsP0TYh#;Rp9DW2 z@!KFfLb*)gY>3sjh#!}+qt9jPIJ-UF(du?sS}EhS?P=D}IIXR2eQT?>j*X8od>_lg z6;AKc_%{l<=?LUHU)A^Y1mln|+w`NMY4)7~4>bxt2l<^yC}xE`z?$ZGlF2uvzDJt~ zRy!%=kEFSS+@1#il*@zhTum7F&MN-|{IMW9gShp5kZZ!Z;q3eU z$s);@CnxWZI}<*ziy)lUg0YT#zG&3%@oBu(&fY+^gFQn{qy0lpkU+VAr^nd~XBG|c z^wsUC+BpPyG0IDOI&~1+8X%rilJi61WR&r(jITeNT*=j@p{C8J`e0v!PWfCD-_NlS zz_YDohnF-DyPKc{yKKsYlN8@g4b^Ds&d?j9^!r6>W zT+qWgFVq7*iQ;m7T>pI%e+v0)pxPMm>8y|wy8EU*r_g9k?Pt9v^3N97#UGNA*jhgjjV?XQ2NnMoqh!BY58hwQ^G5LhB7Ao z9&`qA4F+QR5or!Ef%?HHA3uCHeOe}V6l@*ZJFGS@*n7TnuxNf7{R*$IV9Xsy=e%># zc|IJ|wny`xc_n+4g)5kC$kafXxPCUZK=!&aQe2|^4WK5M`uBEU8eSG zLiKsz>hJcLbHwMdLj1L(x;)gmA+Lpc zb}-(@^dz9J44)l?p%KRC>Y&a7c{tQrP>sLWGThiprkkJ#f%*x#_4H9e$m!8J0`)bT z+G@0oy-m>w>gSlC)dQU!6TFR(LDbP{fR8msX!EQ=y`07&8v=r!5!r&iw+un=$fPmC zvk|fb`a9k5-8=-nATo^_L=D{tVG@0>C87V*NSdMl(+BM$Kn>qpNFZhEM{OfNY9E;p zX>$@innGN27O-(AIL`d0R1HN(S_caCX~stvZbmDz8J|KX0Q*vqX)Sc1Da?3 z0K{CgGQvDhA`##_nZ-BEU|>cC_>g)?uubTo1!}6Q>foE5T+`B9RfRZfZ9+_fsOZIf zWIox@M@zn1Pj>GCkU7TDH<6~^S{A>>%&ZJU87YQ;A%aFpoeTBcKy`z0mY;58kdx0y zvhrxqyg(jCYkA`+oacNxII{}QKIr+*EPhj;1V6)o06*9aN%FA@7P*;&H<3=WSUxkO zubh00yhbdK<>L(&pAjL_?8)Lc_O2OQe>DJa#B&&6!!$U+>#u3$AZK5!{=iyA`U67- zdt_?T*;)~!W752qqi-j>TR43A>;o;OjD56sIGWqd*4%lGG_K_MF_BGXiF^!eIsF-f zh*L~|A)DYFK$_NM@wJSK@xg9ITruzHjoEsfy$S1lGG`s;nUVwM5=kob2gqnhsXvgd$ss_}++QW~pQfQKes;g~BPfAC@SyKf{LuTz z)gNL0+AP23qy*nY_MaBfAf0RL`1Q88R8K&kxPNwAtZgofqg7A?U{Ko!Iegv5K2H9# zvQM`6+7oX=QrQ~yJiiGEzs#iy!TK{>ll2$b^CTO2OdgIO@UOibJ}!P>`x&_@%sOwEnw<`{S22$;KlfF=93kDl-eg(OJHiu2+A0MtE-0(_0O1Y&Yzml^O*j}>w_b9ND^dX^xti?psPJF!rPy8KZg*w zjDKuwv$)z?sitALhr{Ro2#t-yXCm#8(=hA9^N%nI;{g*trSXNWcYB@vY)v@ZzX#1i z4CK~(6nv1-ekNq!jQUI{93Xj3$f>|Oz@wcgG%+Rv%HUJ93>;h*wJT^39vE@LiaE#)p;EoHA_!T&A?{yCwk{!g)L1N?7#;UByD Ef2$f0KL7v# literal 0 HcmV?d00001 From f22d20a2da2eedc7a5a6810db99079763198d063 Mon Sep 17 00:00:00 2001 From: PMBio Date: Tue, 10 Dec 2024 14:57:30 +0000 Subject: [PATCH 2/4] fixup! Format Python code with psf/black pull_request --- deeprvat/annotations/annotations.py | 8 ++++++-- tests/annotations/test_annotations.py | 8 +++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 83c76e7d..f8894184 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -2031,7 +2031,10 @@ def create_gene_id_file(gtf_filepath: str, out_file: str): @click.argument("out_file", type=click.Path()) @click.option("--keep_unfilled", type=click.Path(), default=None) def select_rename_fill_annotations( - annotation_columns_yaml_file: str, annotations_path: str, out_file: str, keep_unfilled: str + annotation_columns_yaml_file: str, + annotations_path: str, + out_file: str, + keep_unfilled: str, ): """ Select, rename, and fill missing values in annotation columns based on a YAML configuration file. @@ -2054,7 +2057,8 @@ def select_rename_fill_annotations( annotations_path, columns=list(set(prior_names + key_cols)) ) anno_df.rename(columns=column_name_mapping, inplace=True) - if (keep_unfilled is not None): anno_df.to_parquet(keep_unfilled) + if keep_unfilled is not None: + anno_df.to_parquet(keep_unfilled) anno_df.fillna(fill_value_mapping, inplace=True) anno_df.to_parquet(out_file) diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py index eb9d1d03..82dce87f 100644 --- a/tests/annotations/test_annotations.py +++ b/tests/annotations/test_annotations.py @@ -735,6 +735,7 @@ def test_select_rename_fill_annotations( written_results, expected_results[written_results.columns], check_exact=False ) + @pytest.mark.parametrize( "test_data_name_dir, yaml_file, annotations, expected, expected_unfilled", [ @@ -766,8 +767,7 @@ def test_select_rename_fill_annotations_unfilled( annotations_path.as_posix(), output_path.as_posix(), "--keep_unfilled", - unfilled_path - + unfilled_path, ] result = cli_runner.invoke(annotations_cli, cli_parameters, catch_exceptions=False) assert result.exit_code == 0 @@ -780,7 +780,9 @@ def test_select_rename_fill_annotations_unfilled( written_results, expected_results[written_results.columns], check_exact=False ) assert written_unfilled.shape == expected_unfilled.shape - assert_frame_equal(written_unfilled, expected_unfilled[written_unfilled.columns],check_exact=False) + assert_frame_equal( + written_unfilled, expected_unfilled[written_unfilled.columns], check_exact=False + ) @pytest.mark.parametrize( From 99d0df6eade509b8345477307a5bad96ac0c2638 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=BCck?= Date: Wed, 11 Dec 2024 11:02:09 +0100 Subject: [PATCH 3/4] make unfilled option configurable, default: no copy saved --- docs/annotations.md | 11 +++++++++++ pipelines/annotations.snakefile | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/annotations.md b/docs/annotations.md index 77125e38..5004b96d 100644 --- a/docs/annotations.md +++ b/docs/annotations.md @@ -110,6 +110,9 @@ Data for VEP plugins and the CADD cache are stored in `annotation_data`. ## Running the pipeline on your own data Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data. + +## Configuring the annotation pipeline + You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example: ```shell 'CADD_RAW' : @@ -119,6 +122,14 @@ You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the c ``` Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`. +You can also modify the `example/config/annotation_colnames_filling_values.yaml` file to choose custom filling values for each of the annotations. +For each of the annotations the second value represents the value to use to fill in `NA` values, i.e. in the example above, in the `CADD_raw` column `NA` values are filled using `0`. +If you want to keep a copy of the annotations data before any `NA` values are filled, you can add +```shell +keep_unfilled: True +``` + to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). + You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add ```shell af_mode : 'af_gnomade' diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index e4e1db9b..e3860c11 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -587,7 +587,7 @@ rule select_rename_fill_columns: params: annotations_in=rules.compute_plof_column.params.annotations_out, annotations_out = anno_dir / "annotations.parquet", - unfilled = anno_dir / "unfilled_annotations.parquet" + unfilled = lambda w: f"--keep_unfilled {anno_dir / 'unfilled_annotations.parquet'}" if (config.get('keep_unfilled')) else "" resources: mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1), shell: @@ -598,7 +598,7 @@ rule select_rename_fill_columns: "{input.yaml_file}", "{params.annotations_in}", "{params.annotations_out}", - "--keep_unfilled {params.unfilled}" + "{params.unfilled}" ] ) +" && touch {output.chckpt}" From d368ae599f1936c4f3985720cd048312538b358a Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Wed, 11 Dec 2024 16:00:50 +0100 Subject: [PATCH 4/4] fixup description text --- deeprvat/annotations/annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index f8894184..9ceec7df 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -745,7 +745,7 @@ def deepsea_pca( X = df[deepSEAcols].to_numpy() del df logger.info( - "checking wether input contains data frame with pre-calculated means and SDs" + "checking whether input contains data frame with pre-calculated means and SDs" ) if os.path.exists(means_sd_df): logger.info("standardizing values using existing mean and SD") @@ -2043,7 +2043,7 @@ def select_rename_fill_annotations( - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings. - annotations_path (str): Path to the annotations file. - out_file (str): Path to save the modified annotations file. - - wether to keep annotations data frame containing NA values before filling them + - keep_unfilled (str, optional): Path to save annotations data frame containing NA values before filling them """ logger.info(