From 38226666f0d87a1274490435a59fc422e17caef2 Mon Sep 17 00:00:00 2001 From: ChongLu121 Date: Tue, 13 Jun 2023 17:00:57 +0800 Subject: [PATCH] secse v1.3 update --- README.md | 9 +++++++-- demo/phgdh_demo_vina.ini | 3 ++- demo/subtructure_filter_demo.xls | Bin 0 -> 32256 bytes secse/growing/filter.py | 21 ++++++++------------- secse/utilities/substructure_filter.py | 8 ++++---- 5 files changed, 21 insertions(+), 20 deletions(-) create mode 100755 demo/subtructure_filter_demo.xls diff --git a/README.md b/README.md index 701e059..7ba4ae5 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ further validation. ---------------------------- 1. Setting up dependencies - python ~=3.9, perl ~=5.32 + python ~=3.9, perl ~=5.32 ```bash conda create --name secse -c conda-forge parallel tqdm biopandas openbabel chemprop xlrd=2 pandarallel rdkit=2022.09 conda activate secse @@ -114,6 +114,10 @@ further validation. - _spiro_site_count_, maximum of spiro ring site count, default=1, type=int - _fused_site_count_, maximum of fused ring site count, default=3, type=int - _rdkit_sa_score_, synthetic accessibility score (calculated by RDKit) cutoff, default=5, type=float + - _substructure_filter_, files containing the customized unwanted substructure SMARTS in "*.xls" format, set the + value to 0 if you do not have any additional unwanted substructure. PANIS already includes as default. The file + should include columns for **`Pattern`**, **`ID`**, and **`Max`**, where the **`ID`** should be unique for each SMARTS. You can + refer to the example file [subtructure_filter_demo.xls](demo/subtructure_filter_demo.xls), default=0, type=string Config file of a demo case [phgdh_demo_vina.ini](demo/phgdh_demo_vina.ini) Customized rule json template [rules.json](demo/rules.json). Rule ID should be in the form G-001-XXXX, like @@ -139,7 +143,8 @@ GNU Parallel installation python ~=3.9, perl ~=5.32 -numpy~=1.24.3, pandas~=1.3.3, xlrd~=2.0.1, pandarallel~=1.5.2, tqdm~=4.65.0, biopandas~=0.4.1, openbabel~=3.1.1, rdkit~=2022.09, chemprop~=1.5.2, pytorch~=2.0.0+cu117 +numpy~=1.24.3, pandas~=1.3.3, xlrd~=2.0.1, pandarallel~=1.5.2, tqdm~=4.65.0, biopandas~=0.4.1, openbabel~=3.1.1, rdkit~ +=2022.09, chemprop~=1.5.2, pytorch~=2.0.0+cu117 Linux server with CPUs only also works. diff --git a/demo/phgdh_demo_vina.ini b/demo/phgdh_demo_vina.ini index 02f35d0..8e9d039 100644 --- a/demo/phgdh_demo_vina.ini +++ b/demo/phgdh_demo_vina.ini @@ -48,4 +48,5 @@ ring_system_count = 4 bridged_site_count = 2 spiro_site_count = 1 fused_site_count = 3 -rdkit_sa_score = 5 \ No newline at end of file +rdkit_sa_score = 5 +substructure_filter = 0 \ No newline at end of file diff --git a/demo/subtructure_filter_demo.xls b/demo/subtructure_filter_demo.xls new file mode 100755 index 0000000000000000000000000000000000000000..72ebc09a0035c896c63e08956ead0d01e2607434 GIT binary patch literal 32256 zcmeHw2UrtX*Z)ljAxIHKk)jeQ5|EC9D;-#WdqdZ{tL8grCX-Ai6NLA_|L=Le=V9<>=Kk)v=bU@)xpi*h`E!={mT$GY zPdHa|VnDttjft3r3*b4JiCYjzS1K{e+?&BOKvmcOAqyBwpd!VF%6ud!@*59?;uq^5S&J{-K&`W?i90JD+h9?7(52z)vr8KKSeOghU zn<=_|)MqfMLH4=ytgFme71acC6Cxm8g`pJPZ`7wF^=S^D08Kze6|u0V?FfMc+#Bsc@!v8%xpm_y^WoIGB&Nv|8C0eR&e>J{Y51yk3U#JIP zqz7N82j8Fv*Qd{1J@o7K;QILMqu;8B&;M_I_Ne3<)9vCjU0pCMp>QLJQtjovHV)G$ z8ox|pL#3e(7AksUZQPo{t7&j)sPvGH4$k3k%FwI*WFw^ybP>Gzm@v4Sp00Q3lNddF zg-V;N^%O&ANtKIJUNQ98d^mbIh0nIMoM$r6#6m>-Lk%^aoP5yXpo4{I7o}%S2IrK^ zN)K+y;Fn2Hu%{hkA2IaEeENuCaNxtu7(Qw{(Ee7P!9jmoPg;L-7EYIo_BR+SY5pAj ztr=X+{{U59U3`ZgoVJfH{dPU{bi2@{XZ%*BXTGJSwTU&<3*A4!Tmbqdu<95~OR%TZ zLhM2Dr*Wt?4sJuyLoMS2UaEeK#(_^06(21e_;7F-2@r#(_(P)8*A?2Nr*=@`9ba(rmQ(uTo`q|b zi^WF^FGnt~-GC2vXn29#G)Ya8#>6SGE4e0;2c%0pNxBwJWguoSuP}f|n646P0s@mZ zbC{0`bu%D|ure8N(Nn1ms7R$Uh$@@Gu(BC&&Q&Ruj4GP}E2UBraG0u81{}XCl>rB& zN@c(iuTmMXtFKfB?CUF)0SAq8Gqm8e2WUso67;qQ5!Fx>$>?(QI!7u^mvEUtW3LDk zbVL12pa-Z36Ld}eOmMKSKylDr^)sniF(&B3`kCM~rUIIvTkB_nW9Sc=lth(ZJ2-$= zfJynagPm{%n3P{TI3!hoN%^&d<5vZklwUhIDXjpL@@ofYTt8&u=U;y9!0tswn3P{T z^(w}s{MvD-7?bjAM_Mr^<=0OAiZRiy9qvdE8$RWCh1?8IS{ZOuW={8iQMwL0u#;Hp;H{)^2^jxan6%Z#7efoggAC!g7U;! zrKXGHN_Eqcii5Tm6J4q9Kti`VWDXmO{8EpR22k&dA76GOA;7xR@~u19IxB z#)G9xxJr2gEv*##4PIKc$hbm5L4l4^At50;O0h9UDa2^jgfL36G2OcG0dl%7n1X64 zrJKQ5D+5T=b}e0B^n(lUD_fx6hbQvKCZ(RLMZnMOa#z@48p)>1RS3#VL6 z?I>U5v()5PaR4yU_#Ika%bE-mKEGpECu0*fR`Dw4Diz34Y81ax%8~`@lKuOQf!n1u z(W)O@R%lsD*NuJx63s*cBH9_ywCEb;j{!|9#-#j-L{P<;ls}PZRxu{!PbBcQeg$k& z{zRgA#h7SMB)C2n0!%c0n@;;=9ABv{(ebfu+qUWSZETEje8fNsbuoP#8`E`}P@vVA zhgT^BQW90Zd0PB8Ot{`2223h7;>jzUA-u8~TBYZ2OfdefCDmWH`R5k$R9I73*n`i(s4D3g47>qqRWKqR?)yjGkEii z#oP?-Dw`ojD?Rs8T2?V*VG*#!w*dT<4W)N-!S2tzmr<^3YN8V zWixasKf^4cExgsNMg)Guhqw?a@q`dMI(32+UXbTWT0;N^s>@JRPgxCK^y^ZhJq2(W z!bqe@zJC9m(h#4qe+36|!7XBxW63Yak4KIpuN->fNXao&B!B($D!-f>{BrzxiY$*!tA#){+zZ*K6*u|_$Kc)Up0-yA>=AEjE3s3fZJ_!E9P zwfW`n`I{539JareM14NJkzbAtzZ^b)bLEx8_P3I#n>TOr%dzE`!{=`edF8PEtt3kM zcmcm0JAOHQ{^rdqhwX1AQTt0@^UJa4m&508{=9P7{#Fuo{9YlyoH{5cphEs81vz|_ z8h`WipT1U4{nX``!{=|#ymHw7=I8%@OBR25>ha6r^EVl<9Jast`4<-#^UHDIm&508 z9=vkc{^sYeyq?T2N6IgU&)# zqQ`PMTB`znqNG#t@4xg2Y#Inw^9ha=tKgp`i18jk3N-keffsX3zO z=~@nEAx1<76BWVhgggoQ7ghy*84B!`0`fY6Y3YVId4@e$)C!+*qgFMO0kdTisA26& zgG0JdX-#_kk)sx^JpmFCw@e9S^VE_n5K6!?vVZ$NyC6xO5jJ!UlMXZTjNq&e=$zh7 zS7$rWIlUWOXWYSMbjEF7mCnX+z6QLlbxKfAol(}n78J)6+$^DLfvvch;#jXr&oz#bybs6K^nr;84Fu zeLHoKw{O-jPco9DMQxC$NoVVVTeXx0p(yz60WDo{^G-}z5Otwrj4pJH(S?qoE!Yz*==>5K<4g;mwQ8FT~CUnEa$+mYtNX*og@XgN$O zn+NF0cu;ki_=7I!r;J@fIJ($?E}C?v-$7l}^@9F`0B-Km{^Or%0G?4#ij7*&7O-TMIUOEDF#Ys4y9OWq^8vNqsHh*!d%aGd6Gb_OooCw;76FkZaKJ-%lHxQ z_NrS>0P>kNL+_q)P5fU;=TYlx+gNQWAGzPnCcb5X8G1&cG!ojJp=VXoIbYHM}J0~WUF`yaw z37t=~s0`As3{hREYbKSc5Y5;G3rVPdbz{Pzs;Pz_4pm%!_0{~?6DmSprTbEb(cu@Q z;fDhim!FH8AA90N2ybIY{hDd`;V{MJ=ceYzo*+RP4-tO-f;IfG7P$O8)%@5KAwuF8 zUE$}~T*D71KU{vkYJThq4w1$E_>M#e_%_X0>M0sx zec*c(Bgr7>yJF#K6d3}k8VdIJZvse90q*)_6DyK}d!fXHB==7lp-7dcj!aQ3l-Q7k zEWz{;Nxyko317Zi$x6e_)4;}^;ct@we4hwJwV|0ad2HYt0QDg&zT{A0X7~<);D?YL z7Dylv()hVKc|m?Aa6G9j4wH$v3{R9mGEmWPHZVH|+>G}pCS^6?eh@@S$ap~uIPG^x z5DK56q~2RRvjj(}Byj#p0*6~9p9rz|O1cQ00LK8WBDU8EDDGh3-6q_Ckmz1xfORW~ z%2VLog}jxgkB=Y3!cv)-Kcu%|B8?#u%R~a0hd^K?cap;k6uHc)og2U%kdH6(;Kw}p zi-`zzY)K_kgFEzm%mq|b0+Le?Ku$k2VjdbZ4NMhVGNC!S%Vq9v!SOL3?R1o~R4Lr9tAB!Q zT#Orrc&7jzIz}g1ygb4`!A<+d6>Fg?fDDQ!i;oEh#rb#|&}1>=bSz-3*2*u=Cm~qQ z6`+JktAvC>4}u5&M#CN4k!*!KOx}nY)NCo-ec-L%-{3b%5^D$ms4i>OT}b9il1%5a zQp~0aNo=Hn2*28aaDDRZPW6K@qaby_HLc(tFCQ{^gktcB#ExT!DuyR`jTw}(Vdm*} zM|~_>ev0@y@sVfS{B5BUmuC}BOh2(A?a;{u&ep}(wyo~;&!?!19mCd0ty4U%`+r&z z^e}b5gVBzo^Yd4?-TW@Bq4Nv-)(Hi717ogl?3^>+&Zb$fg6i|%7aiCq3!IeMCTC$> z!LN^p_Kn&WJUutnA$Qfqznch8Hy+g>^IN9*nUU>p*}TY^+2mody!4U5&MkGj2ETsv zWxn%++`^)efOZ?&$|V!GAIiJ@a?O`k#nxe7r}nsh@LJU{XZe|KM>>A7-SV`DqVa{s zYwk$jnM|v3bc<(3ThGS_gO=pHym_@-%D&svw@ptA=uy({u(D>)4JLr10|*^Qigkb;b+#2a|4e0m`-|=y?^3`^!68z+PIAJ zF36R8uS_TRWf}b(?Qhz@xEXUoTy3u;Yt@@iYlFXUn)>Db>?cy|h3mveAIL*a_q#tT z_+Z9_;_OwcT_o{ihW=;euQzuj?u)&!ve_A5`+|s_K67eRpL{)LU)-j!J{O8DF1BrX zvr%v7jMX!TC6+qOUt)VNAnMWw>$uXuo||T@DOPSj>Qk~L=5WSlk4G&Z?da z4>2u?u69&v5qdv;`RLs1fk}I1EqiY}y=m`9BG0#LabnrUr8i3-J)9VL_HNB9iz3as zdE^_JO;Mcjn!Kwl&11Fwni~;yj(McqKT+%PvoSWsg}+(dZF6#X`*f!(x!D_c+?HP5 z(eO(BHy*cp>~S6tJhgR~r~?#geQS#h@Anx^Uhm)9=Y>;72AKkczaXfln@^m&z-+rc=av$@h<=2{ww#}2RgL8 z9CW0+g>~uI2RkObxxf3I*x7u*_iuy$nf2m;tMe42$=hs7zZzSAKXYbANYVI!K9ikZ zj!*Llh&@xiAS$49=WvI)Gj|`aQRnFX`VTGhh89npBWQhMS>TF>>$AfYE@93e_uI5w zHY%fubNb6~U!L#UbvIb~@a>bbCqvGr6v}L-6vUdOp9(r9+$m~2%E#S3YyM>MqdS%> z8atGjT^m+AB=11$nYlw(KCb*w|(4(--ZuW!ICZAK&_unb}Ffew?W{-R8vo|KM9q_DlS#I8# z!c4E(4F~>qeX#$@+zW$T)An~C(51`jQ^RZak&3*4$w|(LGne{i05m;^Avg-4g z73bPG4*XvC{B!%<(#<^9jh(us-jta~?HPSH$XNQzwPCNPlqMb=^Vhp$FP%8s zh0`C_XAGwF&Jos0dbal0l&q~uqpKTl{aO_HcfN7J>^t%YS1gAWt&04mv*V+%o6Zw< zM|Bs~zdU!yD2Fj4TP5rr{J_cHc!uE$^LGhRo`HA5l5!m9o${Kp=2=-?XL;@7vss^_ zyGKS$9+Bzx*UttXS&_+0kB<9o+1>1F56%DaS{xpb7wMP!>{g3u3r;q$ZyZ+B#Vcjf zAIo$0d$!75aWbgqMZ>#_k5(%-)i;;*uzPJJiof4{zs0Rd4+g}YvI@U#JG`~WkY34; zhV%`aB-y*J#Cpbt=S9bI{DYo9Jk<78-Dz%3TNzB-|IeuZ;WpXr@*tNxL;*UinZ)o8wV)}AB&Erw-%_B->d#mGxfJfEhXjVp@z z$H{7U>z>uJ(uOrvBxagB?U^?qSTgg<-pf0NK6#N9|2F8fzxdpvz!~K{gc6r_^q9`o&D*#M?<%p!wt^*-O3ocq5q0uX_?Dsd0gGFe(mD{>)t9q zbvU!F@0S5{y9EsEe#WnPiA|U7CGYz@N=;snmNve))#6dBRyX^*#P#Q2{`7aAJ?hV_ z2X&Xnq>Iv4+7|X|U;L~?OnzNKgZgP52br68-JR*=`EXc+5Es9fS+emfm#^47Thx19 zQ;Qx)MEN$)6srekuYTBXp4r7&XM49_JmB}moumsk*FJT61)7U8Ic}n}A+AMuG;Zhyv_h-Gbj}=WkHnL!x^*TfAP|vB=(jI-h zH}#h@-t$65S@(R*kN-7(N(Z~=-A0V`@%F!XI={byOuM$-^rX}NO}o8rFS(Jha6sS5 zg)NUn1TM<@IIUh;xFj)a_4T!#&xJkQx3$MP`G5-@Hl6<{-H~c{<;9z%VHdhiyp=X+ z+174u{a)5B{d($W+qmCnX$2!R*4jy|(${IS@{Uj->+oxHb2L~D*9lSvn zvwB|a{YyXJ-?#plw|VEh6w~{!vnD2OZhhiwy$LlNcA0u=yn*ArV=a6?JkC6{Xv>D& zM_&R<=l`xKO{-NfT$a$fk@1DXmGh4^%YG5JtD%)?}RfX|cDQyY(+c%|f6?4=U(rE_0@ z$=lm}{m9|N!>+m^~%d<>uv`n+c(R;(QRQ?`wtEM zcMKnQ`j@Orbv$~k`3nqo^~lAc0bK^fya;^qIDXL8psWWMvfM^T6`Wt5HTr6u!=aZi z__ygCQ26w52n=?*W4YOd+@(7q#U4g_SI%%qPhhA4k~N~MCN$C2j`-Bf`ZQs*WXO<9 zMHVv~+@5@X<7I=aKO36`ggCc<`O0jQhs0}AuLBGE@13`}*ec2Ij&Hwpc9N#g<_ueS zc+l>b(?jE|+vQvj>a(Tigy};9U)Ab6Gg21az&TFl$%Gy~WAB=OaFiOkehPks$4u^Zwozzg{7>b7L1Ow|{G5m($nV!8+l4`xe8mXB)~^ z951=&v}j$F=u~`B(I0o!MHdJ?;rN=q87`2qE}r>rY@{{=?vziH+Xw8)ngDV$a;H$5 zjJ7bh)R~j8%+KY-ozG}EUFG9M&2G3nxK5zq@d(zxY;DSMiP%U(m{#F_gdu@76uAeR z?Sw3XCHx}RG6NFGG{B~@Fr-6b`nOgrw0>*FM2Is?`f3fg4Xp_238!%yz{0&%2V8l> z76~kcgs>w~oqD#U*6yfv^DI+yc^cxzdX9$QsxXjZd2tV+4}`8D8Utb& zmJ>%v>jTgDCxo=FLVxPI6vEsEBwx}10+8#un&IXdb#($B)kr$EFeD_?CxX%&wB^o@&T<7zX?X*7c}sKIFSf( zhXJ2Mv0%WhI)!0`i=q9w81mp^SRY&rkBf3KbUH2udoDT{7KuxTHN(Zw6S){J=eQUy zzquIf1Lz6VzqfN z+#%!A(O;p`u57_8m2booa0A7Br_387;G*cd@0 z(J>*!(DN{cs2uV$F#!pO-$Wv0oDkE55QE?I2}MfGAqT&wDTR<4PHlxCCAJ}?lu(q& zO%APuro_WK;V{t9JPlNogrY=la;l#|3Al+FPW3g&Q>3CS3&Aw>b~*g43C$DtdBdqC zD^kMIFG@Pi6XnVw2TduaD6tNaQmn-jTSg?2swnYgUpVzuaHKTSqQv$h$DK{eX1K{3 zPHBXc5-mz>MRNS%8;a7HqC`1JX$+KY;hDA)YSxkxwWswu~9ki|)MTxnQ62}n@ zCCZiKFV0Yu<`gBiS)|0@#X?Ft4N*D%@C-#+oub5+jFdPYX($nm#O-N{(t@JI_KlPl zKp6$kgd6~!5p4xITLX%5^i4U$!&Px??d&-E89R=xoE^s&&W@vJu;bXW*>Uu^Rv?{` zhjw76V`LbIq_i5JeUg))_@0d<-ue;m>Un~&cZSU?WmEop%xZRVIC}u8F4*%FfSg=n+NmZ z!F+izKNglLXiw>alMu$H4AzK+G5XPO(n1B0q@E=Nu;?-c9Vvc+ER4wud)jIl4E7Tb z)|3Ye;=!8nV8J|Ca~>>&g=GplQ|(j=7~|L2ZqfJHar8TO9DR-*M}K3tEA%yX9Q{l~ zUk%o4*w-i7VyB}=vE%4X>^OQ7JC0t&j-v;$6h&*+C< zgS`sVtD(oRJvgJoEx~I(h)can_B!*f*m0D>*uNC)a1G*pSoU~H z*5L?Q)*1Q&Sgn=ed;?5KB=Aw47?d>!ygCt;pJaI9o#cIJoDT z(m1>$rk0j+c!P_hTn79mph!T!$A#D9FpoKCK)>^~px*ETq`L5lj#-u=lmK;I3r6Ca zfC}oXh^U(>HP53FT+87c2S;}+@)`#^oZ(S2uzYyEq~1Jh>nW_Xo{XkK;K;8h%vS$( zJq;?YC+de6m_U!7FjxM6(zCg+204J!O%jS9s%t9jF-&DlPGPtLz`DV;FxGip^fH>7 zHXxxOIA(%{6u^uSoOpYIqCcFi7y;)fnwFDyY}fJcgX(5P zZ1$Lt@JF_55@aRW>j~)#ClC6vN(*(52YU>{>5Ua6tGfOx3w&2XeF_-|(Xc?|UiQ*= zUt;=Mte+_%4PCci#}RN11awAB4~2lf6afJ%ry~TM%lCnR>yF6~&{t+dz&ZUg2ur0X}2?BL#(j6`rT#IOmS;pOiXe_>klg(&%1E{RfX2GQ6L(<(MQz3Z#WX zAin3We*St@Ks|qcQuo;6>(~`lT~!vSvOtvusw_}tfhr4BS)j@SRTikSK$QimEKp^E zDhqHdpl|(;>vZ9jGgtDx>@8-^gZ011mrbzSLJT?Bd%(6p!1X)s2}D4^J)&3$_>QwH z1Pcf~A>bZBKM44~ZYTuY8yEut_X=_$Fna^^8s37+53t^QJz$J7F+b}YH^dZhR38e6hJeokjJfRRUj?c(9F}6*3VnkG%c99(@4EHWt0NxXV z7y|M!h3^Y7{MdI)$1T#D5GuL@hV&@E>Z-Cpl?AFSP-THC3shO4$^umusIown1*$Ai zWq~RS{F@eF*1EWE#rN@e$_Ur3xNgQZJ)TL%(;&E($MrC-+41x+o(qO$Hg%R4-}&R( z9@n{eHW%0TxCekIlJKl8p0C9-wfIgS&(7ky9M`#02=yU2LU4lM3<1vsHh=)jY&cs& zs8fFKaL3bp`2S3M!W~cQ;psdd2zUw)Pu}5aJ50w@b$F5v&$8f}9#6^P$v8X>hw0xS z;JxaiEf5DMa`3lJq;U3aFx*q&H&7~bSrbF*C=PPNJrk(}KM6TReZrwKo(t{Iz+FIk zDj7(6Q+f?WdyhJ&+wX9=HtQMB)(mUwcsp1yuu;)2GRQQT|3KSS9~|019$AWdHyG literal 0 HcmV?d00001 diff --git a/secse/growing/filter.py b/secse/growing/filter.py index 59d96eb..ec79ce0 100755 --- a/secse/growing/filter.py +++ b/secse/growing/filter.py @@ -33,10 +33,17 @@ def __init__(self, gen, config_path): self.input_smiles = None self.mol = None self.pains_smarts = None - self.strutFilter = StructureFilter() config = configparser.ConfigParser() config.read(config_path) + + substructure_filter_file = config.get("properties", "substructure_filter") + if substructure_filter_file == "0": + self.strutFilter = StructureFilter() + else: + # print("Use additional substructure filter patters.") + self.strutFilter = StructureFilter(substructure_filter_file) + self.MW = config.getfloat("properties", "MW") self.logP_lower = config.getfloat("properties", "logP_lower") self.logP_upper = config.getfloat("properties", "logP_upper") @@ -143,19 +150,7 @@ def alert_filter(self): yield "PAINS" yield "PASS" - def element_filter(self): - f_count = self.input_smiles.count("F") - br_count = self.input_smiles.count("Br") - cl_count = self.input_smiles.count("Cl") - i_count = self.input_smiles.count("I") - s_count = self.input_smiles.count("S") + self.input_smiles.count("s") - p_count = self.input_smiles.count("P") - if not all([f_count <= 5, br_count <= 2, cl_count <= 3, i_count <= 1, s_count <= 2, p_count <= 1]): - yield "element" - yield "PASS" - def substructure_filter(self): - # self.element_filter() yield self.strutFilter.sfilter(self.mol) def ring_system_filter(self): diff --git a/secse/utilities/substructure_filter.py b/secse/utilities/substructure_filter.py index fc98543..bb2f5d4 100755 --- a/secse/utilities/substructure_filter.py +++ b/secse/utilities/substructure_filter.py @@ -9,13 +9,13 @@ import pandas as pd from rdkit import Chem -FILTER_FILE = "Structure Filter_20211015_v1.12.xls" +FILTER_FILE = os.path.join(os.getenv("SECSE"), "utilities", "Structure Filter_20211015_v1.12.xls") class StructureFilter: - def __init__(self): - df = pd.read_excel(os.path.join(os.getenv("SECSE"), "utilities", FILTER_FILE), - usecols=["Pattern", "ID", "Max"]).dropna() + def __init__(self, filter_lst=FILTER_FILE): + df = pd.read_excel(filter_lst, usecols=["Pattern", "ID", "Max"]).dropna() + df["ID"] = df["ID"].astype(str) df = df.set_index("ID") df["Pattern_sma"] = df["Pattern"].apply(lambda x: Chem.MolFromSmarts(x)) self.fdic = df[["Pattern_sma", "Max"]].T.to_dict()