diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 00000000..10636716 --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,25 @@ +name: JOSS paper build + +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: joss_paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: joss_paper/paper.pdf diff --git a/joss_paper/Fig1.pdf b/joss_paper/Fig1.pdf new file mode 100644 index 00000000..c36d291f Binary files /dev/null and b/joss_paper/Fig1.pdf differ diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib new file mode 100644 index 00000000..75b74b6f --- /dev/null +++ b/joss_paper/paper.bib @@ -0,0 +1,322 @@ +@article{bennett1976efficient, + title={Efficient estimation of free energy differences from Monte Carlo data}, + author={Bennett, C. H.}, + journal={Journal of Computational Physics}, + volume={22}, + number={2}, + pages={245-268}, + year={1976}, + doi={10.1016/0021-9991(76)90078-4}, +} + + +@article{Case2005uq, + author = {Case, David A and Cheatham, 3rd, Thomas E and Darden, Tom and Gohlke, Holger and Luo, Ray and Merz, Jr, Kenneth M and Onufriev, Alexey and Simmerling, Carlos and Wang, Bing and Woods, Robert J}, + journal = {Journal of Computational Chemistry}, + number = {16}, + pages = {1668-1688}, + title = {The {Amber} biomolecular simulation programs}, + volume = {26}, + year = {2005}, + doi = {10.1002/jcc.20290} +} + + +@article{chodera2007use, + title={Use of the Weighted Histogram Analysis Method for the Analysis of Simulated and Parallel Tempering Simulations}, + author={J. D. Chodera and Swope, W. C. and Pitera, J. W. and Seok, C. and Dill, K. A.}, + journal={Journal of Chemical Theory and Computation}, + volume={3}, + number={1}, + pages={26-41}, + year={2007}, + doi={10.1021/ct0502864}, +} + + +@article{Nejahi2021aa, + author = {Younes Nejahi and Mohammad Soroush Barhaghi and Gregory Schwing and Loren Schwiebert and Jeffrey Potoff}, + journal = {SoftwareX}, + pages = {100627}, + title = {Update 2.70 to ``{GOMC}: {GPU} Optimized {Monte Carlo} for the simulation of phase equilibria and physical properties of complex fluids''}, + volume = {13}, + doi = {10.1016/j.softx.2020.100627}, + year = {2021} +} + + +@article{deng2009computations, + title={Computations of standard binding free energies with molecular dynamics simulations}, + author={Deng, Y. and Roux, B.}, + journal={The Journal of Physical Chemistry B}, + volume={113}, + number={8}, + pages={2234-2246}, + year={2009}, + doi={10.1021/jp807701h}, +} + +@article{gusev2023active, + title={Active Learning Guided Drug Design Lead Optimization Based on Relative Binding Free Energy Modeling}, + author={Gusev, F. and Gutkin, E. and Kurnikova, M. G. and Isayev, O.}, + journal={Journal of Chemical Information and Modeling}, + volume={63}, + number={2}, + pages={583-594}, + year={2023}, + doi={10.1021/acs.jcim.2c01052}, +} + + +@article{Hedges2019aa, + author = {Lester O. Hedges and Antonia S.J.S. Mey and Charles A. Laughton and Francesco L. Gervasio and Adrian J. Mulholland and Christopher J. Woods and Julien Michel}, + journal = {Journal of Open Source Software}, + keywords = {free energy, framework, python, MD SIMULATION}, + number = {43}, + pages = {1831}, + publisher = {The Open Journal}, + title = {{BioSimSpace}: An interoperable {Python} framework for biomolecular simulation}, + volume = {4}, + year = {2019}, + doi = {10.21105/joss.01831} +} + +@article{kirkwood1935statistical, + title={Statistical mechanics of fluid mixtures}, + author={Kirkwood, J. G.}, + journal={The Journal of Chemical Physics}, + volume={3}, + number={5}, + pages={300-313}, + year={1935}, + doi={10.1063/1.1749657}, +} + +@article{klimovich2015guidelines, + title={Guidelines for the analysis of free energy calculations}, + author={Klimovich, P. V. and Shirts, M. R. and Mobley, D. L.}, + journal={Journal of Computer-Aided Molecular Design}, + volume={29}, + number={5}, + pages={397-411}, + year={2015}, + doi={10.1007/s10822-015-9840-9}, +} + +@book{merz2010drug, + title={Drug design: structure-and ligand-based approaches}, + author={Merz, Jr, K. M. and Ringe, D. and Reynolds, C. H.}, + year={2010}, + publisher={Cambridge University Press}, + doi = {10.48550/arXiv.1309.0238}, +} + +@article{paliwal2011benchmark, + title={A Benchmark Test Set for Alchemical Free Energy Transformations and Its Use to Quantify Error in Common Free Energy Methods}, + author={Paliwal, H. and Shirts, M. R.}, + journal={Journal of Chemical Theory and Computation}, + volume={7}, + number={12}, + pages={4115-4134}, + year={2011}, + doi={10.1021/ct2003995}, +} + +@article{pham2011identifying, + title={Identifying low variance pathways for free energy calculations of molecular transformations in solution phase}, + author={Pham, T. T. and Shirts, M. R.}, + journal={The Journal of Chemical Physics}, + volume={135}, + number={3}, + pages={034114}, + year={2011}, + doi={10.1063/1.3607597}, +} + +@article{phillips2020scalable, + title={Scalable molecular dynamics on CPU and GPU architectures with NAMD}, + author = {Phillips,James C. and Hardy,David J. and Maia,Julio D. C. and Stone,John E. and Ribeiro,Jo{\~a}o V. and Bernardi,Rafael C. and Buch,Ronak and Fiorin,Giacomo and H{\'e}nin,J{\'e}r{\^o}me and Jiang,Wei and McGreevy,Ryan and Melo,Marcelo C. R. and Radak,Brian K. and Skeel,Robert D. and Singharoy,Abhishek and Wang,Yi and Roux,Beno{\^\i}t and Aksimentiev,Aleksei and Luthey-Schulten,Zaida and Kal{\'e},Laxmikant V. and Schulten,Klaus and Chipot,Christophe and Tajkhorshid,Emad}, + journal={The Journal of Chemical Physics}, + volume={153}, + number={4}, + pages={044130}, + year={2020}, + doi={10.1063/5.0014475}, +} + +@article{pohorille2010good, + title={Good practices in free-energy calculations}, + author={Pohorille, A. and Jarzynski, C. and Chipot, C.}, + journal={The Journal of Physical Chemistry B}, + volume={114}, + number={32}, + pages={10235-10253}, + year={2010}, + doi={10.1021/jp102971x}, +} + + +@article{Abraham2015aa, + author = {Abraham, Mark James and Murtola, Teemu and Schulz, Roland and P{\'a}ll, Szil{\'a}rd and Smith, Jeremy C. and Hess, Berk and Lindahl, Erik}, + journal = {SoftwareX}, + pages = {19 - 25}, + title = {{GROMACS}: High performance molecular simulations through multi-level parallelism from laptops to supercomputers}, + volume = {1--2}, + year = {2015}, + doi = {10.1016/j.softx.2015.06.001} +} + +@article{shirts2008statistically, + title={Statistically optimal analysis of samples from multiple equilibrium states}, + author={Shirts, M. R. and Chodera, J. D.}, + journal={The Journal of Chemical Physics}, + volume={129}, + number={12}, + pages={124105}, + year={2008}, + doi={10.1063/1.2978177}, +} + +@article{yang2004free, + title={Free energy simulations: use of reverse cumulative averaging to determine the equilibrated region and the time required for convergence}, + author={Yang, W. and Bitetti-Putzer, R. and Karplus, M.}, + journal={The Journal of Chemical Physics}, + volume={120}, + number={6}, + pages={2618-2628}, + year={2004}, + doi={10.1063/1.1638996}, +} + +@article{zwanzig1954high, + title={High‐temperature equation of state by a perturbation method. I. Nonpolar gases}, + author={Zwanzig, R. W.}, + journal={The Journal of Chemical Physics}, + volume={22}, + number={8}, + pages={1420-1426}, + year={1954}, + doi={10.1063/1.1740409}, +} + +@article{fan2020aa, + author = {Fan, Shujie and Iorga, Bogdan I. and Beckstein, Oliver}, + journal = {Journal of Computer-Aided Molecular Design}, + pages = {543--560}, + title = {Prediction of octanol-water partition coefficients for the {SAMPL6}-{$\log P$} molecules using molecular dynamics simulations with {OPLS-AA}, {AMBER} and {CHARMM} force fields}, + volume = {34}, + year = {2020}, + doi = {10.1007/s10822-019-00267-z} +} + + +@article{Mey2020aa, + author = {Antonia S. J. S. Mey and Bryce Allen and Hannah E. Bruce Macdonald and John D. Chodera and Maximilian Kuhn and Julien Michel and David L. Mobley and Levi N. Naden and Samarjeet Prasad and Andrea Rizzi and Jenke Scheen and Michael R. Shirts and Gary Tresadern and Huafeng Xu}, + journal = {Living Journal of Computational Molecular Science}, + number = {1}, + pages = {18378}, + title = {Best Practices for Alchemical Free Energy Calculations}, + volume = {2}, + year = {2020}, + doi = {10.33011/livecoms.2.1.18378} +} + + +@article{Chodera2016aa, + author = {J. D. Chodera}, + journal = {Journal of Chemical Theory and Computation}, + month = {Apr}, + number = {4}, + pages = {1799--1805}, + title = {A Simple Method for Automated Equilibration Detection in Molecular Simulations}, + volume = {12}, + year = {2016}, + doi = {10.1021/acs.jctc.5b00784} +} + +@InProceedings{ mckinney-proc-scipy-2010, + author = { {W}es {M}c{K}inney }, + title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython }, + booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference }, + pages = { 56 - 61 }, + year = { 2010 }, + editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman }, + doi = { 10.25080/Majora-92bf1922-00a } +} + + +@article{scikitlearn2011, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} + +@inproceedings{sklearn2013api, + author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and + Fabian Pedregosa and Andreas Mueller and Olivier Grisel and + Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort + and Jaques Grobler and Robert Layton and Jake VanderPlas and + Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, + title = {{API} design for machine learning software: experiences from the scikit-learn + project}, + booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, + year = {2013}, + pages = {108--122}, + doi = {10.48550/arXiv.1309.0238}, +} + + +@incollection{Schlaich2015aa, + address = {J{\"u}lich}, + author = {Schlaich, A and Kowalik, B and Kandu{\v{c}}, M and Schneck, Emanuel and Netz, RR}, + booktitle = {Computational Trends in Solvation and Transport in Liquids-Lecture Notes (IAS Series 28)}, + editor = {Sutmann, G. and Grotendorst, J. and Gompper, G. and Marx, D.}, + pages = {155--185}, + publisher = {Forschungszentrum J{\"u}lich GmbH}, + title = {Simulation techniques for solvation-induced surface-interactions at prescribed water chemical potential}, + volume = 28, + year = 2015 +} + + +@article{Thompson2022aa, + author = {Aidan P. Thompson and H. Metin Aktulga and Richard Berger and Dan S. Bolintineanu and W. Michael Brown and Paul S. Crozier and Pieter J. {in 't Veld} and Axel Kohlmeyer and Stan G. Moore and Trung Dac Nguyen and Ray Shan and Mark J. Stevens and Julien Tranchida and Christian Trott and Steven J. Plimpton}, + journal = {Computer Physics Communications}, + pages = 108171, + title = {{LAMMPS} - a flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales}, + volume = 271, + year = 2022, + doi = {10.1016/j.cpc.2021.108171}, +} + + +@Article{Salari2018, + author = {Salari, Reza and Joseph, Thomas and Lohia, Ruchi and Hénin, Jérôme and Brannigan, Grace}, + title = {A Streamlined, General Approach for Computing Ligand Binding Free Energies and Its Application to GPCR-Bound Cholesterol.}, + journal = {Journal of Chemical Theory and Computation}, + year = {2018}, + volume = {14}, + number = {12}, + pages = {6560--6573}, + doi = {10.1021/acs.jctc.8b00447}, +} + + +@article{santiagomcrae2023, + author={Santiago-McRae, E., and Ebrahimi, M. and Sandberg, J. W. and Brannigan, G. and Hénin, J.}, + title={Computing Absolute Binding Affinities by Streamlined Alchemical Free Energy Perturbation (SAFEP) [Article v1.0]}, + url={https://livecomsjournal.org/index.php/livecoms/article/view/v5i1e2067}, + doi={10.33011/livecoms.5.1.2067}, + journal={Living Journal of Computational Molecular Science}, + year={2023}, + month={Oct.}, + volume={5}, + number={1}, + pages={2067} +} diff --git a/joss_paper/paper.md b/joss_paper/paper.md new file mode 100644 index 00000000..b58591eb --- /dev/null +++ b/joss_paper/paper.md @@ -0,0 +1,228 @@ +--- +title: 'alchemlyb: the simple alchemistry library' +tags: + - Python + - alchemistry + - molecular dynamics + - free energy +authors: + - name: Zhiyi Wu + orcid: 0000-0002-7615-7851 + equal-contrib: true + affiliation: 1 + - name: David L. Dotson + orcid: 0000-0001-5879-2942 + equal-contrib: true + affiliation: "2, 3" + - name: Irfan Alibay + orcid: 0000-0001-5787-9130 + affiliation: 4 + - name: Bryce K. Allen + orcid: 0000-0002-0804-8127 + affiliation: 5 + - name: Mohammad Soroush Barhaghi + orcid: 0000-0001-8226-7347 + affiliation: 6 + - name: Jérôme Hénin + orcid: 0000-0003-2540-4098 + affiliation: 7 + - name: Thomas T. Joseph + orcid: 0000-0003-1323-3244 + affiliation: 8 + - name: Ian M. Kenney + orcid: 0000-0002-9749-8866 + affiliation: 2 + - name: Hyungro Lee + orcid: 0000-0002-4221-7094 + affiliation: 9 + - name: Haoxi Li + orcid: 0009-0004-8369-1042 + affiliation: 10 + - name: Victoria Lim + orcid: 0000-0003-4030-9312 + affiliation: 11 + - name: Shuai Liu + orcid: 0000-0002-8632-633X + affiliation: 12 + - name: Domenico Marson + orcid: 0000-0003-1839-9868 + affiliation: 13 + - name: Pascal T. Merz + orcid: 0000-0002-7045-8725 + affiliation: 14 + - name: Alexander Schlaich + orcid: 0000-0002-4250-363X + affiliation: 15 + - name: David Mobley + orcid: 0000-0002-1083-5533 + affiliation: 11 + - name: Michael R. Shirts + orcid: 0000-0003-3249-1097 + affiliation: 16 + - name: Oliver Beckstein + orcid: 0000-0003-1340-0831 + corresponding: true + affiliation: "2,17" +affiliations: + - name: Exscientia plc, Schroedinger Building, Oxford, United Kingdom + index: 1 + - name: Department of Physics, Arizona State University, Tempe, Arizona, United States of America + index: 2 + - name: Datryllic LLC, Phoenix, Arizona, United States of America (present affiliation) + index: 3 + - name: Open Free Energy, Open Molecular Software Foundation, Davis, California, United States + index: 4 + - name: Differentiated Therapeutics, San Diego, CA + index: 5 + - name: Department of Chemical Engineering and Materials Science, Wayne State University, Detroit, Michigan, United States of America + index: 6 + - name: Université Paris Cité, CNRS, Laboratoire de Biochimie Théorique, Paris, France + index: 7 + - name: Department of Anesthesiology and Critical Care, Perelman School of Medicine, University of Pennsylvania, Philadelphia, Pennsylvania, United States of America + index: 8 + - name: Pacific Northwest National Laboratory, Richland, Washington, United States of America + index: 9 + - name: UNC Eshelman School of Pharmacy, University of North Carolina, Chapel Hill, NC, United States of America + index: 10 + - name: Departments of Pharmaceutical Sciences and Chemistry, University of California Irvine, Irvine, California, United States of America + index: 11 + - name: Silicon Therapeutics LLC, Boston, United States of America + index: 12 + - name: Molecular Biology and Nanotechnology Laboratory (MolBNL@UniTS), DEA, University of Trieste, Trieste, Italy + index: 13 + - name: PM Scientific Consulting, Basel, Switzerland + index: 14 + - name: Stuttgart Center for Simulation Science (SC SimTech) & Institute for Computational Physics, University of Stuttgart, Stuttgart, Germany + index: 15 + - name: University of Colorado Boulder, Boulder, Colorado, United States of America + index: 16 + - name: Center for Biological Physics, Arizona State University, Tempe, AZ, United States of America + index: 17 + +date: 4 June 2024 +bibliography: paper.bib + +--- + +# Summary + +*alchemlyb* is an open-source Python software package for the analysis of alchemical free energy calculations, an important method in computational chemistry and biology, most notably in the field of drug discovery [@merz2010drug]. +Its functionality contains individual composable building blocks for all aspects of a full typical free energy analysis workflow, starting with the extraction of raw data from the output of diverse molecular simulation packages, moving on to data preprocessing tasks such as decorrelation of time series, using various estimators to derive free energy estimates from simulation samples, and finally providing quality analysis tools for data convergence checking and visualization. +*alchemlyb* also contains high-level end-to-end workflows that combine multiple building blocks into a user-friendly analysis pipeline from the initial data input stage to the final results. This workflow functionality enhances accessibility by enabling researchers from diverse scientific backgrounds, and not solely computational chemistry specialists, to use *alchemlyb* effectively. + + +# Statement of need + +In the pharmaceutical sector, computational chemistry techniques are integral for evaluating potential drug compounds based on their protein binding affinity [@deng2009computations]. +Notably, absolute binding free energy calculations between proteins and ligands or relative binding affinity of ligands to the same protein are routinely employed for this purpose [@merz2010drug]. +The resultant estimates of these free energies are essential for understanding binding affinity throughout various stages of drug discovery, such as hit identification and lead optimization [@merz2010drug]. +Other free energies extracted from simulations are useful in solution thermodynamics, chemical engineering, environmental science, and material science [@Schlaich2015aa]. + +Molecular simulation packages such as [GROMACS](https://www.gromacs.org/) [@Abraham2015aa], [Amber](https://ambermd.org/) [@Case2005uq], [NAMD](https://www.ks.uiuc.edu/Research/namd/) [@phillips2020scalable], [LAMMPS](https://lammps.org/) [@Thompson2022aa], and [GOMC](https://gomc-wsu.org/) [@Nejahi2021aa] are used to run free energy simulations and many of these packages also contain tools for the subsequent processing of simulation data into free energies. +However, there are no standard output formats and analysis tools implement different algorithms for the different stages of the free energy data processing pipeline. +Therefore, it is very difficult to analyze data from different simulation packages in a consistent manner. +Furthermore, the native analysis tools do not always implement current best practices [@klimovich2015guidelines; @Mey2020aa] or are out of date. +Overall, the coupling between data generation and analysis in most simulation packages hinders seamless collaboration and comparison of results across different implementations of data generation for free energy calculations. + +*alchemlyb* addresses this problem by focusing only on the data analysis portion of this process with the goal to provide a unified interface for working with free energy data generated from different software packages. +In an initial step data are read from the native package file formats and then organized into a common standard data structure, organized as a [*pandas*](https://pandas.pydata.org) `DataFrame` [@mckinney-proc-scipy-2010]. +Functions are provided for pre-processing data by subsampling or decorrelation. +Statistical mechanical estimators are available to extract free energies and thermodynamic expectations as well associated metrics of quality; these estimators are implemented as classes with the same API as estimators in [scikit-learn](https://scikit-learn.org) [@scikitlearn2011; @sklearn2013api]. +*alchemlyb* implements modular building blocks to simplify the process of extracting crucial thermodynamic insights from molecular simulations in a uniform manner. + +*alchemlyb* succeeds the widely-used but now deprecated [`alchemical-analysis.py` tool](https://github.com/MobleyLab/alchemical-analysis) [@klimovich2015guidelines], which combined pre-processing, free energy estimation, and plotting in a single script. +`alchemical-analysis.py` was not thoroughly tested and hard to integrate into modern workflows due to its monolithic design, and only supported (now outdated) Python 2. +*alchemlyb* improves over its predecessor with a modular, function based design and thorough testing of all components using continuous integration. +Thus, *alchemlyb* is a library that enables users to easily use well-tested building blocks within their own tools while additionally providing examples of complete end-to-end workflows. +This innovation enables consistent processing of free energy data from diverse simulation packages, facilitating streamlined comparison and combination of results. + +Notably, *alchemlyb*'s robust and user-friendly nature has led to its integration into other automated workflow libraries such as BioSimSpace [@Hedges2019aa] or MDPOW [@fan2020aa], demonstrating its accessibility and usability within broader scientific workflows and reinforcing its position as a versatile tool in the field of computational chemistry. + + +# Background: Alchemical free energy calculations + +Free energy differences are fundamental to understand many different processes at the molecular scale, ranging from the binding of drug molecules to their receptor proteins or nucleic acids through the partitioning of molecules into different solvents or phases to the stability of crystals and biomolecules [@deng2009computations]. +The calculation of such transfer free energies involves constructing two end states where a target molecule interacts with different environments. +For example, in a solvation free energy calculation, in one state (the coupled state) the target molecule interacts with a solvent (in the case of hydration free energies, water), while in the other state (the decoupled state) the ligand has no intermolecular interactions, which mimics the transfer of a ligand from infinite dilution in the solvent to the gas phase. +The solvation free energy is then obtained by calculating the free energy difference between these two end states, but it is crucial to ensure sufficient overlap in phase space between the coupled and decoupled states, a condition often challenging to achieve. + +Stratified alchemical free energy calculations have emerged as a de-facto standard approach whereby non-physical intermediate states are introduced to bridge between the physical end states of the process [@Mey2020aa]. +In such free energy calculations, overlapping states are created by the introduction of a parameter $\lambda$ that continuously connects the functional form (the Hamiltonian of the system) of the two end-states, resulting in a series of intermediate states each with a different $\lambda$ value between 0 and 1 and with the physically realizable end states at $\lambda=0$ and $\lambda=1$. +In general, $N$ alchemical parameters are used to describe the alchemical transformation with a parameter vector $\vec{\lambda}=(\lambda_1, \lambda_2, \dots, \lambda_N)$, so that $\vec{\lambda}=(0, 0, \dots, 0)$ indicates the initial and $\vec{\lambda} = (1, 1, \dots, 1)$ the final state. +The intermediate states are non-physical but required for converging the calculations. +At each $\vec{\lambda}$-value (or "window"), the system configurations are sampled in the relevant thermodynamic ensemble, typically using Molecular Dynamics (MD) or Monte Carlo (MC) simulations, while generating and accumulating free energy data discussed below. +Estimators are then applied to these data to compute free energy differences between states, including the difference between the final and initial state, thus yielding the desired free energy difference of the physical process of interest. + +# Implementation +## Core design principles + +*alchemlyb* is a Python library that seeks to make alchemical free energy calculations easier and less error prone. +It includes functionality for parsing data from file formats of widely used simulation packages, subsampling these data, and fitting these data with an estimator to obtain free energies. +Functions are simple in usage and pure in scope, and can be chained together to build customized analyses of data while estimators are implemented as classes that follow the tried-and-tested scikit-learn API [@sklearn2013api]. +General and robust workflows following best practices are also provided, which can be used as reference implementations and examples. + +First and foremost, scientific code must be correct and we try to ensure this requirement by following best software engineering practices during development, close to full test coverage of all code in the library (currently 99%), and providing citations to published papers for included algorithms. +We use a curated, public data set ([*alchemtest*](https://github.com/alchemistry/alchemtest)) for automated testing; code in *alchemtest* is published under the open source BSD-3 clause license while all data are included under an [open license](https://opendefinition.org/licenses/#recommended-conformant-licenses) such as [CC0](https://creativecommons.org/publicdomain/zero/1.0/) (public domain) or [CC-BY](http://opendefinition.org/licenses/cc-by/) (attribution required). + +The guiding design principles are summarized as: + +1. Use functions when possible, classes only when necessary (or for estimators, see (2)). +2. For estimators, mimic the object-oriented scikit-learn API as much as possible. +3. Aim for a consistent interface throughout, e.g. all parsers take similar inputs and yield a common set of outputs, using the `pandas.DataFrame` as the underlying data structure. +4. Have *all* functionality tested. + +*alchemlyb* supports recent versions of Python 3 and follows the [SPEC 0 (Minimum Supported Dependencies)](https://scientific-python.org/specs/spec-0000/) Scientific Python Ecosystem Coordination community standard for deciding on when to drop support for older versions of Python and dependencies. +Releases are numbered following the [Semantic Versioning 2.0.0](https://semver.org/) standard of MAJOR.MINOR.PATCHLEVEL, which ensures that users immediately understand if a release may break backwards compatibility (increase of the major version), adds new features (increase of minor version), or only contains bug fixes or other changes that do not directly affect users. +All code is published under the open source BSD-3 clause license. + +## Library structure + +*alchemlyb* offers specific parsers in `alchemlyb.parsing` to load raw free energy data from various molecular simulation packages ([GROMACS](https://www.gromacs.org/) [@Abraham2015aa], [Amber](https://ambermd.org/) [@Case2005uq], [NAMD](https://www.ks.uiuc.edu/Research/namd/) [@phillips2020scalable], and [GOMC](https://gomc-wsu.org/) [@Nejahi2021aa]) and provides a general structure for implementing parsers for other packages that are not yet supported. +The raw data are converted into a standard format as a `pandas.DataFrame` [@mckinney-proc-scipy-2010] and converted from the energy of the software to units of $k T$ where $k = 1.380649 \times 10^{-23}\,\text{J}\,\text{K}^{-1}$ is Boltzmann's constant and $T$ is the temperature at which the simulation was performed. +Metadata such as $T$ and the energy unit are stored in `DataFrame` attributes and propagated through *alchemlyb*, which enables seamless unit conversion with functions in the `alchemlyb.postprocessing` module. +Two types of free energy data are considered: Hamiltonian gradients (`dHdl`, $dH/d\lambda$) at all lambda states, suitable for thermodynamic integration (TI) estimators [@kirkwood1935statistical], and reduced potential energy differences between lambda states (`u_nk`, $u_{nk}$), which are used for free energy perturbation (FEP) estimators [@zwanzig1954high]. + +Both types of estimators assume uncorrelated samples in order to give unbiased estimates of the uncertainties, which requires subsampling of the raw data. +The `alchemlyb.preprocessing.subsampling` module provides tools for data subsampling based on autocorrelation times [@chodera2007use; @Chodera2016aa] as well as simple slicing of the `dHdl` and `u_nk` DataFrames. + +The two major classes of commonly used estimators are implemented in `alchemlyb.estimators`. +Unlike other components of *alchemlyb* that are implemented as pure functions, estimators are implemented as classes and follow the well-known scikit-learn API [@sklearn2013api] where instantiation sets the parameters (e.g., `estimator = MBAR(maximum_iterations=10000)`) and calling of the `fit()` method (e.g., `estimator.fit(u_nk)`) applies the estimator to the data and populates output attributes of the class; these results attributes are customarily indicated with a trailing underscore (e.g., `estimator.delta_f_` for the matrix of free energy differences between all states). +In *alchemlyb*, TI [@paliwal2011benchmark] and TI with Gaussian quadrature [@gusev2023active] estimators are implemented in the TI category of estimators (module `alchemlyb.estimators.TI`). +FEP category estimators (module `alchemlyb.estimators.FEP`) include Bennett Acceptance Ratio (BAR) [@bennett1976efficient] and Multistate BAR (MBAR) [@shirts2008statistically], which are implemented in the [*pymbar*](https://github.com/choderalab/pymbar) package [@shirts2008statistically] and called from *alchemlyb*. + +To evaluate the accuracy of the free energy estimate, *alchemlyb* offers a range of assessment tools. +The error of the TI method is correlated with the average curvature [@pham2011identifying], while the error of FEP estimators depends on the overlap in sampled energy distributions [@pohorille2010good]. +*alchemlyb* creates visualizations of the smoothness of the integrand for TI estimators and the overlap matrix for FEP estimators, which can be qualitatively and quantitatively analyzed to determine the degree of overlap between simulated alchemical states, and suggest whether additional simulations should be run. +For statistical validity, the accumulated samples should be collected from equilibrated simulations and *alchemlyb* contains tools for assessing (`alchemlyb.convergence`) and plotting (`alchemlyb.visualisation`) the convergence of the free energy estimate as a function of simulation time [@yang2004free] and means to compute the "fractional equilibration time" [@fan2020aa] to detect potentially non-equilibrated data. + +*alchemlyb* offers all these tools as a library for users to customize each stage of the analysis (\autoref{fig:buildingblocks}). + +![The building blocks of *alchemlyb*. Raw data from simulation packages are parsed into common data structures depending on the free energy quantities, pre-processed, and processed with a free energy estimator. The resulting free energy differences are analyzed for convergence and plotted for quality assessment.\label{fig:buildingblocks}](Fig1.pdf) + + +## Workflows + +The building blocks are sufficient to compute free energies from alchemical free energy simulations and assess their reliability. +This functionality is used, for example, by the Streamlined Alchemical Free Energy Perturbation (SAFEP) analysis scripts [@Salari2018; @santiagomcrae2023]. + +*alchemlyb* also provides a structure to combine the building blocks into full end-to-end workflows (module `alchemlyb.workflows`). +As an example, the `ABFE` workflow for absolute binding free energy estimation reads in the raw input data and performs decorrelation, estimation, and quality plotting of the estimate. +It can directly estimate quantities such as solvation free energies and makes it easy to calculate more complex quantities such as absolute binding free energies (as the difference between the solvation free energy of the ligand in water and the solvation free energy of the ligand in the protein's binding pocket). + + +# Acknowledgements + +Some work on *alchemlyb* was supported by grants from the National +Institutes of Health (Award No R01GM118772 to O.B., R35GM148236 to +D.M., K08GM139031 to T.T.J.) and the National Science Foundation (award ACI-1443054 to O.B.). A.S. acknowledges funding by Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany's Excellence Strategy - EXC 2075 – 390740016 and support by the Stuttgart Center for Simulation Science (SimTech). +The sponsors were not involved in any aspects of the research or the writing of the manuscript. + +We thank Dominik Wille, Travis Jensen, and Jennifer A. Clark for substantial code contributions, Helmut Carter and Wei-Tse Hsu for small fixes, Shujie Fan for initial code for fractional equilibration time calculation, and Jan Janssen for creating the initial conda-forge package. + +# Author contributions + +D.L.D., M.R.S., D.M., and O.B. designed the project. Z.W., D.L.D., I.A., B.K.A., M.S.B, J.H., T.T.J., I.M.K., H.L., H.L., V.L., S.L., D.M., P.T.M, A.S. contributed to new features. Z.W., D.L.D., O.B. maintained the code base. Z.W., D.L.D., M.R.S, A.S., P.T.M., O.B. wrote the manuscript. + +# References + +