diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000..118a9e4 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,29 @@ +name: documentation + +on: [push, pull_request, workflow_dispatch] + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - name: Install dependencies + run: | + sudo apt-get install pandoc + pip install -r ./docs/requirements.txt + - name: Sphinx build + run: | + cd ./docs/ + make html + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + with: + publish_branch: gh-pages + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/build/html + force_orphan: true\ diff --git a/.gitignore b/.gitignore index ed0313b..544ae91 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ -# currently useful info, not for the package -misc/ -nist_map/ +# project-specific +update/data/* # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index abe9556..bcfa3ad 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,13 @@ # NistChemPy: Python API for NIST Chemistry WebBook -[NIST Chemistry WebBook](https://webbook.nist.gov/) is a public database containing physico-chemical data which was carifully verified by specialists. However, the only way to retrieve information is to use browser, and no API is available. Moreover, search results are limited to 400 found compounds which is convenient for manual search of several compounds but unsuitable for chemoinformatics. +**NistChemPy** is an unofficial API for the [NIST Chemistry WebBook](https://webbook.nist.gov/). -NistChemPy is designed to solve this problem. It supports search by compound name, InChI/InChI-key, CAS RN, and chemical formula, and downloading key properties of retrieved compounds. Search object is designed in a way that it is easy to automate the search for all the necessary substances without exceeding the limit of 400 pieces. +This package not only automates the search and data extraction processes but also bypasses the WebBook's limitation of 400 compounds per search. + +Currently, **NistChemPy** enables the extraction of basic compound properties as well as IR, THz, MS, and UV-Vis spectra. + +Additional properties are available via URLs that link to their respective web pages, with potential support for direct extraction in future updates. -At the moment the code only supports IR, MS and UV/Vis spectra; support for other thermodynamic properties may be added later. ## Installation @@ -14,6 +17,12 @@ Install NistChemPy using [pip](https://pypi.org/project/NistChemPy/): pip install nistchempy ``` +> [!WARNING] +> Please note that versions starting with 1.0.0 are not backward compatible with the older alpha versions due to significant changes in the code structure. +> You may need to update your nistchempy-based code or use the older nistchempy versions. + + ## How To -The main NistChemPy features including search and compound manipulations are shown in the [tutorial](https://github.com/EPiCs-group/NistChemPy/blob/main/tutorial.ipynb). +The primary features of NistChemPy, such as search capabilities and compound manipulations, are detailed in the [documentation](https://ivanchernyshov.github.io/NistChemPy/source/api.html). + diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..6f6062a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = ./ +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..7898853 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,75 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import sys, os +sys.path.insert(0, os.path.abspath('../')) + + +# -- Project information ----------------------------------------------------- + +project = 'NistChemPy' +copyright = '2023, Ivan Yu. Chernyshov' +author = 'Ivan Yu. Chernyshov' + +# The full version, including alpha/beta/rc tags +release = '0.3.0' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'nbsphinx', + #'sphinx_gallery.load_style', + #'myst_parser' +] + +# Add any paths that contain templates here, relative to this directory. +#templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', '**.ipynb_checkpoints'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +#html_static_path = ['_static'] + +# -- Logo ---------------------------------------------------------- + +#html_theme_options = { +# 'logo_only': False, +#} +#html_logo = 'source/images/mace_favicon.png' +#html_favicon = 'source/images/mace_favicon.png' + + +# -- Options for exts ---------------------------------------------- + +# myst-nb +nb_execution_mode = "off" +#nb_execution_cache_path = "source/cache" + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..bf5b1d3 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,74 @@ +========================================== +Welcome to the NistChemPy's documentation! +========================================== + +.. toctree:: + :hidden: + :maxdepth: 2 + :caption: Cookbook: + + Basic Search + Compound Properties + Advanced Search + + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Package details: + + Package API + Changelog + + +**NistChemPy** is an unofficial API for the `NIST Chemistry WebBook`_. +This package not only automates the search and data extraction processes but also bypasses the WebBook's limitation of 400 compounds per search. +Currently, NistChemPy enables the extraction of basic compound properties as well as IR, THz, MS, and UV-Vis spectra. +Additional properties are available via URLs that link to their respective web pages, with potential support for direct extraction in future updates. + + +Installation +============ + +**NistChemPy** can be installed as a `PyPI package`_: + +.. code-block:: + + > pip install nistchempy + + +Requirements +============ + +1. Python 3.7+; + +2. requests; + +3. bs4; + +4. pandas; + +5. importlib_resources (for Python 3.7 and 3.8). + + +Useful links +============ + +1. `NIST Chemistry WebBook`_: webapp accessing the NIST Chemistry WebBook database. +2. `GitHub`_: GitHub page of the package. +3. `PyPI package`_: PyPI page of the package. +4. `Update tools`_: script for semi-automatic update of structural information of new NIST Chemistry WebBook compounds. + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` + + + +.. _NIST Chemistry WebBook: https://webbook.nist.gov/chemistry/ +.. _GitHub: https://github.com/IvanChernyshov/NistChemPy +.. _PyPI package: https://pypi.org/project/nistchempy/ +.. _Update tools: https://github.com/IvanChernyshov/NistChemPy/tree/main/update diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..390a749 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=./ +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..03783c8 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,13 @@ +# nistchempy +requests +pandas +bs4 +importlib_resources ; python_version < '3.9' + +# docs +sphinx +sphinx_gallery +sphinx-rtd-theme +notebook +pandoc +nbsphinx diff --git a/docs/source/advanced_search.ipynb b/docs/source/advanced_search.ipynb new file mode 100644 index 0000000..029c668 --- /dev/null +++ b/docs/source/advanced_search.ipynb @@ -0,0 +1,1735 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8d7cb2ed-a704-4b35-aa29-6996a9e80272", + "metadata": {}, + "source": [ + "# Advanced search" + ] + }, + { + "cell_type": "markdown", + "id": "69985863-21be-43fb-90fb-7c79a58c188b", + "metadata": {}, + "source": [ + "NIST Chemistry WebBook supports structure search, however to the best of our knowledge there is no straightforward way to implement it as a Python API. To overcome this problem, as well as WebBook's limitation of found compounds, **NistChemPy** package contains dataframe with the main info on all NIST Chemistry WebBook compounds:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6d78af72-40e0-4f28-b615-0d5bb1be52cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDnamesynonymsformulamol_weightinchiinchi_keycas_rnmol2Dmol3DGas phase thermochemistry dataCondensed phase thermochemistry dataPhase change dataReaction thermochemistry dataGas phase ion energetics dataIon clustering dataIR SpectrumTHz IR spectrumMass spectrum (electron ionization)UV/Visible spectrumGas ChromatographyVibrational and/or electronic energy levelsConstants of diatomic moleculesHenry's Law dataFluid PropertiesComputational Chemistry Comparison and Benchmark DatabaseElectron-Impact Ionization Cross Sections (on physics web site)Gas Phase Kinetics DatabaseMicrowave spectra (on physics lab web site)NIST Atomic Spectra Database - Ground states and ionization energies (on physics web site)NIST Atomic Spectra Database - Levels Holdings (on physics web site)NIST Atomic Spectra Database - Lines Holdings (on physics web site)NIST Polycyclic Aromatic Hydrocarbon Structure IndexReference simulationReference simulation: SPC/E WaterReference simulation: TraPPE Carbon DioxideX-ray Photoelectron Spectroscopy Database, version 5.0NIST / TRC Web Thermo Tables, \"lite\" edition (thermophysical and thermochemical data)NIST / TRC Web Thermo Tables, professional edition (thermophysical and thermochemical data)
0B100iron oxide anionNaNFeO-71.8450NaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=B100...NaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=B100...https://webbook.nist.gov/cgi/cbook.cgi?ID=B100...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1B1000AsF3..Cl anionNaNAsClF3-167.3700NaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=B100...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2B1000000AgH2-NaNAgH2-109.8846NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=B100...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3B1000001HAg(H2)NaNAgH3110.8920NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=B100...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4B1000002AgNO+NaNAgNO+137.8738NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=B100...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
........................................................................................................................
129323U99777Methyl 3-hydroxycholest-5-en-26-oate, TMS deri...Methyl (25RS)-3β-hydroxy-5-cholesten-26-oate, ...C31 H54 O3 Si502.8442InChI=1S/C31H54O3Si/c1-21(10-9-11-22(2)29(32)3...DNXGNXYNSBCWGX-QBUYVTDMSA-NNaNhttps://webbook.nist.gov/cgi/cbook.cgi?Str2Fil...NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=U997...NaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=U997...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
129324U998302-Methyl-3-oxovaleric acid, O,O'-bis(trimethyl...3-Oxopentanoic acid, 2-methyl, TMS\\n2-Methyl-3...C12 H26 O3 Si2274.5040InChI=1S/C12H26O3Si2/c1-9-11(14-16(3,4)5)10(2)...LXAIQDVPXKOIGO-KHPPLWFESA-NNaNhttps://webbook.nist.gov/cgi/inchi?Str2File=U9...NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/inchi?ID=U99830&M...NaNhttps://webbook.nist.gov/cgi/inchi?ID=U99830&M...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
129325U999423-Hydroxy-3-(4'-hydroxy-3'-methoxyphenyl)propi...Vanillylhydracrylic acid, tri-TMS\\nVanillylhyd...C19 H36 O5 Si3428.7426InChI=1S/C19H36O5Si3/c1-21-18-13-15(11-12-16(1...QCMUGKOFXVYNCF-UHFFFAOYSA-NNaNhttps://webbook.nist.gov/cgi/inchi?Str2File=U9...NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/inchi?ID=U99942&M...NaNhttps://webbook.nist.gov/cgi/inchi?ID=U99942&M...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
129326U999472-Propylpentanoic acid, 2,3,4,6-tetra(trimethy...Valproic acid, glucuronide, TMSC26 H58 O7 Si4595.0765InChI=1S/C26H58O7Si4/c1-15-17-20(18-16-2)25(27...OVXMRISJDUWFKB-UHFFFAOYSA-NNaNhttps://webbook.nist.gov/cgi/inchi?Str2File=U9...NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/inchi?ID=U99947&M...NaNhttps://webbook.nist.gov/cgi/inchi?ID=U99947&M...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
129327xY5O2 radicalNaNO2 Y5476.5281NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://webbook.nist.gov/cgi/cbook.cgi?ID=x&Ma...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

129328 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " ID name \\\n", + "0 B100 iron oxide anion \n", + "1 B1000 AsF3..Cl anion \n", + "2 B1000000 AgH2- \n", + "3 B1000001 HAg(H2) \n", + "4 B1000002 AgNO+ \n", + "... ... ... \n", + "129323 U99777 Methyl 3-hydroxycholest-5-en-26-oate, TMS deri... \n", + "129324 U99830 2-Methyl-3-oxovaleric acid, O,O'-bis(trimethyl... \n", + "129325 U99942 3-Hydroxy-3-(4'-hydroxy-3'-methoxyphenyl)propi... \n", + "129326 U99947 2-Propylpentanoic acid, 2,3,4,6-tetra(trimethy... \n", + "129327 x Y5O2 radical \n", + "\n", + " synonyms formula \\\n", + "0 NaN FeO- \n", + "1 NaN AsClF3- \n", + "2 NaN AgH2- \n", + "3 NaN AgH3 \n", + "4 NaN AgNO+ \n", + "... ... ... \n", + "129323 Methyl (25RS)-3β-hydroxy-5-cholesten-26-oate, ... C31 H54 O3 Si \n", + "129324 3-Oxopentanoic acid, 2-methyl, TMS\\n2-Methyl-3... C12 H26 O3 Si2 \n", + "129325 Vanillylhydracrylic acid, tri-TMS\\nVanillylhyd... C19 H36 O5 Si3 \n", + "129326 Valproic acid, glucuronide, TMS C26 H58 O7 Si4 \n", + "129327 NaN O2 Y5 \n", + "\n", + " mol_weight inchi \\\n", + "0 71.8450 NaN \n", + "1 167.3700 NaN \n", + "2 109.8846 NaN \n", + "3 110.8920 NaN \n", + "4 137.8738 NaN \n", + "... ... ... \n", + "129323 502.8442 InChI=1S/C31H54O3Si/c1-21(10-9-11-22(2)29(32)3... \n", + "129324 274.5040 InChI=1S/C12H26O3Si2/c1-9-11(14-16(3,4)5)10(2)... \n", + "129325 428.7426 InChI=1S/C19H36O5Si3/c1-21-18-13-15(11-12-16(1... \n", + "129326 595.0765 InChI=1S/C26H58O7Si4/c1-15-17-20(18-16-2)25(27... \n", + "129327 476.5281 NaN \n", + "\n", + " inchi_key cas_rn \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 DNXGNXYNSBCWGX-QBUYVTDMSA-N NaN \n", + "129324 LXAIQDVPXKOIGO-KHPPLWFESA-N NaN \n", + "129325 QCMUGKOFXVYNCF-UHFFFAOYSA-N NaN \n", + "129326 OVXMRISJDUWFKB-UHFFFAOYSA-N NaN \n", + "129327 NaN NaN \n", + "\n", + " mol2D mol3D \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 https://webbook.nist.gov/cgi/cbook.cgi?Str2Fil... NaN \n", + "129324 https://webbook.nist.gov/cgi/inchi?Str2File=U9... NaN \n", + "129325 https://webbook.nist.gov/cgi/inchi?Str2File=U9... NaN \n", + "129326 https://webbook.nist.gov/cgi/inchi?Str2File=U9... NaN \n", + "129327 NaN NaN \n", + "\n", + " Gas phase thermochemistry data \\\n", + "0 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... \n", + "1 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Condensed phase thermochemistry data Phase change data \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 NaN NaN \n", + "129324 NaN NaN \n", + "129325 NaN NaN \n", + "129326 NaN NaN \n", + "129327 NaN NaN \n", + "\n", + " Reaction thermochemistry data \\\n", + "0 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Gas phase ion energetics data Ion clustering data \\\n", + "0 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 NaN NaN \n", + "129324 NaN NaN \n", + "129325 NaN NaN \n", + "129326 NaN NaN \n", + "129327 https://webbook.nist.gov/cgi/cbook.cgi?ID=x&Ma... NaN \n", + "\n", + " IR Spectrum THz IR spectrum \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 NaN NaN \n", + "129324 NaN NaN \n", + "129325 NaN NaN \n", + "129326 NaN NaN \n", + "129327 NaN NaN \n", + "\n", + " Mass spectrum (electron ionization) UV/Visible spectrum \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 https://webbook.nist.gov/cgi/cbook.cgi?ID=U997... NaN \n", + "129324 https://webbook.nist.gov/cgi/inchi?ID=U99830&M... NaN \n", + "129325 https://webbook.nist.gov/cgi/inchi?ID=U99942&M... NaN \n", + "129326 https://webbook.nist.gov/cgi/inchi?ID=U99947&M... NaN \n", + "129327 NaN NaN \n", + "\n", + " Gas Chromatography \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 https://webbook.nist.gov/cgi/cbook.cgi?ID=U997... \n", + "129324 https://webbook.nist.gov/cgi/inchi?ID=U99830&M... \n", + "129325 https://webbook.nist.gov/cgi/inchi?ID=U99942&M... \n", + "129326 https://webbook.nist.gov/cgi/inchi?ID=U99947&M... \n", + "129327 NaN \n", + "\n", + " Vibrational and/or electronic energy levels \\\n", + "0 NaN \n", + "1 NaN \n", + "2 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... \n", + "3 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... \n", + "4 https://webbook.nist.gov/cgi/cbook.cgi?ID=B100... \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Constants of diatomic molecules Henry's Law data Fluid Properties \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "129323 NaN NaN NaN \n", + "129324 NaN NaN NaN \n", + "129325 NaN NaN NaN \n", + "129326 NaN NaN NaN \n", + "129327 NaN NaN NaN \n", + "\n", + " Computational Chemistry Comparison and Benchmark Database \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Electron-Impact Ionization Cross Sections (on physics web site) \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Gas Phase Kinetics Database \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Microwave spectra (on physics lab web site) \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " NIST Atomic Spectra Database - Ground states and ionization energies (on physics web site) \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " NIST Atomic Spectra Database - Levels Holdings (on physics web site) \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " NIST Atomic Spectra Database - Lines Holdings (on physics web site) \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " NIST Polycyclic Aromatic Hydrocarbon Structure Index \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " Reference simulation Reference simulation: SPC/E Water \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "129323 NaN NaN \n", + "129324 NaN NaN \n", + "129325 NaN NaN \n", + "129326 NaN NaN \n", + "129327 NaN NaN \n", + "\n", + " Reference simulation: TraPPE Carbon Dioxide \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " X-ray Photoelectron Spectroscopy Database, version 5.0 \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " NIST / TRC Web Thermo Tables, \"lite\" edition (thermophysical and thermochemical data) \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + " NIST / TRC Web Thermo Tables, professional edition (thermophysical and thermochemical data) \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "129323 NaN \n", + "129324 NaN \n", + "129325 NaN \n", + "129326 NaN \n", + "129327 NaN \n", + "\n", + "[129328 rows x 39 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nistchempy as nist\n", + "import pandas as pd\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "df = nist.get_all_data()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "eed873f7-7df9-44f9-b19f-7acf7da28dc6", + "metadata": {}, + "source": [ + "Its columns can be divided in 5 groups:\n", + "\n", + "1. General properties\n", + "\n", + "2. Molecular files\n", + "\n", + "3. NIST Chemistry WebBook data\n", + "\n", + "4. NIST public data\n", + "\n", + "5. NIST subscription data\n", + "\n", + "All columns except for the first group contain URLs for the corresponding data, allowing one to parse the relevant pages without the need to preload the compounds themselves:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a6dc7b7a-65f4-493f-bdfc-917fa3ef76d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDinchiNIST Atomic Spectra Database - Ground states and ionization energies (on physics web site)
11504C10028145InChI=1S/Nohttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
11587C10043922InChI=1S/Rnhttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
11743C10097322InChI=1S/Brhttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
16920C12385136InChI=1S/Hhttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
18616C13494809InChI=1S/Tehttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
............
59684C7440735InChI=1S/Frhttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
59685C7440746InChI=1S/Inhttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
60897C7704349InChI=1S/Shttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
60995C7723140InChI=1S/Phttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
61257C7782492InChI=1S/Sehttps://physics.nist.gov/cgi-bin/ASD/ie.pl?spe...
\n", + "

101 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ID inchi \\\n", + "11504 C10028145 InChI=1S/No \n", + "11587 C10043922 InChI=1S/Rn \n", + "11743 C10097322 InChI=1S/Br \n", + "16920 C12385136 InChI=1S/H \n", + "18616 C13494809 InChI=1S/Te \n", + "... ... ... \n", + "59684 C7440735 InChI=1S/Fr \n", + "59685 C7440746 InChI=1S/In \n", + "60897 C7704349 InChI=1S/S \n", + "60995 C7723140 InChI=1S/P \n", + "61257 C7782492 InChI=1S/Se \n", + "\n", + " NIST Atomic Spectra Database - Ground states and ionization energies (on physics web site) \n", + "11504 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "11587 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "11743 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "16920 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "18616 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "... ... \n", + "59684 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "59685 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "60897 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "60995 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "61257 https://physics.nist.gov/cgi-bin/ASD/ie.pl?spe... \n", + "\n", + "[101 rows x 3 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "col = 'NIST Atomic Spectra Database - Ground states and ionization energies (on physics web site)'\n", + "df.loc[~df[col].isna(), ['ID', 'inchi', col]]" + ] + }, + { + "cell_type": "markdown", + "id": "cbfd6766-a8bb-4817-8df4-e7680beceb7a", + "metadata": {}, + "source": [ + "This dataframe can be used to limit all entries to those ones with desired properties. To use short names for NIST Chemistry WebBook properties, one can use the `nist.get_search_parameters` function:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9a6f4527-66c8-4606-bc91-536be0ced28c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'use_SI': 'Units for thermodynamic data, \"SI\" if True and \"calories\" if False',\n", + " 'match_isotopes': 'Exactly match the specified isotopes (formula search only)',\n", + " 'allow_other': 'Allow elements not specified in formula (formula search only)',\n", + " 'allow_extra': 'Allow more atoms of elements in formula than specified (formula search only)',\n", + " 'no_ion': 'Exclude ions from the search (formula search only)',\n", + " 'cTG': 'Gas phase thermochemistry data',\n", + " 'cTC': 'Condensed phase thermochemistry data',\n", + " 'cTP': 'Phase change data',\n", + " 'cTR': 'Reaction thermochemistry data',\n", + " 'cIE': 'Gas phase ion energetics data',\n", + " 'cIC': 'Ion clustering data',\n", + " 'cIR': 'IR Spectrum',\n", + " 'cTZ': 'THz IR spectrum',\n", + " 'cMS': 'Mass spectrum (electron ionization)',\n", + " 'cUV': 'UV/Visible spectrum',\n", + " 'cGC': 'Gas Chromatography',\n", + " 'cES': 'Vibrational and/or electronic energy levels',\n", + " 'cDI': 'Constants of diatomic molecules',\n", + " 'cSO': \"Henry's Law data\"}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ps = nist.get_search_parameters()\n", + "ps" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c8912156-a244-45de-bc29-0b832a080806", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDnamesynonymsformulamol_weightinchiinchi_keycas_rnmol2Dmol3D...NIST Atomic Spectra Database - Ground states and ionization energies (on physics web site)NIST Atomic Spectra Database - Levels Holdings (on physics web site)NIST Atomic Spectra Database - Lines Holdings (on physics web site)NIST Polycyclic Aromatic Hydrocarbon Structure IndexReference simulationReference simulation: SPC/E WaterReference simulation: TraPPE Carbon DioxideX-ray Photoelectron Spectroscopy Database, version 5.0NIST / TRC Web Thermo Tables, \"lite\" edition (thermophysical and thermochemical data)NIST / TRC Web Thermo Tables, professional edition (thermophysical and thermochemical data)
11392C100016p-NitroanilineBenzenamine, 4-nitro-\\nAniline, p-nitro-\\np-Am...C6 H6 N2 O2138.1240InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4...TYMLOMAKGOJONV-UHFFFAOYSA-N100-01-6https://webbook.nist.gov/cgi/inchi?Str2File=C1...https://webbook.nist.gov/cgi/inchi?Str3File=C1......NaNNaNNaNNaNNaNNaNNaNhttps://srdata.nist.gov/xps/SpectralByCompdDd/...NaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
11393C100027Phenol, 4-nitro-Phenol, p-nitro-\\np-Hydroxynitrobenzene\\np-Nit...C6 H5 NO3139.1088InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8HBTJIUGUIPKRLHP-UHFFFAOYSA-N100-02-7https://webbook.nist.gov/cgi/inchi?Str2File=C1...https://webbook.nist.gov/cgi/inchi?Str3File=C1......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
11412C100094Benzoic acid, 4-methoxy-p-Anisic acid\\np-Methoxybenzoic acid\\nDraconic...C8 H8 O3152.1473InChI=1S/C8H8O3/c1-11-7-4-2-6(3-5-7)8(9)10/h2-...ZEYHEAKUIGZSGI-UHFFFAOYSA-N100-09-4https://webbook.nist.gov/cgi/inchi?Str2File=C1...https://webbook.nist.gov/cgi/inchi?Str3File=C1......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
11417C100107Benzaldehyde, 4-(dimethylamino)-Benzaldehyde, p-(dimethylamino)-\\np-(Dimethyla...C9 H11 NO149.1897InChI=1S/C9H11NO/c1-10(2)9-5-3-8(7-11)4-6-9/h3...BGNGWHSBYQYVRX-UHFFFAOYSA-N100-10-7https://webbook.nist.gov/cgi/inchi?Str2File=C1...https://webbook.nist.gov/cgi/inchi?Str3File=C1......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
11422C100129Benzene, 1-ethyl-4-nitro-p-Ethylnitrobenzene\\np-Nitroethylbenzene\\np-Ni...C8 H9 NO2151.1626InChI=1S/C8H9NO2/c1-2-7-3-5-8(6-4-7)9(10)11/h3...RESTWAHJFMZUIZ-UHFFFAOYSA-N100-12-9https://webbook.nist.gov/cgi/inchi?Str2File=C1...https://webbook.nist.gov/cgi/inchi?Str3File=C1......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
..................................................................
66252C99923Acetophenone, 4'-amino-Ethanone, 1-(4-aminophenyl)-\\np-Acetylaniline\\...C8 H9 NO135.1632InChI=1S/C8H9NO/c1-6(10)7-2-4-8(9)5-3-7/h2-5H,...GPRYKVSEZCQIHD-UHFFFAOYSA-N99-92-3https://webbook.nist.gov/cgi/inchi?Str2File=C9...https://webbook.nist.gov/cgi/inchi?Str3File=C9......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
66254C99934Acetophenone, 4'-hydroxy-Ethanone, 1-(4-hydroxyphenyl)-\\np-Hydroxyaceto...C8 H8 O2136.1479InChI=1S/C8H8O2/c1-6(9)7-2-4-8(10)5-3-7/h2-5,1...TXFPEBPIARQUIG-UHFFFAOYSA-N99-93-4https://webbook.nist.gov/cgi/inchi?Str2File=C9...https://webbook.nist.gov/cgi/inchi?Str3File=C9......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
66257C99945Benzoic acid, 4-methyl-p-Toluic acid\\np-Methylbenzoic acid\\np-Toluyli...C8 H8 O2136.1479InChI=1S/C8H8O2/c1-6-2-4-7(5-3-6)8(9)10/h2-5H,...LPNBBFKOUUSUDB-UHFFFAOYSA-N99-94-5https://webbook.nist.gov/cgi/inchi?Str2File=C9...https://webbook.nist.gov/cgi/inchi?Str3File=C9......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
66264C99967Benzoic acid, 4-hydroxy-Benzoic acid, p-hydroxy-\\np-Hydroxybenzoic aci...C7 H6 O3138.1207InChI=1S/C7H6O3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8...FJKROLUGYXJWQN-UHFFFAOYSA-N99-96-7https://webbook.nist.gov/cgi/inchi?Str2File=C9...https://webbook.nist.gov/cgi/inchi?Str3File=C9......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
66269C99978Benzenamine, N,N,4-trimethyl-p-Toluidine, N,N-dimethyl-\\np-Methyl-N,N-dimet...C9 H13 N135.2062InChI=1S/C9H13N/c1-8-4-6-9(7-5-8)10(2)3/h4-7H,...GYVGXEWAOAAJEU-UHFFFAOYSA-N99-97-8https://webbook.nist.gov/cgi/inchi?Str2File=C9...https://webbook.nist.gov/cgi/inchi?Str3File=C9......NaNNaNNaNNaNNaNNaNNaNNaNNaNhttps://wtt-pro.nist.gov/wtt-pro/index.html?cm...
\n", + "

1469 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " ID name \\\n", + "11392 C100016 p-Nitroaniline \n", + "11393 C100027 Phenol, 4-nitro- \n", + "11412 C100094 Benzoic acid, 4-methoxy- \n", + "11417 C100107 Benzaldehyde, 4-(dimethylamino)- \n", + "11422 C100129 Benzene, 1-ethyl-4-nitro- \n", + "... ... ... \n", + "66252 C99923 Acetophenone, 4'-amino- \n", + "66254 C99934 Acetophenone, 4'-hydroxy- \n", + "66257 C99945 Benzoic acid, 4-methyl- \n", + "66264 C99967 Benzoic acid, 4-hydroxy- \n", + "66269 C99978 Benzenamine, N,N,4-trimethyl- \n", + "\n", + " synonyms formula \\\n", + "11392 Benzenamine, 4-nitro-\\nAniline, p-nitro-\\np-Am... C6 H6 N2 O2 \n", + "11393 Phenol, p-nitro-\\np-Hydroxynitrobenzene\\np-Nit... C6 H5 NO3 \n", + "11412 p-Anisic acid\\np-Methoxybenzoic acid\\nDraconic... C8 H8 O3 \n", + "11417 Benzaldehyde, p-(dimethylamino)-\\np-(Dimethyla... C9 H11 NO \n", + "11422 p-Ethylnitrobenzene\\np-Nitroethylbenzene\\np-Ni... C8 H9 NO2 \n", + "... ... ... \n", + "66252 Ethanone, 1-(4-aminophenyl)-\\np-Acetylaniline\\... C8 H9 NO \n", + "66254 Ethanone, 1-(4-hydroxyphenyl)-\\np-Hydroxyaceto... C8 H8 O2 \n", + "66257 p-Toluic acid\\np-Methylbenzoic acid\\np-Toluyli... C8 H8 O2 \n", + "66264 Benzoic acid, p-hydroxy-\\np-Hydroxybenzoic aci... C7 H6 O3 \n", + "66269 p-Toluidine, N,N-dimethyl-\\np-Methyl-N,N-dimet... C9 H13 N \n", + "\n", + " mol_weight inchi \\\n", + "11392 138.1240 InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4... \n", + "11393 139.1088 InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H \n", + "11412 152.1473 InChI=1S/C8H8O3/c1-11-7-4-2-6(3-5-7)8(9)10/h2-... \n", + "11417 149.1897 InChI=1S/C9H11NO/c1-10(2)9-5-3-8(7-11)4-6-9/h3... \n", + "11422 151.1626 InChI=1S/C8H9NO2/c1-2-7-3-5-8(6-4-7)9(10)11/h3... \n", + "... ... ... \n", + "66252 135.1632 InChI=1S/C8H9NO/c1-6(10)7-2-4-8(9)5-3-7/h2-5H,... \n", + "66254 136.1479 InChI=1S/C8H8O2/c1-6(9)7-2-4-8(10)5-3-7/h2-5,1... \n", + "66257 136.1479 InChI=1S/C8H8O2/c1-6-2-4-7(5-3-6)8(9)10/h2-5H,... \n", + "66264 138.1207 InChI=1S/C7H6O3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8... \n", + "66269 135.2062 InChI=1S/C9H13N/c1-8-4-6-9(7-5-8)10(2)3/h4-7H,... \n", + "\n", + " inchi_key cas_rn \\\n", + "11392 TYMLOMAKGOJONV-UHFFFAOYSA-N 100-01-6 \n", + "11393 BTJIUGUIPKRLHP-UHFFFAOYSA-N 100-02-7 \n", + "11412 ZEYHEAKUIGZSGI-UHFFFAOYSA-N 100-09-4 \n", + "11417 BGNGWHSBYQYVRX-UHFFFAOYSA-N 100-10-7 \n", + "11422 RESTWAHJFMZUIZ-UHFFFAOYSA-N 100-12-9 \n", + "... ... ... \n", + "66252 GPRYKVSEZCQIHD-UHFFFAOYSA-N 99-92-3 \n", + "66254 TXFPEBPIARQUIG-UHFFFAOYSA-N 99-93-4 \n", + "66257 LPNBBFKOUUSUDB-UHFFFAOYSA-N 99-94-5 \n", + "66264 FJKROLUGYXJWQN-UHFFFAOYSA-N 99-96-7 \n", + "66269 GYVGXEWAOAAJEU-UHFFFAOYSA-N 99-97-8 \n", + "\n", + " mol2D \\\n", + "11392 https://webbook.nist.gov/cgi/inchi?Str2File=C1... \n", + "11393 https://webbook.nist.gov/cgi/inchi?Str2File=C1... \n", + "11412 https://webbook.nist.gov/cgi/inchi?Str2File=C1... \n", + "11417 https://webbook.nist.gov/cgi/inchi?Str2File=C1... \n", + "11422 https://webbook.nist.gov/cgi/inchi?Str2File=C1... \n", + "... ... \n", + "66252 https://webbook.nist.gov/cgi/inchi?Str2File=C9... \n", + "66254 https://webbook.nist.gov/cgi/inchi?Str2File=C9... \n", + "66257 https://webbook.nist.gov/cgi/inchi?Str2File=C9... \n", + "66264 https://webbook.nist.gov/cgi/inchi?Str2File=C9... \n", + "66269 https://webbook.nist.gov/cgi/inchi?Str2File=C9... \n", + "\n", + " mol3D ... \\\n", + "11392 https://webbook.nist.gov/cgi/inchi?Str3File=C1... ... \n", + "11393 https://webbook.nist.gov/cgi/inchi?Str3File=C1... ... \n", + "11412 https://webbook.nist.gov/cgi/inchi?Str3File=C1... ... \n", + "11417 https://webbook.nist.gov/cgi/inchi?Str3File=C1... ... \n", + "11422 https://webbook.nist.gov/cgi/inchi?Str3File=C1... ... \n", + "... ... ... \n", + "66252 https://webbook.nist.gov/cgi/inchi?Str3File=C9... ... \n", + "66254 https://webbook.nist.gov/cgi/inchi?Str3File=C9... ... \n", + "66257 https://webbook.nist.gov/cgi/inchi?Str3File=C9... ... \n", + "66264 https://webbook.nist.gov/cgi/inchi?Str3File=C9... ... \n", + "66269 https://webbook.nist.gov/cgi/inchi?Str3File=C9... ... \n", + "\n", + " NIST Atomic Spectra Database - Ground states and ionization energies (on physics web site) \\\n", + "11392 NaN \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " NIST Atomic Spectra Database - Levels Holdings (on physics web site) \\\n", + "11392 NaN \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " NIST Atomic Spectra Database - Lines Holdings (on physics web site) \\\n", + "11392 NaN \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " NIST Polycyclic Aromatic Hydrocarbon Structure Index \\\n", + "11392 NaN \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " Reference simulation Reference simulation: SPC/E Water \\\n", + "11392 NaN NaN \n", + "11393 NaN NaN \n", + "11412 NaN NaN \n", + "11417 NaN NaN \n", + "11422 NaN NaN \n", + "... ... ... \n", + "66252 NaN NaN \n", + "66254 NaN NaN \n", + "66257 NaN NaN \n", + "66264 NaN NaN \n", + "66269 NaN NaN \n", + "\n", + " Reference simulation: TraPPE Carbon Dioxide \\\n", + "11392 NaN \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " X-ray Photoelectron Spectroscopy Database, version 5.0 \\\n", + "11392 https://srdata.nist.gov/xps/SpectralByCompdDd/... \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " NIST / TRC Web Thermo Tables, \"lite\" edition (thermophysical and thermochemical data) \\\n", + "11392 NaN \n", + "11393 NaN \n", + "11412 NaN \n", + "11417 NaN \n", + "11422 NaN \n", + "... ... \n", + "66252 NaN \n", + "66254 NaN \n", + "66257 NaN \n", + "66264 NaN \n", + "66269 NaN \n", + "\n", + " NIST / TRC Web Thermo Tables, professional edition (thermophysical and thermochemical data) \n", + "11392 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "11393 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "11412 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "11417 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "11422 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "... ... \n", + "66252 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "66254 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "66257 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "66264 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "66269 https://wtt-pro.nist.gov/wtt-pro/index.html?cm... \n", + "\n", + "[1469 rows x 39 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option('display.max_columns', 20)\n", + "sub = df.loc[~df.inchi.isna() & ~df.mol2D.isna() & ~df[ps['cMS']].isna() & ~df[ps['cUV']].isna()]\n", + "sub" + ] + }, + { + "cell_type": "markdown", + "id": "32a852f5-190c-4ad6-897f-3a0bd11b29f1", + "metadata": {}, + "source": [ + "Also one can run a substructure search, e.g. to get only non-aromatic compounds:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4e92f88d-c88c-4703-8a38-b4d079f7654a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "320 of 1469 compounds were selected\n" + ] + } + ], + "source": [ + "from rdkit import Chem\n", + "\n", + "# supress rdkit warnings\n", + "from rdkit import RDLogger\n", + "RDLogger.DisableLog('rdApp.*')\n", + "\n", + "# prepare molecules for search\n", + "mols = [(ID, Chem.MolFromInchi(inchi)) for ID, inchi in zip(sub.ID, sub.inchi)]\n", + "mols = [(ID, mol) for ID, mol in mols if mol]\n", + "\n", + "# search\n", + "pat = Chem.MolFromSmarts('[a]')\n", + "hits = [ID for ID, mol in mols if not mol.HasSubstructMatch(pat)]\n", + "print(f'{len(hits)} of {len(sub)} compounds were selected')" + ] + }, + { + "cell_type": "markdown", + "id": "0b0c6c3d-0887-472c-94ee-2cb161aecb3f", + "metadata": {}, + "source": [ + "Those compounds can be retrieved via `nist.get_compound` function." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 0000000..5478d2c --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,64 @@ +NistChemPy API +============== + + +nistchempy +---------- + +.. automodule:: nistchempy + :imported-members: + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource + :noindex: + + +nistchempy.requests +------------------- + +.. automodule:: nistchempy.requests + :imported-members: + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource + :noindex: + + +nistchempy.compound +------------------- + +.. automodule:: nistchempy.compound + :imported-members: + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource + :noindex: + + +nistchempy.search +----------------------- + +.. automodule:: nistchempy.search + :imported-members: + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource + :noindex: + + +nistchempy.compound_list +------------------------ + +.. automodule:: nistchempy.compound_list + :imported-members: + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource + :noindex: + + diff --git a/docs/source/basic_search.ipynb b/docs/source/basic_search.ipynb new file mode 100644 index 0000000..cc3ac5b --- /dev/null +++ b/docs/source/basic_search.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "239e7050-7764-4555-8ccd-59a4d434e1d8", + "metadata": {}, + "source": [ + "# Basic Search" + ] + }, + { + "cell_type": "markdown", + "id": "31604519-9cfe-4045-b8bf-9da518253d1b", + "metadata": {}, + "source": [ + "## Basic search\n", + "\n", + "There are five available search types:\n", + " - by [name](https://webbook.nist.gov/chemistry/name-ser/) (`search_type = 'name'`);\n", + " - by [InChI](https://webbook.nist.gov/chemistry/inchi-ser/) (`search_type = 'inchi'`);\n", + " - by [CAS RN](https://webbook.nist.gov/chemistry/cas-ser/) (`search_type = 'cas'`);\n", + " - by [chemical formula](https://webbook.nist.gov/chemistry/form-ser/) (`search_type = 'formula'`);\n", + " - and by NIST Compound ID (`search_type = 'id'`):" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "aa15ff6a-cb87-45e7-8048-e3f4a81bdbe7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NistSearch(success=True, num_compounds=10, lost=False)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nistchempy as nist\n", + "\n", + "s = nist.run_search(identifier = '1,2,3*-butane', search_type = 'name')\n", + "s" + ] + }, + { + "cell_type": "markdown", + "id": "4778b207-50c9-42f4-9607-dd9c2014d6a4", + "metadata": {}, + "source": [ + "List of found compounds is stored in the `compound_ids` attribute, and the compounds can be retrieved via the `load_found_compounds` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54164007-8e62-40da-9be6-4ab08e9b0802", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['C1871585',\n", + " 'C18338404',\n", + " 'C298180',\n", + " 'C1529686',\n", + " 'C632053',\n", + " 'C13138517',\n", + " 'C62521691',\n", + " 'C76397234',\n", + " 'C101257798',\n", + " 'C1464535']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s.compound_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9a2f8642-03f7-42d5-9736-7d56ac151633", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[NistCompound(ID=C1871585),\n", + " NistCompound(ID=C18338404),\n", + " NistCompound(ID=C298180),\n", + " NistCompound(ID=C1529686),\n", + " NistCompound(ID=C632053),\n", + " NistCompound(ID=C13138517),\n", + " NistCompound(ID=C62521691),\n", + " NistCompound(ID=C76397234),\n", + " NistCompound(ID=C101257798),\n", + " NistCompound(ID=C1464535)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s.load_found_compounds()\n", + "s.compounds" + ] + }, + { + "cell_type": "markdown", + "id": "f53f8bfc-f14e-4acc-93c0-91753944dd6d", + "metadata": {}, + "source": [ + "## Search Parameters" + ] + }, + { + "cell_type": "markdown", + "id": "26fb4af0-5bf0-4081-bfb5-fd8128df0cb6", + "metadata": {}, + "source": [ + "In addition to the main identifier, you can limit the search using several parameters, which can be using the `print_search_params` function:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bfc9a41e-0574-4667-a710-7845287345c8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "use_SI : Units for thermodynamic data, \"SI\" if True and \"calories\" if False\n", + "match_isotopes : Exactly match the specified isotopes (formula search only)\n", + "allow_other : Allow elements not specified in formula (formula search only)\n", + "allow_extra : Allow more atoms of elements in formula than specified (formula search only)\n", + "no_ion : Exclude ions from the search (formula search only)\n", + "cTG : Gas phase thermochemistry data\n", + "cTC : Condensed phase thermochemistry data\n", + "cTP : Phase change data\n", + "cTR : Reaction thermochemistry data\n", + "cIE : Gas phase ion energetics data\n", + "cIC : Ion clustering data\n", + "cIR : IR Spectrum\n", + "cTZ : THz IR spectrum\n", + "cMS : Mass spectrum (electron ionization)\n", + "cUV : UV/Visible spectrum\n", + "cGC : Gas Chromatography\n", + "cES : Vibrational and/or electronic energy levels\n", + "cDI : Constants of diatomic molecules\n", + "cSO : Henry's Law data\n" + ] + } + ], + "source": [ + "nist.print_search_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "73eccd8e-d428-4665-b684-dc0d90ba6a2c", + "metadata": {}, + "source": [ + "These options can be specified as arguments of the `nist.search` function or defined in `nist.NistSearchParameters` object:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7c02fd86-2ef5-437d-8e1c-fa8fad99a886", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['C110565', 'C110576', 'C1190223', 'C4028562', 'C4279225', 'C541333', 'C594376', 'C616217', 'C7581977', 'C760236', 'C764410', 'C821103', 'C926578']\n", + "['C110565', 'C110576', 'C1190223', 'C4028562', 'C4279225', 'C541333', 'C594376', 'C616217', 'C7581977', 'C760236', 'C764410', 'C821103', 'C926578']\n" + ] + } + ], + "source": [ + "# query\n", + "identifier = 'C4H?Cl2'\n", + "search_type = 'formula'\n", + "\n", + "# direct search (entries with IR spectra)\n", + "s1 = nist.run_search(identifier, search_type, cIR = True)\n", + "\n", + "# search with NistSearchParameters\n", + "params = nist.NistSearchParameters(cIR = True)\n", + "s2 = nist.run_search(identifier, search_type, params)\n", + "\n", + "# compare searches\n", + "print(sorted(s1.compound_ids))\n", + "print(sorted(s2.compound_ids))" + ] + }, + { + "cell_type": "markdown", + "id": "b49baaf8-95a3-4460-b633-da6ed52055aa", + "metadata": {}, + "source": [ + "## Limit of Found Compounds\n", + "\n", + "NIST Chemistry WebBook limits the search results by 400 compounds. To check if that happened for your search, you need to check the `lost` property:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c36b7e24-0d50-42fe-b7a9-04599a47951a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NistSearch(success=True, num_compounds=400, lost=True)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params = nist.NistSearchParameters(no_ion = True, cMS = True)\n", + "s = nist.run_search('C6H?O?', 'formula', params)\n", + "s" + ] + }, + { + "cell_type": "markdown", + "id": "aea437a5-a825-4207-9ae3-ee0fac61969f", + "metadata": {}, + "source": [ + "To overcome that when searching for a large number of substances, try to break the chemical formula into subsets:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "14e641b8-7360-470c-b12a-cdeee7deb6ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(170, False), (178, False), (80, False), (42, False), (7, False), (24, False)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sub_searches = []\n", + "for i in range(1, 7):\n", + " s = nist.run_search(f'C6H?O{i}', 'formula', params)\n", + " sub_searches.append( (len(s.compound_ids), s.lost) )\n", + "sub_searches" + ] + }, + { + "cell_type": "markdown", + "id": "40506388-ba9f-412a-99fc-d24360817966", + "metadata": {}, + "source": [ + "The better way is to overcome this problem is to use the pre-prepared compound list. For more details see the `Structure Search` page of the CookBook." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst new file mode 100644 index 0000000..b92918d --- /dev/null +++ b/docs/source/changelog.rst @@ -0,0 +1,9 @@ +Changelog +========= + +1.0.0 +----- + +First tracked release. + + diff --git a/docs/source/compound_properties.ipynb b/docs/source/compound_properties.ipynb new file mode 100644 index 0000000..9ae0dfe --- /dev/null +++ b/docs/source/compound_properties.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cb74ce1f-c7c9-4916-a1b7-baab8ae661b0", + "metadata": {}, + "source": [ + "# Compound properties" + ] + }, + { + "cell_type": "markdown", + "id": "9ffa0885-f936-47fb-a8a1-0acf229c6919", + "metadata": {}, + "source": [ + "## Initialization\n", + "\n", + "NIST Chemistry WebBook compound can be initialized via NIST Compound ID, CAS Registry Number, or InChI string:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dbf6b78d-dc00-4ec2-ab0d-6008e3912258", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NistCompound(ID=C632053)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nistchempy as nist\n", + "\n", + "X = nist.get_compound('C632053')\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bea0e054-e0ec-4958-a3ea-6b8a344fb8db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NistCompound(ID=C632053)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = nist.get_compound('632-05-3')\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f7a4c154-96d1-4749-9b15-288856ca9cfd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NistCompound(ID=C632053)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = nist.get_compound('InChI=1S/C4H7Br3/c1-3(6)4(7)2-5/h3-4H,2H2,1H3')\n", + "X" + ] + }, + { + "cell_type": "markdown", + "id": "078b752f-6f62-4920-b18b-5aec58022c2a", + "metadata": {}, + "source": [ + "If there are no compound with given identifier in the NIST Chemistry WebBook database, `nist.get_compound` will return `None`. The same result will occur if multiple substances correspond to the given InChI." + ] + }, + { + "cell_type": "markdown", + "id": "9d715d52-7687-4bd6-938d-ca353b659f48", + "metadata": {}, + "source": [ + "The other way of compound initialization is to run the search and load found compounds (see the **Basic Search** section of the CookBook)." + ] + }, + { + "cell_type": "markdown", + "id": "6b65ad21-fefd-4ce2-9097-fbf3d1c1bf06", + "metadata": {}, + "source": [ + "## Properties\n", + "\n", + "The `nist.compound.NistCompound` object contains information extracted from the NIST Chemistry WebBook's compound web page. It can be divided into three groups:\n", + " - **Basic properties** — properties which are already extracted from the compound web page:\n", + " - `ID`: NIST Compound ID;\n", + " - `name`: chemical name;\n", + " - `synonyms`: synonyms;\n", + " - `formula`: chemical formula;\n", + " - `mol_weight`: molecular weigth;\n", + " - `inchi` / `inchi_key`: InChI / InChIKey strings;\n", + " - `cas_rn`: CAS Registry Number.\n", + " - **Reference properties** — dictionaries {property name => URL}. There are four subgroups:\n", + " - `mol_refs`: molecular properties, which are 2D and 3D MOL-files;\n", + " - `data_refs`: WebBook properties, which are stored in NIST Chemistry WebBook;\n", + " - `nist_public_refs`: other properties, which are stored in public NIST websites;\n", + " - `nist_subscription_refs`: other properties, which are stored in paid NIST websites.\n", + " - **Extracted properties** — properties extracted from the URLs provided by **reference properties**:\n", + " - `mol2D` / `mol3D`: text blocks of 2D / 3D MOL-files;\n", + " - `ir_specs` / `thz_specs` / `ms_specs` / `uv_specs`: JDX-formatted text blocks of IR / THz / MS / UV spectra." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "184aa682-2382-4111-a251-6e516d9f2afa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ID': 'C120127',\n", + " 'name': 'Anthracene',\n", + " 'synonyms': ['Anthracin',\n", + " 'Green Oil',\n", + " 'Paranaphthalene',\n", + " 'Tetra Olive N2G',\n", + " 'Anthracene oil',\n", + " 'p-Naphthalene',\n", + " 'Anthracen',\n", + " 'Coal tar pitch volatiles:anthracene',\n", + " 'Sterilite hop defoliant'],\n", + " 'formula': 'C14 H10',\n", + " 'mol_weight': 178.2292,\n", + " 'inchi': 'InChI=1S/C14H10/c1-2-6-12-10-14-8-4-3-7-13(14)9-11(12)5-1/h1-10H',\n", + " 'inchi_key': 'MWPLVEDNUUSJAV-UHFFFAOYSA-N',\n", + " 'cas_rn': '120-12-7',\n", + " 'mol_refs': {'mol2D': 'https://webbook.nist.gov/cgi/cbook.cgi?Str2File=C120127',\n", + " 'mol3D': 'https://webbook.nist.gov/cgi/cbook.cgi?Str3File=C120127'},\n", + " 'data_refs': {'cTG': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=1#Thermo-Gas',\n", + " 'cTC': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=2#Thermo-Condensed',\n", + " 'cTP': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=4#Thermo-Phase',\n", + " 'cTR': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=8#Thermo-React',\n", + " 'cSO': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=10#Solubility',\n", + " 'cIE': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=20#Ion-Energetics',\n", + " 'cIC': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=40#Ion-Cluster',\n", + " 'cIR': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=80#IR-Spec',\n", + " 'cMS': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=200#Mass-Spec',\n", + " 'cUV': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=400#UV-Vis-Spec',\n", + " 'cGC': 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C120127&Units=SI&Mask=2000#Gas-Chrom'},\n", + " 'nist_public_refs': {'Gas Phase Kinetics Database': 'https://kinetics.nist.gov/kinetics/rpSearch?cas=120127',\n", + " 'X-ray Photoelectron Spectroscopy Database, version 5.0': 'https://srdata.nist.gov/xps/SpectralByCompdDd/21197',\n", + " 'NIST Polycyclic Aromatic Hydrocarbon Structure Index': 'https://pah.nist.gov/?q=pah015'},\n", + " 'nist_subscription_refs': {'NIST / TRC Web Thermo Tables, \"lite\" edition (thermophysical and thermochemical data)': 'https://wtt-lite.nist.gov/wtt-lite/index.html?cmp=anthracene',\n", + " 'NIST / TRC Web Thermo Tables, professional edition (thermophysical and thermochemical data)': 'https://wtt-pro.nist.gov/wtt-pro/index.html?cmp=anthracene'},\n", + " 'nist_response': NistResponse(ok=True, content_type='text/html; charset=UTF-8'),\n", + " 'mol2D': None,\n", + " 'mol3D': None,\n", + " 'ir_specs': [],\n", + " 'thz_specs': [],\n", + " 'ms_specs': [],\n", + " 'uv_specs': []}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# anthracene example\n", + "s = nist.run_search('anthracene', 'name')\n", + "X = s.compounds[0]\n", + "X.__dict__" + ] + }, + { + "cell_type": "markdown", + "id": "fc7d0742-ae64-422a-999e-e7f1da5f2330", + "metadata": {}, + "source": [ + "## MOL-files\n", + "\n", + "To load MOL-files, one can use `get_mol2D`, `get_mol3D`, or `get_molfiles` methods:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "369bc56f-fb93-4a61-90be-7bda1e432376", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(False, False)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.get_molfiles()\n", + "X.mol2D is None, X.mol3D is None" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b80185db-4e89-4d9e-8d59-f62b7579b8c1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAWrklEQVR4nO3db0xTVxsA8EsBmRVfCoj8E6KiFhb+uAh1UxOisIDAzL6wxMRGsyzdErXMOALRJWz7sHU6Z0dIJlvmrPvG9kHK4gQc+CdKAUsciqydThYBwdGpjIBQ2vt+OHvv24Fi7z23955z+/w+LEvswcdr+5w/z3lqCMuyDAAAAKFUcgcAAAB0gzQKAABYII0CAAAWSKMAAIAF0igAAGAJkzsAulmt1tHR0ZKSkuTkZLljUY7e3l6bzabVavPz8+WOReGGhobOnj0bHx+/Y8cOuWOhGQsEcTqdmZmZ6BmqVKo9e/bMzs7KHRT1Zmdnd+/eHRISgh5sdna20+mUOyhl8ng8e/fuDQ0NRY967dq1/f39cgdFK0ijvLndbrPZrFL9cx6yaNEi9D/Jycnd3d1yR0ex3t5eblEfERHBTVFHjx6dnp6WOzpF6e7u5h419wZWqVRms9ntdssdHX0gjfJz4cKF7Oxs9LaLjo4+c+aMx+M5cOBAWFgYwzAhISF6vX5kZETuMCkzOTlZU1ODPs9hYWEGg4FlWavVunz5cvSoU1NTLRaL1+uVO1LquVwuo9GIFqGhoaG7du1yu90tLS1xcXHoUaenp587d07uMCkDadRfg4ODer0e7TdTUlJOnTrl+6vj4+M1NTVoDaXRaEwmEyyg/NTW1rZu3Tq0GjIYDA8fPvT91ebmZm7e0ul0ly5dkitO2rnd7vr6+mXLljEMEx4ebjQa5zzqxsbG1atXo0ddVlZ29+5dmSKlD6TR55uenjabzUuXLmUYRq1W19TUTE1NPfWVTqeztLQUvRG1Wi3M6gu7f/++Xq/njkE7Ojqe+jKPx2OxWBISErhP+O3btyUOlXa+u6iCgoKbN28+9WX+v9WBL0ijz2G1WvlO0a2trenp6TCrL8Dr9VosltjYWO7j+tzF+8TEhMlkQp/w8PBwg8Hw4MEDaaKlmu8uKi0traGhgdeQlJQUi8UiQZxUgzT6TDhLy5mZGW5WX7x4Mczqvnp7e1955RX0YEtLS3lNM4ODgwaDAR3tRUdHm0ymJ0+eBCxSuqGlZWRkpLClpZ8LWMBCGn2qiYkJUQ46h4aGuFl9xYoVMKv7lpISExMFP5C+vr6SkhKoPi1gzi5qYGBAwA+Zf5z6+PFj0UNVAEij/+L1ehsaGlJSUriy++joKObP9J3Vt23bFrSz+pxSEv4HsrW1FapP8zmdTm6OSU9Pb25uxvyBvsX9xMTE+vp6j8cjSqiKAWn0/3p6ejZv3ozef7m5uc+qeAiAiiS+s/qjR4/E+uHk87OUJABUn3zN2UWJewnUbrcH6NOhAJBGWfbf821CQkKA5ltpfheiCCglCQDVJ99dlEqlEmUXJdfvQqNgT6Pz14l8N5vd3d12u93/1/f09GzZsoWb1a9evcozZGr09vZu2rRJWCmJZdlvv/12YmLC/9cHbfXJd52Yl5fHd504MzNjsVj8b2UO6JqXUkGdRvFPLWdnZ3NycvjOzGhWT01NZRTa+IRfSjp79izDMElJSfX19by+rCCoqk+inFoeP34cnaK2tLT4P0r0E1iqBWka9a2h49yMm5ycPHjwYHh4OMMwMTExdXV1/s/MYt0HIE1bW5tWq8UsJXV1dW3cuBF9SrOysvg2Mii++iRiDb2pqWnlypVoRt+5c+fg4KD/Y0W5D6AAQZdGA3GjE2dmVlLj08jIiLilJKvVmpaWhn5gYWHh9evX/R+r4OqT6Dc6JycnTSaTsBummLdTlSG40qjvx1L0/iKcmZn2xqfAlZJmZma4ZRc6PBkeHvZ/uMKqTwJakvj+cPQm5PvDg7zxKVjSqDSLPpyZmd7GJ8xSkj9cLldVVRU6AFGr1VVVVbz2sAqoPkm26Gtvb8/KyuJ2AH19ff6PDdrGJ+WnUemPIHGWDHQ1PonVleSngYEB7uEEVfVJ4iNIdPCK9hZ8D16Ds/FJyWk0EC1J/sOZmS9evEh+4xNXSgoJCRGlK8lPNpuNuzGWkZHR1NTEazhd1ScZC+JjY2O+1wB4zTrB1vik2DQ653qmLE0XODMzyY1PopeS+EITJLdAU2T1iZDrmXa7nTuxycvLs9lsvMbiXGiliALTKGnNQjgzs+/Y2NhYs9ks759Fmq4kP6HjQo1Go7DqE3rIKMuT0Cw0Px7/H1SQND4pKo2SvILDmZkJaXySoJQkwPzq0/j4uP/DSas+EbuCQ6tjdA4eHR1tNpuh8YmjnDRK/hcp4awy5G188i0lJSQkEFj4cjqd5eXlVFefqDhPdDgc27dvRw8qJyeH18myghuflJBGxWpJkgbOzCxL45NcpSQBbDYbt5SjqPpEXXXbarWuWrVK2M0BRTY+0Z1G6b1rSUXjk+ylJAGoqz61t7fTeNcSGp98UZxGae/8YQlufCKqlCQAFdWngLYkSePevXvcRLtmzRrBjU+U/vE5VKZRJfWhE9j4RGYpSQBiq08KW461tbUFeeMTZWlUqd+KREjjE/mlJAFIqz4p8nAwyBufqEmj81uSFPYdnazcjU++pSS9Xu9yufj+BJKRUH1ScKkaCdrGJzrSKCEXJyUgS+MTjaUkAWSsPin+4qSvOY1PnZ2dvMaSeW12YaSnUdJakqQhWeMT7aUkASSuPpHWkiQN9KeOj49ngqPxidw0SnJLkjQC3fjkcDi2bt2KXlNSUkJvKUkAaapPlK6txBI8jU+EptG6ujqu9kdmS5I0cGbmBRqfFFlKEiBw1SeqT/rE5dv4tH79esGNT1qt9tixY4GLEwdxabSnp4c7vSL/CzelIW7jU0tLi4JLSQKIW31qb2+nve4cCGI1PsXHx1+5ciVwcQpDVhq12+0qlQo9r6KiIqov04kOp87rcDiKi4vRWLT4ysnJ4fWlZ8qGWX1yu90nTpzgjgLRDykuLnY4HIGLmTo4jU9TU1NFRUXowapUKl7/pLkEyEqj+fn5DMMsXbo02E6R/Idz67CxsVGj0URGRlZXV5N80iQXzOrT+Pj43r17lyxZotFoGhsbAxcn1XAanzo6OlBxLz8/P2ABCkFWGtXpdKjcIXcgRMPpgUHb+f7+/oBGSDWc6lN/fz86xQtohAoguPEJbch0Ol1Aw+Prnw0IoMiiRYsqKip+/fVXvV4/NTX14YcfZmZmfv/993LHpRAxMTEmk+nGjRvl5eVTU1Offvppenr6V1995fF45A5NObZu3drT02M2m6Oios6fP79+/fqKiorx8XG54xKIrDS6bt067r9gYcnJyadPn0bfD3Tnzp033ngDzepyx6UQa9eubWho6Ojo2Lx58/Dw8Ntvv52VlfXjjz/KHZdyhIWFVVRU3Llzx2g0er3e2trajIyM06dPsyy7wCgyUwRZaTQxMZH7L/BHfn6+3W5HpeGff/75pZdeonpWJ83GjRsvX76Mqk/9/f2vvfbaq6+++ssvv8gdl3LExsZ+8cUXnZ2dmzZtGh4e3r1798svv9zV1fWs15OZIshKowubnp4eGxubmJiQOxCyhIWFGQyGW7duvfXWWx6Pp7a21m63yx2UcoSEhJSXl9+6devIkSMajeb8+fPbtm2bnJyUOy5F2bBhw+XLl7/55pv4+Piurq66ujq5I+KHpjT63XffxcXFvfvuu3IHQqK4uLivv/66s7OzpqaG600CYomIiKisrLx9+7bRaDx8+LBarZY7IqVRqVRvvvmmw+F47733TCaT3OHwEyZ3AEBMubm5ubm5ckehWGgHKncUShYVFXX06FG5o+CNptUoAAAQCNIoAABggTQKAABYII0CAAAWSKMAAIAF0igAAGCBNAoAAFggjQIAABZIowAAgAXSKAAAYIE0CgAAWCCNAgAAFkijAACABdIoAABggTQKAABYII0CAAAWSKMAAIAF0igAAGCBNAoAAFggjQIAABZIowAAgAXSKAAAYIE0CgAAWCCNAgAAFkijAACABdIoAABggTQKAABYII0CAAAWSKMAAIAF0igAAGCBNAoAAFggjQIAABZIowAAgAXSKAAAYIE0CgAAWCCNAgAAFkijAACABdIoAABggTQKAABYII0CAAAWSKMAAIAF0igAAGCBNAoAAFggjQIAABZIowAAgAXSKAAAYIE0CgAAWCCNAgAAFkijAACABdIoAABggTQKAABYII0qyrVr1z744AO5o1Asl8tVUVHx+eefyx2IYj1+/LiysnJ4eFjuQPgJkzsAHvR6/euvv/7CCy/IHQiJ/vzzz0OHDp08edLr9ebn52/dulXuiBRlenq6trb2448/fvToUUxMzDvvvKNWq+UOSlG8Xu+pU6cOHTo0Ojo6Ojp6+vRpuSPigaY0GhERERERIXcUxJmdnT158uThw4fHxsbCw8P37du3YcMGuYNSDpZlf/jhh+rq6t9//51hmMLCws8++wxyqLjsdvv+/fs7OjoYhtHpdPv27ZM7In7ISqP379/n/gv8ceHCBaPReOPGDYZhCgoKamtrX3zxRbmDUo7Ozs6DBw9euXKFYZiMjIwjR46UlZXJHZSiuFyujz76qK6uzuv1JiUlffLJJ3q9PiQk5FmvJzRFsCTR6XQMw5SUlMgdCAUGBwe5N1xaWlpDQ4M/o7RaLcMw/f39gQ6Pdk6ns7y8HD3epKSk+vr62dnZ547q7+9nGEar1UoQIe3cbrfZbI6KimIYJjw83Gg0jo+PP3dUSUkJwzA6nU6CCP1H1moU+GNmZubLL798//33JyYm1Gp1ZWVldXU1HBmL5a+//jpy5IjZbJ6enlar1fv37z98+PDSpUvljktR2tvbjUbjzZs3GYYpLCysra3NyMiQOygMcufxf8nPz2cYZsmSJVeuXJE7FkJZrdbVq1ejv7uysrKBgQFeYzUaTWRkZHV1tdvtDlyQlJqenjabzRqNhmEYlUql1+uHh4f9Hz4xMVFZWRkZGRkVFWWxWAIXJ9Xu3bun1+vRG3jNmjVNTU3+j+3s7PzPf/7DMEx+fn7AAhSCrDRqt9tVqn/uYBUVFU1NTckdEUEcDgfa0TAMk56e3tzczGtscXExGos2qjk5OTabLXDR0sXr9TY0NHDzU2Fh4fXr1/0f7na7T5w4ER8fj/Iv+iHFxcUOhyNwMVNncnLSZDJFRkYyDKNWq2tqavz/gE9NTRUVFaEHq1Kp7HZ7QEPli6w0yrJsT08P925esWIFzOosy05MTNTU1KBbCtHR0Waz2f+1pO9YjUZjMplaW1vRCWlISIher3e5XAENnnw2m23z5s3oLZeRkcFrfcSybGtra3Z2Nhqu0+na29vr6+uXLVvG/O/I7/HjxwGKnCJWq3XVqlXcLuqPP/7gNZbLCfHx8QRuVYlLo0hdXR331ty2bduNGzfkjkgeXq/XYrEkJCRw28zR0VH/xzY0NKSkpHAZkxs7OTlZU1OzaNEihmESEhKCdq4SVkfi9PX1cfuD1NRUi8Xi9XrRL7lcLqPRGBoayjBMYmJifX29x+MJzB+CdL47ofXr11+6dInXWO4Ja7XaY8eOBS5OHISmUZZlPR6PxWKJi4vjZvVHjx7JHZSk7Hb7pk2b0HsoLy+P1x68p6dny5YtaGxubm5HR8f81zgcDu6WfklJyd27d0ULnXgul6uqqgot0tVqdVVVlT9lYs7g4KDBYEBZMjo62mQyPXnyZP7L7HY7t87Ny8t76t+Cgv3999/cbI12Uf7PUjg7MOmRm0YR31k9NjbWbDYHw6yOs5bxHZuQkLDwWLTajY2NZRhm8eLFNTU109PTIv0hCIVfRzKZTKhwHx4ebjAYHjx4sMDrffcEfPcT9ELvK+6wWK/XL/yU5o8VtgOTC+lpFJmztrp69arcEQWK2+0WfLKG1u++Y/1cv4+MjHDF0+zsbKUumjDrSOjxoo83OuC7ffu2n2PnnFATvrbCdO3aNW4XpdPpOjs7/R87Z/1OSxWUjjTK/u8zkJqayp30jYyMyB2UyNrb27OystB7qKCgoK+vz/+xFy5c8D1NvnnzpoDfXcGlJ3HrSLwO+DhOp1PwXQsqjI2NcTuhpKQk38Pi56L6NJmaNIrMrzsrYxMqrCUJGRoa4sampKTg1IsUWXoKXB1JGJybv8Sa35Lk/y4KZwdGCMrSKPLbb7+VlpZy9btz587JHZFw6KhO2GU6NBad06GTTVFu2iqm9CRNHUkAnL90ArW1tWVmZnJHJbx2UTg7MHJQmUaR1tZWroGsrKyMxk87ZkuS71hx//i0l54kriMJ47sFwdxGyGVOSxKvXRTODow0FKdRlmVnZmYCsRyTAE5LktPplGYxTmPpScY6kjC+h9oFBQUCDrVlgdOSpLDFOEt7GkV8DwfJb3wStyVJgkUiRaUnEupIAlB3OChWS5JijoaVkEaRixcvEt74FKCWJAmQX3oirY4kABWlaofDsX37dvSgcFqSFHZRQTlplCW78QmzJYlbZD2rJUkCZJaeiK0jCeN7cVLGv+v50E4oGFqSBFBUGkVIa3ySrCVJAkSVnqioIwlAWuNTsLUkCaDANIqQ0PgkbksSOedlspeeqKsjCUBI45NvS1JeXl4wtCQJoNg0ysrd+CRvS5IE2traZCk92Ww2boKkqI4kjIyNT0HbkiSAktMoIn11G+dCHF13CSUuPQ0MDHAPh9I6kjABvSM8X5C3JAmg/DSKSHPXksCWJAlIUHqaX0fi9eEkrY4kgGR3Ldva2rhdFGZLEpm7qEAIljSKBLTxidiWJAn4lp7Qh1ysJf/MzAy3ulFSHUmYgG5WoCVJsOBKo2xgGp+oaEmSgOilJ6vVmpaWpuA6kjCiNz5BSxKmoEujiFiNT5OTkwcPHgwPD2cYJiYmpq6ujvCWJAmIUnrq6urauHEjyhRZWVl8Zxe66kgCiHgE2dTUtHLlSvT3tXPnzsHBQf/HKrIlSYAgTaPInMYnAbP67OxsTk4OXS1JEsAvPf3000/BVkcSQJSC+PHjxxmGycnJ4TXZKP67U3kJ6jTKYnxjPKe7u5vXP/dKSEuSBHp7e7krh6WlpXwPfL/99tuJiQn/X6+AOpIwmI1PMzMzFotFWEuS4r/J30/BnkYRaRqfSGtJkkDgSk++FFZHEkCaxqcgaUkSANLo/wWu8YnkliQJBK7rScF1JAECuk4M8n/ldGGQRv8lEI1PVLQkScC39GQwGPAnEsXXkYSZc2qJf/cj2FqSBIA0+hRi1dDpakmSgFhdT0FVRxJGlJvIwdmSJACk0WfCudHp25IUtJfpngWn9BS0dSQBMN+ElH4tvywgjT5Ha2treno6r1md9pYkCQgoPUEdSRgBW6Igb0kSANLo8/nf+KSkliQJ+Fl6gjoSPj+XltCSJAykUX/NaXw6deqU76+Oj48rsiVJAnNKTw8fPvT91ebmZqgjiWL+QeecR93Y2AgtScJAGuXHd1aPjo4+c+aMx+M5cOBAWFgYo9CWJAn4lp7CwsIMBgPLslardfny5VBHEpdv2T00NHTXrl1ut7ulpQX90zsMtCQJAmmUN/RtjCqVCr3t0IefYZjk5OTu7m65o6NYb29vcnIyephoXc8wjEqlOnr0KCztxdXd3c09au4NrFKpoCVJGEijAjmdzszMTO79t2fPHl6t3+CpZmdnd+/ejU5O0IGp0+mUOyhl8ng8e/fuRctShmHWrl3b398vd1C0CmFZlgFCWa3W0dHRkpISbm4H+Hp7e202m1arzc/PlzsWhRsaGjp79mx8fPyOHTvkjoVikEYBAACLSu4AAACAbpBGAQAAC6RRAADAAmkUAACwQBoFAAAs/wUx08zANz9xbQAAAOJ6VFh0cmRraXRQS0wgcmRraXQgMjAyMy4wOS41AAB4nHu/b+09BiDgZ0AAPiAWAOIGRjaHDCDNzIyfoQFisCBoclSgM7gZGTKYGJgSmBkTmFgzmFiYgZg1gZ0lgZUtgY0zg4mDHYg5E7g5Eji5Erh4M5h4uDOYeHkSRJiB+tkYmFhZgIaxcXKws7CycfHycHNwim8CyjAi+5FB8+1ee5jA0X9MYLbZ6jj77pz/cDZMvnbdPXuYGhAbptdgt7IDTD2IDVN/a02cA0w9iA1Tv1SnG64exIap52FaDlcPYsPUiwEAjUNAAxurvUYAAAFWelRYdE1PTCByZGtpdCAyMDIzLjA5LjUAAHichZPbasMwDIbv8xR6gGEsW47t3eVQRhlNYOt2X0Zhu+nF6PszKSOxOornHLCVz85v/XJ3uX5+nz7Ol/MD7MdHGNBZdLEBaS/j89cVtubGhuO2cuec4d1ba5sDSAf63dN+guHY9WtkmN+m4ysgAbY8h69btjvOhzWCMIA1dmmAhpJP3Nki20ynOWsCYrzL+YVLbU7LejmHcJcjzamF/3KBOTTR51j/b6u5yj4ic86EkHxdX9JcRV9mzhuK2df1odVgRSCKI2Q8ZVdXiE6DFYkongTjMP/jHZIGKxp303hTRL9l1c/TWMqKuGSwVA/XIvhSJCTfSy0Qh9piOclTnJVhWwwkNicUn4iJWOwgJlCnnTiYVHZljDqLJK+sskUyDVGlhWQibpJQJDKI2xZQVMqB8zpHOiMyXg8p95sfe+rD7uaS1yIAAAB/elRYdFNNSUxFUyByZGtpdCAyMDIzLjA5LjUAAHicPY5RDsAgCEOvss8tIQREJsTjcA0PP40Ovtq8hjY4IkqExDpZmq9xExAqc4NOwFhNbCq0121ad1XojE28JS2oapJUsDaXpBWlekmqWNgTbvcXnijljx+cBsrqnLiHETzjA99yLCQAxHnlAAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rdkit import Chem\n", + "\n", + "mol = Chem.MolFromMolBlock(X.mol2D)\n", + "mol" + ] + }, + { + "cell_type": "markdown", + "id": "9e5fc177-d6d4-4f3d-abab-463ece22baff", + "metadata": {}, + "source": [ + "## Spectra\n", + "\n", + "To load spectra, one can use `get_ir_spectra`, `get_thz_spectra`, `get_ms_spectra`, `get_uv_spectra`, and `get_all_spectra` methods:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1144e47e-a659-49a3-81e1-6876706eaf65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([], [], [], [])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.ir_specs, X.thz_specs, X.ms_specs, X.uv_specs" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3960b362-89cd-4153-a658-24aaf4e50ee5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([], [], [Spectrum(C120127, Mass spectrum #0)], [])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.get_ms_spectra()\n", + "X.ir_specs, X.thz_specs, X.ms_specs, X.uv_specs" + ] + }, + { + "cell_type": "markdown", + "id": "58cc075d-5efc-4558-8b88-e1f159a4bb74", + "metadata": {}, + "source": [ + "Spectrum object contains JDX-formatted text block of the spectrum which includes both meta-information and spectral data:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "33060848-c874-448a-aa12-47b8c5c4cdd4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "##TITLE=Anthracene\n", + "##JCAMP-DX=4.24\n", + "##DATA TYPE=MASS SPECTRUM\n", + "##ORIGIN=Japan AIST/NIMC Database- Spectrum MS-NW- 132\n", + "##OWNER=NIST Mass Spectrometry Data Center\n", + "Collection (C) 2014 copyright by the U.S. Secretary of Commerce\n", + "on behalf of the United States of America. All rights reserved.\n", + "##CAS REGISTRY NO=120-12-7\n", + "##$NIST MASS SPEC NO=228201\n", + "##MOLFORM=C14 H10\n", + "##MW=178\n", + "##$NIST SOURCE=MSDC\n", + "##XUNITS=M/Z\n", + "##YUNITS=RELATIVE INTENSITY\n", + "##XFACTOR=1\n", + "##YFACTOR=1\n", + "##FIRSTX=27\n", + "##LASTX=181\n", + "##FIRSTY=20\n", + "##MAXX=181\n", + "##MINX=27\n", + "##MAXY=9999\n", + "##MINY=10\n", + "##NPOINTS=62\n", + "##PEAK TABLE=(XY..XY)\n", + "27,20 28,10 38,30 39,109\n", + "50,129 51,129 52,30 61,40\n", + "62,129 63,289 64,20 65,20\n", + "69,20 73,10 74,219 75,299\n", + "76,619 77,80 78,10 83,50\n", + "85,30 86,99 87,169 88,439\n", + "89,759 90,10 98,119 99,90\n", + "100,50 101,50 102,60 110,40\n", + "111,50 113,60 114,20 115,50\n", + "122,40 123,20 124,20 125,50\n", + "126,149 127,60 128,80 137,30\n", + "138,30 139,209 140,80 149,70\n", + "150,419 151,629 152,689 153,80\n", + "163,50 164,20 174,129 175,199\n", + "176,1409 177,799 178,9999 179,1569\n", + "180,149 181,30\n", + "##END=\n", + "\n" + ] + } + ], + "source": [ + "ms = X.ms_specs[0]\n", + "print(ms.jdx_text)" + ] + }, + { + "cell_type": "markdown", + "id": "f294f4e8-ada0-4a96-a3eb-16f7bbc7cfc5", + "metadata": {}, + "source": [ + "Spectra of the given compound can be saved to the given directory via `save_ir_spectra`, `save_ir_spectra`, `save_ir_spectra`, `save_ir_spectra`, `save_all_spectra` methods of the `nist.compound.NistCompound` object. To save the specific spectrum, one can use the `save` method of the `nist.compound.Spectrum` object." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nistchempy/__init__.py b/nistchempy/__init__.py index e4a8ce8..7b54f80 100644 --- a/nistchempy/__init__.py +++ b/nistchempy/__init__.py @@ -1,14 +1,17 @@ -# imports -from .nistchempy import __version__, \ - get_all_data, Compound, Spectrum, \ - print_search_parameters, \ - SearchParameters, Search - -# module functions -__all__ = [ - '__version__', - 'get_all_data', - 'Compound', 'Spectrum' - 'print_search_parameters', 'SearchParameters', 'Search' -] +'''This package is a Python interface for the NIST Chemistry WebBook database +that provides additional data for the efficient compound search and +automatic retrievement of the stored physico-chemical data + +''' + +__version__ = '1.0.0' +__updated__ = 'September 25, 2024' +__license__ = 'MIT' + + +from nistchempy.compound_list import get_all_data +from nistchempy.compound import get_compound +from nistchempy.search import run_search, NistSearchParameters +from nistchempy.search import get_search_parameters, print_search_parameters + diff --git a/nistchempy/compound.py b/nistchempy/compound.py new file mode 100644 index 0000000..bde8e14 --- /dev/null +++ b/nistchempy/compound.py @@ -0,0 +1,370 @@ +'''The module contains compound-related functionality + +Attributes: + SPEC_TYPES (dict): dictionary containing abbreviations for spectra types used + in compound page (keys) or urls for downloading JDX-files (values) + +''' + +#%% Imports + +from __future__ import annotations + +import re as _re +import os as _os + +from urllib.parse import urlparse, parse_qs + +import nistchempy.requests as _ncpr +import nistchempy.parsing as _parsing + +import dataclasses as _dcs +import typing as _tp + + +#%% Attributes + +SPEC_TYPES = {'IR': 'IR', 'TZ': 'THz', 'MS': 'Mass', 'UV': 'UVVis'} + + + +#%% Classes + +@_dcs.dataclass(eq = False, repr = False) +class Spectrum(): + '''Wrapper for IR, MS, and UV-Vis extracted from NIST Chemistry WebBook + + Attributes: + compound (NistCompound): parent NistCompound object + spec_type (str): IR / TZ (THz IR) / MS / UV (UV-Vis) + spec_idx (str): index of the spectrum + jdx_text (str): text block of the corresponding JDX-file + + ''' + + compound: NistCompound + spec_type: str + spec_idx: str + jdx_text: str + + + def __str__(self): + pretty_names = {'IR': 'IR spectrum', 'TZ': 'THz IR spectrum', + 'MS': 'Mass spectrum', 'UV': 'UV-Vis spectrum'} + + return f'Spectrum({self.compound.ID}, {pretty_names[self.spec_type]} #{self.spec_idx})' + + + def __repr__(self): + return self.__str__() + + + def save(self, name = None, path_dir = None): + ''' + Saves spectrum in JDX format + ''' + path = name if name else f'{self.compound.ID}_{self.spec_type}_{self.spec_idx}.jdx' + if path_dir: + path = _os.path.join(path_dir, path) + with open(path, 'w') as outf: + outf.write(self.jdx_text) + + + +@_dcs.dataclass(eq = False, repr = False) +class NistCompound(): + '''Stores info on NIST Chemistry WebBook compound + + Attributes: + ID (_tp.Optional[str]): NIST compound ID + name (_tp.Optional[str]): chemical name + synonyms (_tp.List[str]): synonyms of the chemical name + formula (_tp.Optional[str]): chemical formula + mol_weight (_tp.Optional[float]): molecular weigth, g/cm^3 + inchi (_tp.Optional[str]): InChI string + inchi_key (_tp.Optional[str]): InChI key string + cas_rn (_tp.Optional[str]): CAS registry number + mol_refs (_tp.Dict[str, str]): references to 2D and 3D MOL-files + data_refs (_tp.Dict[str, str]): references to the webpages containing + physical chemical data for the given compound + nist_public_refs (_tp.Dict[str, str]): references to webpages of other + public NIST databases containing data for the given compound + nist_subscription_refs (_tp.Dict[str, str]): references to webpages of + subscription NIST databases containing data for the given compound + nist_response (NistResponse): response to the GET request + mol2D (_tp.Optional[str]): text block of a MOL-file containing 2D atomic coordinates + mol3D (_tp.Optional[str]): text block of a MOL-file containing 3D atomic coordinates + ir_specs (_tp.List[Spectrum]): list pf IR Spectrum objects + thz_specs (_tp.List[Spectrum]): list pf THz Spectrum objects + ms_specs (_tp.List[Spectrum]): list pf MS Spectrum objects + uv_specs (_tp.List[Spectrum]): list pf UV-Vis Spectrum objects + + ''' + + ID: _tp.Optional[str] + name: _tp.Optional[str] + synonyms: _tp.List[str] + formula: _tp.Optional[str] + mol_weight: _tp.Optional[float] + inchi: _tp.Optional[str] + inchi_key: _tp.Optional[str] + cas_rn: _tp.Optional[str] + mol_refs: _tp.Dict[str, str] + data_refs: _tp.Dict[str, str] + nist_public_refs: _tp.Dict[str, str] + nist_subscription_refs: _tp.Dict[str, str] + nist_response: _ncpr.NistResponse + mol2D: _tp.Optional[str] = _dcs.field(init = False) + mol3D: _tp.Optional[str] = _dcs.field(init = False) + ir_specs: _tp.List[Spectrum] = _dcs.field(init = False) + thz_specs: _tp.List[Spectrum] = _dcs.field(init = False) + ms_specs: _tp.List[Spectrum] = _dcs.field(init = False) + uv_specs: _tp.List[Spectrum] = _dcs.field(init = False) + + + def __post_init__(self): + self.mol2D = None + self.mol3D = None + self.ir_specs = [] + self.thz_specs = [] + self.ms_specs = [] + self.uv_specs = [] + + + def __str__(self): + return f'NistCompound(ID={self.ID})' + + + def __repr__(self): + return self.__str__() + + +##### Loading MOL-files ####################################################### + + def get_molfile(self, dim: int, **kwargs) -> None: + '''Loads text block of 2D / 3D molfile + + Arguments: + dim (int): dimensionality of molfile (2D / 3D) + kwargs: requests.get kwargs parameters + + ''' + if dim not in (2, 3): + raise ValueError(f'Bad dimensionality (must be 2 or 3): {dim}') + key = f'mol{dim}D' + if key not in self.mol_refs: + return + nr = _ncpr.make_nist_request(self.mol_refs[key], **kwargs) + if nr.ok: + setattr(self, key, nr.text) + + + def get_mol2D(self, **kwargs) -> None: + '''Loads text block of 2D molfile + + Arguments: + kwargs: requests.get kwargs parameters + + ''' + self.get_molfile(2, **kwargs) + + + def get_mol3D(self, **kwargs) -> None: + '''Loads text block of 2D molfile + + Arguments: + kwargs: requests.get kwargs parameters + + ''' + self.get_molfile(3, **kwargs) + + + def get_molfiles(self, **kwargs) -> None: + '''Loads text block of all available molfiles + + Arguments: + kwargs: requests.get kwargs parameters + + ''' + self.get_mol2D(**kwargs) + self.get_mol3D(**kwargs) + + +##### Loading spectra ######################################################### + + def get_spectrum(self, spec_type: str, spec_idx: str) -> Spectrum: + '''Loads spectrum of given type (IR / TZ / MS / UV) and index + + Arguments: + spec_type (str): spectrum type [ IR / TZ / MS / UV ] + spec_idx (str): spectrum index + + Returns: + Spectrum: wrapper for the text block of JDX-formatted spectrum + + ''' + # prepare params + if spec_type not in SPEC_TYPES: + raise ValueError(f'spec_type must be one of IR / TZ / MS / UV: {spec_type}') + params = {'JCAMP': self.ID, 'Index': spec_idx, + 'Type': SPEC_TYPES[spec_type]} + # request + nr = _ncpr.make_nist_request(_ncpr.SEARCH_URL, params) + spec = Spectrum(self, spec_type, spec_idx, nr.text) if nr.ok else None + + return spec + + + def get_spectra(self, spec_type: str) -> None: + '''Loads all available spectra of given type (IR / TZ / MS / UV) + + Arguments: + spec_type (str): spectrum type [ IR / TZ / MS / UV ] + + ''' + # prepare + if spec_type not in SPEC_TYPES: + raise ValueError(f'spec_type must be one of IR / TZ / MS / UV: {spec_type}') + key = 'c' + spec_type + if key not in self.data_refs: + return None + # request + nr = _ncpr.make_nist_request(self.data_refs[key]) + if not nr.ok: + return None + # extract spectra indexes + refs = nr.soup.findAll(attrs = {'href': _re.compile('Index=')}) + refs = [ref.attrs['href'] for ref in refs] + idxs = [parse_qs(urlparse(ref).query)['Index'][0] for ref in refs] + idxs = sorted(list(set(idxs))) + # load spectra + key = ('thz' if spec_type == 'TZ' else spec_type.lower()) + '_specs' + setattr(self, key, []) + for idx in idxs: + X = self.get_spectrum(spec_type, idx) + if X: getattr(self, key).append(X) + + + def get_ir_spectra(self): + '''Loads all available IR spectra''' + self.get_spectra('IR') + + + def get_thz_spectra(self): + '''Loads all available THz spectra''' + self.get_spectra('TZ') + + + def get_ms_spectra(self): + '''Loads all available MS spectra''' + self.get_spectra('MS') + + + def get_uv_spectra(self): + '''Loads all available UV-Vis spectra''' + self.get_spectra('UV') + + + def get_all_spectra(self): + '''Loads all available spectra''' + self.get_ir_spectra() + self.get_thz_spectra() + self.get_ms_spectra() + self.get_uv_spectra() + + + def save_spectra(self, spec_type, path_dir = './') -> None: + '''Saves all spectra of given type to the specified folder + + Arguments: + spec_type (str): spectrum type [ IR / TZ / MS / UV ] + path_dir (str): directory to save spectra + + ''' + # check input + if spec_type not in SPEC_TYPES: + raise ValueError(f'spec_type must be one of IR / TZ / MS / UV: {spec_type}') + if not _os.path.isdir(path_dir): + raise ValueError(f'"{path_dir}" must be a directory') + # save + for spec in getattr(self, spec_type): + spec.save(f'{self.ID}_{spec_type}_{spec.spec_idx}.jdx', path_dir) + + + def save_ir_spectra(self, path_dir = './') -> None: + '''Saves IR spectra to the specified folder''' + self.save_spectra('IR', path_dir) + + + def save_thz_spectra(self, path_dir = './') -> None: + '''Saves IR spectra to the specified folder''' + self.save_spectra('TZ', path_dir) + + + def save_ms_spectra(self, path_dir = './') -> None: + '''Saves mass spectra to the specified folder''' + self.save_spectra('MS', path_dir) + + + def save_uv_spectra(self, path_dir = './') -> None: + '''Saves all UV-Vis spectra to the specified folder''' + self.save_spectra('UV', path_dir) + + + def save_all_spectra(self, path_dir = './') -> None: + '''Saves all UV-Vis spectra to the specified folder''' + self.save_ir_spectra(path_dir) + self.save_tz_spectra(path_dir) + self.save_ms_spectra(path_dir) + self.save_uv_spectra(path_dir) + + + +#%% Initialization + +def compound_from_response(nr: _ncpr.NistResponse) -> _tp.Optional[NistCompound]: + '''Initializes NistCompound object from the corresponding response + + Arguments: + nr (_ncpr.NistResponse): response to the GET request for a compound + + Returns: + _tp.Optional[NistCompound]: NistCompound object, and None if there are + several compounds corresponding to the given ID + + ''' + # check if it's compound page + if not _parsing.is_compound_page(nr.soup): + return None + # extract data + info = {**_parsing.parse_compound_page(nr.soup), + 'nist_response': nr} + nc = NistCompound(**info) + + return nc + + +def get_compound(ID: str, **kwargs) -> _tp.Optional[NistCompound]: + '''Loads the main info on the given NIST compound + + Arguments: + ID (str): NIST compound ID, CAS RN or InChI + kwargs: requests.get kwargs parameters + + Returns: + _tp.Optional[NistCompound]: NistCompound object, and None if there are + several compounds corresponding to the given ID + + ''' + if ID[:6] == 'InChI=': + url = f'{_ncpr.INCHI_URL}/{ID}' + params = {} + else: + url = _ncpr.SEARCH_URL + params = {'ID': ID} + nr = _ncpr.make_nist_request(url, params, **kwargs) + X = compound_from_response(nr) + + return X + + diff --git a/nistchempy/compound_list.py b/nistchempy/compound_list.py new file mode 100644 index 0000000..4e5b88f --- /dev/null +++ b/nistchempy/compound_list.py @@ -0,0 +1,34 @@ +'''Loads pre-prepared info on compounds structure and data availability''' + +#%% Imports + +import sys as _sys +if _sys.version_info < (3, 9): + import importlib_resources as _importlib_resources +else: + import importlib.resources as _importlib_resources + +import zipfile as _zipfile +import pandas as _pd + + +#%% Functions + +def get_all_data() -> _pd.core.frame.DataFrame: + '''Returns pandas dataframe containing info on all NIST Chem WebBook compounds + + Returns: + _pd.core.frame.DataFrame: dataframe containing pre-extracted compound info + + ''' + pkg = _importlib_resources.files('nistchempy') + data_file = pkg / 'nist_data.zip' + with _importlib_resources.as_file(data_file) as path: + zf = _zipfile.ZipFile(path) + df = _pd.read_csv(zf.open('nist_data.csv'), dtype = 'str') + df['mol_weight'] = df['mol_weight'].astype(float) + zf.close() + + return df + + diff --git a/nistchempy/nist_data.zip b/nistchempy/nist_data.zip index 29f6d32..3b2c56e 100644 Binary files a/nistchempy/nist_data.zip and b/nistchempy/nist_data.zip differ diff --git a/nistchempy/nistchempy.py b/nistchempy/nistchempy.py deleted file mode 100644 index 693f8c1..0000000 --- a/nistchempy/nistchempy.py +++ /dev/null @@ -1,513 +0,0 @@ -''' -Python API for NIST Chemistry WebBook -''' - -#%% Imports - -import sys -if sys.version_info < (3, 9): - import importlib_resources -else: - import importlib.resources as importlib_resources - -import re, os, requests, zipfile -from urllib.parse import urlparse, parse_qs -from bs4 import BeautifulSoup, Comment -import pandas as pd - - -#%% Package info - -__version__ = '0.2.3' - - -#%% All NIST Data - -def get_all_data(): - ''' - Returns pandas dataframe containing info on all NIST Chem WebBook compounds - ''' - dt0 = {'mol_weight': 'float64'} - dt1 = {k: 'string' for k in ('ID', 'name', 'formula', 'inchi', 'inchi_key', 'cas_rn')} - dt2 = {k: 'bool' for k in ('mol2D', 'mol3D', 'cIR', 'cTZ', 'cMS', 'cUV', 'cGC', - 'cTG', 'cTC', 'cTP', 'cSO', 'cTR', 'cIE', 'cIC', 'cES', 'cDI')} - dtypes = {**dt0, **dt1, **dt2} - pkg = importlib_resources.files('nistchempy') - data_file = pkg / 'nist_data.zip' - with importlib_resources.as_file(data_file) as path: - zf = zipfile.ZipFile(path) - df = pd.read_csv(zf.open('nist_data.csv'), dtype = dtypes) - zf.close() - - return df - - -#%% Support functions - -def _is_compound(soup): - ''' - Checks if html is a single compound page and returns NIST ID if yes - ''' - header = soup.findAll('h1', {'id': 'Top'}) - if not header: - return None - # get info - header = header[0] - info = header.findNext('ul') - if not info: - return None - # extract NIST ID - for comment in soup.findAll(text = lambda text: isinstance(text, Comment)): - comment = str(comment).replace('\r\n', '').replace('\n', '') - if not '/cgi/cbook.cgi' in comment: - continue - return re.search(r'/cgi/cbook.cgi\?Form=(.*?)&', comment).group(1) - - return None - - -#%% Compound-related classes - -class Spectrum(): - ''' - Class for IR, MS, and UV-Vis extracted from NIST Chemistry WebBook - ''' - - _pretty_names = {'IR': 'IR spectrum', - 'TZ': 'THz IR spectrum', - 'MS': 'Mass spectrum', - 'UV': 'UV-Vis spectrum'} - - def __init__(self, compound, spec_type, spec_idx, jdx): - self.compound = compound - self.spec_type = spec_type - self.spec_idx = spec_idx - self.jdx_text = jdx - - def save(self, name = None, path_dir = None): - ''' - Saves spectrum in JDX format - ''' - path = name if name else f'{self.compound.ID}_{self.spec_type}_{self.spec_idx}.jdx' - if path_dir: - path = os.path.join(path_dir, path) - with open(path, 'w') as outf: - outf.write(self.jdx_text) - - def __str__(self): - return f'Spectrum({self.compound.ID}, {self._pretty_names[self.spec_type]} #{self.spec_idx})' - - def __repr__(self): - return f'Spectrum({self.compound.ID}, {self._pretty_names[self.spec_type]} #{self.spec_idx})' - - -class Compound(): - ''' - Object for NIST Chemistry WebBook compound - ''' - - # NIST URLs - _NIST_URL = 'https://webbook.nist.gov' - _COMP_ID = '/cgi/cbook.cgi' - - # mappings for spectra - _MASKS = {'1': 'cTG', '2': 'cTC', '4': 'cTP', '8': 'cTR', '10': 'cSO', - '20': 'cIE', '40': 'cIC', '80': 'cIR', '100': 'cTZ', '200': 'cMS', - '400': 'cUV', '800': 'cES', '1000': 'cDI', '2000': 'cGC'} - _SPECS = {'IR': 'IR', 'TZ': 'THz', 'MS': 'Mass', 'UV': 'UVVis'} - - def _load_compound_info(self): - ''' - Loads main compound info - ''' - r = requests.get(self._NIST_URL + self._COMP_ID, {'ID': self.ID, 'Units': 'SI'}) - if not r.ok: - raise ConnectionError(f'Bad NIST response, status code: {r.status_code}') - # check if it is compound page - soup = BeautifulSoup(re.sub('clss=', 'class=', r.text), - features = 'html.parser') - header = soup.findAll('h1', {'id': 'Top'}) - if not header: - raise ValueError(f'Bad compound ID: {self.ID}') - header = header[0] - # get info - info = header.findNext('ul') - if not info: - raise ValueError(f'Bad compound ID: {self.ID}') - # name - self.name = header.text - # synonyms - hits = info.findChildren(text = re.compile('Other names')) - if hits: - text = hits[0].findParent('li').text.replace('Other names:', '') - synonyms = [_.strip(';').strip() for _ in text.split('\n')] - self.synonyms = [_ for _ in synonyms if _] - # formula - hits = info.findChildren(text = re.compile('Formula')) - if hits: - text = hits[0].findParent('li').text.replace('Formula:', '') - self.formula = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text.strip()) - # mol weight - hits = info.findChildren(text = re.compile('Molecular weight')) - if hits: - text = hits[0].findParent('li').text.replace('Molecular weight:', '') - self.mol_weight = float(text) - # InChI and InChI key - hits = info.findChildren(attrs = {'class': 'inchi-text'}) - if hits: - for hit in hits: - if 'InChI=' in hit.text: - self.inchi = hit.text - elif re.search(r'', hit.text): - self.inchi_key = hit.text - # CAS RN - hits = info.findChildren(text = re.compile('CAS Registry Number:')) - if hits: - text = hits[0].findParent('li').text.replace('CAS Registry Number:', '') - self.cas_rn = text.strip() - # 2D structure - hits = info.findChildren(attrs = {'href': re.compile('Str2File')}) - if hits: - self.data_refs['mol2D'] = self._NIST_URL + hits[0].attrs['href'] - # 3D structure - hits = info.findChildren(attrs = {'href': re.compile('Str3File')}) - if hits: - self.data_refs['mol3D'] = self._NIST_URL + hits[0].attrs['href'] - # other data and spectroscopy - hits = info.findChildren(attrs = {'href': re.compile('/cgi/cbook.cgi.*Mask=\d')}) - for hit in hits: - mask = re.search('Mask=(\d+)', hit.attrs['href']).group(1) - key = self._MASKS.get(mask, hit.text) - if key in self.data_refs: - self.data_refs[key] += [self._NIST_URL + hit.attrs['href']] - else: - self.data_refs[key] = [self._NIST_URL + hit.attrs['href']] - - def get_2D(self): - ''' - Loads 2D structure in MOL2 format - ''' - if 'mol2D' not in self.data_refs: - return - r = requests.get(self.data_refs['mol2D']) - if r.ok: - self.mol2D = r.text - - def get_3D(self): - ''' - Loads 3D structure in MOL2 format - ''' - if 'mol3D' not in self.data_refs: - return - r = requests.get(self.data_refs['mol3D']) - if r.ok: - self.mol3D = r.text - - def get_spectra(self, spec_type): - ''' - Loads available mass spectra in JCAMP-DX format - ''' - if spec_type not in self._SPECS: - raise ValueError(f'Bad spec_type value: {spec_type}') - if 'c'+spec_type not in self.data_refs: - return - r = requests.get(self.data_refs['c'+spec_type][0]) - if not r.ok: - return - soup = BeautifulSoup(re.sub('clss=', 'class=', r.text), - features = 'html.parser') - # get available spectrum indexes - idxs = soup.findAll(attrs = {'href': re.compile('Index=')}) - idxs = [re.search(r'Index=(\d+)', _.attrs['href']).group(1) for _ in idxs] - idxs = sorted(list(set(idxs))) - # load jdxs - for idx in idxs: - spec = requests.get(self._NIST_URL + self._COMP_ID, - {'JCAMP': self.ID, 'Index': idx, - 'Type': self._SPECS[spec_type]}) - if spec.ok: - spec = Spectrum(self, spec_type, idx, spec.text) - getattr(self, spec_type).append(spec) - - def get_ir_spectra(self): - ''' - Loads available IR spectra in JCAMP-DX format - ''' - - return self.get_spectra('IR') - - def get_tz_spectra(self): - ''' - Loads available IR spectra in JCAMP-DX format - ''' - - return self.get_spectra('TZ') - - def get_ms_spectra(self): - ''' - Loads available mass spectra in JCAMP-DX format - ''' - - return self.get_spectra('MS') - - def get_uv_spectra(self): - ''' - Loads available UV-Vis spectra in JCAMP-DX format - ''' - - return self.get_spectra('UV') - - def get_all_spectra(self): - ''' - Loads available spectroscopic data - ''' - self.get_ir_spectra() - self.get_tz_spectra() - self.get_ms_spectra() - self.get_uv_spectra() - - def get_all_data(self): - ''' - Loads available structural and spectroscopic data - ''' - self.get_2D() - self.get_3D() - self.get_all_spectra() - - def save_spectra(self, spec_type, path_dir = './'): - ''' - Saves all spectra of given type to the specified folder - ''' - if not os.path.isdir(path_dir): - raise ValueError(f'"{path_dir}" must be directory') - for spec in getattr(self, spec_type): - spec.save(f'{self.ID}_{spec_type}_{spec.spec_idx}.jdx', path_dir) - - def save_ir_spectra(self, path_dir = './'): - ''' - Saves IR spectra to the specified folder - ''' - self.save_spectra('IR', path_dir) - - def save_tz_spectra(self, path_dir = './'): - ''' - Saves IR spectra to the specified folder - ''' - self.save_spectra('TZ', path_dir) - - def save_ms_spectra(self, path_dir = './'): - ''' - Saves mass spectra to the specified folder - ''' - self.save_spectra('MS', path_dir) - - def save_uv_spectra(self, path_dir = './'): - ''' - Saves all UV-Vis spectra to the specified folder - ''' - self.save_spectra('UV', path_dir) - - def save_all_spectra(self, path_dir = './'): - ''' - Saves all UV-Vis spectra to the specified folder - ''' - self.save_ir_spectra(path_dir) - self.save_tz_spectra(path_dir) - self.save_ms_spectra(path_dir) - self.save_uv_spectra(path_dir) - - def __init__(self, ID): - self.ID = ID - for prop, val in [('name', None), ('synonyms', []), ('formula', None), ('mol_weight', None), - ('inchi', None), ('inchi_key', None), ('cas_rn', None), - ('IR', []), ('TZ', []), ('MS', []), ('UV', []), - ('mol2D', None), ('mol3D', None), - ('data_refs', {})]: - setattr(self, prop, val) - self._load_compound_info() - - def __str__(self): - return f'Compound({self.ID})' - - def __repr__(self): - return f'Compound({self.ID})' - - -#%% Search-related classes - -def print_search_parameters(): - ''' - Prints available search parameters - ''' - info = {'Units': 'Units for thermodynamic data, "SI" or "CAL" for calorie-based', - 'MatchIso': 'Exactly match the specified isotopes (formula search only)', - 'AllowOther': 'Allow elements not specified in formula (formula search only)', - 'AllowExtra': 'Allow more atoms of elements in formula than specified (formula search only)', - 'NoIon': 'Exclude ions from the search (formula search only)', - 'cTG': 'Contains gas-phase thermodynamic data', - 'cTC': 'Contains condensed-phase thermodynamic data', - 'cTP': 'Contains phase-change thermodynamic data', - 'cTR': 'Contains reaction thermodynamic data', - 'cIE': 'Contains ion energetics thermodynamic data', - 'cIC': 'Contains ion cluster thermodynamic data', - 'cIR': 'Contains IR data', - 'cTZ': 'Contains THz IR data', - 'cMS': 'Contains MS data', - 'cUV': 'Contains UV/Vis data', - 'cGC': 'Contains gas chromatography data', - 'cES': 'Contains vibrational and electronic energy levels', - 'cDI': 'Contains constants of diatomic molecules', - 'cSO': 'Contains info on Henry\'s law'} - max_len = max([len(_) for _ in info]) - spaces = [' '*(max_len - len(_) + 1) for _ in info] - for (key, val), space in zip(info.items(), spaces): - print(f'{key}{space}: {val}') - - -class SearchParameters(): - ''' - Object containing parameters for compound search in NIST WebBook - To get the full description of available options please use - "print_search_parameters" function - ''' - info = {'Units': 'SI', - 'MatchIso': False, 'AllowOther': False, 'AllowExtra': False, 'NoIon': False, - 'cTG': False, 'cTC': False, 'cTP': False, 'cTR': False, 'cIE': False, 'cIC': False, - 'cIR': False, 'cTZ': False, 'cMS': False, 'cUV': False, 'cGC': False, - 'cES': False, 'cDI': False, 'cSO': False} - - def get_request_parameters(self): - ''' - Returns dictionary with GET parameters - ''' - params = {'Units': self.Units} - for key in self.info: - if key == 'Units': - continue - val = getattr(self, key) - if val: - params[key] = 'on' - - return params - - def __init__(self, **kwargs): - # set default - for key, val in self.info.items(): - setattr(self, key, val) - # check kwargs - for key, val in kwargs.items(): - if key not in self.info: - raise TypeError(f'"{key}" is an invalid keyword argument for SearchParameters') - if key == 'Units' and val not in ('SI', 'CAL'): - raise ValueError(f'Bad value for "Units" parameter: {val}') - if key != 'Units' and type(val) is not bool: - raise ValueError(f'Bad value for "{key}" parameter: {val}') - setattr(self, key, val) - - def __str__(self): - sep = ', ' # ',\n' + ' '*17 - text = [f'SearchParameters(Units={self.Units}'] + \ - [f'{key}={getattr(self, key)}' for key in self.info if key != 'Units' and getattr(self, key)] - text[-1] = text[-1] + ')' - - return sep.join(text) - - def __repr__(self): - sep = ', ' # ',\n' + ' '*17 - text = [f'SearchParameters(Units={self.Units}'] + \ - [f'{key}={getattr(self, key)}' for key in self.info if key != 'Units' and getattr(self, key)] - text[-1] = text[-1] + ')' - - return sep.join(text) - - -class Search(): - ''' - Object for searching in NIST Chemistry WebBook - ''' - - # NIST URLs - _NIST_URL = 'https://webbook.nist.gov' - _COMP_ID = '/cgi/cbook.cgi' - - # parameters data - search_types = {'formula': 'Formula', 'name': 'Name', 'inchi': 'InChI', 'cas': 'ID'} - formula_only = ('MatchIso', 'AllowOther', 'AllowExtra', 'NoIon') - - def __init__(self, **kwargs): - self.parameters = SearchParameters(**kwargs) - self.IDs = [] - self.compounds = [] - self.lost = False - self.success = True - - def find_compounds(self, identifier, search_type, **kwargs): - ''' - Search for species data by chemical name - search_type must be one of 'formula', 'name', 'inchi', 'cas' - clear_found: clear all found compounds - raise_lost: raise exception if limit of 400 compounds per search - was achieved - ''' - if search_type not in self.search_types: - raise ValueError(f'Bad search_type value: {search_type}') - # prepare GET parameters - params = {self.search_types[search_type]: identifier} - params.update(self.parameters.get_request_parameters()) - addend = SearchParameters(**kwargs) - params.update(addend.get_request_parameters()) - # load webpage - r = requests.get(self._NIST_URL + self._COMP_ID, params) - if not r.ok: - self.success = False - self.IDs = [] - self.compounds = [] - self.lost = False - return - soup = BeautifulSoup(re.sub('clss=', 'class=', r.text), - features = 'html.parser') - # check if no compounds - if search_type == 'inchi': - errs = ['information from the inchi', 'no matching species found'] - else: - errs = ['not found'] - err_flag = False - for err in errs: - if sum([err in _.text.lower() for _ in soup.findAll('h1')]): - err_flag = True - break - if err_flag: - self.success = True - self.IDs = [] - self.compounds = [] - self.lost = False - return - # check if one compound - flag = _is_compound(soup) - if flag: - self.success = True - self.IDs = [flag] - self.compounds = [] - self.lost = False - return - # extract IDs - refs = soup.find('ol').findChildren('a', href = re.compile(self._COMP_ID)) - IDs = [parse_qs(urlparse(a.attrs['href']).query)['ID'][0] for a in refs] - self.IDs = IDs - self.compounds = [] - self.success = True - self.lost = 'Due to the large number of matching species' in soup.text - - def load_found_compounds(self): - ''' - Loads compounds - ''' - self.compounds = [Compound(ID) for ID in self.IDs] - - def __str__(self): - return f'Search(Success={self.success}, Lost={self.lost}, Found={len(self.IDs)})' - - def __repr__(self): - return f'Search(Success={self.success}, Lost={self.lost}, Found={len(self.IDs)})' - - diff --git a/nistchempy/parsing.py b/nistchempy/parsing.py new file mode 100644 index 0000000..1967a9e --- /dev/null +++ b/nistchempy/parsing.py @@ -0,0 +1,484 @@ +'''The module contains parsing-related functionality''' + +#%% Imports + +import re as _re + +import urllib.parse as _uparse + +import bs4 as _bs4 + +import nistchempy.requests as _ncpr + +import typing as _tp + + +#%% Search + +def get_found_compounds(soup: _bs4.BeautifulSoup) -> dict: + '''Extracts IDs of found compounds for NIST Chemistry WebBook search + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + dict: extracted NIST search parameters + + ''' + try: + refs = soup.find('ol').findChildren('a', href = _re.compile('/cgi/cbook.cgi')) + IDs = [_uparse.parse_qs(_uparse.urlparse(a.attrs['href']).query)['ID'][0] \ + for a in refs] + lost = 'due to the large number of matching species' in soup.text.lower() + except AttributeError: # no ol with compound refs + IDs = [] + lost = False + + return {'IDs': IDs, 'lost': lost} + + + +#%% Compound detection + +def is_compound_page(soup: _bs4.BeautifulSoup) -> bool: + '''Checks if html is a single compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + bool: True for a single compound page + + ''' + header = soup.findAll('h1', {'id': 'Top'}) + if not header: + return False + # get info + header = header[0] + info = header.findNext('ul') + if not info: + return False + + return True + + + +#%% Compound ID + +def get_compound_id_from_comment(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts compound ID from commented field in Notes section + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: NIST compound ID, None if not detected + + ''' + for comment in soup.findAll(string = lambda text: isinstance(text, _bs4.Comment)): + comment = str(comment).replace('\r\n', '').replace('\n', '') + match = _re.search(r'/cgi/.*\?Form=(.*?)&', comment) + if not match: + continue + return match.group(1) + + return None + + +def get_compound_id_from_units_switch(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts compound ID from url to switch energy units + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: NIST compound ID, None if not detected + + ''' + # get info block + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + # get switch link + refs = info.findChildren(name = 'a', string = _re.compile('witch to')) + if not refs: + return None + # extract ID + for ref in refs: + match = _re.search('/cgi/.*\?ID=(.*)&', str(ref)) + if match: + return match.group(1) + + return None + + +def get_compound_id_from_data_refs(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts compound ID from urls to compound data + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: NIST compound ID, None if not detected + + ''' + # TODO: implement + + return None + + +def get_compound_id(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Checks if html is a single compound page and returns NIST compound ID if so + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: NIST compound ID for single compound webpage and None otherwise + + ''' + ID = get_compound_id_from_comment(soup) + if ID is None: + ID = get_compound_id_from_units_switch(soup) + if ID is None: + ID = get_compound_id_from_data_refs(soup) + + return ID + + +#%% Other compound fields + +def get_compound_name(soup: _bs4.BeautifulSoup) -> str: + '''Extracts chemical name from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + str: chemical name of a NIST compound + + ''' + header = soup.findAll('h1', {'id': 'Top'})[0] + name = header.text.strip() + + return name + + +def get_compound_synonyms(soup: _bs4.BeautifulSoup) -> _tp.List[str]: + '''Extracts synonyms of chemical name from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.List[str]: list of alternative chemical names + + ''' + # prepare + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + synonyms = [] + # find synonyms + hits = info.findChildren(string = _re.compile('Other names')) + if hits: + text = hits[0].findParent('li').text.replace('Other names:', '').strip() + synonyms = [_.strip(';').strip() for _ in text.split('\n')] + synonyms = [_ for _ in synonyms if _] + + return synonyms + + +def get_compound_formula(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts chemical formula from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: chemical formula, and None if not found + + ''' + # prepare + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + formula = None + # find chemical formula + hits = info.findChildren(string = _re.compile('Formula')) + if hits: + text = hits[0].findParent('li').text.replace('Formula:', '') + formula = _re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text.strip()) + + return formula + + +def get_compound_mol_weight(soup: _bs4.BeautifulSoup) -> _tp.Optional[float]: + '''Extracts molecular weight from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[float]: molecular weight, and None if not found + + ''' + # prepare + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + mw = None + # find chemical formula + hits = info.findChildren(string = _re.compile('Molecular weight')) + if hits: + text = hits[0].findParent('li').text.replace('Molecular weight:', '') + text = _re.sub('[^0-9\.]', ' ', text).strip().split()[0] + try: + mw = float(text) + except ValueError: + try: + text = _re.search('\d+\.\d+', text).group(0) + mw = float(text) + except ValueError: + pass + + return mw + + +def get_compound_inchi(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts InChI from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: InChI string, and None if not found + + ''' + # prepare + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + inchi = None + # find chemical formula + hits = info.findChildren(attrs = {'class': 'inchi-text'}) + if hits: + for hit in hits: + if 'InChI:' in hit.find_previous().text: + inchi = hit.text + + return inchi + + +def get_compound_inchi_key(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts InChI key from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: InChI key string, and None if not found + + ''' + # prepare + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + inchi_key = None + # find chemical formula + hits = info.findChildren(attrs = {'class': 'inchi-text'}) + if hits: + for hit in hits: + if 'InChIKey:' in hit.find_previous().text: + inchi_key = hit.text + + return inchi_key + + +def get_compound_casrn(soup: _bs4.BeautifulSoup) -> _tp.Optional[str]: + '''Extracts CAS registry number from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[str]: CAS RN, and None if not found + + ''' + # prepare + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + cas = None + # find chemical formula + hits = info.findChildren(string = _re.compile('CAS Registry Number')) + if hits: + text = hits[0].findParent('li').text.replace('CAS Registry Number:', '') + cas = text.strip() + + return cas + + + +def get_compound_mol_refs(soup: _bs4.BeautifulSoup) -> _tp.Dict[str, str]: + '''Extracts dictionary of URLs for compound MOL-files from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Dict[str, str]: mol2D / mol3D are keys, URLs are values + + ''' + # preparations + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + mol_refs = {} + # 2D mol + hits = info.findChildren(attrs = {'href': _re.compile('Str2File')}) + if hits: + mol_refs['mol2D'] = _ncpr.BASE_URL + hits[0].attrs['href'] + # 3D mol + hits = info.findChildren(attrs = {'href': _re.compile('Str3File')}) + if hits: + mol_refs['mol3D'] = _ncpr.BASE_URL + hits[0].attrs['href'] + + return mol_refs + + + +def get_compound_data_refs(soup: _bs4.BeautifulSoup) -> _tp.Dict[str, str]: + '''Extracts dictionary of URLs for compound properties from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Dict[str, str]: property names are keys, URLs are values + + ''' + MASKS = {'1': 'cTG', '2': 'cTC', '4': 'cTP', '8': 'cTR', + '10': 'cSO', '20': 'cIE', '40': 'cIC', '80': 'cIR', + '100': 'cTZ', '200': 'cMS', '400': 'cUV', '800': 'cES', + '1000': 'cDI', '2000': 'cGC'} + # preparations + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + data_refs = {} + # localize other data + hits = info.findChildren(string = _re.compile('Other data available')) + if not hits: + return {} + hit = hits[0].find_parent('li') + if not hit: + return {} + # extract refs + for item in hit.findChildren('li'): + refs = [(a.text.strip(), a.attrs['href']) for a in item.findChildren('a')] + if not refs: + continue + text, ref = refs[0] + mask = _re.search('Mask=(\d+)', ref) + key = MASKS.get(mask.group(1), text) if mask else text + data_refs[key] = _ncpr.BASE_URL + ref + + return data_refs + + + +def get_compound_nist_public_refs(soup: _bs4.BeautifulSoup) -> _tp.Dict[str, str]: + '''Extracts dictionary of URLs for compound properties stored at other + public NIST sites from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Dict[str, str]: property names are keys, URLs are values + + ''' + # preparations + header = soup.findAll('h1', {'id': 'Top'})[0] + info = header.findNext('ul') + data_refs = {} + # localize other data + hits = info.findChildren(string = _re.compile('other public NIST sites')) + if not hits: + return {} + hit = hits[0].find_parent('li') + if not hit: + return {} + # extract refs + for item in hit.findChildren('li'): + refs = [(a.text.strip(), a.attrs['href']) for a in item.findChildren('a')] + if not refs: + continue + text, ref = refs[0] + data_refs[text] = ref + + return data_refs + + + +def get_compound_nist_subscription_refs(soup: _bs4.BeautifulSoup) -> _tp.Dict[str, str]: + '''Extracts dictionary of URLs for compound properties stored at other + subscription NIST sites from compound page + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Dict[str, str]: property names are keys, URLs are values + + ''' + data_refs = {} + # prepare + headers = soup.findAll('h2', string = _re.compile('NIST subscription')) + if not headers: + return {} + header = headers[0] + # get list elements + hit = header.find_next('ul') + if not hit: + return {} + # iterate + for item in hit.findChildren('li'): + refs = [(a.text.strip(), a.attrs['href']) for a in item.findChildren('a')] + if not refs: + continue + text, ref = refs[0] + data_refs[text] = ref + + return data_refs + + + +#%% Compound + +def parse_compound_page(soup: _bs4.BeautifulSoup) -> _tp.Optional[dict]: + '''Parses Nist compound webpage and returns dictionary with extracted info + + Arguments: + soup (_bs4.BeautifulSoup): bs4-parsed web-page + + Returns: + _tp.Optional[dict]: dictionary with extracted info and None if webpage + does not correspond to single compound + + ''' + # extract + ID = get_compound_id(soup) + name = get_compound_name(soup) + synonyms = get_compound_synonyms(soup) + formula = get_compound_formula(soup) + mol_weight = get_compound_mol_weight(soup) + inchi = get_compound_inchi(soup) + inchi_key = get_compound_inchi_key(soup) + cas_rn = get_compound_casrn(soup) + mol_refs = get_compound_mol_refs(soup) + data_refs = get_compound_data_refs(soup) + nist_public_refs = get_compound_nist_public_refs(soup) + nist_subscription_refs = get_compound_nist_subscription_refs(soup) + # output + info = {'ID': ID, 'name': name, 'synonyms': synonyms, 'formula': formula, + 'mol_weight': mol_weight, 'inchi': inchi, 'inchi_key': inchi_key, + 'cas_rn': cas_rn, 'mol_refs': mol_refs, 'data_refs': data_refs, + 'nist_public_refs': nist_public_refs, + 'nist_subscription_refs': nist_subscription_refs} + + return info + + diff --git a/nistchempy/requests.py b/nistchempy/requests.py new file mode 100644 index 0000000..cd708e4 --- /dev/null +++ b/nistchempy/requests.py @@ -0,0 +1,96 @@ +'''Request wrappers for NIST Chemistry WebBook APIs + +Attributes: + BASE_URL (str): base URL of the NIST Chemistry WebBook database + SEARCH_URL (str): relative URL for the search API + +''' + +#%% Imports + +import requests as _requests + +import bs4 as _bs4 + +import dataclasses as _dcs +import typing as _tp + + +#%% Attributes + +BASE_URL = 'https://webbook.nist.gov' +SEARCH_URL = f'{BASE_URL}/cgi/cbook.cgi' +INCHI_URL = f'{BASE_URL}/cgi/inchi' + + +#%% Basic GET request + +def fix_html(html: str) -> str: + '''Fixes detected typos in html code of NIST Chem WebBook web pages + + Arguments: + html (str): text of html-file + + Returns: + str: fixed html-file + + ''' + fixed = html.replace('clss=', 'class=') + + return fixed + + +@_dcs.dataclass(eq = False) +class NistResponse(): + '''Describes response to the GET request to the NIST Chemistry WebBook + + Attributes: + response (_requests.models.Response): request's response + ok (bool): True if request's status code is less than 400 + content_type (_tp.Optional[str]): content type of the response + text (_tp.Optional[str]): text of the response + soup (_tp.Optional[_bs4.BeautifulSoup]): BeautifulSoup object of the html response + + ''' + + response: _requests.models.Response = _dcs.field(repr = False) + ok: bool = _dcs.field(init = False, repr = True) + content_type: _tp.Optional[str] = _dcs.field(init = False, repr = True) + text: _tp.Optional[str] = _dcs.field(init = False, repr = False) + soup: _tp.Optional[_bs4.BeautifulSoup] = _dcs.field(default = None, init = False, repr = False) + + + def __post_init__(self): + self.ok = self.response.ok + self.text = self.response.text + self.content_type = self.response.headers.get('content-type', None) + if 'html' in self.content_type.lower(): + self.text = fix_html(self.text) + self.soup = _bs4.BeautifulSoup(self.text, features = 'html.parser') + + + def _save_response(self, path: str) -> None: + '''Saves response HTML page for testing purposes''' + with open(path, 'w') as outf: + outf.write(self.response.text) + + + +def make_nist_request(url: str, params: dict = {}, **kwargs) -> NistResponse: + '''Dummy request to the NIST Chemistry WebBook + + Arguments: + url (str): URL of the NIST webpage + params (str): GET request parameters + kwargs: requests.get kwargs parameters + + Returns: + NistResponse: wrapper for the request's response + + ''' + r = _requests.get(url, params, **kwargs) + nr = NistResponse(r) + + return nr + + diff --git a/nistchempy/search.py b/nistchempy/search.py new file mode 100644 index 0000000..f281c8d --- /dev/null +++ b/nistchempy/search.py @@ -0,0 +1,274 @@ +'''The module contains search-related functionality''' + +#%% Imports + +import dataclasses as _dcs +import typing as _tp + +import nistchempy.requests as _ncpr +import nistchempy.compound as _compound +import nistchempy.parsing as _parsing + + +#%% Search parameters helper + +def get_search_parameters() -> _tp.Dict[str, str]: + '''Returns search parameters and the corresponding keys + + Returns: + _tp.Dict[str, str]: {short_key => search_parameter} + + ''' + info = {'use_SI': 'Units for thermodynamic data, "SI" if True and "calories" if False', + 'match_isotopes': 'Exactly match the specified isotopes (formula search only)', + 'allow_other': 'Allow elements not specified in formula (formula search only)', + 'allow_extra': 'Allow more atoms of elements in formula than specified (formula search only)', + 'no_ion': 'Exclude ions from the search (formula search only)', + 'cTG': 'Gas phase thermochemistry data', + 'cTC': 'Condensed phase thermochemistry data', + 'cTP': 'Phase change data', + 'cTR': 'Reaction thermochemistry data', + 'cIE': 'Gas phase ion energetics data', + 'cIC': 'Ion clustering data', + 'cIR': 'IR Spectrum', + 'cTZ': 'THz IR spectrum', + 'cMS': 'Mass spectrum (electron ionization)', + 'cUV': 'UV/Visible spectrum', + 'cGC': 'Gas Chromatography', + 'cES': 'Vibrational and/or electronic energy levels', + 'cDI': 'Constants of diatomic molecules', + 'cSO': 'Henry\'s Law data'} + + return info + + +def print_search_parameters() -> None: + ''' + Prints available search parameters + ''' + info = get_search_parameters() + max_len = max([len(_) for _ in info]) + spaces = [' '*(max_len - len(_) + 1) for _ in info] + for (key, val), space in zip(info.items(), spaces): + print(f'{key}{space}: {val}') + + + +#%% Classes + +@_dcs.dataclass +class NistSearchParameters(): + '''GET parameters for compound search of NIST Chemistry WebBook + + Attributes: + use_SI (bool): if True, returns results in SI units. otherwise calories are used + match_isotopes (bool): if True, exactly matches the specified isotopes (formula search only) + allow_other (bool): if True, allows elements not specified in formula (formula search only) + allow_extra (bool): if True, allows more atoms of elements in formula than specified (formula search only) + no_ion (bool): if True, excludes ions from the search (formula search only) + cTG (bool): if True, returns entries containing gas-phase thermodynamic data + cTC (bool): if True, returns entries containing condensed-phase thermodynamic data + cTP (bool): if True, returns entries containing phase-change thermodynamic data + cTR (bool): if True, returns entries containing reaction thermodynamic data + cIE (bool): if True, returns entries containing ion energetics thermodynamic data + cIC (bool): if True, returns entries containing ion cluster thermodynamic data + cIR (bool): if True, returns entries containing IR data + cTZ (bool): if True, returns entries containing THz IR data + cMS (bool): if True, returns entries containing MS data + cUV (bool): if True, returns entries containing UV/Vis data + cGC (bool): if True, returns entries containing gas chromatography data + cES (bool): if True, returns entries containing vibrational and electronic energy levels + cDI (bool): if True, returns entries containing constants of diatomic molecules + cSO (bool): if True, returns entries containing info on Henry\'s law + + ''' + + use_SI: bool = True # Units = SI/CAL + match_isotopes: bool = False + allow_other: bool = False + allow_extra: bool = False + no_ion: bool = False + cTG: bool = False + cTC: bool = False + cTP: bool = False + cTR: bool = False + cIE: bool = False + cIC: bool = False + cIR: bool = False + cTZ: bool = False + cMS: bool = False + cUV: bool = False + cGC: bool = False + cES: bool = False + cDI: bool = False + cSO: bool = False + + + def __str__(self): + params = [f'{k}={v}' for k, v in self.__dict__.items() if v] + text = f'SearchParameters({", ".join(params)})' + + return text + + + def __repr__(self): + return self.__str__() + + + def get_request_parameters(self) -> dict: + '''Returns dictionary containing GET parameters + + Returns: + dict: dictionary of GET parameters relevant to the search + + ''' + params = {'Units': 'SI' if self.use_SI else 'CAL'} + for key, val in self.__dict__.items(): + if key == 'Units' or not val: + continue + params[key] = 'on' + + return params + + + +@_dcs.dataclass(eq = False) +class NistSearch(): + '''Results of the compound search in NIST Chemistry WebBook + + Attributes: + nist_response (NistResponse): NIST search response + search_parameters (NistSearchParameters): used search parameters + compound_ids (_tp.List[str]): NIST IDs of found compounds + compounds (_tp.List[_compound.NistCompound]): NistCompound objects of found compounds + success (bool): True if search request was successful + num_compounds (int): number of found compounds + lost (bool): True if search returns less compounds than there are in the database + + ''' + + nist_response: _ncpr.NistResponse = _dcs.field(repr = False) + search_parameters: NistSearchParameters = _dcs.field(repr = False) + compound_ids: _tp.List[str] = _dcs.field(repr = False) + compounds: _tp.List[_compound.NistCompound] = _dcs.field(init = False, repr = False) + success: bool + num_compounds: int = _dcs.field(init = False) + lost: bool + + + def __post_init__(self): + self.compounds = [] + self.num_compounds = len(self.compound_ids) + + + def _save_response_page(self, path: str = 'nist_search.html') -> None: + '''Saves response page for testing purposes''' + self.nist_response._save_response(path) + + + def load_found_compounds(self, **kwargs) -> None: + '''Loads found compounds + + Arguments: + kwargs: requests.get kwargs parameters + + ''' + self.compounds = [] + for ID in self.compound_ids: + X = _compound.get_compound(ID, **kwargs) + self.compounds.append(X) + + + +#%% Search + +def run_search(identifier: str, search_type: str, + search_parameters: _tp.Optional[NistSearchParameters] = None, + use_SI: bool = True, match_isotopes: bool = False, + allow_other: bool = False, allow_extra: bool = False, + no_ion: bool = False, cTG: bool = False, cTC: bool = False, + cTP: bool = False, cTR: bool = False, cIE: bool = False, + cIC: bool = False, cIR: bool = False, cTZ: bool = False, + cMS: bool = False, cUV: bool = False, cGC: bool = False, + cES: bool = False, cDI: bool = False, cSO: bool = False, + **kwargs) -> NistSearch: + '''Searches compounds in NIST Chemistry WebBook + + Arguments: + identifier (str): NIST compound ID / formula / name / inchi / CAS RN + search_type (str): identifier type, available options are: + - 'formula' + - 'name' + - 'inchi' + - 'cas' + - 'id' + search_parameters (_tp.Optional[NistSearchParameters]): search parameters; if provided, the following search parameter arguments are ignored + use_SI (bool): if True, returns results in SI units. otherwise calories are used + match_isotopes (bool): if True, exactly matches the specified isotopes (formula search only) + allow_other (bool): if True, allows elements not specified in formula (formula search only) + allow_extra (bool): if True, allows more atoms of elements in formula than specified (formula search only) + no_ion (bool): if True, excludes ions from the search (formula search only) + cTG (bool): if True, returns entries containing gas-phase thermodynamic data + cTC (bool): if True, returns entries containing condensed-phase thermodynamic data + cTP (bool): if True, returns entries containing phase-change thermodynamic data + cTR (bool): if True, returns entries containing reaction thermodynamic data + cIE (bool): if True, returns entries containing ion energetics thermodynamic data + cIC (bool): if True, returns entries containing ion cluster thermodynamic data + cIR (bool): if True, returns entries containing IR data + cTZ (bool): if True, returns entries containing THz IR data + cMS (bool): if True, returns entries containing MS data + cUV (bool): if True, returns entries containing UV/Vis data + cGC (bool): if True, returns entries containing gas chromatography data + cES (bool): if True, returns entries containing vibrational and electronic energy levels + cDI (bool): if True, returns entries containing constants of diatomic molecules + cSO (bool): if True, returns entries containing info on Henry\'s law + kwargs: requests.get parameters + + Returns: + NistSearch: search object containing info on found compounds + + ''' + # parameters + search_types = {'formula': 'Formula', 'name': 'Name', + 'inchi': 'InChI', 'cas': 'ID', 'id': 'ID'} + if search_type not in search_types: + raise ValueError(f'Bad search_type value: {search_type}') + # prepare search parameters + if search_parameters is None: + search_parameters = NistSearchParameters(use_SI = use_SI, + match_isotopes = match_isotopes if search_type == 'formula' else False, + allow_other = allow_other if search_type == 'formula' else False, + allow_extra = allow_extra if search_type == 'formula' else False, + no_ion = no_ion if search_type == 'formula' else False, + cTG = cTG, cTC = cTC, cTP = cTP, cTR = cTR, cIE = cIE, cIC = cIC, + cIR = cIR, cTZ = cTZ, cMS = cMS, cUV = cUV, cGC = cGC, cES = cES, + cDI = cDI, cSO = cSO) + # prepare GET parameters + params = {search_types[search_type]: identifier, + **search_parameters.get_request_parameters()} + # load webpage + nr = _ncpr.make_nist_request(_ncpr.SEARCH_URL, params, **kwargs) + if not nr.ok: + return NistSearch(nist_response = nr, search_parameters = search_parameters, + compound_ids = [], success = False, lost = False) + + # XXX: there are possible "search errors" which follows the

tag: + # 1) 'information from the inchi' and 'no matching species found' + # for inchi search + # 2) 'not found' for other searches + # possibly in the future we need to catch them explicitly + + # check if response is a compound page + if _parsing.is_compound_page(nr.soup): + X = _compound.compound_from_response(nr) + nsearch = NistSearch(nist_response = nr, search_parameters = search_parameters, + compound_ids = [X.ID], success = True, lost = False) + nsearch.compounds = [X] + return nsearch + # extract IDs + info = _parsing.get_found_compounds(nr.soup) + + return NistSearch(nist_response = nr, search_parameters = search_parameters, + compound_ids = info['IDs'], success = True, lost = info['lost']) + + diff --git a/setup.cfg b/setup.cfg index 013dedd..e755657 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,19 +1,19 @@ [metadata] name = NistChemPy -version = 0.2.3 +version = attr:nistchempy.__version__ author = Ivan Chernyshov author_email = ivan.chernyshoff@gmail.com description = A simple Python wrapper around the NIST Chemistry WebBook long_description = file: README.md long_description_content_type = text/markdown -url = https://github.com/EPiCs-group/NistChemPy +url = https://github.com/IvanChernyshov/NistChemPy project_urls = - Bug Tracker = https://github.com/EPiCs-group/NistChemPy/issues + Bug Tracker = https://github.com/IvanChernyshov/NistChemPy/issues keywords = nist python api chemistry cheminformatics classifiers = License :: OSI Approved :: MIT License Programming Language :: Python :: 3 - Development Status :: 3 - Alpha + Development Status :: 4 - Beta Operating System :: OS Independent Intended Audience :: Science/Research Topic :: Scientific/Engineering @@ -31,7 +31,7 @@ install_requires = requests beautifulsoup4 pandas -python_requires = >=3.6 +python_requires = >=3.7 [options.package_data] * = *.zip diff --git a/tests/test_compound.py b/tests/test_compound.py new file mode 100644 index 0000000..fbce42a --- /dev/null +++ b/tests/test_compound.py @@ -0,0 +1,45 @@ +'''Unit tests for nistchempy.compound''' + +import nistchempy as nist + + +class TestCompoundInitialization: + + def test_correct_id(self): + X = nist.get_compound('C71432') + assert X is not None + + def test_correct_casrn(self): + X = nist.get_compound('71-43-2') + assert X is not None + + def test_correct_inchi(self): + X = nist.get_compound('InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H') + assert X is not None + + def test_nonunique_inchi(self): + X = nist.get_compound('InChI=1S/C10H14O2/c1-6-3-4-8-7(2)5-12-10(11)9(6)8/h5-6,8-9H,3-4H2,1-2H3') + assert X is None + + def test_incorrect_id(self): + X = nist.get_compound('qwe-qwe-qwe') + assert X is None + + + +class TestPropertyExtraction: + + X = nist.get_compound('C71432') + + def test_mol2D(self): + assert self.X.mol2D is None + self.X.get_mol2D() + assert self.X.mol2D is not None + + def test_ms_spec(self): + assert not self.X.ms_specs + self.X.get_ms_spectra() + assert self.X.ms_specs + assert self.X.ms_specs[0].jdx_text is not None + + diff --git a/tests/test_parsing.py b/tests/test_parsing.py new file mode 100644 index 0000000..ef99644 --- /dev/null +++ b/tests/test_parsing.py @@ -0,0 +1,11 @@ +'''Unit tests for nistchempy.parsing formed from validation of NistChemPy +via all NIST Chemistry WebBook compounds''' + +import nistchempy as nist + + +def test_mw(): + X = nist.get_compound('C25085534') + assert X.mol_weight + + diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..623fb34 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,46 @@ +'''Unit tests for nistchempy.search''' + +import nistchempy as nist + + +class TestSearch: + + def test_search_id(self): + s = nist.run_search('C71432', 'id') + assert len(s.compounds) == 1 + assert s.compounds[0].name.lower() == 'benzene' + + def test_search_casrn(self): + s = nist.run_search('71-43-2', 'cas') + assert len(s.compounds) == 1 + assert s.compounds[0].name.lower() == 'benzene' + + def test_search_name(self): + s = nist.run_search('*butadiene*', 'name') + assert len(s.compound_ids) > 0 + X = nist.get_compound(s.compound_ids[0]) + names = [X.name] + X.synonyms + assert any(['butadiene' in name.lower() for name in names]) + + def test_search_formula(self): + s = nist.run_search('C6H?Cl3', 'formula') + assert s.compound_ids + + def test_search_inchi(self): + s = nist.run_search('InChI=1S/C10H14O2/c1-6-3-4-8-7(2)5-12-10(11)9(6)8/h5-6,8-9H,3-4H2,1-2H3', 'inchi') + assert s.compound_ids + + def test_search_bad_inchi(self): + s = nist.run_search('qwe-qwe-qwe', 'inchi') + assert not s.compound_ids + + def test_search_lost(self): + s = nist.run_search('C?H?O?', 'formula') + assert s.lost + + def test_load_compounds(self): + s = nist.run_search('InChI=1S/C10H14O2/c1-6-3-4-8-7(2)5-12-10(11)9(6)8/h5-6,8-9H,3-4H2,1-2H3', 'inchi') + s.load_found_compounds() + assert all([X.ID is not None for X in s.compounds]) + + diff --git a/tutorial.ipynb b/tutorial.ipynb deleted file mode 100644 index 34f4148..0000000 --- a/tutorial.ipynb +++ /dev/null @@ -1,1101 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "73cc5915", - "metadata": {}, - "source": [ - "# NistChemPy Tutorial\n", - "\n", - "## Compound and Spectrum\n", - "\n", - "To get NIST compound initialize `Compound` object with NIST ID. The main properties including name, chemical formula, InChI, and links to physico-chemical data will be parsed:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "e9ada870", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ID': 'C85018',\n", - " 'name': 'Phenanthrene',\n", - " 'synonyms': ['Phenanthren', 'Phenanthrin', 'Phenantrin'],\n", - " 'formula': 'C14 H10',\n", - " 'mol_weight': 178.2292,\n", - " 'inchi': 'InChI=1S/C14H10/c1-3-7-13-11(5-1)9-10-12-6-2-4-8-14(12)13/h1-10H',\n", - " 'inchi_key': 'YNPNZTXNASCQKK-UHFFFAOYSA-N',\n", - " 'cas_rn': '85-01-8',\n", - " 'IR': [],\n", - " 'MS': [],\n", - " 'UV': [],\n", - " 'mol2D': None,\n", - " 'mol3D': None,\n", - " 'data_refs': {'mol2D': 'https://webbook.nist.gov/cgi/cbook.cgi?Str2File=C85018',\n", - " 'mol3D': 'https://webbook.nist.gov/cgi/cbook.cgi?Str3File=C85018',\n", - " 'cTG': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=1#Thermo-Gas'],\n", - " 'cTC': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=2#Thermo-Condensed'],\n", - " 'cTP': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=4#Thermo-Phase'],\n", - " 'cTR': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=8#Thermo-React'],\n", - " 'cSO': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=10#Solubility'],\n", - " 'cIE': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=20#Ion-Energetics'],\n", - " 'cIC': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=40#Ion-Cluster'],\n", - " 'cIR': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=80#IR-Spec'],\n", - " 'cMS': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=200#Mass-Spec'],\n", - " 'cUV': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=400#UV-Vis-Spec'],\n", - " 'cGC': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C85018&Units=SI&Mask=2000#Gas-Chrom']}}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import nistchempy as nist\n", - "X = nist.Compound('C85018')\n", - "X.__dict__" - ] - }, - { - "cell_type": "markdown", - "id": "1f07886f", - "metadata": {}, - "source": [ - "Abbreviations of available data types can be viewed using the `print_search_params` function:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "106804f3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Units : Units for thermodynamic data, \"SI\" or \"CAL\" for calorie-based\n", - "MatchIso : Exactly match the specified isotopes (formula search only)\n", - "AllowOther : Allow elements not specified in formula (formula search only)\n", - "AllowExtra : Allow more atoms of elements in formula than specified (formula search only)\n", - "NoIon : Exclude ions from the search (formula search only)\n", - "cTG : Contains gas-phase thermodynamic data\n", - "cTC : Contains condensed-phase thermodynamic data\n", - "cTP : Contains phase-change thermodynamic data\n", - "cTR : Contains reaction thermodynamic data\n", - "cIE : Contains ion energetics thermodynamic data\n", - "cIC : Contains ion cluster thermodynamic data\n", - "cIR : Contains IR data\n", - "cTZ : Contains THz IR data\n", - "cMS : Contains MS data\n", - "cUV : Contains UV/Vis data\n", - "cGC : Contains gas chromatography data\n", - "cES : Contains vibrational and electronic energy levels\n", - "cDI : Contains constants of diatomic molecules\n", - "cSO : Contains info on Henry's law\n" - ] - } - ], - "source": [ - "nist.print_search_parameters()" - ] - }, - { - "cell_type": "markdown", - "id": "6bc36359", - "metadata": {}, - "source": [ - "MOL files containing 2D and 3D coordinates and spectroscopic data will not be loaded due to the additional request required for each property. They can be downloaded later:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "eebfc20b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - " NIST 07011517253D 1 1.00000 -539.53865\r\n", - "Copyright by the U.S. Sec. Commerce on behalf of U.S.A. All rights reserved.\r\n", - " 24 26 0 0 0 0 0 0 0 0999 V2000\r\n", - " 4.2671 4.2111 6.0319 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 3.4011 3.3615 5.3683 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 3.4337 3.2256 3.9602 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 5.2115 4.9687 5.3136 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 5.2684 4.8584 3.9386 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 4.3927 3.9962 3.2378 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 4.4609 3.8894 1.8079 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 2.5375 2.3405 3.2259 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 2.6439 2.2686 1.8051 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 1.5565 1.5396 3.8570 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 3.6253 3.0639 1.1234 C 0 0 0 0 0 0 0 0 0 0 0 0\r\n", - " 1.7801 1.4135 1.0811 C 0 \n" - ] - } - ], - "source": [ - "X.get_3D() # X.get_2d() for 2D coordinates\n", - "print(X.mol3D[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "96ef503d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Spectrum(C85018, IR spectrum #0),\n", - " Spectrum(C85018, IR spectrum #1),\n", - " Spectrum(C85018, IR spectrum #2),\n", - " Spectrum(C85018, IR spectrum #3),\n", - " Spectrum(C85018, IR spectrum #4),\n", - " Spectrum(C85018, IR spectrum #5),\n", - " Spectrum(C85018, IR spectrum #6)]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.get_spectra('IR')\n", - "X.IR" - ] - }, - { - "cell_type": "markdown", - "id": "b183a068", - "metadata": {}, - "source": [ - "The spectra are stored as a list, and each contains the text of a JCAMP-DX file:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "17a2f820", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Compound(C85018) IR 2\n", - "====================\n", - "##TITLE=PHENANTHRENE\n", - "##JCAMP-DX=4.24\n", - "##DATA TYPE=INFRARED SPECTRUM\n", - "##CLASS=COBLENTZ\n", - "##ORIGIN=CENTRE D'ETUDES NUCLEAIRES DE GRENOBLE\n", - "##OWNER=COBLENTZ SOCIETY\n", - "Collection (C) 2018 copyright by the U.S. Secretary of Commerce\n", - "on behalf of the United States of America. All rights reserved.\n", - "##DATE=Not specified, most likely prior to 1970\n", - "##CAS REGISTRY NO=85-01-8\n", - "##MOLFORM=C14 H10\n", - "##SOURCE REFERENCE=COBLENTZ NO. 4253\n", - "##$NIST SOURCE=COBLENTZ\n", - "##$NIST IMAGE=cob4253\n", - "##SPECTROMETER/DATA SYSTEM=Not specified, most likely a prism, grating, or hybrid spectrometer.\n", - "##STATE=SOLUTION (SATURATED IN HEPTANE)\n", - "##PATH LENGTH=0.05 CM\n", - "$$PURITY 99.99%\n", - "##SAMPLING PROCEDURE=TRANSMISSION\n", - "##RESOLUTION=4\n", - "##DATA PROCESSING=DIGITIZED BY NIST FROM HARD COPY\n", - "##XUNITS=MICROMETERS\n", - "##YUNITS=TRANSMITTANCE\n", - "##XFACTOR=1.000000\n", - "##YFACTOR=1\n", - "##DELTAX=000.011124\n", - "##FIRSTX=14.665\n", - "##LASTX=35.1221\n", - "##FIRSTY=0.843\n", - "##MAXX=35.1221\n", - "##MINX=14.665\n", - "##MAXY=0.93\n", - "##MINY=0.358\n", - "##NPOINTS=1840\n", - "##XYDATA=(X++(Y..Y))\n", - "14.665000 0.8430 0.8430 0.8450 0.8\n" - ] - } - ], - "source": [ - "spec = X.IR[2]\n", - "print(spec.compound, spec.spec_type, spec.spec_idx)\n", - "print('='*20)\n", - "print(spec.jdx_text[:1000])" - ] - }, - { - "cell_type": "markdown", - "id": "c9e3a4a6", - "metadata": {}, - "source": [ - "## Search\n", - "\n", - "There are four available search types: by [name](https://webbook.nist.gov/chemistry/name-ser/), [InChI](https://webbook.nist.gov/chemistry/inchi-ser/), [CAS RN](https://webbook.nist.gov/chemistry/cas-ser/), and [chemical formula](https://webbook.nist.gov/chemistry/form-ser/). In addition to the main identifier, you can limit the search using several parameters, which can be using the `print_search_params` function:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "77787d19", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Units : Units for thermodynamic data, \"SI\" or \"CAL\" for calorie-based\n", - "MatchIso : Exactly match the specified isotopes (formula search only)\n", - "AllowOther : Allow elements not specified in formula (formula search only)\n", - "AllowExtra : Allow more atoms of elements in formula than specified (formula search only)\n", - "NoIon : Exclude ions from the search (formula search only)\n", - "cTG : Contains gas-phase thermodynamic data\n", - "cTC : Contains condensed-phase thermodynamic data\n", - "cTP : Contains phase-change thermodynamic data\n", - "cTR : Contains reaction thermodynamic data\n", - "cIE : Contains ion energetics thermodynamic data\n", - "cIC : Contains ion cluster thermodynamic data\n", - "cIR : Contains IR data\n", - "cTZ : Contains THz IR data\n", - "cMS : Contains MS data\n", - "cUV : Contains UV/Vis data\n", - "cGC : Contains gas chromatography data\n", - "cES : Contains vibrational and electronic energy levels\n", - "cDI : Contains constants of diatomic molecules\n", - "cSO : Contains info on Henry's law\n" - ] - } - ], - "source": [ - "nist.print_search_parameters()" - ] - }, - { - "cell_type": "markdown", - "id": "9e73ff08", - "metadata": {}, - "source": [ - "These options can be specified when initializing the `Search` object or later in the find_compounds method as `**kwargs`:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "99f4a811", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SearchParameters(Units=SI, NoIon=True, cMS=True)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search = nist.Search(NoIon = True, cMS = True)\n", - "search.parameters" - ] - }, - { - "cell_type": "markdown", - "id": "16ee93f3", - "metadata": {}, - "source": [ - "After setting parameters you can start searching compounds. Let's start with the name search. Search object have four properties, which are updated after each run of `find_compounds` method:\n", - "* `success`: was the search successful?\n", - "* `lost`: did the search stay within the limit of 400 compounds?\n", - "* `IDs`: NIST IDs of found compounds (`Compound` objects are not initialized here to prevent wasting time on internet requests);\n", - "* `compounds`: list of `Compound` objects, which is empty after search." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d8a41351", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Search(Success=True, Lost=False, Found=4)\n", - "True False ['C1871585', 'C298180', 'C1529686', 'C1464535'] []\n" - ] - } - ], - "source": [ - "search.find_compounds(identifier = '1,2,3*-butane', search_type = 'name')\n", - "print(search)\n", - "print(search.success, search.lost, search.IDs, search.compounds)" - ] - }, - { - "cell_type": "markdown", - "id": "3ddb9f34", - "metadata": {}, - "source": [ - "After search finished, you can initialize `Compound` objects:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "5b761c5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Compound(C1871585), Compound(C298180), Compound(C1529686), Compound(C1464535)]\n", - "Propane, 1,2,3-trichloro-2-methyl-\n", - "['1,2,3-Trichloro-2-methylpropane', '1,2,3-Trichloroisobutane']\n" - ] - } - ], - "source": [ - "search.load_found_compounds()\n", - "print(search.compounds)\n", - "print(search.compounds[0].name)\n", - "print(search.compounds[0].synonyms)" - ] - }, - { - "cell_type": "markdown", - "id": "06bf9558", - "metadata": {}, - "source": [ - "Search by CAS registry number and InChI ignores some search parameters. Let's exemplify this on AgCl. Even though there are no available MS data for AgCl, the output contains it:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "fa5e59c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SearchParameters(Units=SI, NoIon=True, cMS=True)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search.find_compounds('7783-90-6', 'cas')\n", - "search.parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e7c8c596", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'mol2D': 'https://webbook.nist.gov/cgi/cbook.cgi?Str2File=C7783906',\n", - " 'mol3D': 'https://webbook.nist.gov/cgi/cbook.cgi?Str3File=C7783906',\n", - " 'cTC': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C7783906&Units=SI&Mask=2#Thermo-Condensed'],\n", - " 'cTP': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C7783906&Units=SI&Mask=4#Thermo-Phase'],\n", - " 'cTR': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C7783906&Units=SI&Mask=8#Thermo-React'],\n", - " 'cIE': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C7783906&Units=SI&Mask=20#Ion-Energetics'],\n", - " 'cDI': ['https://webbook.nist.gov/cgi/cbook.cgi?ID=C7783906&Units=SI&Mask=1000#Diatomic']}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search.load_found_compounds()\n", - "X = search.compounds[0]\n", - "X.data_refs" - ] - }, - { - "cell_type": "markdown", - "id": "8be1edb0", - "metadata": {}, - "source": [ - "Search by chemical formula is the most powerful way of retrieving data. The only problem is the possibility that the number of found entries will exceed the limit of 400 compounds. To check if this happened, you need to get the `lost` property:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "191aed3e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Search(Success=True, Lost=True, Found=400)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search = nist.Search(NoIon = True, cMS = True)\n", - "search.find_compounds('C6H*O?', 'formula')\n", - "search" - ] - }, - { - "cell_type": "markdown", - "id": "c7466a73", - "metadata": {}, - "source": [ - "To overcome that when searching for a large number of substances, try to break the chemical formula into subsets:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d24573fb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(170, False), (178, False), (80, False), (42, False), (7, False), (24, False)]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "overflows = []\n", - "for i in range(1, 7):\n", - " search.find_compounds(f'C6H?O{i}', 'formula')\n", - " overflows.append( (len(search.IDs), search.lost) )\n", - "overflows" - ] - }, - { - "cell_type": "markdown", - "id": "e0da51d0", - "metadata": {}, - "source": [ - "This strategy can be used to combine search results and use the found identifiers to collect spectroscopic data.\n", - "\n", - "## Extracted data on NIST compounds\n", - "\n", - "Limiting search results to 400 substances and the impossibility to create an external API for the search by substructure brings significant inconvenience to the search process. To overcome this problem, we extracted all NIST Chemistry WebBook compounds using the [sitemap](https://webbook.nist.gov/sitemap_index.xml) and organized the data as a pandas data frame. It consists of 24 columns:\n", - "* columns **1–7** contains the compound description:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "b3567315", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDnameformulamol_weightinchiinchi_keycas_rn
0B100iron oxide anionFeO-71.8450<NA><NA><NA>
1B1000AsF3..Cl anionAsClF3-167.3700<NA><NA><NA>
2B1000000AgH2-AgH2-109.8846<NA><NA><NA>
3B1000001HAg(H2)AgH3110.8920<NA><NA><NA>
4B1000002AgNO+AgNO+137.8738<NA><NA><NA>
........................
129000U99777Methyl 3-hydroxycholest-5-en-26-oate, TMS deri...C31 H54 O3 Si502.8442InChI=1S/C31H54O3Si/c1-21(10-9-11-22(2)29(32)3...DNXGNXYNSBCWGX-QBUYVTDMSA-N<NA>
129001U998302-Methyl-3-oxovaleric acid, O,O'-bis(trimethyl...C12 H26 O3 Si2274.5040InChI=1S/C12H26O3Si2/c1-9-11(14-16(3,4)5)10(2)...LXAIQDVPXKOIGO-KHPPLWFESA-N<NA>
129002U999423-Hydroxy-3-(4'-hydroxy-3'-methoxyphenyl)propi...C19 H36 O5 Si3428.7426InChI=1S/C19H36O5Si3/c1-21-18-13-15(11-12-16(1...QCMUGKOFXVYNCF-UHFFFAOYSA-N<NA>
129003U999472-Propylpentanoic acid, 2,3,4,6-tetra(trimethy...C26 H58 O7 Si4595.0765InChI=1S/C26H58O7Si4/c1-15-17-20(18-16-2)25(27...OVXMRISJDUWFKB-UHFFFAOYSA-N<NA>
129004xY5O2 radicalO2 Y5476.5281<NA><NA><NA>
\n", - "

129005 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " ID name \\\n", - "0 B100 iron oxide anion \n", - "1 B1000 AsF3..Cl anion \n", - "2 B1000000 AgH2- \n", - "3 B1000001 HAg(H2) \n", - "4 B1000002 AgNO+ \n", - "... ... ... \n", - "129000 U99777 Methyl 3-hydroxycholest-5-en-26-oate, TMS deri... \n", - "129001 U99830 2-Methyl-3-oxovaleric acid, O,O'-bis(trimethyl... \n", - "129002 U99942 3-Hydroxy-3-(4'-hydroxy-3'-methoxyphenyl)propi... \n", - "129003 U99947 2-Propylpentanoic acid, 2,3,4,6-tetra(trimethy... \n", - "129004 x Y5O2 radical \n", - "\n", - " formula mol_weight \\\n", - "0 FeO- 71.8450 \n", - "1 AsClF3- 167.3700 \n", - "2 AgH2- 109.8846 \n", - "3 AgH3 110.8920 \n", - "4 AgNO+ 137.8738 \n", - "... ... ... \n", - "129000 C31 H54 O3 Si 502.8442 \n", - "129001 C12 H26 O3 Si2 274.5040 \n", - "129002 C19 H36 O5 Si3 428.7426 \n", - "129003 C26 H58 O7 Si4 595.0765 \n", - "129004 O2 Y5 476.5281 \n", - "\n", - " inchi \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "... ... \n", - "129000 InChI=1S/C31H54O3Si/c1-21(10-9-11-22(2)29(32)3... \n", - "129001 InChI=1S/C12H26O3Si2/c1-9-11(14-16(3,4)5)10(2)... \n", - "129002 InChI=1S/C19H36O5Si3/c1-21-18-13-15(11-12-16(1... \n", - "129003 InChI=1S/C26H58O7Si4/c1-15-17-20(18-16-2)25(27... \n", - "129004 \n", - "\n", - " inchi_key cas_rn \n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "... ... ... \n", - "129000 DNXGNXYNSBCWGX-QBUYVTDMSA-N \n", - "129001 LXAIQDVPXKOIGO-KHPPLWFESA-N \n", - "129002 QCMUGKOFXVYNCF-UHFFFAOYSA-N \n", - "129003 OVXMRISJDUWFKB-UHFFFAOYSA-N \n", - "129004 \n", - "\n", - "[129005 rows x 7 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = nist.get_all_data()\n", - "df.loc[:, df.columns[:7]]" - ] - }, - { - "cell_type": "markdown", - "id": "e2e5261f", - "metadata": {}, - "source": [ - "* columns **8–23** correspond to the available compound properties, including atomic coordinates, spectra, and thermodynamic data (for the full description see the `print_search_params` function):" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "0ed8bf6a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mol2Dmol3DcIRcTZcMScUVcGCcTGcTCcTPcSOcTRcIEcICcEScDI
0FalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseTrueTrueFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
...................................................
129000TrueFalseFalseFalseTrueFalseTrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
129001TrueFalseFalseFalseTrueFalseTrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
129002TrueFalseFalseFalseTrueFalseTrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
129003TrueFalseFalseFalseTrueFalseTrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
129004FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalse
\n", - "

129005 rows × 16 columns

\n", - "
" - ], - "text/plain": [ - " mol2D mol3D cIR cTZ cMS cUV cGC cTG cTC cTP \\\n", - "0 False False False False False False False True False False \n", - "1 False False False False False False False True False False \n", - "2 False False False False False False False False False False \n", - "3 False False False False False False False False False False \n", - "4 False False False False False False False False False False \n", - "... ... ... ... ... ... ... ... ... ... ... \n", - "129000 True False False False True False True False False False \n", - "129001 True False False False True False True False False False \n", - "129002 True False False False True False True False False False \n", - "129003 True False False False True False True False False False \n", - "129004 False False False False False False False False False False \n", - "\n", - " cSO cTR cIE cIC cES cDI \n", - "0 False True True False False False \n", - "1 False False False False False False \n", - "2 False False False False True False \n", - "3 False False False False True False \n", - "4 False False False False True False \n", - "... ... ... ... ... ... ... \n", - "129000 False False False False False False \n", - "129001 False False False False False False \n", - "129002 False False False False False False \n", - "129003 False False False False False False \n", - "129004 False False True False False False \n", - "\n", - "[129005 rows x 16 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[:, df.columns[7:]]" - ] - }, - { - "cell_type": "markdown", - "id": "53520657", - "metadata": {}, - "source": [ - "This data can be easily used to get the full list of compounds with the desired properties, and the use of chemoinformatic libraries will allow filtering substances by structure:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "ff06ecdb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Compound(C100016),\n", - " Compound(C100027),\n", - " Compound(C100094),\n", - " Compound(C100107),\n", - " Compound(C100129)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "IDs = df.ID[~df.inchi.isna() & df.cMS & df.cUV]\n", - "compounds = [nist.Compound(ID) for ID in IDs[:5]]\n", - "compounds" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/update/README.md b/update/README.md new file mode 100644 index 0000000..f992e59 --- /dev/null +++ b/update/README.md @@ -0,0 +1,34 @@ +# NistChemPy Update Scripts + +This directory contains script those functionality is to update the pre-downloaded info on NIST Chemistry WebBook compounds. + + +## Requirements + +All requirements are listed in [requirements.txt](requirements.txt). + +[tqdm](https://tqdm.github.io/) is the only addition to NistChemPy and its dependences. + + +## How to use + +Update scripts use NIST Chemistry WebBook sitemaps to get URLs of compound web pages and use them to extract data. + +All scripts require a root directory to store all interim data which is the first CLI parameter. + +The data update pipeline consists of the following steps: + +1. [get_nist_compounds.py](get_nist_compounds.py): downloads sitemaps, extracts and saves compound URLs. + +2. [get_compound_htmls.py](get_compound_htmls.py): downloads HTML-pages of found compounds. + +3. [check_compound_initialization.py](check_compound_initialization.py): initializes Compound object from downloaded HTML-files. +Possible errors must be manually verified to fix bugs in NistChemPy code. + +4. [process_nonload_errors.py](process_nonload_errors.py): processes errors related to broken links in sitemaps. + +5. [get_missing_stereoisomers.py](get_missing_stereoisomers.py): extracts info on stereoisomers for each pre-downloaded compounds. This fixes almost all errors with broken compound URLs. + +6. [extract_info_from_htmls.py](extract_info_from_htmls.py): extracts info on compounds from prepared compound HTMLs and saves it as if final nist_data.csv final required for the package. + + diff --git a/update/check_compound_initialization.py b/update/check_compound_initialization.py new file mode 100644 index 0000000..b4c93b9 --- /dev/null +++ b/update/check_compound_initialization.py @@ -0,0 +1,134 @@ +'''Check compound initialization using preloaded html-files''' + +#%% Imports + +import os, argparse + +from tqdm import tqdm + +import pandas as pd + +from bs4 import BeautifulSoup + +import nistchempy as nist + +from typing import List + + +#%% Functions + +def check_soup(soup: BeautifulSoup) -> bool: + '''Returns False if something is wrong with the compound's soup + + Arguments: + soup (BeautifulSoup): bs4-parsed web-page + + Returns: + bool: True if compound page is OK + + ''' + if not nist.parsing.is_compound_page(soup): + return False + # extract data + info = {**nist.parsing.parse_compound_page(soup), + 'nist_response': None} + X = nist.compound.NistCompound(**info) + + return X is not None + + +def get_unreadable_compounds(dir_html: str) -> List[str]: + '''Iterates through loaded HTML-files and returns names of non-readable ones + + Arguments: + dir_html: path to the directory containing compound HTML-files + + Returns: + List[str]: list of non-readable HTML files corresponding to compounds.csv row indexes + + ''' + errors = [] + # cycle over files + fs = os.listdir(dir_html) + for f in tqdm(fs, total = len(fs)): + # prepare + idx = int(f.replace('.html', '')) + path = os.path.join(dir_html, f) + with open(path, 'r') as inpf: + text = inpf.read() + soup = BeautifulSoup(text, 'html.parser') + # check + flag = check_soup(soup) + if not flag: + errors.append(idx) + + return errors + + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Runs compound initialization from pre-loaded HTML-pages') + parser.add_argument('dir_data', + help = 'directory containing compounds.csv and htmls/') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + # check root dir + if not os.path.exists(args.dir_data): + raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}') + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + # check compounds.csv + path_csv = os.path.join(args.dir_data, 'compounds.csv') + if not os.path.exists(path_csv): + raise ValueError('Given dir_data directory does not contain compounds.csv file') + + return + + +def main() -> None: + '''Runs compound initialization''' + print('Preparing data ...') + # prepare arguments + args = get_arguments() + check_arguments(args) + dir_html = os.path.join(args.dir_data, 'htmls/') + path_csv = os.path.join(args.dir_data, 'compounds.csv') + df = pd.read_csv(path_csv) + # process compounds + print('Running compound initialization ...') + idxs = get_unreadable_compounds(dir_html) + sub = df.loc[df.index.isin(idxs)] + # save + print('\nSaving data ...') + path_out = os.path.join(args.dir_data, 'unreadable.csv') + sub.to_csv(path_out) + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + diff --git a/update/extract_info_from_htmls.py b/update/extract_info_from_htmls.py new file mode 100644 index 0000000..1f8cdd1 --- /dev/null +++ b/update/extract_info_from_htmls.py @@ -0,0 +1,196 @@ +'''Extracts compound info from previously downloaded HTML-files''' + +#%% Imports + +import os, argparse, json + +from bs4 import BeautifulSoup + +import pandas as pd + +from tqdm import tqdm + +import nistchempy as nist + + + +#%% Functions + +def get_compounds_info(dir_data: str) -> None: + '''Extracts compound info from HTML-files + + Arguments: + dir_data (str): root data dump directory + + ''' + + # get list of htmls + fs = [] + for d in ('htmls', 'htmls_stereo'): + for f in os.listdir(os.path.join(dir_data, d)): + path = os.path.join(dir_data, d, f) + fs.append(path) + + # run extraction + data = [] + for f in tqdm(fs): + with open(f, 'r') as inpf: + soup = BeautifulSoup(inpf.read(), 'html.parser') + if not nist.parsing.is_compound_page(soup): + continue + info = nist.parsing.parse_compound_page(soup) + data.append(info) + + # save data + path_out = os.path.join(dir_data, 'compounds_data.json') + with open(path_out, 'w') as outf: + json.dump(data, outf, indent = 2) + + return + + + +def get_columns(data: list) -> dict: + '''Extracts columns from compound data + + Arguments: + data (list): contents of compounds_data.json + + Returns: + list: column names + + ''' + cols = [k for k in data[0].keys() if '_refs' not in k] + # get unique ref keys + keys = {k: set() for k in data[0].keys() if '_refs' in k} + for item in data: + for k1 in keys: + for k2 in item[k1].keys(): + keys[k1].add(k2) + keys = {k: sorted(list(v)) for k, v in keys.items()} + # fix data_refs + ps = nist.search.get_search_parameters() + data_refs = [v for k, v in ps.items() if len(k) == 3] + data_refs += [k for k in keys['data_refs'] if len(k) != 3] + keys['data_refs'] = data_refs + # final columns + for k, v in keys.items(): + cols += v + + return cols + + + +def prepare_dataset(dir_data: str) -> None: + '''Transforms extracted data to nist_data.csv and nist_data_full.csv + + Arguments: + dir_data (str): root data dump directory + + ''' + + # load data + path_json = os.path.join(dir_data, 'compounds_data.json') + with open(path_json, 'r') as inpf: + data = json.load(inpf) + + # prepare + ref_keys = [k for k in data[0].keys() if '_refs' in k] + ps = nist.search.get_search_parameters() + ps = {k: v for k, v in ps.items() if len(k) == 3} + cols = get_columns(data) + df = [] + + # get rows + for item in data: + add = {k: v for k, v in item.items() if '_refs' not in k} + add['synonyms'] = '\\n'.join(add['synonyms']) + for k in ref_keys: + add.update(item[k]) + df.append(add) + + # process dataframe + df = pd.DataFrame(df) + df = df.rename(columns = ps) + df = df.sort_values('ID', ignore_index = True) + df = df[cols] + + # save + path_out = os.path.join(dir_data, 'nist_data.csv') + df.to_csv(path_out, index = None) + + return + + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds') + parser.add_argument('dir_data', + help = 'directory containing compound.csv file created by get_nist_compounds.py script') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + # check root dir + if not os.path.exists(args.dir_data): + raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}') + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + # check htmls dir + dir_html = os.path.join(args.dir_data, 'htmls') + if not os.path.exists(dir_html): + raise ValueError('Given dir_data directory does not contain htmls/ folder') + # check stereo dir + dir_stereo = os.path.join(args.dir_data, 'htmls_stereo') + if not os.path.exists(dir_stereo): + os.mkdir(dir_stereo) + + return + + +def main() -> None: + '''Updates the list of NIST compounds via downloaded HTML pages''' + + # prepare arguments + args = get_arguments() + check_arguments(args) + + # extract info + print('\nExtracting info from HTML-files ...') + path_json = os.path.join(args.dir_data, 'compounds_data.json') + if not os.path.exists(path_json): + get_compounds_info(args.dir_data) + + # transform to dataframes + print('\nTransforming to dataframe ...') + prepare_dataset(args.dir_data) + print() + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + + diff --git a/update/get_compound_htmls.py b/update/get_compound_htmls.py new file mode 100644 index 0000000..81994ea --- /dev/null +++ b/update/get_compound_htmls.py @@ -0,0 +1,157 @@ +'''Downloads HTML pages of NIST Chemistry WebBook compounds''' + +#%% Imports + +import re, os, sys, time +import argparse + +from tqdm import tqdm + +import pandas as pd + +import nistchempy as nist + + +#%% Download functions + +def download_compound_html(url: str, path_html: str, path_err: str) -> None: + '''Downloads HTML page of the NIST compound + + Arguments: + url (str): URL of compound page + path_html (str): output HTML file + path_err (str): errors file + + ''' + nr = nist.requests.make_nist_request(url) + if nr.ok: + with open(path_html, 'w') as outf: + outf.write(nr.text) + else: + idx = os.path.basename(path_html).replace('.html', '') + message = f'ID={idx}, code={nr.response.status_code}, url={url}\n' + with open(path_err, 'a') as outf: + outf.write(message) + + return + + +def download_compound_htmls(df: pd.core.frame.DataFrame, dir_html: str, + path_err: str, crawl_delay: float = 5) -> None: + '''Main function for updating the list of NIST compounds + + Arguments: + df (pd.core.frame.DataFrame): compounds' [ id / url ] dataframe + dir_html (str): directory containing HTML pages + path_err (str): errors file + crawl_delay (float): interval between HTTP requests, seconds + + ''' + n_errs = 0 + # download cycle + for i, url in tqdm(zip(df.index, df.url), total = len(df)): + if n_errs >= 3: + print('\n3 download errors in a row, stopping execution ...') + sys.exit(0) + path_html = os.path.join(dir_html, f'{i}.html') + try: + download_compound_html(url, path_html, path_err) + n_errs = 0 + time.sleep(crawl_delay) + except (KeyboardInterrupt, SystemError, SystemExit): + raise + except: + n_errs += 1 + time.sleep(max(120, 10*crawl_delay)) + + return + + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds') + parser.add_argument('dir_data', + help = 'directory containing compound.csv file created by get_nist_compounds.py script') + parser.add_argument('--crawl-delay', type = float, default = 5, + help = 'pause between HTTP requests, seconds') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + # check root dir + if not os.path.exists(args.dir_data): + raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}') + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + # check compounds.csv + path_csv = os.path.join(args.dir_data, 'compounds.csv') + if not os.path.exists(path_csv): + raise ValueError('Given dir_data directory does not contain compounds.csv file') + # crawl delay + if args.crawl_delay < 0: + raise ValueError(f'--crawl-delay must be positive: {args.crawl_delay}') + + return + + +def main() -> None: + '''Updates the list of NIST compounds via downloaded HTML pages''' + # prepare arguments + args = get_arguments() + check_arguments(args) + + # get NIST compounds + print('Loading compounds data ...') + path_csv = os.path.join(args.dir_data, 'compounds.csv') + df = pd.read_csv(path_csv) + + # check html dir + dir_html = os.path.join(args.dir_data, 'htmls') + if not os.path.exists(dir_html): + os.mkdir(dir_html) + + # check errors file + path_err = os.path.join(args.dir_data, 'download_htmls.err') + err_urls = [] + if os.path.exists(path_err): + with open(path_err, 'r') as inpf: + lines = [l.strip() for l in inpf.readlines()] + lines = [l for l in lines if l] + err_urls = [re.search('url=(.+)', l).group(1).strip() for l in lines] + + # filter downloaded systems + loaded = [int(f.replace('.html', '')) for f in os.listdir(dir_html)] + df = df.loc[~df.index.isin(loaded)] + df = df.loc[~df.url.isin(err_urls)] + + # download + print('Downloading NIST compound webpages ...') + download_compound_htmls(df, dir_html, path_err, args.crawl_delay) + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + diff --git a/update/get_missing_stereoisomers.py b/update/get_missing_stereoisomers.py new file mode 100644 index 0000000..d5b7066 --- /dev/null +++ b/update/get_missing_stereoisomers.py @@ -0,0 +1,217 @@ +'''The script uses previous data on NIST Chemistry WebBook compounds to fix +current errors, namely non-available compound pages from sitemaps''' + +#%% Imports + +import re, os, sys, time +import argparse + +from urllib.parse import urlparse, parse_qs + +import requests + +from bs4 import BeautifulSoup + +from tqdm import tqdm + +import nistchempy as nist + +from typing import List + + +#%% Functions + +def get_stereoisomers(soup: BeautifulSoup) -> List[str]: + '''Extracts info on stereoisomers from compound's soup + + Arguments: + soup (BeautifulSoup): compound web page's soup + + Returns: + List[str]: list of compound IDs corresponding to the stereoisomers + + ''' + IDs = [] + # find stereoisomer refs + hits = soup.findAll(string = re.compile('Stereoisomers')) + if not hits: + return IDs + item = hits[0].find_parent('li') + if not item: + return IDs + items = item.findChildren('a') + # extract IDs + IDs = [parse_qs(urlparse(a.attrs['href']).query) for a in items] + IDs = [ps['ID'][0] for ps in IDs if 'ID' in ps] + + return IDs + + + +def get_missing_stereoisomers(dir_html: str) -> List[str]: + '''Extracts stereoisomers for each compound and returns those ones missing + in the htmls directory + + Arguments: + dir_html (str): directory containing HTML pages + + Returns: + List[str]: list of compound IDs of missing stereoisomers + + ''' + IDs = [] + stereos = [] + # cycle over files + for f in tqdm(os.listdir(dir_html)): + # get soup + path = os.path.join(dir_html, f) + with open(path, 'r') as inpf: + text = inpf.read() + soup = BeautifulSoup(text, 'html.parser') + # extract ID + if nist.parsing.is_compound_page(soup): + ID = nist.parsing.get_compound_id(soup) + IDs.append(ID) + # extract stereoisomers + addend = get_stereoisomers(soup) + stereos += addend + # filter stereos + stereos = set(stereos).difference(set(IDs)) + stereos = sorted(list(stereos)) + + return stereos + + + +def download_stereoisomer_htmls(IDs: List[str], dir_stereo: str, + crawl_delay: float = 5) -> None: + '''Downloads compound pages for stereoisomers + + Arguments: + IDs (List[str]): list of compound IDs of missing stereoisomers + dir_stereo (str): directory to save HTML pages of stereoisomers + crawl_delay (float): interval between HTTP requests, seconds + + ''' + # download cycle + n_errs = 0 + for ID in tqdm(IDs, total = len(IDs)): + time.sleep(crawl_delay) + if n_errs >= 3: + print('\n3 download errors in a row, stopping execution ...') + sys.exit(0) + path_html = os.path.join(dir_stereo, f'{ID}.html') + url = nist.requests.SEARCH_URL + f'?ID={ID}' + try: + #nr = nist.requests.make_nist_request(url) + nr = requests.get(url) + if nr.text and nr.text.strip(): + with open(path_html, 'w') as outf: + outf.write(nr.text) + n_errs = 0 + except (KeyboardInterrupt, SystemError, SystemExit): + raise + except: + # n_errs += 1 + # time.sleep(max(60, 10*crawl_delay)) + pass + + return + + + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds') + parser.add_argument('dir_data', + help = 'directory containing compound.csv file created by get_nist_compounds.py script') + parser.add_argument('--crawl-delay', type = float, default = 5, + help = 'pause between HTTP requests, seconds') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + # check root dir + if not os.path.exists(args.dir_data): + raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}') + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + # check htmls dir + dir_html = os.path.join(args.dir_data, 'htmls') + if not os.path.exists(dir_html): + raise ValueError('Given dir_data directory does not contain htmls/ folder') + # check stereo dir + dir_stereo = os.path.join(args.dir_data, 'htmls_stereo') + if not os.path.exists(dir_stereo): + os.mkdir(dir_stereo) + # crawl delay + if args.crawl_delay < 0: + raise ValueError(f'--crawl-delay must be positive: {args.crawl_delay}') + + return + + +def main() -> None: + '''Updates the list of NIST compounds via downloaded HTML pages''' + + # prepare arguments + args = get_arguments() + check_arguments(args) + + # load compounds + print('\nExtracting stereoisomers ...') + path_stereo_ids = os.path.join(args.dir_data, 'stereoisomers.txt') + if os.path.exists(path_stereo_ids): + with open(path_stereo_ids, 'r') as inpf: + IDs = [l.strip() for l in inpf.readlines()] + IDs = [ID for ID in IDs if ID] + else: + dir_html = os.path.join(args.dir_data, 'htmls') + IDs = get_missing_stereoisomers(dir_html) + with open(path_stereo_ids, 'w') as outf: + outf.write('\n'.join(IDs) + '\n') + + # download stereoisomers + print('\nChecking downloaded stereoisomers ...') + dir_stereo = os.path.join(args.dir_data, 'htmls_stereo') + loaded = [] + for f in tqdm(os.listdir(dir_stereo)): + with open(os.path.join(dir_stereo, f), 'r') as inpf: + text = inpf.read() + soup = BeautifulSoup(text, 'html.parser') + if nist.parsing.is_compound_page(soup): + ID = f.replace('.html', '') + loaded.append(ID) + IDs = [ID for ID in IDs if ID not in loaded] + print('\nLoading stereoisomers ...') + download_stereoisomer_htmls(IDs, dir_stereo, args.crawl_delay) + print() + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + + diff --git a/update/get_nist_compounds.py b/update/get_nist_compounds.py new file mode 100644 index 0000000..8d6dc59 --- /dev/null +++ b/update/get_nist_compounds.py @@ -0,0 +1,170 @@ +'''Downloads NIST Chemistry WebBook sitemaps and extracts all compounds''' + +#%% Imports + +import os, shutil, gzip +import argparse + +from urllib.robotparser import RobotFileParser +from urllib.request import urlopen, urlretrieve +from urllib.parse import unquote, urlparse, parse_qs + +from bs4 import BeautifulSoup + +import pandas as pd + +import nistchempy as nist + + +#%% Functions + +def donwload_nist_sitemaps(dir_data: str) -> None: + '''Downloads sitemaps from NIST Chemistry WebBook + + Arguments: + dir_data (str): directory for robots.txt and primary sitemap + + ''' + dir_xmls = os.path.join(dir_data, 'sitemaps') + # create dirs + for path in (dir_data, dir_xmls): + if not os.path.exists(path): + os.mkdir(path) + # save robots.txt + ROBOTS_URL = nist.requests.BASE_URL + '/robots.txt' + with open(os.path.join(dir_data, 'robots.txt'), 'w') as outf: + text = urlopen(ROBOTS_URL).read().decode('utf-8') + outf.write(text) + # save initial sitemap + robots = RobotFileParser(nist.requests.BASE_URL + '/robots.txt') + robots.read() + url = robots.site_maps()[0] + fname = urlparse(url).path.split('/')[-1] + text = urlopen(url).read().decode('utf-8') + with open(os.path.join(dir_data, fname), 'w') as outf: + outf.write(text) + # download actual sitemaps + xml = BeautifulSoup(text, 'xml') + for item in xml.findAll('sitemap'): + url = item.loc.text + fname = urlparse(url).path.split('/')[-1] + urlretrieve(url, os.path.join(dir_xmls, fname)) + # unzip archives + for f in os.listdir(dir_xmls): + f = os.path.join(dir_xmls, f) + with gzip.open(f, 'rb') as inpf: + with open(f.replace('.gz', ''), 'wb') as outf: + shutil.copyfileobj(inpf, outf) + # remove gzips + for f in os.listdir(dir_xmls): + f = os.path.join(dir_xmls, f) + if f[-3:] != '.gz': + continue + os.remove(f) + + return + + + +def is_compound_url(url: str) -> bool: + ''' ''' + + return 'cgi/inchi' in url or 'cgi/cbook.cgi?ID' in url + + + +def get_compound_id(url: str) -> str: + '''Extracts compound ID from NIST compound URL''' + if 'cgi/cbook.cgi?ID' in url: + ID = parse_qs(urlparse(url).query)['ID'][0] + else: + ID = urlparse(url).path.replace('/cgi/inchi/', '') + + return ID + + + +def get_nist_compounds_list(dir_data: str) -> None: + '''Extracts NIST compounds from sitemap files + + Arguments: + dir_data (str): directory for robots.txt and primary sitemap + + ''' + dir_xmls = os.path.join(dir_data, 'sitemaps') + path_csv = os.path.join(dir_data, 'compounds.csv') + # get compound urls + urls = [] + for f in os.listdir(dir_xmls): + # parse file + with open(os.path.join(dir_xmls, f), 'r') as inpf: + text = inpf.read() + soup = BeautifulSoup(text, 'xml') + # get urls + add = [unquote(item.loc.text) for item in soup.findAll('url')] + add = [url for url in add if is_compound_url(url)] + urls += add + # extract compound IDs + IDs = [get_compound_id(url) for url in urls] + # save as dataframe + df = pd.DataFrame({'id': IDs, 'url': urls}) + df.to_csv(path_csv, index = None) + + return + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Loads list of NIST Chemistry WebBook compounds to the given directory') + parser.add_argument('dir_data', help = 'directory to save downloaded and extracted info on NIST compounds') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + if not os.path.exists(args.dir_data): + os.mkdir(args.dir_data) # FilexExistsError / FileNotFoundError + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + + return + + +def main() -> None: + '''Extracts info on NIST Chemistry WebBook compounds and saves to csv file''' + # prepare arguments + args = get_arguments() + check_arguments(args) + path_csv = os.path.join(args.dir_data, 'compounds.csv') + # get list of compounds + if not os.path.exists(path_csv): + print('Downloading NIST Chemistry WebBook sitemaps ...') + donwload_nist_sitemaps(args.dir_data) + print('Extracting NIST compound list ...') + get_nist_compounds_list(args.dir_data) + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + diff --git a/update/process_nonload_errors.py b/update/process_nonload_errors.py new file mode 100644 index 0000000..61b3e75 --- /dev/null +++ b/update/process_nonload_errors.py @@ -0,0 +1,178 @@ +'''The script uses previous data on NIST Chemistry WebBook compounds to fix +current errors, namely non-available compound pages from sitemaps''' + +#%% Imports + +import re, os, time +import argparse + +from tqdm import tqdm + +import pandas as pd + +import nistchempy as nist + +from typing import List, Tuple + + +#%% Functions + +def read_download_errors(path: str) -> List[str]: + '''Returns list of URLS of not downloaded pages + + Arguments: + path (str): path to download_htmls.err + + Returns: + List[str]: list of not downloaded URLs + + ''' + with open(path, 'r') as inpf: + lines = [l.strip() for l in inpf.readlines() if 'url=' in l] + err_urls = [re.search('url=(.+)', l).group(1).strip() for l in lines] + + return err_urls + + + + +def get_errors_to_fix(dir_data: str, path_old: str) -> List[Tuple[str, str]]: + '''Extract errors with loaded HTML pages which were parsed previously + + Arguments: + dir_data (str): path to the directory with current parsing results + path_old (str): old nist_data.csv file + + Returns: + List[Tuple[str, str]]: list of (idx, url) tuples + + ''' + + # paths + path_new = os.path.join(dir_data, 'compounds.csv') + path_non_load = os.path.join(dir_data, 'download_htmls.err') + + # read data + old = pd.read_csv(path_old, low_memory = False) + new = pd.read_csv(path_new) + nload = read_download_errors(path_non_load) + + # treat non-load + sub = new.loc[new.url.isin(nload)].copy() + for idx, inchi in zip(sub.index, sub.id): + if 'InChI=' not in inchi: + continue + IDs = old.loc[old.inchi == inchi, 'ID'] + if len(IDs) != 1: + continue + ID = old.loc[old.inchi == inchi, 'ID'].values[0] + new_ref = nist.requests.SEARCH_URL + f'?ID={ID}' + sub.loc[idx, 'url'] = new_ref + + # output + urls = [(idx, url) for idx, url in zip(sub.index, sub.url)] + + return urls + + +def load_errors(errs: List[Tuple[str, str]], dir_data: str, + crawl_delay: float = 5) -> None: + '''Downloads previously errorneous compound pages if possible + + Arguments: + errs (List[Tuple[str, str]]): list of (idx, url) tuples + dir_data (str): path to the directory with current parsing results + crawl_delay (float): interval between http requests + + ''' + for idx, url in tqdm(errs, total = len(errs)): + nr = nist.requests.make_nist_request(url) + if nr.ok: + path_html = os.path.join(dir_data, 'htmls', f'{idx}.html') + with open(path_html, 'w') as outf: + outf.write(nr.text) + time.sleep(crawl_delay) + + return + + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds') + parser.add_argument('dir_data', + help = 'directory containing compound.csv file created by get_nist_compounds.py script') + parser.add_argument('path_old', + help = 'old nist_data.csv file') + parser.add_argument('--crawl-delay', type = float, default = 5, + help = 'pause between HTTP requests, seconds') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + # check root dir + if not os.path.exists(args.dir_data): + raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}') + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + # check compounds.csv + path_csv = os.path.join(args.dir_data, 'compounds.csv') + if not os.path.exists(path_csv): + raise ValueError('Given dir_data directory does not contain compounds.csv file') + # check download_htmls.err + path_nload = os.path.join(args.dir_data, 'download_htmls.err') + if not os.path.exists(path_nload): + raise ValueError('Given dir_data directory does not contain download_htmls.err file') + # check htmls dir + dir_html = os.path.join(args.dir_data, 'htmls') + if not os.path.exists(dir_html): + raise ValueError('Given dir_data directory does not contain htmls/ folder') + # check old data file + if not os.path.exists(args.path_old): + raise ValueError(f'Given path_old file does not exists: {args.path_old}') + # crawl delay + if args.crawl_delay < 0: + raise ValueError(f'--crawl-delay must be positive: {args.crawl_delay}') + + return + + +def main() -> None: + '''Updates the list of NIST compounds via downloaded HTML pages''' + # prepare arguments + args = get_arguments() + check_arguments(args) + # load compounds + print('Preparing errors ...') + errs = get_errors_to_fix(args.dir_data, args.path_old) + print('Loading webpages ...') + load_errors(errs, args.dir_data, args.crawl_delay) + print() + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + + diff --git a/update/requirements.txt b/update/requirements.txt new file mode 100644 index 0000000..cf93f46 --- /dev/null +++ b/update/requirements.txt @@ -0,0 +1,2 @@ +nistchempy >= 1.0.0 +tqdm