MolecularAI · SGenheden · Dec 18, 2024 · Dec 13, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
     -   id: check-added-large-files
     -   id: check-merge-conflict
 -   repo: https://github.com/psf/black
-    rev: 22.0.0
+    rev: 24.1.0
     hooks:
     -   id: black
 # We should add some linter to here at some point
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ Then execute the following commands in the root of the repository
 
     conda env create -f env-dev.yml
     conda activate rxn-env
-    poetry install
+    poetry install --with dev
 
 the `rxnutils` package is now installed in editable mode.
 

diff --git a/env-dev.yml b/env-dev.yml
@@ -3,5 +3,5 @@ channels:
   - https://conda.anaconda.org/conda-forge
   - defaults
 dependencies:
-  - python>=3.9,<3.11
-  - poetry>=1.1.4,<2.0
+  - python>=3.9,<3.13
+  - poetry>=1.2.0,<2.0
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,9 +13,9 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.9,<3.11"
-urllib3 = "<2.0"
-pandas = "^1.0.0"
+python = ">=3.9,<3.13"
+urllib3 = "^1.2.26"
+pandas = ">=1.0.0,<3.0.0"
 xxhash = "^2.0.0"
 rdchiral = "^1.1.0"
 PyYAML = "^6.0.1"
@@ -30,21 +30,25 @@ cgrtools = "^4.1.35"
 scipy = "^1.11.4"
 pydantic = "^2.8.2"
 apted = "^1.0.3"
+dask = ">=2024.4.1"
+onnxruntime = {version = "<1.17.0", optional=true}
 
-[tool.poetry.dev-dependencies]
-pytest = "^6.2.2"
-pytest-datadir = "^1.3.1"
-pytest-mock = "^3.7.0"
-pytest-mccabe = "^2.0"
-pytest-black = "^0.3.12"
-pytest-cov = "^3.0.0"
-black = "^22.0.0"
-mypy = "^0.800"
-pre-commit = "^2.10.1"
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.3"
+pytest-datadir = "^1.5.0"
+pytest-mock = "^3.14.0"
+pytest-cov = "^6.0.0"
+requests-mock = "^1.12.1"
+black = "^24.10.0"
+mypy = "^1.13.0"
+pre-commit = "^4.0.1"
 ipython = "^7.21.0"
-pylint = "^2.14.1"
-invoke = "^1.7.1"
-Sphinx = "^7.3.7"
+pylint = "^3.3.1"
+invoke = "^2.2.0"
+sphinx = "<8.1.0"
+
+[tool.poetry.extras]
+models = ["onnxruntime"]
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
@@ -57,3 +61,6 @@ max-attributes = 15
 max-public-methods = 25
 min-public-methods = 0
 disable = "W1203, W0707, W1514, W0602, typecheck"
+
+[tool.coverage.run]
+relative_files = true
diff --git a/rxnutils/chem/augmentation.py b/rxnutils/chem/augmentation.py
@@ -1,6 +1,8 @@
 """ Routines for augmenting chemical reactions
 """
 
+from rxnutils.chem.utils import split_rsmi
+
 _SINGLE_REACTANT_REAGENTS = {"10.1.1": "Br", "10.1.2": "Cl"}
 
 
@@ -12,7 +14,7 @@ def single_reactant_augmentation(smiles: str, classification: str) -> str:
     :param classification: the classification of the reaction or an empty string
     :return: the processed SMILES
     """
-    reactants = smiles.split(">")[0]
+    reactants = split_rsmi(smiles)[0]
     if "." in reactants:
         return smiles
     classification = classification.split(" ")[0]

diff --git a/rxnutils/chem/cgr.py b/rxnutils/chem/cgr.py
@@ -1,5 +1,6 @@
 """ Wrapper class for the CGRTools library
 """
+
 import io
 import warnings
 from typing import List
@@ -28,20 +29,14 @@ def __init__(self, reaction: ChemicalReaction) -> None:
         self._cgr_reactants = []
         self._cgr_products = []
         self._make_cgr_containers()
-        self.reaction_container = ReactionContainer(
-            reactants=self._cgr_reactants, products=self._cgr_products
-        )
+        self.reaction_container = ReactionContainer(reactants=self._cgr_reactants, products=self._cgr_products)
         try:
             self.cgr_container = self.reaction_container.compose()
         except ValueError as err:
             if str(err) == "mapping of graphs is not disjoint":
-                raise ValueError(
-                    "Reaction contains inconsistent atom-mapping, perhaps duplicates"
-                )
+                raise ValueError("Reaction contains inconsistent atom-mapping, perhaps duplicates")
             elif str(err).endswith("} not equal"):
-                raise ValueError(
-                    "Atom with the same atom-mapping in reactant and product is not equal"
-                )
+                raise ValueError("Atom with the same atom-mapping in reactant and product is not equal")
             else:
                 raise ValueError(f"Unknown problem with generating CGR: {err}")
 
@@ -58,10 +53,7 @@ def bonds_broken(self) -> int:
     @property
     def bonds_changed(self) -> int:
         """Returns the number of broken or formed bonds in the reaction"""
-        return sum(
-            bond.p_order is None or bond.order is None
-            for _, _, bond in self.cgr_container.bonds()
-        )
+        return sum(bond.p_order is None or bond.order is None for _, _, bond in self.cgr_container.bonds())
 
     @property
     def bonds_formed(self) -> int:
@@ -71,9 +63,7 @@ def bonds_formed(self) -> int:
     @property
     def total_centers(self) -> int:
         """Returns the number of atom and bond centers in the reaction"""
-        return len(self.cgr_container.center_atoms) + len(
-            self.cgr_container.center_bonds
-        )
+        return len(self.cgr_container.center_atoms) + len(self.cgr_container.center_bonds)
 
     def distance_to(self, other: "CondensedGraphReaction") -> int:
         """
@@ -104,14 +94,11 @@ def _make_renumbered_mols(self):
         # so this adds safe atom-mapping to un-mapped atoms
         renumbered_mols = []
         max_atom_map_numb = max(
-            max(atom_mapping_numbers(smi) or [0])
-            for smi in self.reaction.reactants_list + self.reaction.products_list
+            max(atom_mapping_numbers(smi) or [0]) for smi in self.reaction.reactants_list + self.reaction.products_list
         )
         for mol0 in self.reaction.reactants + self.reaction.products:
             if mol0 is None:
-                raise ValueError(
-                    "Cannot create CGR for this reaction, some molecules are None"
-                )
+                raise ValueError("Cannot create CGR for this reaction, some molecules are None")
             mol = Chem.rdchem.Mol(mol0)
             for atom in mol.GetAtoms():
                 if not atom.GetAtomMapNum():

diff --git a/rxnutils/chem/disconnection_sites/atom_map_tagging.py b/rxnutils/chem/disconnection_sites/atom_map_tagging.py
@@ -7,6 +7,8 @@
 import pandas as pd
 from rdkit import Chem
 
+from rxnutils.chem.utils import split_rsmi
+
 
 def _get_atom_identifier(atom: Chem.rdchem.Atom) -> str:
     """
@@ -21,9 +23,7 @@ def _get_atom_identifier(atom: Chem.rdchem.Atom) -> str:
     return str(atom_id)
 
 
-def _get_bond_environment_identifier(
-    atoms: Sequence[Chem.rdchem.Atom], bond: Chem.rdchem.Bond
-) -> str:
+def _get_bond_environment_identifier(atoms: Sequence[Chem.rdchem.Atom], bond: Chem.rdchem.Bond) -> str:
     """
     Get the environment of a specific bond.
 
@@ -79,17 +79,14 @@ def get_atom_list(reactants_smiles: str, product_smiles: str) -> List[int]:
     ordered_reactant_neighbor_dict = _get_atomic_neighborhoods(reactants_smiles)
     ordered_product_neighbor_dict = _get_atomic_neighborhoods(product_smiles)
 
-    all_indices = set(ordered_product_neighbor_dict.keys()) | set(
-        ordered_reactant_neighbor_dict.keys()
-    )
+    all_indices = set(ordered_product_neighbor_dict.keys()) | set(ordered_reactant_neighbor_dict.keys())
 
     # Checks to see equivlence of atomic enviroments.
     # If environment changed, then add atom to list
     atom_list = [
         atom_map
         for atom_map in all_indices
-        if ordered_reactant_neighbor_dict.get(atom_map, [])
-        != ordered_product_neighbor_dict.get(atom_map, [])
+        if ordered_reactant_neighbor_dict.get(atom_map, []) != ordered_product_neighbor_dict.get(atom_map, [])
     ]
 
     return atom_list
@@ -104,7 +101,7 @@ def atom_map_tag_reactants(mapped_rxn: str) -> str:
     :return: SMILES of the reactants containing tags corresponding to atoms changed in the
         reaction.
     """
-    reactants_smiles, _, product_smiles = mapped_rxn.split(">")
+    reactants_smiles, _, product_smiles = split_rsmi(mapped_rxn)
 
     reactants_mol = Chem.MolFromSmiles(reactants_smiles)
     atom_list = get_atom_list(reactants_smiles, product_smiles)
@@ -128,7 +125,7 @@ def atom_map_tag_products(mapped_rxn: str) -> str:
     :return: SMILES of the product containing tags corresponding to atoms changed in the
         reaction.
     """
-    reactants_smiles, _, product_smiles = mapped_rxn.split(">")
+    reactants_smiles, _, product_smiles = split_rsmi(mapped_rxn)
 
     product_mol = Chem.MolFromSmiles(product_smiles)
     atom_list = get_atom_list(reactants_smiles, product_smiles)

diff --git a/rxnutils/chem/disconnection_sites/tag_converting.py b/rxnutils/chem/disconnection_sites/tag_converting.py
@@ -15,22 +15,21 @@ def smiles_tokens(smiles: str) -> List[str]:
     :param smiles: SMILES to tokenize
     :return: List of tokens identified in SMILES.
     """
-    pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
+    pattern = (
+        r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
+    )
     regex = re.compile(pattern)
     tokens = [token for token in regex.findall(smiles)]
 
     tokenized_smiles = "".join(tokens)
     if smiles != tokenized_smiles:
         raise AssertionError(
-            f"tokenized SMILES not the same as input SMILES: {tokenized_smiles}, "
-            "{smiles}, tokens: {tokens}"
+            f"tokenized SMILES not the same as input SMILES: {tokenized_smiles}, " "{smiles}, tokens: {tokens}"
         )
     return tokens
 
 
-def _next_tagged_token(
-    product_tagged_tokens: List[str], untagged_token: str, tagged_token_idx: int
-) -> Tuple[str, int]:
+def _next_tagged_token(product_tagged_tokens: List[str], untagged_token: str, tagged_token_idx: int) -> Tuple[str, int]:
     """
     Get the next tagged token in the sequence. Includes checks and fixes for
     stereochemistry changes due to removing atom mapping.
@@ -51,19 +50,13 @@ def _next_tagged_token(
             tagged_token_idx += 1
             return product_tagged_tokens[tagged_token_idx], tagged_token_idx
 
-    if (
-        tagged_token != untagged_token
-        and not ":1" in tagged_token
-        and "@" in tagged_token
-    ):
+    if tagged_token != untagged_token and not ":1" in tagged_token and "@" in tagged_token:
         return untagged_token, tagged_token_idx
 
     return tagged_token, tagged_token_idx
 
 
-def tagged_smiles_from_tokens(
-    product_tagged_tokens: List[str], product_untagged_tokens: List[str]
-) -> Tuple[str, str]:
+def tagged_smiles_from_tokens(product_tagged_tokens: List[str], product_untagged_tokens: List[str]) -> Tuple[str, str]:
     """
     Convert the tagged SMILES from atom-mapping to unmapped-token + '!'
 
@@ -81,24 +74,16 @@ def tagged_smiles_from_tokens(
 
     for untagged_token in product_untagged_tokens:
 
-        tagged_token, tagged_token_idx = _next_tagged_token(
-            product_tagged_tokens, untagged_token, tagged_token_idx
-        )
+        tagged_token, tagged_token_idx = _next_tagged_token(product_tagged_tokens, untagged_token, tagged_token_idx)
 
-        if tagged_token != untagged_token and (
-            untagged_token == "/" or untagged_token == "\\"
-        ):
+        if tagged_token != untagged_token and (untagged_token == "/" or untagged_token == "\\"):
             continue
 
         if tagged_token == untagged_token:
             product_converted += untagged_token
         else:
             # Remove brackets around a single letter
-            if (
-                len(untagged_token) == 3
-                and untagged_token.startswith("[")
-                and untagged_token.endswith("]")
-            ):
+            if len(untagged_token) == 3 and untagged_token.startswith("[") and untagged_token.endswith("]"):
                 untagged_token = untagged_token[1]
             product_converted += untagged_token + "!"
 
@@ -109,9 +94,7 @@ def tagged_smiles_from_tokens(
     return product_converted, product_untagged
 
 
-def _canonicalize_tagged_smiles(
-    product_tagged: str, product_untagged: str = None
-) -> Tuple[str, str]:
+def _canonicalize_tagged_smiles(product_tagged: str, product_untagged: str = None) -> Tuple[str, str]:
     """
     Reorder the tagged-product SMILES on canonical form using the canonicalized
     untagged product.
@@ -123,13 +106,7 @@ def _canonicalize_tagged_smiles(
     mol = Chem.MolFromSmiles(product_tagged)
     mol_untagged = Chem.MolFromSmiles(product_untagged)
 
-    _, canonical_atom_order = tuple(
-        zip(
-            *sorted(
-                [(j, i) for i, j in enumerate(Chem.CanonicalRankAtoms(mol_untagged))]
-            )
-        )
-    )
+    _, canonical_atom_order = tuple(zip(*sorted([(j, i) for i, j in enumerate(Chem.CanonicalRankAtoms(mol_untagged))])))
 
     mol = Chem.RenumberAtoms(mol, canonical_atom_order)
     mol_untagged = Chem.RenumberAtoms(mol_untagged, canonical_atom_order)
@@ -158,9 +135,7 @@ def convert_atom_map_tag(product_atom_map_tagged: str) -> str:
     if not Chem.MolFromSmiles(product_untagged):
         return ""
 
-    product_tagged, product_untagged = _canonicalize_tagged_smiles(
-        product_atom_map_tagged, product_untagged
-    )
+    product_tagged, product_untagged = _canonicalize_tagged_smiles(product_atom_map_tagged, product_untagged)
 
     # Update the SMILES string to remove atom-mapping brackets and explicit [H]:s and
     # replace by <atom>!

diff --git a/rxnutils/chem/features/__init__.py b/rxnutils/chem/features/__init__.py