Update project structure

breimanntools · Jun 29, 2024 · 42142e8 · 42142e8
1 parent a47b038
commit 42142e8
Show file tree

Hide file tree

Showing 15 changed files with 62 additions and 48 deletions.
diff --git a/README.rst b/README.rst
@@ -120,8 +120,8 @@ If you use AAanalysis in your work, please cite the respective publication as fo
 
 **CPP**:
    Breimann and Kamp *et al.* (2024c),
-   *Complete γ-secretase substrate proteome revealed by explainable AI*, .. # Link if available
+   *Charting γ-secretase substrates by explainable AI*, .. # Link if available
 
 **dPULearn**:
    Breimann and Kamp *et al.* (2024c),
-   *Complete γ-secretase substrate proteome revealed by explainable AI*, .. # Link if available
+   *Charting γ-secretase substrates by explainable AI*, .. # Link if available
diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py
@@ -3,6 +3,7 @@
                             SequencePreprocessor)
 from .feature_engineering import AAclust, AAclustPlot, SequenceFeature, NumericalFeature, CPP, CPPPlot
 from .pu_learning import dPULearn, dPULearnPlot
+from .explainable_ai import TreeModel
 from .pertubation import AAMut, AAMutPlot, SeqMut, SeqMutPlot
 from .plotting import (plot_get_clist, plot_get_cmap, plot_get_cdict,
                        plot_settings, plot_legend, plot_gcfs)
@@ -32,7 +33,7 @@
     "AAMutPlot",
     "SeqMut",
     "SeqMutPlot",
-    # "TreeModel"           # SHAP (explainable AI module)
+    "TreeModel",
     # "ShapExplainer"       # SHAP
     "plot_get_clist",
     "plot_get_cmap",
@@ -49,12 +50,11 @@
 
 # Import of professional (pro) version features if dependencies are available
 try:
-    from .explainable_ai import TreeModel, ShapExplainer
+    from .explainable_ai_pro import ShapExplainer
     from .data_handling_pro import comp_seq_sim, filter_seq
     from .show_html import display_df
     # Extend the __all__ list with pro features if successful
-    __all__.extend(["TreeModel",
-                    "ShapExplainer",
+    __all__.extend(["ShapExplainer",
                     "display_df",
                     "comp_seq_sim",
                     "filter_seq"])
@@ -76,9 +76,7 @@ def __call__(self, *args, **kwargs):
         return UnavailableFeature
 
     # Use the factory function to create placeholders for pro features
-    make_pro_feature("TreeModel")
-    make_pro_feature("ShapExplainer")
-    make_pro_feature("display_df")
-    make_pro_feature("comp_seq_sim")
-    make_pro_feature("comp_pw_seq_sim")
-    make_pro_feature("filter_seq")
+    ShapExplainer = make_pro_feature("ShapExplainer")
+    display_df = make_pro_feature("display_df")
+    comp_seq_sim = make_pro_feature("comp_seq_sim")
+    filter_seq = make_pro_feature("filter_seq")
diff --git a/aaanalysis/_utils/check_models.py b/aaanalysis/_utils/check_models.py
@@ -1,9 +1,9 @@
 """This is a script for scikit-learn model-specific check functions"""
 import inspect
 from inspect import isclass
+import numpy as np
 
 from ._utils import add_str
-
 # Helper functions
 
 
@@ -72,3 +72,11 @@ def check_model_kwargs(model_class=None, model_kwargs=None, name_model_class="mo
     if "random_state" not in model_kwargs and "random_state" in valid_args:
         model_kwargs.update(dict(random_state=random_state))
     return model_kwargs
+
+
+def check_match_list_model_classes_kwargs(list_model_classes=None, list_model_kwargs=None):
+    """Check length match of list_model_classes and list_model_kwargs"""
+    n_models = len(list_model_classes)
+    n_args = len(list_model_kwargs)
+    if n_models != n_args:
+        raise ValueError(f"Length of 'list_model_kwargs' (n={n_args}) should match to 'list_model_classes' (n{n_models}")
diff --git a/aaanalysis/data_handling_pro/_filter_seq.py b/aaanalysis/data_handling_pro/_filter_seq.py
@@ -117,9 +117,9 @@ def filter_seq(df_seq: pd.DataFrame = None,
 
     Warnings
     --------
+    * This function requires `biopython`, which is automatically installed via `pip install aaanalysis[pro]`.
     * CD-HIT and MMseq2 must be installed separately.
     * CD-HIT is not available for Windows.
-    * This function requires `biopython`, which is automatically installed via `pip install aaanalysis[pro]`.
 
     Examples
     --------

diff --git a/aaanalysis/explainable_ai/__init__.py b/aaanalysis/explainable_ai/__init__.py
@@ -1,7 +1,5 @@
-from ._shap_explainer import ShapExplainer
 from ._tree_model import TreeModel
 
 __all__ = [
-    "ShapExplainer",
     "TreeModel",
 ]
diff --git a/aaanalysis/explainable_ai/_backend/check_models.py b/aaanalysis/explainable_ai/_backend/check_models.py
@@ -5,20 +5,11 @@
 import aaanalysis.utils as ut
 
 
-def check_match_list_model_classes_kwargs(list_model_classes=None, list_model_kwargs=None):
-    """Check length match of list_model_classes and list_model_kwargs"""
-    n_models = len(list_model_classes)
-    n_args = len(list_model_kwargs)
-    if n_models != n_args:
-        raise ValueError(f"Length of 'list_model_kwargs' (n={n_args}) should match to 'list_model_classes' (n{n_models}")
-
-
 def check_match_labels_X(labels=None, X=None):
     """Check if labels binary classification task labels"""
     n_samples = X.shape[0]
     # Accept float if fuzzy_labeling is True
-    str_add = "Consider setting 'fuzzy_labeling=True'."
-    labels = ut.check_labels(labels=labels, len_requiered=n_samples, str_add=str_add)
+    labels = ut.check_labels(labels=labels, len_requiered=n_samples)
     unique_labels = set(labels)
     if len(unique_labels) != 2:
         raise ValueError(f"'labels' should contain 2 unique labels ({unique_labels})")

diff --git a/aaanalysis/explainable_ai/_tree_model.py b/aaanalysis/explainable_ai/_tree_model.py
@@ -2,7 +2,6 @@
 This is a script for the frontend of the TreeModel class used to obtain Mote Carlo estimates of feature importance.
 
 DEV: TODO features
-a) TreeModel.fit: Add n_jobs as input
 b) TreeModel.eval: Add n_features to output
 """
 from typing import Optional, Dict, List, Tuple, Type, Union, Callable
@@ -13,8 +12,7 @@
 
 import aaanalysis.utils as ut
 
-from ._backend.check_models import (check_match_list_model_classes_kwargs,
-                                    check_match_labels_X,
+from ._backend.check_models import (check_match_labels_X,
                                     check_match_X_is_selected)
 from ._backend.tree_model.tree_model_fit import fit_tree_based_models
 from ._backend.tree_model.tree_model_predict_proba import monte_carlo_predict_proba
@@ -115,8 +113,6 @@ def check_match_df_feat_importance_arrays(df_feat=None, feat_importance=None, fe
 
 
 # TODO split from shap explainer to be installed via aanalysis (not aaanalysis[pro])
-# TODO manage aaanalysis[pro] (add info/warning in docu for every function/module whose dependencies are not installed)
-# TODO e.g., seq_filter, comp_seq_sim, SHAP ...# II Main Functions
 class TreeModel:
     """
     Tree Model class: A wrapper for tree-based models to obtain Monte Carlo estimates of feature
@@ -196,7 +192,7 @@ def __init__(self,
         list_model_kwargs = ut.check_list_like(name="list_model_kwargs", val=list_model_kwargs, accept_none=True)
         if list_model_kwargs is None:
             list_model_kwargs = [{} for _ in list_model_classes]
-        check_match_list_model_classes_kwargs(list_model_classes=list_model_classes, list_model_kwargs=list_model_kwargs)
+        ut.check_match_list_model_classes_kwargs(list_model_classes=list_model_classes, list_model_kwargs=list_model_kwargs)
         _list_model_kwargs = []
         for model_class, model_kwargs in zip(list_model_classes, list_model_kwargs):
             ut.check_mode_class(model_class=model_class)

diff --git a/aaanalysis/explainable_ai_pro/__init__.py b/aaanalysis/explainable_ai_pro/__init__.py
@@ -1,7 +1,5 @@
 from ._shap_explainer import ShapExplainer
-from ._tree_model import TreeModel
 
 __all__ = [
     "ShapExplainer",
-    "TreeModel",
 ]
diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py
@@ -57,7 +57,8 @@
                                 check_file_path_exists,
                                 check_is_fasta)
 from ._utils.check_models import (check_mode_class,
-                                  check_model_kwargs)
+                                  check_model_kwargs,
+                                  check_match_list_model_classes_kwargs)
 from ._utils.check_plots import (check_fig,
                                  check_ax,
                                  check_figsize,

diff --git a/docs/requirements_dev.txt b/docs/requirements_dev.txt
@@ -1,6 +1,7 @@
 # Requires Python >= 3.9
 
 # Additional dependencies for professional version (pip install aaanalysis[pro])
+biopython>=1.8.3
 shap>=0.44.0
 ipython>=8.16.1
 hypothesis>=6.86.2

diff --git a/docs/source/index/citations.rst b/docs/source/index/citations.rst
@@ -14,8 +14,8 @@ If you use **AAanalysis** in your work, please cite the respective publication a
 
 **CPP**:
    [Breimann24c]_  Breimann and Kamp *et al.* (2024c),
-   *Complete γ-secretase substrate proteome revealed by explainable AI*, .. # Link if available
+   *Charting γ-secretase substrates by explainable AI*, .. # Link if available
 
 **dPULearn**:
    [Breimann24c]_  Breimann and Kamp *et al.* (2024c),
-   *Complete γ-secretase substrate proteome revealed by explainable AI*, .. # Link if available
+   *Charting γ-secretase substrates by explainable AI*, .. # Link if available
diff --git a/docs/source/index/references.rst b/docs/source/index/references.rst
@@ -34,8 +34,7 @@ AAanalysis Algorithms
    `bioRxiv <https://www.biorxiv.org/content/10.1101/2023.08.03.551768v1>`__.
 
 .. [Breimann24c] Breimann and Kamp *et al.* (2024c),
-   *Complete γ-secretase substrate proteome revealed by explainable AI*,
-   .. # Link if available
+   *Charting γ-secretase substrates by explainable AI*, .. # Link if available
 
 Sequence Algorithms
 -------------------

diff --git a/examples/data_handling/sp_get_aa_window.ipynb b/examples/data_handling/sp_get_aa_window.ipynb
@@ -52,7 +52,7 @@
     }
    ],
    "source": [
-    "# Get amino acid window of size 5\n",
+    "# Get amino acid window of size 6\n",
     "window = sp.get_aa_window(seq=seq, pos_stop=5)\n",
     "print(window)"
    ],
@@ -88,7 +88,7 @@
     }
    ],
    "source": [
-    "# Get amino acid window of size 3\n",
+    "# Get amino acid window of size 4\n",
     "window = sp.get_aa_window(seq=seq, pos_start=2, pos_stop=5)\n",
     "print(window)"
    ],

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,15 @@ description = "Python framework for interpretable protein prediction"
 authors = ["Stephan Breimann <stephanbreimann@gmail.de>"]
 license = "BSD-3-Clause"
 readme = "README.rst"
-include = ["data/*.xlsx", "data/benchmarks/*.tsv", "data/benchmarks/*.xlsx"]
+homepage = "https://aaanalysis.readthedocs.io"
+keywords = ["protein", "prediction", "bioinformatics", "machine learning", "interpretable AI"]
+
+# Include additional files
+include = [
+    "data/*.xlsx",
+    "data/benchmarks/*.tsv",
+    "data/benchmarks/*.xlsx"
+]
 
 # Add classifiers to provide more details about the package (used by PyPI)
 classifiers = [
@@ -36,7 +44,6 @@ classifiers = [
 [tool.poetry.dependencies]
 # Core dependencies (via pip install aaanalysis)
 python = "^3.9"
-biopython = "^1.8.3"
 cycler = "^0.11.0"
 et-xmlfile = "^1.1.0"
 fonttools = "^4.37.1"
@@ -59,23 +66,24 @@ scipy = "^1.10.0"
 seaborn = "^0.13.2"
 six = "^1.16.0"
 threadpoolctl = "^3.1.0"
+
 # Optional professional dependencies (via pip install aaanalysis[pro])
+biopython = { version = "^1.8.3", optional = true }
 shap = { version = "^0.44.0", optional = true }
 ipython = { version = "^8.16.1", optional = true }
 hypothesis = { version = "^6.86.2", optional = true }
 pytest = { version = "^7.4.2", optional = true }
 UpSetPlot = { version = "^0.8.0", optional = true }
 
 [tool.poetry.extras]
-pro = ["shap", "ipython", "hypothesis", "pytest", "UpSetPlot"]
+pro = ["shap", "biopython", "ipython", "hypothesis", "pytest", "UpSetPlot"]
 
 # Project URLs
 [tool.poetry.urls]
 "Repository" = "https://github.com/breimanntools/aaanalysis"
 "Documentation" = "https://aaanalysis.readthedocs.io"
 
-# If you use a tool for linting or formatting, you can add its configurations here.
-# For example, if you use `black` for formatting:
+# Configuration for black (code formatter)
 [tool.black]
 line-length = 88
 exclude = '''
@@ -85,8 +93,24 @@ exclude = '''
   | \.eggs
   | \.mypy_cache
   | \.pytest_cache
-  | \__pycache__
+  | __pycache__
   | build
   | dist
 )/
 '''
+
+# Configuration for isort (import sorting)
+[tool.isort]
+profile = "black"
+line_length = 88
+
+# Configuration for flake8 (linting)
+[tool.flake8]
+max-line-length = 88
+exclude = '''
+    .git,
+    __pycache__,
+    build,
+    dist,
+    venv
+'''
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
 # Requires Python >= 3.9
 
 # Dependecies for core version (pip install aaanalysis)
-biopython>=1.8.3
 cycler>=0.11.0
 et-xmlfile>=1.1.0
 fonttools>=4.37.1
@@ -26,6 +25,7 @@ six>=1.16.0
 threadpoolctl>=3.1.0
 
 # Additional dependencies for professional version (pip install aaanalysis[pro])
+biopython>=1.8.3
 shap>=0.44.0
 ipython>=8.16.1
 hypothesis>=6.86.2