Merge pull request #1 from AIS-Package/main

v0.1.21
AIS-Package · Jun 15, 2023 · bb238be · bb238be
2 parents 2b7cf31 + b1d3f72
commit bb238be
Show file tree

Hide file tree

Showing 15 changed files with 406 additions and 66 deletions.
diff --git a/aisp/NSA/__init__.py b/aisp/NSA/__init__.py
@@ -1,4 +1,5 @@
 from ._negativeSelection import RNSA, BNSA
 
 __author__ = 'João Paulo da Silva Barros'
-__all__ = ['RNSA', 'BNSA']
+__all__ = ['RNSA', 'BNSA']
+__version__ = '0.1.21'
diff --git a/aisp/NSA/_negativeSelection.py b/aisp/NSA/_negativeSelection.py
@@ -31,8 +31,15 @@ class RNSA(Base):
 
             Defaults to ``'default-NSA'``.
 
-        * cell_bounds (``bool``): If set to ``True``, this option limits the generation of detectors to 
-            the space within the plane between 0 and 1. 
+        * non_self_label (``str``): This variable stores the label that will be assigned when the data has only one 
+        output class, and the sample is classified as not belonging to that class. Defaults to ``'non-self'``.
+        * cell_bounds (``bool``): If set to ``True``, this option limits the generation of detectors to the space within 
+        the plane between 0 and 1. This means that any detector whose radius exceeds this limit is discarded, 
+        this variable is only used in the ``V-detector`` algorithm. Defaults to ``False``.
+        * p (``float``): This parameter stores the value of ``p`` used in the Minkowski distance. The default is ``2``, which 
+        represents normalized Euclidean distance. Different values of p lead to different variants of the Minkowski 
+        distance [learn more](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html).
+       
         * detectors (``dict``): This variable stores a list of detectors by class.
         * classes (``npt.NDArray``): list of output classes.
         
@@ -60,13 +67,19 @@ class RNSA(Base):
 
             Defaults to ``'default-NSA'``.
 
-        * cell_bounds (``bool``):  Se definido como ``True``, esta opção limita a geração dos detectores ao espaço 
-            do plano compreendido entre 0 e 1.
-       
+        * non_self_label (``str``): Esta variável armazena o rótulo que será atribuído quando os dados possuírem 
+        apenas uma classe de saída, e a amostra for classificada como não pertencente a essa classe. Defaults to ``'non-self'``.
+        * cell_bounds (``bool``):  Se definido como ``True``, esta opção limita a geração dos detectores ao espaço do plano 
+        compreendido entre 0 e 1. Isso significa que qualquer detector cujo raio ultrapasse esse limite é descartado, 
+        e esta variável é usada exclusivamente no algoritmo ``V-detector``.
+        * p (``float``): Este parâmetro armazena o valor de ``p`` utilizada na distância de Minkowski. O padrão é ``2``, o que significa 
+        distância euclidiana normalizada. Diferentes valores de p levam a diferentes variantes da distância de 
+        Minkowski [saiba mais](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html).
         
         * detectors (``dict``): Essa variável armazena uma lista com detectores por classes.
         * classes (``npt.NDArray``): lista com as classes de saída.
     """
+
     def __init__(
         self, 
         N: int = 100, 
@@ -77,7 +90,7 @@ def __init__(
         max_discards: int = 1000, 
         seed: int = None, 
         algorithm: Literal['default-NSA', 'V-detector'] ='default-NSA', 
-        **kwargs: Dict[str, Union[bool, str]]
+        **kwargs: Dict[str, Union[bool, str, float]]
     ):
         """
         Negative Selection class constructor (``RNSA``).
@@ -97,7 +110,7 @@ def __init__(
             * metric (``str``): Way to calculate the distance between the detector and the sample:
 
                 * ``'Euclidean'`` ➜ The calculation of the distance is given by the expression: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²).
-                * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ, In this project ``p == 2``.
+                * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ.
                 * ``'manhattan'`` ➜ The calculation of the distance is given by the expression: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|) .
 
             Defaults to ``'euclidean'``.
@@ -119,7 +132,9 @@ def __init__(
                     - cell_bounds (``bool``): If set to ``True``, this option limits the generation of detectors to the space within 
                     the plane between 0 and 1. This means that any detector whose radius exceeds this limit is discarded, 
                     this variable is only used in the ``V-detector`` algorithm. Defaults to ``False``.
-
+                    - p (``float``): This parameter stores the value of ``p`` used in the Minkowski distance. The default is ``2``, which 
+                    represents normalized Euclidean distance. Different values of p lead to different variants of the Minkowski 
+                    distance [learn more](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html).
         ---
 
         Construtor da classe de Seleção negativa (``RNSA``).
@@ -138,7 +153,7 @@ def __init__(
             * metric (``str``): Forma para se calcular a distância entre o detector e a amostra: 
 
                 * ``'euclidiana'`` ➜ O cálculo da distância dá-se pela expressão: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²).
-                * ``'minkowski'``  ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ, Neste projeto ``p == 2``.
+                * ``'minkowski'``  ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ.
                 * ``'manhattan'``  ➜ O cálculo da distância dá-se pela expressão: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|).
 
             Defaults to ``'euclidean'``.
@@ -159,6 +174,9 @@ def __init__(
                     - cell_bounds (``bool``):  Se definido como ``True``, esta opção limita a geração dos detectores ao espaço do plano 
                     compreendido entre 0 e 1. Isso significa que qualquer detector cujo raio ultrapasse esse limite é descartado, 
                     e esta variável é usada exclusivamente no algoritmo ``V-detector``.
+                    - p (``float``): Este parâmetro armazena o valor de ``p`` utilizada na distância de Minkowski. O padrão é ``2``, o que significa 
+                    distância euclidiana normalizada. Diferentes valores de p levam a diferentes variantes da distância de 
+                    Minkowski [saiba mais](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html).
         """
 
         if metric == 'manhattan' or metric == 'minkowski' or metric == 'euclidean':
@@ -198,10 +216,17 @@ def __init__(
         else:
             self._Detector = namedtuple("Detector", "position")
             self._algorithm: str = 'default-NSA'
-
+
+        if max_discards > 0:
+            self.max_discards: int = max_discards
+        else:
+            self.max_discards: int = 1000
+
+        # Obtém as variáveis do kwargs.
+        self.p: float = kwargs.get('p', 2)
         self._cell_bounds: bool = kwargs.get('cell_bounds', False)
         self.non_self_label: str = kwargs.get('non_self_label', 'non-self')
-        self.max_discards: int = max_discards
+
         self.detectors: Union[dict, None] = None
         self.classes: npt.NDArray = None
 
@@ -688,7 +713,8 @@ def get_params(self, deep: bool = True) -> dict:
             'seed': self.seed,
             'algorithm': self._algorithm,
             'r_s': self.r_s,
-            'cell_bounds': self._cell_bounds
+            'cell_bounds': self._cell_bounds,
+            'p': self.p
         }
 
 class BNSA(Base):

diff --git a/aisp/_base.py b/aisp/_base.py
@@ -13,32 +13,42 @@ class Base:
     A classe base contém funções que são utilizadas por mais de uma classe do pacote, 
     e por isso são consideradas essenciais para o funcionamento geral do sistema.
     """
-    def __init__(self, metric: str = 'euclidean'):
+    def __init__(self, metric: str = 'euclidean', p: float = 2):
         """
         Parameters:
         ---
         * metric (``str``): Way to calculate the distance between the detector and the sample:
 
                 * ``'Euclidean'`` ➜ The calculation of the distance is given by the expression: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²).
-                * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ , In this project ``p == 2``.
+                * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ.
                 * ``'manhattan'`` ➜ The calculation of the distance is given by the expression: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|) .
         
+        
+        * p (``float``): This parameter stores the value of ``p`` used in the Minkowski distance. 
+        The default is ``2``, which represents normalized Euclidean distance. Different values of p lead to 
+        different variants of the Minkowski distance [learn more](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html).
+
         ---
 
         Parameters:
         ---
         * metric (``str``): Forma para se calcular a distância entre o detector e a amostra: 
 
                 * ``'euclidiana'`` ➜ O cálculo da distância dá-se pela expressão: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²).
-                * ``'minkowski'``  ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ , Neste projeto ``p == 2``.
+                * ``'minkowski'``  ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p).
                 * ``'manhattan'``  ➜ O cálculo da distância dá-se pela expressão: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|).
 
             Defaults to ``'euclidean'``.
+        
+        * p (``float``): Este parâmetro armazena o valor de ``p`` utilizada na distância de Minkowski.
+          O padrão é ``2``, o que significa distância euclidiana normalizada. Diferentes valores de p levam a 
+          diferentes variantes da distância de Minkowski [saiba mais](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html).
         """
         if metric == 'manhattan' or metric == 'minkowski' or metric == 'euclidean':
-            self.metric = metric
+            self.metric: str = metric
         else:
-            self.metric = 'euclidean'
+            self.metric: str = 'euclidean'
+        self.p: float = p
 
     def _distance(self, u: npt.NDArray, v: npt.NDArray):
         """
@@ -69,7 +79,7 @@ def _distance(self, u: npt.NDArray, v: npt.NDArray):
         if self.metric == 'manhattan':
             return cityblock(u, v)
         elif self.metric == 'minkowski':
-            return minkowski(u, v, 2)
+            return minkowski(u, v, self.p)
         else:
             return euclidean(u, v)
 

diff --git a/examples/BNSA/example_with_randomly_generated_dataset-en.ipynb b/examples/BNSA/example_with_randomly_generated_dataset-en.ipynb
@@ -219,7 +219,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.4"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/examples/BNSA/example_with_randomly_generated_dataset-pt.ipynb b/examples/BNSA/example_with_randomly_generated_dataset-pt.ipynb
@@ -49,7 +49,7 @@
     "import numpy as np\n",
     "import seaborn as sns\n",
     "import matplotlib.pyplot as plt\n",
-    "from scipy.spatial.distance import cdist, hamming\n",
+    "from scipy.spatial.distance import cdist\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score"
    ]
@@ -224,7 +224,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.4"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/examples/BNSA/mushrooms_dataBase_example_en.ipynb b/examples/BNSA/mushrooms_dataBase_example_en.ipynb
@@ -55,6 +55,61 @@
     "#### **2. Load the database and binarize them.**"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Loading the \"Mushroom\" dataset. This dataset contains information about mushrooms with two output categories: poisonous and edible. It includes the following 22 characteristics per sample:\n",
+    "\n",
+    "1. Cap-shape: convex, conical, flat, knobbed, bell, sunken.\n",
+    "2. Cap-surface: fibrous, grooves, smooth, scaly.\n",
+    "3. Cap-color: brown, gray, pink, green, purple, red, white, yellow, chocolate.\n",
+    "4. Bruises: yes, no.\n",
+    "5. Odor: almond, anise, creosote, foul, spicy, fishy, floury, mushroomy, none.\n",
+    "6. Gill-attachment: attached, free, none.\n",
+    "7. Gill-spacing: close, crowded.\n",
+    "8. Gill-size: broad, narrow.\n",
+    "9. Gill-color: black, brown, pink, gray, orange, pink, green, purple, red, white, yellow, chocolate.\n",
+    "10. Stalk-shape: enlarging, tapering.\n",
+    "11. Stalk-root: bulbous, club, cup, equal, rooted, missing.\n",
+    "12. Stalk-surface-above-ring: fibrous, scaly, silky, smooth.\n",
+    "13. Stalk-surface-below-ring: fibrous, scaly, silky, smooth.\n",
+    "14. Stalk-color-above-ring: brown, gray, pink, orange, white, yellow, red, chocolate.\n",
+    "15. Stalk-color-below-ring: brown, gray, pink, orange, white, yellow, red, chocolate.\n",
+    "16. Veil-type: partial, universal.\n",
+    "17. Veil-color: brown, orange, white, yellow.\n",
+    "18. Ring-number: none, one, two.\n",
+    "19. Ring-type: evanescent, large, none, pendant.\n",
+    "20. Spore-print-color: black, brown, pink, green, orange, purple, white, yellow, chocolate.\n",
+    "21. Population: abundant, clustered, numerous, scattered, several, solitary.\n",
+    "22. Habitat: grasses, leaves, meadows, paths, urban, waste, woods.\n",
+    "\n",
+    "This dataset is also available at the following links: [kaggle](https://www.kaggle.com/datasets/uciml/mushroom-classification) and [UCIML](https://archive.ics.uci.edu/dataset/73/mushroom).\n",
+    "\n",
+    "\n",
+    "Mushroom. (1987). UCI Machine Learning Repository. https://doi.org/10.24432/C5959T."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the database\n",
+    "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'\n",
+    "mushrooms = pd.read_csv(url, header=None)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Normalizes the dataset to binary using one-hot encoding with the \"get_dummies\" method from pandas."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -70,10 +125,6 @@
     }
    ],
    "source": [
-    "# Load the database\n",
-    "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'\n",
-    "mushrooms = pd.read_csv(url, header=None)\n",
-    "\n",
     "# Create column names\n",
     "columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',\n",
     "           'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',\n",
@@ -116,6 +167,14 @@
     "#### **4. Cross Validation.**"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Model performance is evaluated through cross-validation. In each iteration, 10% of the training data is reserved for testing."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,