From b1d3f7236e307dc67cb842bbf3f0eae034cfc208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Paulo?= <88985821+Joao-Paulo-Silva@users.noreply.github.com> Date: Thu, 15 Jun 2023 18:31:44 -0300 Subject: [PATCH] v0.1.21 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this version, improvements have been made to the examples in Jupyter Notebook to provide a clearer explanation and better understanding of cells. Also, support for changing the "p" value of the Minkowski distance has been implemented. ------- Nesta versão, foram realizadas melhorias nos exemplos em Jupyter Notebook para fornecer uma explicação mais clara e um melhor entendimento das células. Também foi implementado suporte para alterar o valor "p" da distância de Minkowski. --- aisp/NSA/__init__.py | 3 +- aisp/NSA/_negativeSelection.py | 50 ++++++++++--- aisp/_base.py | 22 ++++-- ...e_with_randomly_generated_dataset-en.ipynb | 2 +- ...e_with_randomly_generated_dataset-pt.ipynb | 4 +- .../BNSA/mushrooms_dataBase_example_en.ipynb | 67 ++++++++++++++++- .../mushrooms_dataBase_example_pt-br.ipynb | 67 +++++++++++++++-- ...e_with_randomly_generated_dataset-en.ipynb | 70 ++++++++++++++++-- ...e_with_randomly_generated_dataset-pt.ipynb | 42 +++++++---- .../RNSA/geyser_dataBase_example_en.ipynb | 14 +++- .../RNSA/geyser_dataBase_example_pt-br.ipynb | 39 +++++++++- examples/RNSA/iris_dataBase_example_en.ipynb | 42 ++++++++++- .../RNSA/iris_dataBase_example_pt-br.ipynb | 41 +++++++++- pyproject.toml | 9 ++- requirements.txt | Bin 84 -> 84 bytes 15 files changed, 406 insertions(+), 66 deletions(-) diff --git a/aisp/NSA/__init__.py b/aisp/NSA/__init__.py index 91833bb..2499c09 100644 --- a/aisp/NSA/__init__.py +++ b/aisp/NSA/__init__.py @@ -1,4 +1,5 @@ from ._negativeSelection import RNSA, BNSA __author__ = 'João Paulo da Silva Barros' -__all__ = ['RNSA', 'BNSA'] \ No newline at end of file +__all__ = ['RNSA', 'BNSA'] +__version__ = '0.1.21' \ No newline at end of file diff --git a/aisp/NSA/_negativeSelection.py b/aisp/NSA/_negativeSelection.py index 78b7376..eb520a2 100644 --- a/aisp/NSA/_negativeSelection.py +++ b/aisp/NSA/_negativeSelection.py @@ -31,8 +31,15 @@ class RNSA(Base): Defaults to ``'default-NSA'``. - * cell_bounds (``bool``): If set to ``True``, this option limits the generation of detectors to - the space within the plane between 0 and 1. + * non_self_label (``str``): This variable stores the label that will be assigned when the data has only one + output class, and the sample is classified as not belonging to that class. Defaults to ``'non-self'``. + * cell_bounds (``bool``): If set to ``True``, this option limits the generation of detectors to the space within + the plane between 0 and 1. This means that any detector whose radius exceeds this limit is discarded, + this variable is only used in the ``V-detector`` algorithm. Defaults to ``False``. + * p (``float``): This parameter stores the value of ``p`` used in the Minkowski distance. The default is ``2``, which + represents normalized Euclidean distance. Different values of p lead to different variants of the Minkowski + distance [learn more](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html). + * detectors (``dict``): This variable stores a list of detectors by class. * classes (``npt.NDArray``): list of output classes. @@ -60,13 +67,19 @@ class RNSA(Base): Defaults to ``'default-NSA'``. - * cell_bounds (``bool``): Se definido como ``True``, esta opção limita a geração dos detectores ao espaço - do plano compreendido entre 0 e 1. - + * non_self_label (``str``): Esta variável armazena o rótulo que será atribuído quando os dados possuírem + apenas uma classe de saída, e a amostra for classificada como não pertencente a essa classe. Defaults to ``'non-self'``. + * cell_bounds (``bool``): Se definido como ``True``, esta opção limita a geração dos detectores ao espaço do plano + compreendido entre 0 e 1. Isso significa que qualquer detector cujo raio ultrapasse esse limite é descartado, + e esta variável é usada exclusivamente no algoritmo ``V-detector``. + * p (``float``): Este parâmetro armazena o valor de ``p`` utilizada na distância de Minkowski. O padrão é ``2``, o que significa + distância euclidiana normalizada. Diferentes valores de p levam a diferentes variantes da distância de + Minkowski [saiba mais](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html). * detectors (``dict``): Essa variável armazena uma lista com detectores por classes. * classes (``npt.NDArray``): lista com as classes de saída. """ + def __init__( self, N: int = 100, @@ -77,7 +90,7 @@ def __init__( max_discards: int = 1000, seed: int = None, algorithm: Literal['default-NSA', 'V-detector'] ='default-NSA', - **kwargs: Dict[str, Union[bool, str]] + **kwargs: Dict[str, Union[bool, str, float]] ): """ Negative Selection class constructor (``RNSA``). @@ -97,7 +110,7 @@ def __init__( * metric (``str``): Way to calculate the distance between the detector and the sample: * ``'Euclidean'`` ➜ The calculation of the distance is given by the expression: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²). - * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ, In this project ``p == 2``. + * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ. * ``'manhattan'`` ➜ The calculation of the distance is given by the expression: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|) . Defaults to ``'euclidean'``. @@ -119,7 +132,9 @@ def __init__( - cell_bounds (``bool``): If set to ``True``, this option limits the generation of detectors to the space within the plane between 0 and 1. This means that any detector whose radius exceeds this limit is discarded, this variable is only used in the ``V-detector`` algorithm. Defaults to ``False``. - + - p (``float``): This parameter stores the value of ``p`` used in the Minkowski distance. The default is ``2``, which + represents normalized Euclidean distance. Different values of p lead to different variants of the Minkowski + distance [learn more](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html). --- Construtor da classe de Seleção negativa (``RNSA``). @@ -138,7 +153,7 @@ def __init__( * metric (``str``): Forma para se calcular a distância entre o detector e a amostra: * ``'euclidiana'`` ➜ O cálculo da distância dá-se pela expressão: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²). - * ``'minkowski'`` ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ, Neste projeto ``p == 2``. + * ``'minkowski'`` ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ. * ``'manhattan'`` ➜ O cálculo da distância dá-se pela expressão: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|). Defaults to ``'euclidean'``. @@ -159,6 +174,9 @@ def __init__( - cell_bounds (``bool``): Se definido como ``True``, esta opção limita a geração dos detectores ao espaço do plano compreendido entre 0 e 1. Isso significa que qualquer detector cujo raio ultrapasse esse limite é descartado, e esta variável é usada exclusivamente no algoritmo ``V-detector``. + - p (``float``): Este parâmetro armazena o valor de ``p`` utilizada na distância de Minkowski. O padrão é ``2``, o que significa + distância euclidiana normalizada. Diferentes valores de p levam a diferentes variantes da distância de + Minkowski [saiba mais](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html). """ if metric == 'manhattan' or metric == 'minkowski' or metric == 'euclidean': @@ -198,10 +216,17 @@ def __init__( else: self._Detector = namedtuple("Detector", "position") self._algorithm: str = 'default-NSA' - + + if max_discards > 0: + self.max_discards: int = max_discards + else: + self.max_discards: int = 1000 + + # Obtém as variáveis do kwargs. + self.p: float = kwargs.get('p', 2) self._cell_bounds: bool = kwargs.get('cell_bounds', False) self.non_self_label: str = kwargs.get('non_self_label', 'non-self') - self.max_discards: int = max_discards + self.detectors: Union[dict, None] = None self.classes: npt.NDArray = None @@ -688,7 +713,8 @@ def get_params(self, deep: bool = True) -> dict: 'seed': self.seed, 'algorithm': self._algorithm, 'r_s': self.r_s, - 'cell_bounds': self._cell_bounds + 'cell_bounds': self._cell_bounds, + 'p': self.p } class BNSA(Base): diff --git a/aisp/_base.py b/aisp/_base.py index d9c0373..e53ab1e 100644 --- a/aisp/_base.py +++ b/aisp/_base.py @@ -13,16 +13,21 @@ class Base: A classe base contém funções que são utilizadas por mais de uma classe do pacote, e por isso são consideradas essenciais para o funcionamento geral do sistema. """ - def __init__(self, metric: str = 'euclidean'): + def __init__(self, metric: str = 'euclidean', p: float = 2): """ Parameters: --- * metric (``str``): Way to calculate the distance between the detector and the sample: * ``'Euclidean'`` ➜ The calculation of the distance is given by the expression: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²). - * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ , In this project ``p == 2``. + * ``'minkowski'`` ➜ The calculation of the distance is given by the expression: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ. * ``'manhattan'`` ➜ The calculation of the distance is given by the expression: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|) . + + * p (``float``): This parameter stores the value of ``p`` used in the Minkowski distance. + The default is ``2``, which represents normalized Euclidean distance. Different values of p lead to + different variants of the Minkowski distance [learn more](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html). + --- Parameters: @@ -30,15 +35,20 @@ def __init__(self, metric: str = 'euclidean'): * metric (``str``): Forma para se calcular a distância entre o detector e a amostra: * ``'euclidiana'`` ➜ O cálculo da distância dá-se pela expressão: √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²). - * ``'minkowski'`` ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ , Neste projeto ``p == 2``. + * ``'minkowski'`` ➜ O cálculo da distância dá-se pela expressão: ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p). * ``'manhattan'`` ➜ O cálculo da distância dá-se pela expressão: ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|). Defaults to ``'euclidean'``. + + * p (``float``): Este parâmetro armazena o valor de ``p`` utilizada na distância de Minkowski. + O padrão é ``2``, o que significa distância euclidiana normalizada. Diferentes valores de p levam a + diferentes variantes da distância de Minkowski [saiba mais](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html). """ if metric == 'manhattan' or metric == 'minkowski' or metric == 'euclidean': - self.metric = metric + self.metric: str = metric else: - self.metric = 'euclidean' + self.metric: str = 'euclidean' + self.p: float = p def _distance(self, u: npt.NDArray, v: npt.NDArray): """ @@ -69,7 +79,7 @@ def _distance(self, u: npt.NDArray, v: npt.NDArray): if self.metric == 'manhattan': return cityblock(u, v) elif self.metric == 'minkowski': - return minkowski(u, v, 2) + return minkowski(u, v, self.p) else: return euclidean(u, v) diff --git a/examples/BNSA/example_with_randomly_generated_dataset-en.ipynb b/examples/BNSA/example_with_randomly_generated_dataset-en.ipynb index d720ac8..1be6c77 100644 --- a/examples/BNSA/example_with_randomly_generated_dataset-en.ipynb +++ b/examples/BNSA/example_with_randomly_generated_dataset-en.ipynb @@ -219,7 +219,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { diff --git a/examples/BNSA/example_with_randomly_generated_dataset-pt.ipynb b/examples/BNSA/example_with_randomly_generated_dataset-pt.ipynb index 19ec45e..0bbff95 100644 --- a/examples/BNSA/example_with_randomly_generated_dataset-pt.ipynb +++ b/examples/BNSA/example_with_randomly_generated_dataset-pt.ipynb @@ -49,7 +49,7 @@ "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", - "from scipy.spatial.distance import cdist, hamming\n", + "from scipy.spatial.distance import cdist\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score" ] @@ -224,7 +224,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { diff --git a/examples/BNSA/mushrooms_dataBase_example_en.ipynb b/examples/BNSA/mushrooms_dataBase_example_en.ipynb index e89795a..33e12ba 100644 --- a/examples/BNSA/mushrooms_dataBase_example_en.ipynb +++ b/examples/BNSA/mushrooms_dataBase_example_en.ipynb @@ -55,6 +55,61 @@ "#### **2. Load the database and binarize them.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading the \"Mushroom\" dataset. This dataset contains information about mushrooms with two output categories: poisonous and edible. It includes the following 22 characteristics per sample:\n", + "\n", + "1. Cap-shape: convex, conical, flat, knobbed, bell, sunken.\n", + "2. Cap-surface: fibrous, grooves, smooth, scaly.\n", + "3. Cap-color: brown, gray, pink, green, purple, red, white, yellow, chocolate.\n", + "4. Bruises: yes, no.\n", + "5. Odor: almond, anise, creosote, foul, spicy, fishy, floury, mushroomy, none.\n", + "6. Gill-attachment: attached, free, none.\n", + "7. Gill-spacing: close, crowded.\n", + "8. Gill-size: broad, narrow.\n", + "9. Gill-color: black, brown, pink, gray, orange, pink, green, purple, red, white, yellow, chocolate.\n", + "10. Stalk-shape: enlarging, tapering.\n", + "11. Stalk-root: bulbous, club, cup, equal, rooted, missing.\n", + "12. Stalk-surface-above-ring: fibrous, scaly, silky, smooth.\n", + "13. Stalk-surface-below-ring: fibrous, scaly, silky, smooth.\n", + "14. Stalk-color-above-ring: brown, gray, pink, orange, white, yellow, red, chocolate.\n", + "15. Stalk-color-below-ring: brown, gray, pink, orange, white, yellow, red, chocolate.\n", + "16. Veil-type: partial, universal.\n", + "17. Veil-color: brown, orange, white, yellow.\n", + "18. Ring-number: none, one, two.\n", + "19. Ring-type: evanescent, large, none, pendant.\n", + "20. Spore-print-color: black, brown, pink, green, orange, purple, white, yellow, chocolate.\n", + "21. Population: abundant, clustered, numerous, scattered, several, solitary.\n", + "22. Habitat: grasses, leaves, meadows, paths, urban, waste, woods.\n", + "\n", + "This dataset is also available at the following links: [kaggle](https://www.kaggle.com/datasets/uciml/mushroom-classification) and [UCIML](https://archive.ics.uci.edu/dataset/73/mushroom).\n", + "\n", + "\n", + "Mushroom. (1987). UCI Machine Learning Repository. https://doi.org/10.24432/C5959T." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the database\n", + "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'\n", + "mushrooms = pd.read_csv(url, header=None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Normalizes the dataset to binary using one-hot encoding with the \"get_dummies\" method from pandas." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -70,10 +125,6 @@ } ], "source": [ - "# Load the database\n", - "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'\n", - "mushrooms = pd.read_csv(url, header=None)\n", - "\n", "# Create column names\n", "columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',\n", " 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',\n", @@ -116,6 +167,14 @@ "#### **4. Cross Validation.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Model performance is evaluated through cross-validation. In each iteration, 10% of the training data is reserved for testing." + ] + }, { "cell_type": "code", "execution_count": 4, diff --git a/examples/BNSA/mushrooms_dataBase_example_pt-br.ipynb b/examples/BNSA/mushrooms_dataBase_example_pt-br.ipynb index a0752c9..3223d9f 100644 --- a/examples/BNSA/mushrooms_dataBase_example_pt-br.ipynb +++ b/examples/BNSA/mushrooms_dataBase_example_pt-br.ipynb @@ -53,6 +53,60 @@ "#### **2. Carregar a base de dados e binarizando-os**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Carregando a base de dados \"Mushroom\". Essa base de dados contém informações sobre cogumelos com duas categorias de saída: venenosos e comestíveis. Ela inclui as seguintes 22 características por amostra:\n", + "\n", + "- Forma do chapéu (Cap-shape): convexo, cônico, plano, em formato de nódulo, em formato de sino, afundado.\n", + "- Superfície do chapéu (Cap-surface): fibroso, sulcado, liso, escamoso.\n", + "- Cor do chapéu (Cap-color): marrom, cinza, rosa, verde, roxo, vermelho, branco, amarelo, chocolate.\n", + "- Contusões (Bruises): sim, não.\n", + "- Odor: amêndoa, anis, creosoto, podre, picante, peixe, farinha, cogumelo, sem odor.\n", + "- Fixação das lamelas (Gill-attachment): anexada, solta, nenhuma.\n", + "- Espaçamento das lamelas (Gill-spacing): próximo, lotado.\n", + "- Tamanho das lamelas (Gill-size): largo, estreito.\n", + "- Cor das lamelas (Gill-color): preto, marrom, rosa, cinza, laranja, rosa, verde, roxo, vermelho, branco, amarelo, chocolate.\n", + "- Forma do caule (Stalk-shape): alargando, afinando.\n", + "- Raiz do caule (Stalk-root): bulbosa, clube, taça, igual, enraizada, ausente.\n", + "- Superfície do caule acima do anel (Stalk-surface-above-ring): fibrosa, escamosa, sedosa, lisa.\n", + "- Superfície do caule abaixo do anel (Stalk-surface-below-ring): fibrosa, escamosa, sedosa, lisa.\n", + "- Cor do caule acima do anel (Stalk-color-above-ring): marrom, cinza, rosa, laranja, branco, amarelo, vermelho, chocolate.\n", + "- Cor do caule abaixo do anel (Stalk-color-below-ring): marrom, cinza, rosa, laranja, branco, amarelo, vermelho, chocolate.\n", + "- Tipo de véu (Veil-type): parcial, universal.\n", + "- Cor do véu (Veil-color): marrom, laranja, branco, amarelo.\n", + "- Número de anéis (Ring-number): nenhum, um, dois.\n", + "- Tipo de anel (Ring-type): evanescente, grande, nenhum, pendente.\n", + "- Cor da impressão de esporos (Spore-print-color): preto, marrom, rosa, verde, laranja, roxo, branco, amarelo, chocolate.\n", + "- População: abundante, agrupada, numerosa, dispersa, várias, solitária.\n", + "- Habitat: gramados, folhas, pradarias, caminhos, áreas urbanas, resíduos, madeira.\n", + "\n", + "Essa base de dados também está disponível nos seguintes links: [kaggle](https://www.kaggle.com/datasets/uciml/mushroom-classification) e [UCIML](https://archive.ics.uci.edu/dataset/73/mushroom).\n", + "\n", + "Mushroom. (1987). UCI Machine Learning Repository. https://doi.org/10.24432/C5959T." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Carrega a base de dados\n", + "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'\n", + "mushrooms = pd.read_csv(url, header=None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Normaliza a base de dados para binaria usando a codificação one-hot utilizando o método \"get_dummies\" do pandas." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -68,10 +122,6 @@ } ], "source": [ - "# Carrega a base de dados\n", - "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'\n", - "mushrooms = pd.read_csv(url, header=None)\n", - "\n", "# Cria nomes de colunas\n", "columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',\n", " 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',\n", @@ -79,7 +129,6 @@ " 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']\n", "\n", "mushrooms.columns = columns\n", - "\n", "# Converte todas as variáveis categóricas em variáveis binárias\n", "mushrooms_binary = pd.get_dummies(mushrooms, columns=columns[1:], drop_first=True)\n", "dados = mushrooms_binary.drop('class', axis=1).to_numpy()\n", @@ -113,6 +162,14 @@ "#### **5. Validação Cruzada.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "O desempenho do modelo é avaliado por meio de validação cruzada. Em cada iteração, 10% dos dados de treinamento são reservados para teste." + ] + }, { "cell_type": "code", "execution_count": 4, diff --git a/examples/RNSA/example_with_randomly_generated_dataset-en.ipynb b/examples/RNSA/example_with_randomly_generated_dataset-en.ipynb index 371cfef..1a08dde 100644 --- a/examples/RNSA/example_with_randomly_generated_dataset-en.ipynb +++ b/examples/RNSA/example_with_randomly_generated_dataset-en.ipynb @@ -50,6 +50,43 @@ "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Generating dice bubbles for classes randomly." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the make_blobs function, two sets of data are generated in the form of bubbles, in the range between 0 and 1, representing each class x and y. Then this data is separated into test and training sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generating the samples and outputs for the training.\n", + "samples, output = make_blobs(n_samples=500 , n_features=2, cluster_std=0.07, center_box=([0.0, 1.0]), centers=[[0.25, 0.75], [0.75, 0.25]], random_state=1234) \n", + "# Separating data for training and testing.\n", + "train_x, test_x, train_y, test_y = train_test_split(samples, output, test_size=0.2)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Testing the model ``default-NSA``:\n", + "Start the model with 500 detectors, each with a radius of 0.06. Then, it presents the result of the forecast accuracy." + ] + }, { "cell_type": "code", "execution_count": 3, @@ -82,10 +119,6 @@ "source": [ "# Starting the class.\n", "nsa = RNSA(N=500, r=0.06, seed=123)\n", - "# Generating the samples and outputs for the training.\n", - "samples, output = make_blobs(n_samples=500 , n_features=2, cluster_std=0.07, center_box=([0.0, 1.0]), centers=[[0.25, 0.75], [0.75, 0.25]], random_state=1234) \n", - "# Separating data for training and testing.\n", - "train_x, test_x, train_y, test_y = train_test_split(samples, output, test_size=0.2)\n", "# Carrying out the training:\n", "nsa.fit(X=train_x, y=train_y)\n", "# Previewing classes with test samples.\n", @@ -120,6 +153,14 @@ "plt.show()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Detector and sample plotting:" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -168,6 +209,15 @@ "plot_detectores(samples, None, nsa, 0)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Testing the model ``V-detector``:\n", + "Start the model with 50 detectors, where the minimum radius is 0.05 and the sample's own radius is 0.04. It then shows the forecast accuracy result." + ] + }, { "cell_type": "code", "execution_count": 6, @@ -207,10 +257,6 @@ "source": [ "# Starting the class.\n", "nsa = RNSA(N=20, r=0.02, algorithm='V-detector', r_s=0.04, seed=123)\n", - "# Generating the samples and outputs for the training.\n", - "samples, output = make_blobs(n_samples=500 , n_features=2, cluster_std=0.07, center_box=([0.0, 1.0]), centers=[[0.25, 0.75], [0.75, 0.25]], random_state=1234) \n", - "# Separating data for training and testing.\n", - "train_x, test_x, train_y, test_y = train_test_split(samples, output, test_size=0.2)\n", "# Carrying out the training:\n", "nsa.fit(X=train_x, y=train_y)\n", "# Previewing classes with test samples.\n", @@ -245,6 +291,14 @@ "plt.show()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6. Detector and sample plotting:" + ] + }, { "cell_type": "code", "execution_count": 8, diff --git a/examples/RNSA/example_with_randomly_generated_dataset-pt.ipynb b/examples/RNSA/example_with_randomly_generated_dataset-pt.ipynb index 95dedf5..91d1167 100644 --- a/examples/RNSA/example_with_randomly_generated_dataset-pt.ipynb +++ b/examples/RNSA/example_with_randomly_generated_dataset-pt.ipynb @@ -51,7 +51,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 2. Iniciando o modelo e aplicando-o em amostras geradas aleatoriamente para testar o desempenho do classificador raio fixo." + "#### 2. Gerando bolhas de dados para as classe aleatoriamente.\n", + "\n", + "Utilizando a função make_blobs, são gerados dois conjuntos de dados em forma de bolhas, no intervalo entre 0 e 1, representando cada classe x e y. Em seguida, esses dados são separados em conjuntos de teste e treinamento." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Gerando as amostras e saídas para o treinamento.\n", + "samples, output = make_blobs(n_samples=500 , cluster_std=0.07, center_box=([0.0, 1.0]), centers=[[0.25, 0.75], [0.75, 0.25]], random_state=1234) \n", + "# Separando dados para treinamento e teste.\n", + "train_x, test_x, train_y, test_y = train_test_split(samples, output, test_size=0.2)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Testando o modelo ``default-NSA``:\n", + "Inicia o modelo com 500 detectores, cada um com um raio de 0.06. Em seguida, apresenta o resultado da acurácia da previsão." ] }, { @@ -86,11 +109,6 @@ "source": [ "# Iniciando a classe.\n", "nsa = RNSA(N=500, r=0.06, seed=1234)\n", - "# Gerando as amostras e saídas para o treinamento.\n", - "samples, output = make_blobs(n_samples=500 , cluster_std=0.07, center_box=([0.0, 1.0]), centers=[[0.25, 0.75], [0.75, 0.25]], random_state=1234) \n", - "# Separando dados para treinamento e teste.\n", - "train_x, test_x, train_y, test_y = train_test_split(samples, output, test_size=0.2)\n", - "\n", "# Efetuando o treinamento: \n", "nsa.fit(X=train_x, y=train_y)\n", "# Efetuando a previsão:: \n", @@ -130,7 +148,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3. Plotando os detectores gerados com as amostras de treinamento." + "#### 4. Plotando os detectores gerados com as amostras de treinamento." ] }, { @@ -184,7 +202,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 4. Iniciando o modelo e aplicando-o em amostras geradas aleatoriamente para testar o desempenho do classificador raio variavel." + "#### 5. Testando o modelo ``V-detector``:\n", + "Inicia o modelo com 50 detectores, onde o raio mínimo é de 0.05 e o raio próprio das amostras é de 0.04. Em seguida, mostra o resultado da acurácia da previsão." ] }, { @@ -219,11 +238,6 @@ "source": [ "# Iniciando a classe.\n", "nsa = RNSA(N=50, r=0.05, algorithm='V-detector', r_s=0.04, seed=1234)\n", - "# Gerando as amostras e saídas para o treinamento.\n", - "samples, output = make_blobs(n_samples=500 , cluster_std=0.07, center_box=([0.0, 1.0]), centers=[[0.25, 0.75], [0.75, 0.25]], random_state=1234) \n", - "# Separando dados para treinamento e teste.\n", - "train_x, test_x, train_y, test_y = train_test_split(samples, output, test_size=0.2)\n", - "\n", "# Efetuando o treinamento: \n", "nsa.fit(X=train_x, y=train_y)\n", "# Efetuando a previsão:: \n", @@ -263,7 +277,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 5. Plotando os detectores gerados com as amostras de treinamento." + "#### 6. Plotando os detectores gerados com as amostras de treinamento." ] }, { diff --git a/examples/RNSA/geyser_dataBase_example_en.ipynb b/examples/RNSA/geyser_dataBase_example_en.ipynb index ab1b73a..1978f37 100644 --- a/examples/RNSA/geyser_dataBase_example_en.ipynb +++ b/examples/RNSA/geyser_dataBase_example_en.ipynb @@ -59,7 +59,7 @@ "outputs": [], "source": [ "# function to normalize the data using functions from scikit-learn.\n", - "def normalizeDados(dados):\n", + "def normalize_data(dados):\n", " scaler = MinMaxScaler().fit(dados)\n", " return scaler.transform(dados)" ] @@ -71,6 +71,16 @@ "#### **3. Load the database using the seaborn package.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading the \"Geyser\" dataset using the Seaborn library. It contains information about the eruption time of the Old Faithful geyser, with two characteristics: duration and time between eruptions. Hence, the outputs are categorized as either \"long\" or \"short\" eruptions.\n", + "\n", + "You can also access this dataset through the following links: [kaggle](https://www.kaggle.com/datasets/pyim59/cours-ml-geyser) and [Github](https://github.com/mwaskom/seaborn-data/blob/master/geyser.csv)." + ] + }, { "cell_type": "code", "execution_count": null, @@ -99,7 +109,7 @@ "outputs": [], "source": [ "# Normalizing the data between 0 and 1\n", - "dados = normalizeDados(dados)\n", + "dados = normalize_data(dados)\n", "# Generating the training and testing sets.\n", "train_x, test_x, train_y, test_y = train_test_split(dados, classes, test_size=0.30)" ] diff --git a/examples/RNSA/geyser_dataBase_example_pt-br.ipynb b/examples/RNSA/geyser_dataBase_example_pt-br.ipynb index 1e45f92..c423c42 100644 --- a/examples/RNSA/geyser_dataBase_example_pt-br.ipynb +++ b/examples/RNSA/geyser_dataBase_example_pt-br.ipynb @@ -59,7 +59,7 @@ "outputs": [], "source": [ "# função para normalizar os dados utilizando funções do scikit-learn.\n", - "def normalizeDados(dados):\n", + "def normalize_dados(dados):\n", " scaler = MinMaxScaler().fit(dados)\n", " return scaler.transform(dados)" ] @@ -71,6 +71,15 @@ "#### **3. Carregar a base de dados usando o pacote seaborn**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Carregando a base de dados “Geyser” através da biblioteca Seaborn. Ela contém informações sobre tempo de erupção do gêiser Old Faithful com duas características, duração e tempo entre as erupções. Logo, as saídas são o tipo de erupções longas (“long”) ou curtas (“short”).\n", + "Essa base de dados também está disponível nos seguintes links, [kaggle](https://www.kaggle.com/datasets/pyim59/cours-ml-geyser) e [Github](https://github.com/mwaskom/seaborn-data/blob/master/geyser.csv)" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -92,6 +101,14 @@ "##### **4.1 Separação de treinamento e teste (30% para teste)**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Celula responsável pela normalização dos dados para valores entre 0 e 1 e pela separação dos conjuntos de treinamento e teste." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -99,7 +116,7 @@ "outputs": [], "source": [ "# Normalizando os dados entre 0 e 1\n", - "dados = normalizeDados(dados)\n", + "dados = normalize_dados(dados)\n", "# Gerando os conjuntos de treinamento e teste.\n", "train_x, test_x, train_y, test_y = train_test_split(dados, classes, test_size=0.30)" ] @@ -118,6 +135,14 @@ "#### **5. Validação Cruzada.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "O desempenho do modelo é avaliado por meio de validação cruzada. Em cada iteração, 10% dos dados de treinamento são reservados para teste." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -264,6 +289,14 @@ "#### **8. Validação Cruzada.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "O desempenho do modelo é avaliado por meio de validação cruzada. Em cada iteração, 10% dos dados de treinamento são reservados para teste." + ] + }, { "cell_type": "code", "execution_count": 8, @@ -413,7 +446,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { diff --git a/examples/RNSA/iris_dataBase_example_en.ipynb b/examples/RNSA/iris_dataBase_example_en.ipynb index 1132e13..efcadd5 100644 --- a/examples/RNSA/iris_dataBase_example_en.ipynb +++ b/examples/RNSA/iris_dataBase_example_en.ipynb @@ -59,7 +59,7 @@ "outputs": [], "source": [ "# function to normalize the data using functions from scikit-learn.\n", - "def normalizeDados(dados):\n", + "def normalize_data(dados):\n", " scaler = MinMaxScaler().fit(dados)\n", " return scaler.transform(dados)" ] @@ -71,6 +71,18 @@ "#### **3. Load the database using the seaborn package.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading the \"iris\" dataset using the Seaborn library. The iris dataset comprises four floral characteristics, namely the size and width of the sepals and petals. Consequently, there are three output varieties for these samples: setosa, versicolor, and virginica.\n", + "\n", + "This dataset is also available at the following links: [kaggle](https://www.kaggle.com/datasets/uciml/iris) and [UCIML](http://archive.ics.uci.edu/dataset/53/iris).\n", + "\n", + "Fisher,R. A.. (1988). Iris. UCI Machine Learning Repository. https://doi.org/10.24432/C56C76." + ] + }, { "cell_type": "code", "execution_count": 3, @@ -92,6 +104,14 @@ "##### **4.1 Separation of training and testing (30% for testing)**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cell responsible for normalizing the data to values between 0 and 1 and for separating the training and testing sets." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -99,7 +119,7 @@ "outputs": [], "source": [ "# Normalizing the data between 0 and 1\n", - "dados = normalizeDados(dados)\n", + "dados = normalize_data(dados)\n", "# Generating the training and testing sets.\n", "train_x, test_x, train_y, test_y = train_test_split(dados, classes, test_size=0.3)" ] @@ -119,6 +139,14 @@ "#### **5. Cross Validation.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Model performance is evaluated through cross-validation. In each iteration, 10% of the training data is reserved for testing." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -264,6 +292,14 @@ "#### **8. Cross Validation.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Model performance is evaluated through cross-validation. In each iteration, 10% of the training data is reserved for testing." + ] + }, { "cell_type": "code", "execution_count": 11, @@ -413,7 +449,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.7" + "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { diff --git a/examples/RNSA/iris_dataBase_example_pt-br.ipynb b/examples/RNSA/iris_dataBase_example_pt-br.ipynb index 3740970..3f4dcf2 100644 --- a/examples/RNSA/iris_dataBase_example_pt-br.ipynb +++ b/examples/RNSA/iris_dataBase_example_pt-br.ipynb @@ -59,7 +59,7 @@ "outputs": [], "source": [ "# função para normalizar os dados utilizando funções do scikit-learn.\n", - "def normalizeDados(dados):\n", + "def normalize_dados(dados):\n", " scaler = MinMaxScaler().fit(dados)\n", " return scaler.transform(dados)" ] @@ -71,6 +71,17 @@ "#### **3. Carregar a base de dados usando o pacote seaborn**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Carregando a base de dados \"iris\" através da biblioteca Seaborn. Iris contém quatro características florais, ou seja, o tamanho e a largura das sépalas e pétalas. Logo, existe três variedades de saída para essas amostras: setosa, versicolor e virginica.\n", + "Essa base de dados também está disponível nos seguintes links, [kaggle](https://www.kaggle.com/datasets/uciml/iris) e [UCIML](http://archive.ics.uci.edu/dataset/53/iris)\n", + "\n", + "Fisher,R. A.. (1988). Iris. UCI Machine Learning Repository. https://doi.org/10.24432/C56C76." + ] + }, { "cell_type": "code", "execution_count": 3, @@ -92,6 +103,14 @@ "##### **4.1 Separação de treinamento e teste (30% para teste)**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Celula responsável pela normalização dos dados para valores entre 0 e 1 e pela separação dos conjuntos de treinamento e teste." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -99,7 +118,7 @@ "outputs": [], "source": [ "# Normalizando os dados entre 0 e 1\n", - "dados = normalizeDados(dados)\n", + "dados = normalize_dados(dados)\n", "# Gerando os conjuntos de treinamento e teste.\n", "train_x, test_x, train_y, test_y = train_test_split(dados, classes, test_size=0.30)" ] @@ -119,6 +138,14 @@ "#### **5. Validação Cruzada.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "O desempenho do modelo é avaliado por meio de validação cruzada. Em cada iteração, 10% dos dados de treinamento são reservados para teste." + ] + }, { "cell_type": "code", "execution_count": 7, @@ -266,6 +293,14 @@ "#### **8. Validação Cruzada.**" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "O desempenho do modelo é avaliado por meio de validação cruzada. Em cada iteração, 10% dos dados de treinamento são reservados para teste." + ] + }, { "cell_type": "code", "execution_count": 10, @@ -416,7 +451,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { diff --git a/pyproject.toml b/pyproject.toml index d456369..8da73b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "aisp" -version = "0.1.1" +version = "0.1.21" authors = [ { name="João Paulo da Silva Barros", email="jpsilvabarr@gmail.com" }, ] @@ -35,9 +35,14 @@ classifiers = [ dependencies = [ "numpy>=1.22.4", "scipy>=1.8.1", - "tqdm==4.64.1", + "tqdm>=4.64.1", ] +[tool.poetry] +readme = "README.md" + +repository = "https://github.com/AIS-Package/aisp" + keywords = ["Artificial Immune Systems", "classification", "Natural computing", "machine learning", "artificial intelligence"] [project.urls] diff --git a/requirements.txt b/requirements.txt index b2d12b86096c368fc74d3e053c5e92ade776626a..5ebf87b00fb505c1e90b9b158a0418f4187b5361 100644 GIT binary patch delta 21 YcmWFuncyN|$6(7~!l1`s1||&|04;?BGXMYp delta 21 YcmWFuncyN|%V5i3!l1`s1||&|04;R`G5`Po