Merge pull request #12 from SasCezar/dev

Submodule Bump and Minor Fixes
SasCezar · Jan 30, 2024 · 04529f5 · 04529f5
2 parents 5d230cd + bb29154
commit 04529f5
Show file tree

Hide file tree

Showing 10 changed files with 75 additions and 28 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -8,7 +8,7 @@ authors:
   given-names: "Andrea"
   orcid: "https://orcid.org/0000-0001-9469-6050"
 title: "AutoFL"
-version: 0.3.1
+version: 0.4.1
 doi: "10.5281/zenodo.10255368"
 date-released: 2023-09-01
 url: "https://github.com/SasCezar/AutoFL"
diff --git a/README.md b/README.md
@@ -64,9 +64,10 @@ the [config](config) folder.
 The main configuration file is [main.yaml](./config/main.yaml), which contains the following options:
 
 - **local**: which environment to use, either local or docker. [Docker](./config/local/docker.yaml) is default.
-- **taxonomy**: which taxonomy to use. Currently only [gitranking](./config/taxonomy/gitranking.yaml) is supported.
+- **taxonomy**: which taxonomy to use. Currently only [gitranking](./config/taxonomy/gitranking.yaml) is supported, but
+  custom taxonomies can be added.
 - **annotator**: which annotators to use. Default is [simple](./config/annotator/simple.yaml), which allows good results
-  without extra dependencies on models.
+  without extra dependencies on language models.
 - **version_strategy**: which version strategy to use. Default is [latest](./config/version_strategy/latest.yaml), which
   will only analyze the latest version of the project.
 - **dataloader**: which dataloader to use. Default is [postgres](./config/dataloader/postgres.yaml) which allows the API
@@ -96,6 +97,10 @@ Other configuration can be defined by creating a new file in the folder of the s
 
 ## Development
 
+The tool is composed of multiple components, their interaction is shown in the following diagram:
+
+![Architecture](resources/architecture/architecture.png)
+
 ### Add New Languages
 
 In order to support more languages, a new language specific parser is needed.
@@ -205,7 +210,7 @@ However, this tool is more up to date, easier to use, more configurable, and als
           month     = dec,
           title     = {{AutoFL}},
           url       = {https://github.com/SasCezar/AutoFL},
-          version   = {0.4.0},
+          version   = {0.4.1},
           year      = {2023},
           url       = {https://doi.org/10.5281/zenodo.10255368},
           doi       = {10.5281/zenodo.10255368}

diff --git a/autofl-ui b/autofl-ui
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "autofl"
-version = "0.4.0"
+version = "0.4.1"
 description = ""
 authors = ["Cezar Sas <cezar.sas@gmail.com>"]
 readme = "README.md"

diff --git a/resources/architecture/architecture.png b/resources/architecture/architecture.png
diff --git a/src/ensemble/avg.py b/src/ensemble/avg.py
@@ -1,5 +1,5 @@
 
-from typing import List, Union
+from typing import List, Union, Tuple
 
 import numpy as np
 
@@ -11,7 +11,10 @@ class AverageEnsemble(EnsembleBase):
     """
     Ensemble method that averages the annotations.
     """
-    def run(self, annotations: List[Annotation]):
-        annotations = [x.distribution for x in annotations if not x.unannotated]
-        mean = np.mean(annotations, axis=0)
-        return mean
+    def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]:
+        annotated = np.array([x.distribution for x in annotations if not x.unannotated])
+        if annotated:
+            mean = np.mean(annotated, axis=0)
+            return mean, 0
+
+        return annotations[0], 1
diff --git a/src/ensemble/cascade.py b/src/ensemble/cascade.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Union, Tuple
 
 import numpy as np
 
@@ -10,8 +10,9 @@ class CascadeEnsemble(EnsembleBase):
     """
     Ensemble method that iterates over the annotations and picks the first annotation that is not unannotated.
     """
-    def run(self, annotations: List[Annotation]):
+    def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]:
         for annotation in annotations:
             if not annotation.unannotated:
-                return annotation
-        return annotations[0]
+                return annotation, 0
+
+        return annotations[0], 1
diff --git a/src/ensemble/ensemble.py b/src/ensemble/ensemble.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import List, Union
+from typing import List, Union, Tuple
 
 import numpy as np
 
@@ -13,19 +13,42 @@ class EnsembleBase(ABC):
     probabilities for each label. The ensemble method should return a single annotation, which is a list of probabilities
     for each label.
     """
+
     def __init__(self):
         pass
 
-    def __call__(self, annotations: List[Union[np.array, Annotation]], *args, **kwargs):
-        return self.run(annotations)
-
-    def run(self, annotations: List[Annotation]):
+    def __call__(self, annotations: List[Union[np.array, Annotation]], *args, **kwargs) \
+            -> Tuple[Union[List | np.array], int]:
+        """
+        Making the ensemble method callable allows to also define functions as ensemble methods instead of classes. This
+        is useful for ensemble methods that do not have any state.
+        :param annotations:
+        :param args:
+        :param kwargs:
+        :return:
+        """
+        distributions, unannotated = self.run(annotations)
+        return self.normalize(distributions), unannotated
+
+    def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]:
+        """
+        Run the ensemble method. This method should be implemented by subclasses.
+        The ensemble method is called with a list of annotations, where each annotation is a list of probabilities for
+        each label. The ensemble method should return a single annotation, which is a list of probabilities for each
+        label. The ensemble method should also return a boolean indicating whether the ensemble method was able to
+        produce a valid annotation. If the ensemble method was not able to produce a valid annotation, the ensemble
+        method should return the first annotation in the list of annotations.
+        :param annotations:
+        :return:
+        """
         pass
 
+    @staticmethod
+    def normalize(annotations: np.array) -> np.array:
+        """
+        Normalize the annotations. This method is used to bring the ensemble result into probability vectors.
+        :param annotations:
+        :return:
+        """
+        return np.array(annotations) / np.linalg.norm(annotations)
 
-class EnsembleNone(EnsembleBase):
-    """
-    Ensemble method that does not do anything. This is useful for single annotator experiments.
-    """
-    def run(self, annotations: List[Annotation]):
-        return annotations
diff --git a/src/ensemble/none.py b/src/ensemble/none.py
@@ -0,0 +1,15 @@
+from typing import List, Tuple, Union
+
+import numpy as np
+
+from ensemble.ensemble import EnsembleBase
+from entity.annotation import Annotation
+
+
+class NoneEnsemble(EnsembleBase):
+    """
+    Ensemble method that does not do anything. This is useful for single annotator experiments.
+    """
+
+    def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]:
+        return annotations[0], 0
diff --git a/src/ensemble/voting.py b/src/ensemble/voting.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Tuple, Union
 
 import numpy as np
 
@@ -11,7 +11,7 @@ def __init__(self, k=10):
         super().__init__()
         self.k = k
 
-    def run(self, annotations: List[Annotation]):
+    def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]:
         best, n = self.extract_best(annotations)
 
         if not best: