diff --git a/CITATION.cff b/CITATION.cff index 7f1f478..7d4693b 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -8,7 +8,7 @@ authors: given-names: "Andrea" orcid: "https://orcid.org/0000-0001-9469-6050" title: "AutoFL" -version: 0.3.1 +version: 0.4.1 doi: "10.5281/zenodo.10255368" date-released: 2023-09-01 url: "https://github.com/SasCezar/AutoFL" \ No newline at end of file diff --git a/README.md b/README.md index 541a9c3..86259a6 100644 --- a/README.md +++ b/README.md @@ -64,9 +64,10 @@ the [config](config) folder. The main configuration file is [main.yaml](./config/main.yaml), which contains the following options: - **local**: which environment to use, either local or docker. [Docker](./config/local/docker.yaml) is default. -- **taxonomy**: which taxonomy to use. Currently only [gitranking](./config/taxonomy/gitranking.yaml) is supported. +- **taxonomy**: which taxonomy to use. Currently only [gitranking](./config/taxonomy/gitranking.yaml) is supported, but + custom taxonomies can be added. - **annotator**: which annotators to use. Default is [simple](./config/annotator/simple.yaml), which allows good results - without extra dependencies on models. + without extra dependencies on language models. - **version_strategy**: which version strategy to use. Default is [latest](./config/version_strategy/latest.yaml), which will only analyze the latest version of the project. - **dataloader**: which dataloader to use. Default is [postgres](./config/dataloader/postgres.yaml) which allows the API @@ -96,6 +97,10 @@ Other configuration can be defined by creating a new file in the folder of the s ## Development +The tool is composed of multiple components, their interaction is shown in the following diagram: + +![Architecture](resources/architecture/architecture.png) + ### Add New Languages In order to support more languages, a new language specific parser is needed. @@ -205,7 +210,7 @@ However, this tool is more up to date, easier to use, more configurable, and als month = dec, title = {{AutoFL}}, url = {https://github.com/SasCezar/AutoFL}, - version = {0.4.0}, + version = {0.4.1}, year = {2023}, url = {https://doi.org/10.5281/zenodo.10255368}, doi = {10.5281/zenodo.10255368} diff --git a/autofl-ui b/autofl-ui index 14a5e5f..b697f23 160000 --- a/autofl-ui +++ b/autofl-ui @@ -1 +1 @@ -Subproject commit 14a5e5f3bf4f7b0ba4a083dc5d9edc56f675f1de +Subproject commit b697f23cd1aed04cc1ac7c24aded46a9f4fb6155 diff --git a/pyproject.toml b/pyproject.toml index b0c2ac1..c5c709c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "autofl" -version = "0.4.0" +version = "0.4.1" description = "" authors = ["Cezar Sas "] readme = "README.md" diff --git a/resources/architecture/architecture.png b/resources/architecture/architecture.png new file mode 100644 index 0000000..24267ef Binary files /dev/null and b/resources/architecture/architecture.png differ diff --git a/src/ensemble/avg.py b/src/ensemble/avg.py index fddad89..d0af567 100644 --- a/src/ensemble/avg.py +++ b/src/ensemble/avg.py @@ -1,5 +1,5 @@ -from typing import List, Union +from typing import List, Union, Tuple import numpy as np @@ -11,7 +11,10 @@ class AverageEnsemble(EnsembleBase): """ Ensemble method that averages the annotations. """ - def run(self, annotations: List[Annotation]): - annotations = [x.distribution for x in annotations if not x.unannotated] - mean = np.mean(annotations, axis=0) - return mean + def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]: + annotated = np.array([x.distribution for x in annotations if not x.unannotated]) + if annotated: + mean = np.mean(annotated, axis=0) + return mean, 0 + + return annotations[0], 1 diff --git a/src/ensemble/cascade.py b/src/ensemble/cascade.py index 16d9dab..1c750d5 100644 --- a/src/ensemble/cascade.py +++ b/src/ensemble/cascade.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Union, Tuple import numpy as np @@ -10,8 +10,9 @@ class CascadeEnsemble(EnsembleBase): """ Ensemble method that iterates over the annotations and picks the first annotation that is not unannotated. """ - def run(self, annotations: List[Annotation]): + def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]: for annotation in annotations: if not annotation.unannotated: - return annotation - return annotations[0] + return annotation, 0 + + return annotations[0], 1 diff --git a/src/ensemble/ensemble.py b/src/ensemble/ensemble.py index 8a545d2..97068d5 100644 --- a/src/ensemble/ensemble.py +++ b/src/ensemble/ensemble.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import List, Union +from typing import List, Union, Tuple import numpy as np @@ -13,19 +13,42 @@ class EnsembleBase(ABC): probabilities for each label. The ensemble method should return a single annotation, which is a list of probabilities for each label. """ + def __init__(self): pass - def __call__(self, annotations: List[Union[np.array, Annotation]], *args, **kwargs): - return self.run(annotations) - - def run(self, annotations: List[Annotation]): + def __call__(self, annotations: List[Union[np.array, Annotation]], *args, **kwargs) \ + -> Tuple[Union[List | np.array], int]: + """ + Making the ensemble method callable allows to also define functions as ensemble methods instead of classes. This + is useful for ensemble methods that do not have any state. + :param annotations: + :param args: + :param kwargs: + :return: + """ + distributions, unannotated = self.run(annotations) + return self.normalize(distributions), unannotated + + def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]: + """ + Run the ensemble method. This method should be implemented by subclasses. + The ensemble method is called with a list of annotations, where each annotation is a list of probabilities for + each label. The ensemble method should return a single annotation, which is a list of probabilities for each + label. The ensemble method should also return a boolean indicating whether the ensemble method was able to + produce a valid annotation. If the ensemble method was not able to produce a valid annotation, the ensemble + method should return the first annotation in the list of annotations. + :param annotations: + :return: + """ pass + @staticmethod + def normalize(annotations: np.array) -> np.array: + """ + Normalize the annotations. This method is used to bring the ensemble result into probability vectors. + :param annotations: + :return: + """ + return np.array(annotations) / np.linalg.norm(annotations) -class EnsembleNone(EnsembleBase): - """ - Ensemble method that does not do anything. This is useful for single annotator experiments. - """ - def run(self, annotations: List[Annotation]): - return annotations diff --git a/src/ensemble/none.py b/src/ensemble/none.py new file mode 100644 index 0000000..77a004c --- /dev/null +++ b/src/ensemble/none.py @@ -0,0 +1,15 @@ +from typing import List, Tuple, Union + +import numpy as np + +from ensemble.ensemble import EnsembleBase +from entity.annotation import Annotation + + +class NoneEnsemble(EnsembleBase): + """ + Ensemble method that does not do anything. This is useful for single annotator experiments. + """ + + def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]: + return annotations[0], 0 diff --git a/src/ensemble/voting.py b/src/ensemble/voting.py index 04a7e9d..f7a635f 100644 --- a/src/ensemble/voting.py +++ b/src/ensemble/voting.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple, Union import numpy as np @@ -11,7 +11,7 @@ def __init__(self, k=10): super().__init__() self.k = k - def run(self, annotations: List[Annotation]): + def run(self, annotations: List[Annotation]) -> Tuple[Union[List | np.array], int]: best, n = self.extract_best(annotations) if not best: