Merge pull request #3 from TattaBio/andre

Update readme and task imports
TattaBio · Jul 2, 2024 · ade30a8 · ade30a8
2 parents 1894ba9 + 6b1c2ee
commit ade30a8
Show file tree

Hide file tree

Showing 8 changed files with 74 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,6 @@
         <a href="#installation">Installation</a> |
         <a href="#usage">Usage</a> |
         <a href="https://huggingface.co/spaces/dgeb">Leaderboard</a> |
-        <a href="#documentation">Documentation</a> |
         <a href="#citing">Citing</a>
     <p>
 </h4>
@@ -29,6 +28,18 @@
     <a href="https://huggingface.co/spaces/dgeb"><img style="float: middle; padding: 10px 10px 10px 10px;" width="100" height="100" src="./docs/images/tatta_logo.png" /></a>
 </h3>
 
+
+DGEB is a benchmark for evaluating biological sequence models on functional and evolutionary information. 
+
+DGEB is designed to evaluate model embeddings using:
+ - Diverse sequences accross the tree of life.
+ - Diverse tasks that capture different aspects of biological function.
+ - Both amino acid and nucleotide sequences.
+
+The current version of DGEB consists of 19 datasets covering all three domains of life (Bacteria, Archaea and Eukarya). DGEB evaluates embeddings using six different embedding tasks: Classification, BiGene mining, Evolutionary Distance Similarity (EDS), Pair Classification, Clustering, and Retrieval. 
+
+We welcome contributions of new tasks and datasets. 
+
 ## Installation
 
 Currently, DGEB sits on the Test PyPI index. Here's the command to install:
@@ -39,12 +50,19 @@ pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://
 
 ## Usage
 
-- Using the python script (see [run_dgeb.py](https://github.com/tattabio/dgeb/blob/main/run_dgeb.py)):
+- Launch evaluation using the python script (see [run_dgeb.py](https://github.com/tattabio/dgeb/blob/main/run_dgeb.py)):
 
 ```bash
 python run_dgeb.py --model facebook/esm2_t6_8M_UR50D
 ```
 
+- To see all supported models and tasks:
+
+```bash
+python run_dgeb.py --help
+```
+
+
 - Using the python API:
 
 ```py
@@ -58,7 +76,7 @@ evaluation.run(model, output_folder="results")
 
 ### Using a custom model
 
-Custom models should be wrapped with the `dgeb.models.BioSeqTransformer` abstract class, and specify the modality, number of layers, and embedding dimension. See see [models.py](https://github.com/tattabio/dgeb/blob/main/dgeb/models.py) for additional examples on custom model loading and inference.
+Custom models should be wrapped with the `dgeb.models.BioSeqTransformer` abstract class, and specify the modality, number of layers, and embedding dimension. See [models.py](https://github.com/tattabio/dgeb/blob/main/dgeb/models.py) for additional examples on custom model loading and inference.
 
 ```python
 import dgeb
@@ -80,39 +98,66 @@ class MyModel(BioSeqTransformer):
         return self.config.hidden_size
 
 
-model = MyModel()
+model = MyModel(model_name='path_to/huggingface_model')
 tasks = dgeb.get_tasks_by_modality(model.modality)
-evaluation = MTEB(tasks=tasks)
+evaluation = dgeb.DGEB(tasks=tasks)
 evaluation.run(model)
 ```
 
+
 ### Evaluating on a custom dataset
 
-TODO(andre): Update this section
+**We strongly encourage users to contribute their custom datasets to DGEB. Please open a PR adding your dataset so that the community can benefit!**
 
-To evaluate on a custom task, you can run the following code on your custom task.
+To evaluate on a custom dataset, first upload your dataset to the [Huggingface Hub](https://huggingface.co/docs/hub/en/datasets-adding). Then define a `Task` subclass with `TaskMetadata` that points to your huggingface dataset. For example, a classification task on a custom dataset can be defined as follows:
 
 ```python
 import dgeb
-from dgeb.tasks import AbsTask
-
-class MyCustomTask(AbsTask):
-    def run(
-        self, model: BioSeqTransformer, layers: Optional[List[int]] = None
-    ) -> TaskResult:
-        pass
+from dgeb.models import BioSeqTransformer
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks.classification_tasks import run_classification_task
+
+class MyCustomTask(Task):
+    metadata = TaskMetadata(
+        id="my_custom_classification",
+        display_name="...",
+        description="...",
+        type="classification",
+        modality=Modality.PROTEIN,
+        datasets=[
+            Dataset(
+                path="path_to/huggingface_dataset",
+                revision="...",
+            )
+        ],
+        primary_metric_id="f1",
+    )
+
+    def run(self, model: BioSeqTransformer) -> TaskResult:
+        return run_classification_task(model, self.metadata)
 
-model = dgeb.models.ESM("facebook/esm2_t6_8M_UR50D")
-evaluation = dgeb.DGEB(tasks=[MyCustomTask()])
+model = dgeb.get_model("facebook/esm2_t6_8M_UR50D")
+evaluation = dgeb.DGEB(tasks=[MyCustomTask])
 evaluation.run(model)
 ```
 
-</details>
+
+## Leaderboard
+
+TODO(nishant): 
+- Add link to leaderboard
+- Instruction for uploading results
+
+## Acknowledgements
+
+DGEB follows the design of text embedding bechmark [MTEB](https://github.com/embeddings-benchmark/mteb) developed by Huggingface 🤗. The evaluation code is adapted from the MTEB codebase.
 
 ## Citing
 
-dgeb was introduced in "[DGEB: Diverse Genomic Embedding Benchmark]()", feel free to cite:
+DGEB was introduced in "[DGEB: Diverse Genomic Embedding Benchmark]()", feel free to cite:
 
 TODO(andre): bibtex
 
-For works that have used dgeb for benchmarking, you can find them on the [leaderboard](https://huggingface.co/spaces/tattabio/DGEB/leaderboard).
+
+
+
diff --git a/dgeb/tasks/__init__.py b/dgeb/tasks/__init__.py
@@ -1,6 +1,6 @@
 # ruff: noqa: F403
 
-from .tasks import Task
+from .tasks import Dataset, Task, TaskMetadata, TaskResult
 from .eds_tasks import *
 from .pair_classification_tasks import *
 from .retrieval_tasks import *
@@ -9,5 +9,8 @@
 from .bigene_mining_tasks import *
 
 __all__ = [
+    "Dataset",
     "Task",
+    "TaskMetadata",
+    "TaskResult",
 ]
diff --git a/dgeb/tasks/bigene_mining_tasks.py b/dgeb/tasks/bigene_mining_tasks.py
@@ -9,7 +9,7 @@
 from dgeb.evaluators import BiGeneMiningEvaluator
 from dgeb.modality import Modality
 from dgeb.models import BioSeqTransformer
-from dgeb.tasks.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
 
 logger = logging.getLogger(__name__)
 

diff --git a/dgeb/tasks/classification_tasks.py b/dgeb/tasks/classification_tasks.py
@@ -16,7 +16,7 @@
 )
 from dgeb.modality import Modality
 from dgeb.models import BioSeqTransformer
-from dgeb.tasks.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
 
 logger = logging.getLogger(__name__)
 

diff --git a/dgeb/tasks/clustering_tasks.py b/dgeb/tasks/clustering_tasks.py
@@ -8,7 +8,7 @@
 from dgeb.evaluators import ClusteringEvaluator
 from dgeb.modality import Modality
 from dgeb.models import BioSeqTransformer
-from dgeb.tasks.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
 
 logger = logging.getLogger(__name__)
 

diff --git a/dgeb/tasks/eds_tasks.py b/dgeb/tasks/eds_tasks.py
@@ -12,7 +12,7 @@
 from dgeb.evaluators import EDSEvaluator
 from dgeb.modality import Modality
 from dgeb.models import BioSeqTransformer
-from dgeb.tasks.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
 
 logger = logging.getLogger(__name__)
 

diff --git a/dgeb/tasks/pair_classification_tasks.py b/dgeb/tasks/pair_classification_tasks.py
@@ -9,7 +9,7 @@
 from dgeb.evaluators import PairClassificationEvaluator
 from dgeb.modality import Modality
 from dgeb.models import BioSeqTransformer
-from dgeb.tasks.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
 
 from ..eval_utils import paired_dataset
 

diff --git a/dgeb/tasks/retrieval_tasks.py b/dgeb/tasks/retrieval_tasks.py
@@ -9,7 +9,7 @@
 from dgeb.evaluators import RetrievalEvaluator
 from dgeb.modality import Modality
 from dgeb.models import BioSeqTransformer
-from dgeb.tasks.tasks import Dataset, Task, TaskMetadata, TaskResult
+from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult
 
 logger = logging.getLogger(__name__)