Merge pull request #8 from TattaBio/cli

Cli & cleanup
TattaBio · Jul 8, 2024 · 9698c8f · 9698c8f
2 parents cfc5799 + 58bdcba
commit 9698c8f
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -28,17 +28,17 @@
     <a href="https://huggingface.co/spaces/dgeb"><img style="float: middle; padding: 10px 10px 10px 10px;" width="100" height="100" src="./docs/images/tatta_logo.png" /></a>
 </h3>
 
-
-DGEB is a benchmark for evaluating biological sequence models on functional and evolutionary information. 
+DGEB is a benchmark for evaluating biological sequence models on functional and evolutionary information.
 
 DGEB is designed to evaluate model embeddings using:
- - Diverse sequences accross the tree of life.
- - Diverse tasks that capture different aspects of biological function.
- - Both amino acid and nucleotide sequences.
 
-The current version of DGEB consists of 18 datasets covering all three domains of life (Bacteria, Archaea and Eukarya). DGEB evaluates embeddings using six different embedding tasks: Classification, BiGene mining, Evolutionary Distance Similarity (EDS), Pair Classification, Clustering, and Retrieval. 
+- Diverse sequences accross the tree of life.
+- Diverse tasks that capture different aspects of biological function.
+- Both amino acid and nucleotide sequences.
+
+The current version of DGEB consists of 18 datasets covering all three domains of life (Bacteria, Archaea and Eukarya). DGEB evaluates embeddings using six different embedding tasks: Classification, BiGene mining, Evolutionary Distance Similarity (EDS), Pair Classification, Clustering, and Retrieval.
 
-We welcome contributions of new tasks and datasets. 
+We welcome contributions of new tasks and datasets.
 
 ## Installation
 
@@ -50,19 +50,18 @@ pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://
 
 ## Usage
 
-- Launch evaluation using the python script (see [run_dgeb.py](https://github.com/tattabio/dgeb/blob/main/run_dgeb.py)):
+- Launch evaluation using the python script (see [cli.py](https://github.com/tattabio/dgeb/blob/main/dgeb/cli.py)):
 
 ```bash
-python run_dgeb.py --model facebook/esm2_t6_8M_UR50D
+dgeb --model facebook/esm2_t6_8M_UR50D
 ```
 
 - To see all supported models and tasks:
 
 ```bash
-python run_dgeb.py --help
+dgeb --help
 ```
 
-
 - Using the python API:
 
 ```py
@@ -104,7 +103,6 @@ evaluation = dgeb.DGEB(tasks=tasks)
 evaluation.run(model)
 ```
 
-
 ### Evaluating on a custom dataset
 
 **We strongly encourage users to contribute their custom datasets to DGEB. Please open a PR adding your dataset so that the community can benefit!**
@@ -141,10 +139,10 @@ evaluation = dgeb.DGEB(tasks=[MyCustomTask])
 evaluation.run(model)
 ```
 
-
 ## Leaderboard
 
-TODO(nishant): 
+TODO(nishant):
+
 - Add link to leaderboard
 - Instruction for uploading results
 
@@ -157,7 +155,3 @@ DGEB follows the design of text embedding bechmark [MTEB](https://github.com/emb
 DGEB was introduced in "[DGEB: Diverse Genomic Embedding Benchmark]()", feel free to cite:
 
 TODO(andre): bibtex
-
-
-
-
diff --git a/benchmarks.png b/benchmarks.png
diff --git a/run_dgeb.py → dgeb/cli.py b/run_dgeb.py → dgeb/cli.py
diff --git a/dgeb/tasks/retrieval_tasks.py b/dgeb/tasks/retrieval_tasks.py
@@ -62,7 +62,6 @@ class ArchRetrieval(Task):
             ),
             Dataset(
                 path="tattabio/arch_retrieval_qrels",
-                description="Relevance between query and corpus proteins",
                 revision="3f142f2f9a0995d56c6e77188c7251761450afcf",
             ),
         ],
@@ -87,7 +86,6 @@ class EukRetrieval(Task):
             ),
             Dataset(
                 path="tattabio/euk_retrieval_qrels",
-                description="Relevance between query and corpus proteins",
                 revision="a5aa01e9b9738074aba57fc07434e352c4c71e4b",
             ),
         ],

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,12 @@ version = "0.0.8"
 description = "Diverse Genomic Embedding Benchmark"
 readme = "README.md"
 license = { file = "LICENSE" }
-keywords = ["scientific software", "genomic embeddings", "machine learning", "benchmark"]
+keywords = [
+    "scientific software",
+    "genomic embeddings",
+    "machine learning",
+    "benchmark",
+]
 classifiers = [
     "Development Status :: 2 - Pre-Alpha",
     "Environment :: Console",
@@ -32,29 +37,29 @@ dependencies = [
     "seaborn>=0.13.2",
     "torch>=2.3.1",
     "tqdm>=4.66.4",
-    "transformers>=4.41.2"
+    "transformers>=4.41.2",
 ]
 
 [project.urls]
-homepage = "https://github.com/TattaBio/dgeb"
+homepage = "https://github.com/TattaBio/DGEB"
 "Huggingface Organization" = "https://huggingface.co/tattabio"
-"Source Code" = "https://github.com/TattaBio/dgeb"
+"Source Code" = "https://github.com/TattaBio/DGEB"
+
+[project.scripts]
+mteb = "dgeb.cli:main"
 
 [project.optional-dependencies]
 dev = ["ruff>=0.0.254", "pytest", "pytest-xdist"]
 
 [tool.setuptools.packages.find]
-exclude = ["tests", "results"]
+exclude = ["tests", "results", "leaderboard", "Dockerfile"]
 
 [tool.setuptools.package-data]
 "*" = ["*.json"]
 
 [tool.ruff]
 target-version = "py38"
-exclude = [
- ".venv",
- "build/"
-]
+exclude = [".venv", "build/"]
 line-length = 88
 indent-width = 4
 
@@ -97,7 +102,18 @@ env = "GIT_COMMIT_AUTHOR"
 default = "semantic-release <semantic-release>"
 
 [tool.semantic_release.commit_parser_options]
-allowed_tags = ["build", "chore", "ci", "docs", "feat", "fix", "perf", "style", "refactor", "test"]
+allowed_tags = [
+    "build",
+    "chore",
+    "ci",
+    "docs",
+    "feat",
+    "fix",
+    "perf",
+    "style",
+    "refactor",
+    "test",
+]
 minor_tags = ["feat"]
 patch_tags = ["fix", "perf"]
 default_bump_level = 0

diff --git a/plot_benchmarks.py → scripts/plot_benchmarks.py b/plot_benchmarks.py → scripts/plot_benchmarks.py