Skip to content

Commit

Permalink
Merge pull request #6 from TattaBio/leaderboard-v1
Browse files Browse the repository at this point in the history
Leaderboard w/ Mock data
  • Loading branch information
jlkravitz authored Jul 11, 2024
2 parents 2b8f500 + 7445114 commit 316bdd6
Show file tree
Hide file tree
Showing 238 changed files with 45,975 additions and 103 deletions.
27 changes: 27 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Docker file for leaderboard
FROM python:3.11-slim

WORKDIR /app

# install curl
RUN apt-get update && apt-get install -y curl
ADD https://astral.sh/uv/install.sh /install.sh
RUN chmod +x /install.sh
RUN /install.sh && rm /install.sh

# install deps
COPY leaderboard/requirements.txt ./
RUN /root/.cargo/bin/uv pip install --system --no-cache -r requirements.txt

# copy src
COPY dgeb dgeb
COPY leaderboard/ leaderboard/

# Run gradio when the container launches
EXPOSE 7860
ENV GRADIO_SERVER_NAME="0.0.0.0"
ENV GRADIO_TEMP_DIR="/app"
WORKDIR /app/leaderboard
CMD ["python", "app.py"]


31 changes: 29 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
---
title: DGEB
app_file : leaderboard/app.py
sdk: docker
sdk_version: 4.36.1
---
<h1 align="center">Diverse Genomic Embedding Benchmark</h1>

<p align="center">
Expand Down Expand Up @@ -80,7 +86,7 @@ Custom models should be wrapped with the `dgeb.models.BioSeqTransformer` abstrac
```python
import dgeb
from dgeb.models import BioSeqTransformer
from dgeb.modality import Modality
from dgeb.tasks.tasks import Modality

class MyModel(BioSeqTransformer):

Expand Down Expand Up @@ -141,7 +147,28 @@ evaluation.run(model)

## Leaderboard

TODO
To add your submission to the DGEB leaderboard, proceed through the following instructions.

1. Fork the DGEB repository by following GitHub's instruction [Forking Workflow](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork).

2. Add your submission .json file to the leaderboard/submissions/<HF_MODEL_NAME>/ directory.

```bash
mv /path/to/<SUBMISSION_FILE>.json /path/to/DGEB/leaderboard/submissions/<HF_MODEL_NAME>/
```

4. Update your fork with the new submission:

```bash
git add leaderboard/submissions/<HF_MODEL_NAME>/<SUBMISSION_FILE>.json
git commit -m "Add submission for <HF_MODEL_NAME>"
git push
```

5. Open a pull request to the main branch of the repository via the Github interface.

6. Once the PR is review and merged, your submission will be added to the leaderboard!


## Acknowledgements

Expand Down
2 changes: 0 additions & 2 deletions dgeb/modality.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
"""Defines the data modality enum."""

from enum import Enum


Expand Down
50 changes: 0 additions & 50 deletions dgeb/results.py

This file was deleted.

97 changes: 61 additions & 36 deletions dgeb/tasks/tasks.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
"""Task abstract class for evaluation and results."""

import logging
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Literal, Optional

from typing import List, Literal, Optional, Any
from importlib.metadata import version
from enum import Enum
import datasets
from pydantic import BaseModel, model_validator
from abc import ABC, abstractmethod


# HACK: if Modality is not defined, then import it from modality.py
try:
from ..modality import Modality
except Exception:
# if not, super hack to get the leaderboard working.
# SHOULD MATCH the code exactly in modality.py
# can we read the file and run that code?
from enum import Enum

class Modality(Enum):
"""Data modality, either DNA or protein sequence."""

PROTEIN = "protein"
DNA = "dna"

from ..modality import Modality
from ..models import BioSeqTransformer

logging.basicConfig(level=logging.INFO)

Expand All @@ -35,10 +50,44 @@ class LayerResult(BaseModel):
metrics: List[TaskMetric]


class GEBModel(BaseModel):
hf_name: str
num_layers: int
num_params: int
embed_dim: int


class Dataset(BaseModel):
path: str
revision: str

def load(self) -> datasets.DatasetDict:
ds = datasets.load_dataset(self.path, revision=self.revision)
if not isinstance(ds, datasets.DatasetDict):
raise ValueError(
f"Dataset {self.path} is not a datasets.DatasetDict object."
)
return ds


class TaskMetadata(BaseModel):
id: str
display_name: str
description: str
modality: Modality
type: TaskType
# List of datasets used by the task.
# Each dataset is a dict of all arguments to pass to `datasets.load_dataset()`.
datasets: List[Dataset]
primary_metric_id: str


# tasks.py
class TaskResult(BaseModel):
dgeb_version: str
task: "TaskMetadata"
# TODO: Convert model to ModelMetadata
model: Dict[str, Any]
model: GEBModel
results: List[LayerResult]

@model_validator(mode="after")
Expand All @@ -55,10 +104,11 @@ def check_valid_primary_metric(self):
@staticmethod
def from_dict(
task_metadata: "TaskMetadata",
layer_results: Dict[str, Any],
model_metadata: Dict[str, Any],
layer_results: LayerResult,
model_metadata: GEBModel,
):
return TaskResult(
dgeb_version=version("dgeb"),
task=task_metadata,
model=model_metadata,
results=list(
Expand All @@ -75,36 +125,11 @@ def from_dict(
)


class Dataset(BaseModel):
path: str
revision: str

def load(self) -> datasets.DatasetDict:
ds = datasets.load_dataset(self.path, revision=self.revision)
if not isinstance(ds, datasets.DatasetDict):
raise ValueError(
f"Dataset {self.path} is not a datasets.DatasetDict object."
)
return ds


class TaskMetadata(BaseModel):
id: str
display_name: str
description: str
modality: Modality
type: TaskType
# List of datasets used by the task.
# Each dataset is a dict of all arguments to pass to `datasets.load_dataset()`.
datasets: List[Dataset]
primary_metric_id: str


# move to model.py?
class Task(ABC):
metadata: TaskMetadata

# using Any instead of "BioSeqTransformer" to avoid installing all deps in leaderboard
@abstractmethod
def run(
self, model: BioSeqTransformer, layers: Optional[List[int]] = None
) -> TaskResult:
def run(self, model: Any, layers: Optional[List[int]] = None) -> TaskResult:
pass
8 changes: 8 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: "3"
services:
dgeb-leaderboard:
build:
context: ./
dockerfile: Dockerfile
ports:
- "7680:7860"
2 changes: 2 additions & 0 deletions leaderboard/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/.projectile
**/__pycache__/
Binary file added leaderboard/DGEB_Figure.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions leaderboard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# to set up hf repo to recieve origin pushes
git remote set-url --add origin git@hf.co:spaces/tattabio/DGEB
Empty file added leaderboard/__init__.py
Empty file.
Loading

0 comments on commit 316bdd6

Please sign in to comment.