Skip to content

Commit

Permalink
Cherry-pick pstjohn md5sum check into main (#46)
Browse files Browse the repository at this point in the history
* Make artifact downloads more robust

Adds md5sum checks and retries to downloads to hopefully prevent CI
failures on malformed downloads.

* only check md5sum for pbss

---------

Signed-off-by: Ohad Mosafi <omosafi@nvidia.com>
Co-authored-by: Peter St. John <pstjohn@nvidia.com>
  • Loading branch information
ohadmo and pstjohn authored Aug 5, 2024
1 parent 919b5b7 commit aeb208a
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 3 deletions.
13 changes: 13 additions & 0 deletions artifact_paths.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,49 +22,57 @@ models:
symlink:
source: "../../esm1nv.nemo"
target: "protein/esm1nv/esm1nv.nemo"
md5sum: 0ba2bf7d7e539e3fc8bff323fa1ab784 # pragma: allowlist secret
prott5nv:
ngc: "nvidia/clara/prott5nv:1.0"
pbss: "s3://bionemo-ci/models/prott5nv.nemo"
symlink:
source: "../../prott5nv.nemo"
target: "protein/prott5nv/prott5nv.nemo"
md5sum: 3181758ab636fce7993e36482f971a50 # pragma: allowlist secret
megamolbart:
ngc: "nvidia/clara/megamolbart:1.0"
pbss: "s3://bionemo-ci/models/megamolbart.nemo"
symlink:
source: "../../megamolbart.nemo"
target: "molecule/megamolbart/megamolbart.nemo"
md5sum: d0c559d38ef5374b34c438e1f34e56d4 # pragma: allowlist secret
molmim_70m_24_3:
# https://wandb.ai/clara-discovery/molmim_convergence_prod_March2024/runs/cOLZ4Eme
ngc: "nvidia/clara/molmim:1.3"
pbss: "s3://bionemo-ci/models/molmim_70m_24_3.nemo"
symlink:
source: "../../molmim_70m_24_3.nemo"
target: "molecule/molmim/molmim_70m_24_3.nemo"
md5sum: ede3edcfc90711eb915591b28b83eca2 # pragma: allowlist secret
equidock_dips:
ngc: "nvidia/clara/equidock_dips:1.1"
pbss: "s3://bionemo-ci/models/equidock_dips.nemo"
symlink:
source: "../../equidock_dips.nemo"
target: "protein/equidock/equidock_dips.nemo"
md5sum: 2e7b021adcc6d76b7ebe2d5fbc4a2a4e # pragma: allowlist secret
equidock_db5:
ngc: "nvidia/clara/equidock_db5:1.1"
pbss: "s3://bionemo-ci/models/equidock_db5.nemo"
symlink:
source: "../../equidock_db5.nemo"
target: "protein/equidock/equidock_db5.nemo"
md5sum: 263fa3d991bda0ee7735cb12d826ac15 # pragma: allowlist secret
diffdock_score:
ngc: "nvidia/clara/diffdock_score:1.5"
pbss: "s3://bionemo-ci/models/diffdock_score.nemo"
symlink:
source: "../../diffdock_score.nemo"
target: "molecule/diffdock/diffdock_score.nemo"
md5sum: 2ff354b4a1032b99609922c6da7663d7 # pragma: allowlist secret
diffdock_confidence:
ngc: "nvidia/clara/diffdock_confidence:1.5"
pbss: "s3://bionemo-ci/models/diffdock_confidence.nemo"
symlink:
source: "../../diffdock_confidence.nemo"
target: "molecule/diffdock/diffdock_confidence.nemo"
md5sum: 8d0e386b6b78be3eff5c334b6cda5607 # pragma: allowlist secret
esm2nv_8m_untrained:
ngc: null
pbss: "s3://bionemo-ci/models/esm2nv_8M_untrained.nemo"
Expand All @@ -84,13 +92,15 @@ models:
symlink:
source: "../../esm2nv_650M_converted.nemo"
target: "protein/esm2nv/esm2nv_650M_converted.nemo"
md5sum: f1d926c4ed38ce16be962c79459c4abf # pragma: allowlist secret
esm2nv_3b:
#TODO: update path when model is released
ngc: "nvidia/clara/esm2nv3b:1.0"
pbss: "s3://bionemo-ci/models/esm2nv_3B_converted.nemo"
symlink:
source: "../../esm2nv_3B_converted.nemo"
target: "protein/esm2nv/esm2nv_3B_converted.nemo"
md5sum: b90222ecdbfc22d9c099a1cc5696c23f # pragma: allowlist secret
openfold_initial_training_public:
pbss: "s3://bionemo-ci/models/openfold_initial_training_public_checkpoint.nemo"
symlink:
Expand Down Expand Up @@ -137,6 +147,7 @@ models:
symlink:
source: "../../geneformer-10M-240530-step-115430-wandb-4ij9ghox.nemo"
target: "singlecell/geneformer/geneformer-10M-240530.nemo"
md5sum: 375ebb9431419f4936fa3aa2bce6e7d6 # pragma: allowlist secret
geneformer_106M_240530:
# A pretrained 106M parameter geneformer (BERT) on 23M unique single cells and 25429 ENSG based tokens,
# padded to a final shape of 25472 for GPU efficiency.
Expand All @@ -146,6 +157,7 @@ models:
symlink:
source: "../../geneformer-106M-240530-step-115430-wandb-KZxWJ0I5.nemo"
target: "singlecell/geneformer/geneformer-106M-240530.nemo"
md5sum: a998810df42bcdede95be319af302868 #pragma: allowlist secret
geneformer:
# A QA model for geneformer with randomly initialized weights.
pbss: "s3://bionemo-ci/models/geneformer-qa.nemo"
Expand Down Expand Up @@ -190,6 +202,7 @@ data:
single_cell:
pbss: "s3://bionemo-ci/test-data/singlecell/singlecell-testdata-20240506.tar.gz"
relative_download_dir: "examples/tests/test_data/"
md5sum: dd6b0d791bf2b3301d9793a1d6663c75 #pragma: allowlist secret
diffdock_sample:
pbss: "s3://bionemo-ci/test-data/diffdock/diffdock_vprocessed_sample_05022024/"
relative_download_dir: "examples/tests/test_data/molecule/diffdock"
Expand Down
54 changes: 51 additions & 3 deletions download_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"""Script to download pretrained models from NGC or PBSS."""

import argparse
import hashlib
import os
import sys
import tarfile
Expand All @@ -20,6 +21,7 @@

import yaml
from pydantic import BaseModel
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential


ALL_KEYWORD = "all"
Expand All @@ -42,6 +44,7 @@ class ArtifactConfig(BaseModel):
relative_download_dir: Optional[Path] = None
extra_args: Optional[str] = None
untar_dir: Optional[str] = None
md5sum: Optional[str] = None


class Config(BaseModel):
Expand Down Expand Up @@ -223,9 +226,8 @@ def download_artifacts(
extra_args = conf[download_artifact].extra_args
command = f"{command} {extra_args}"

_, stderr, retcode = streamed_subprocess_call(command, stream_stdout)
if retcode != 0:
raise ValueError(f"Failed to download {download_artifact=}! {stderr=}")
execute_download(stream_stdout, conf, download_artifact, complete_download_dir, command, file_name, source)

if artifact_type == "data":
tar_file = f"{str(complete_download_dir)}/{file_name}"
if Path(tar_file).is_file():
Expand All @@ -247,6 +249,36 @@ def download_artifacts(
raise ValueError(f"Failed to symlink {source_file=} to {target_file=}; {stderr=}")


@retry(
wait=wait_exponential(multiplier=1, max=10),
retry=retry_if_exception_type(ValueError),
stop=stop_after_attempt(3),
reraise=True,
)
def execute_download(
stream_stdout: bool,
conf: Dict[str, ArtifactConfig],
download_artifact: str,
complete_download_dir: Path,
command: List[str],
file_name: str,
source: str,
) -> None:
"""Execute the download command and check the MD5 checksum of the downloaded file."""

_, stderr, retcode = streamed_subprocess_call(command, stream_stdout)
if retcode != 0:
raise ValueError(f"Failed to download {download_artifact=}! {stderr=}")

if source == "pbss" and conf[download_artifact].md5sum:
downloaded_md5sum = _md5_checksum(Path(complete_download_dir) / file_name)
if downloaded_md5sum != conf[download_artifact].md5sum:
raise ValueError(
f"MD5 checksum mismatch for {download_artifact=}! Expected "
f"{conf[download_artifact].md5sum}, got {downloaded_md5sum}"
)


def load_config(config_file: Path = DATA_SOURCE_CONFIG) -> Config:
"""
Loads the artifacts file into a dictionary.
Expand Down Expand Up @@ -326,5 +358,21 @@ def main():
print("No models or data were selected to download.")


def _md5_checksum(file_path: Path) -> str:
"""Calculate the MD5 checksum of a file.
Args:
file_path (Path): The path to the file to checksum.
Returns:
str: The MD5 checksum of the file.
"""
md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
md5.update(chunk)
return md5.hexdigest()


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions setup/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ ruff==0.0.292
black==23.1.0
pre-commit==3.4.0
ipdb==0.13.11
tenacity==8.5.0
click==8.1.7

0 comments on commit aeb208a

Please sign in to comment.