Cherry-pick pstjohn md5sum check into main (#46)

* Make artifact downloads more robust Adds md5sum checks and retries to downloads to hopefully prevent CI failures on malformed downloads. * only check md5sum for pbss --------- Signed-off-by: Ohad Mosafi <omosafi@nvidia.com> Co-authored-by: Peter St. John <pstjohn@nvidia.com>
NVIDIA · Aug 5, 2024 · aeb208a · aeb208a
1 parent 919b5b7
commit aeb208a
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 3 deletions.
diff --git a/artifact_paths.yaml b/artifact_paths.yaml
@@ -22,49 +22,57 @@ models:
         symlink:
             source: "../../esm1nv.nemo"
             target: "protein/esm1nv/esm1nv.nemo"
+        md5sum: 0ba2bf7d7e539e3fc8bff323fa1ab784 # pragma: allowlist secret
     prott5nv:
         ngc: "nvidia/clara/prott5nv:1.0"
         pbss: "s3://bionemo-ci/models/prott5nv.nemo"
         symlink:
             source: "../../prott5nv.nemo"
             target: "protein/prott5nv/prott5nv.nemo"
+        md5sum: 3181758ab636fce7993e36482f971a50 # pragma: allowlist secret
     megamolbart:
         ngc: "nvidia/clara/megamolbart:1.0"
         pbss: "s3://bionemo-ci/models/megamolbart.nemo"
         symlink:
             source: "../../megamolbart.nemo"
             target: "molecule/megamolbart/megamolbart.nemo"
+        md5sum: d0c559d38ef5374b34c438e1f34e56d4 # pragma: allowlist secret
     molmim_70m_24_3:
         # https://wandb.ai/clara-discovery/molmim_convergence_prod_March2024/runs/cOLZ4Eme
         ngc: "nvidia/clara/molmim:1.3"
         pbss: "s3://bionemo-ci/models/molmim_70m_24_3.nemo"
         symlink:
             source: "../../molmim_70m_24_3.nemo"
             target: "molecule/molmim/molmim_70m_24_3.nemo"
+        md5sum: ede3edcfc90711eb915591b28b83eca2 # pragma: allowlist secret
     equidock_dips:
         ngc: "nvidia/clara/equidock_dips:1.1"
         pbss: "s3://bionemo-ci/models/equidock_dips.nemo"
         symlink:
             source: "../../equidock_dips.nemo"
             target: "protein/equidock/equidock_dips.nemo"
+        md5sum: 2e7b021adcc6d76b7ebe2d5fbc4a2a4e # pragma: allowlist secret
     equidock_db5:
         ngc: "nvidia/clara/equidock_db5:1.1"
         pbss: "s3://bionemo-ci/models/equidock_db5.nemo"
         symlink:
             source: "../../equidock_db5.nemo"
             target: "protein/equidock/equidock_db5.nemo"
+        md5sum: 263fa3d991bda0ee7735cb12d826ac15 # pragma: allowlist secret
     diffdock_score:
         ngc: "nvidia/clara/diffdock_score:1.5"
         pbss: "s3://bionemo-ci/models/diffdock_score.nemo"
         symlink:
             source: "../../diffdock_score.nemo"
             target: "molecule/diffdock/diffdock_score.nemo"
+        md5sum: 2ff354b4a1032b99609922c6da7663d7 # pragma: allowlist secret
     diffdock_confidence:
         ngc: "nvidia/clara/diffdock_confidence:1.5"
         pbss: "s3://bionemo-ci/models/diffdock_confidence.nemo"
         symlink:
             source: "../../diffdock_confidence.nemo"
             target: "molecule/diffdock/diffdock_confidence.nemo"
+        md5sum: 8d0e386b6b78be3eff5c334b6cda5607 # pragma: allowlist secret
     esm2nv_8m_untrained:
         ngc: null
         pbss: "s3://bionemo-ci/models/esm2nv_8M_untrained.nemo"
@@ -84,13 +92,15 @@ models:
         symlink:
             source: "../../esm2nv_650M_converted.nemo"
             target: "protein/esm2nv/esm2nv_650M_converted.nemo"
+        md5sum: f1d926c4ed38ce16be962c79459c4abf # pragma: allowlist secret
     esm2nv_3b:
         #TODO: update path when model is released
         ngc: "nvidia/clara/esm2nv3b:1.0"
         pbss: "s3://bionemo-ci/models/esm2nv_3B_converted.nemo"
         symlink:
             source: "../../esm2nv_3B_converted.nemo"
             target: "protein/esm2nv/esm2nv_3B_converted.nemo"
+        md5sum: b90222ecdbfc22d9c099a1cc5696c23f # pragma: allowlist secret
     openfold_initial_training_public:
         pbss: "s3://bionemo-ci/models/openfold_initial_training_public_checkpoint.nemo"
         symlink:
@@ -137,6 +147,7 @@ models:
         symlink:
             source: "../../geneformer-10M-240530-step-115430-wandb-4ij9ghox.nemo"
             target: "singlecell/geneformer/geneformer-10M-240530.nemo"
+        md5sum: 375ebb9431419f4936fa3aa2bce6e7d6 # pragma: allowlist secret
     geneformer_106M_240530:
         # A pretrained 106M parameter geneformer (BERT) on 23M unique single cells and 25429 ENSG based tokens,
         #   padded to a final shape of 25472 for GPU efficiency.
@@ -146,6 +157,7 @@ models:
         symlink:
             source: "../../geneformer-106M-240530-step-115430-wandb-KZxWJ0I5.nemo"
             target: "singlecell/geneformer/geneformer-106M-240530.nemo"
+        md5sum: a998810df42bcdede95be319af302868 #pragma: allowlist secret
     geneformer:
         # A QA model for geneformer with randomly initialized weights.
         pbss: "s3://bionemo-ci/models/geneformer-qa.nemo"
@@ -190,6 +202,7 @@ data:
     single_cell:
         pbss: "s3://bionemo-ci/test-data/singlecell/singlecell-testdata-20240506.tar.gz"
         relative_download_dir: "examples/tests/test_data/"
+        md5sum: dd6b0d791bf2b3301d9793a1d6663c75 #pragma: allowlist secret
     diffdock_sample:
         pbss: "s3://bionemo-ci/test-data/diffdock/diffdock_vprocessed_sample_05022024/"
         relative_download_dir: "examples/tests/test_data/molecule/diffdock"

diff --git a/download_artifacts.py b/download_artifacts.py
@@ -11,6 +11,7 @@
 """Script to download pretrained models from NGC or PBSS."""
 
 import argparse
+import hashlib
 import os
 import sys
 import tarfile
@@ -20,6 +21,7 @@
 
 import yaml
 from pydantic import BaseModel
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 
 
 ALL_KEYWORD = "all"
@@ -42,6 +44,7 @@ class ArtifactConfig(BaseModel):
     relative_download_dir: Optional[Path] = None
     extra_args: Optional[str] = None
     untar_dir: Optional[str] = None
+    md5sum: Optional[str] = None
 
 
 class Config(BaseModel):
@@ -223,9 +226,8 @@ def download_artifacts(
             extra_args = conf[download_artifact].extra_args
             command = f"{command} {extra_args}"
 
-        _, stderr, retcode = streamed_subprocess_call(command, stream_stdout)
-        if retcode != 0:
-            raise ValueError(f"Failed to download {download_artifact=}! {stderr=}")
+        execute_download(stream_stdout, conf, download_artifact, complete_download_dir, command, file_name, source)
+
         if artifact_type == "data":
             tar_file = f"{str(complete_download_dir)}/{file_name}"
             if Path(tar_file).is_file():
@@ -247,6 +249,36 @@ def download_artifacts(
                 raise ValueError(f"Failed to symlink {source_file=} to {target_file=}; {stderr=}")
 
 
+@retry(
+    wait=wait_exponential(multiplier=1, max=10),
+    retry=retry_if_exception_type(ValueError),
+    stop=stop_after_attempt(3),
+    reraise=True,
+)
+def execute_download(
+    stream_stdout: bool,
+    conf: Dict[str, ArtifactConfig],
+    download_artifact: str,
+    complete_download_dir: Path,
+    command: List[str],
+    file_name: str,
+    source: str,
+) -> None:
+    """Execute the download command and check the MD5 checksum of the downloaded file."""
+
+    _, stderr, retcode = streamed_subprocess_call(command, stream_stdout)
+    if retcode != 0:
+        raise ValueError(f"Failed to download {download_artifact=}! {stderr=}")
+
+    if source == "pbss" and conf[download_artifact].md5sum:
+        downloaded_md5sum = _md5_checksum(Path(complete_download_dir) / file_name)
+        if downloaded_md5sum != conf[download_artifact].md5sum:
+            raise ValueError(
+                f"MD5 checksum mismatch for {download_artifact=}! Expected "
+                f"{conf[download_artifact].md5sum}, got {downloaded_md5sum}"
+            )
+
+
 def load_config(config_file: Path = DATA_SOURCE_CONFIG) -> Config:
     """
     Loads the artifacts file into a dictionary.
@@ -326,5 +358,21 @@ def main():
         print("No models or data were selected to download.")
 
 
+def _md5_checksum(file_path: Path) -> str:
+    """Calculate the MD5 checksum of a file.
+
+    Args:
+        file_path (Path): The path to the file to checksum.
+
+    Returns:
+        str: The MD5 checksum of the file.
+    """
+    md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
 if __name__ == "__main__":
     main()
diff --git a/setup/requirements-dev.txt b/setup/requirements-dev.txt
@@ -5,4 +5,5 @@ ruff==0.0.292
 black==23.1.0
 pre-commit==3.4.0
 ipdb==0.13.11
+tenacity==8.5.0
 click==8.1.7