add tests, rename scipts to "_", add to ci

Coda-Research-Group · Oct 1, 2024 · bf0ad0e · bf0ad0e
1 parent f061c9b
commit bf0ad0e
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 5 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -6,7 +6,7 @@ on:
     branches:    
       # Push events on main and dev branch
       - main
-      - fair-impact-hw1
+      - training-tests
     # Sequence of patterns matched against refs/tags
     tags: '*'
 
@@ -26,3 +26,31 @@ jobs:
 
     - name: Docker build training, backend, and frontend
       run: ./run.sh
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r training/requirements.txt
+        pip install pytest pytest-cov
+
+    - name: Run tests with pytest and generate coverage
+      run: |
+        cd training
+        pytest --cov=. --cov-report=xml --cov-report=term-missing
+
+    - name: Upload coverage report
+      uses: actions/upload-artifact@v2
+      with:
+        name: coverage-report
+        path: training/coverage.xml
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        file: ./training/coverage.xml
+        fail_ci_if_error: true
diff --git a/training/.gitignore b/training/.gitignore
@@ -10,4 +10,6 @@ kubectl
 data/bucket-*
 data/embedding.pkl
 data/kmeans.idx
-models/
+models/
+.coverage
+coverage.xml
diff --git a/training/create-buckets.py → training/create_buckets.py b/training/create-buckets.py → training/create_buckets.py
diff --git a/training/create-embedding.py → training/create_embedding.py b/training/create-embedding.py → training/create_embedding.py
diff --git a/training/create-protein-bucket-mapping.py → training/create_protein_bucket_mapping.py b/training/create-protein-bucket-mapping.py → training/create_protein_bucket_mapping.py
diff --git a/training/pytest.ini b/training/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+testpaths = .
+python_files = tests.py
+addopts = --cov=. --cov-report=term-missing --cov-report=xml
diff --git a/training/requirements-dev.txt b/training/requirements-dev.txt
@@ -3,3 +3,5 @@ pre-commit
 black==23.7.0
 isort==5.12.0
 flake8==6.1.0
+pytest
+pytest-cov
diff --git a/training/run.sh b/training/run.sh
@@ -1,7 +1,7 @@
 #/bin/bash
 
 # 1) ---- Create embeddings
-python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
+python3 create_embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
 
 # 2) ---- Create a K-Means object
 # Clusters and saves the k-means object to to `data/kmeans.idx`
@@ -13,7 +13,7 @@ python3 train.py --input=./data/embedding.pkl --kmeans-path=data/kmeans.idx --ou
 
 # 4) ---- Create bucket-data
 # Collects all predictions from the newest model in `models/`, and saves them to `bucket-data/`
-python3 create-buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/
+python3 create_buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/
 
 # 5) ---- Create bucket-data mapping to protein IDs
-python3 create-protein-bucket-mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl
+python3 create_protein_bucket_mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl
diff --git a/training/tests.py b/training/tests.py
@@ -0,0 +1,37 @@
+import os
+from pathlib import Path
+
+from create_embedding import run
+import pandas as pd
+import re
+
+
+def parse_protein_id(filename):
+    # Use regex to extract the protein ID
+    match = re.search(r'AF-([\w\d]+)-F1-model_v3\.cif', filename)
+    if match:
+        return match.group(1)
+    return None
+
+
+def test_create_embedding():
+    cif_path = "./data/cifs"
+    output_path = "./data/embedding.pkl"
+    granularity = 10
+
+    # 45 features for each protein - (10x10 - 10) / 2
+    expected_dimensionality = 45
+
+    run(Path(cif_path), Path(output_path), granularity)
+
+    assert os.path.exists(output_path)
+    assert os.path.getsize(output_path) > 0
+    # load embedding.pkl and check if it has the correct shape
+    df = pd.read_pickle(output_path)
+    assert df.shape[0] == len(os.listdir(cif_path))
+    assert df.shape[1] == expected_dimensionality
+
+    # check if the length of the index is equal to the number of proteins
+    assert sorted(df.index.tolist()) == sorted(
+        [parse_protein_id(file) for file in os.listdir(cif_path) if file.endswith('.cif')]
+    )