diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 182a1c5..c567a5e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,7 +6,7 @@ on: branches: # Push events on main and dev branch - main - - fair-impact-hw1 + - training-tests # Sequence of patterns matched against refs/tags tags: '*' @@ -26,3 +26,31 @@ jobs: - name: Docker build training, backend, and frontend run: ./run.sh + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r training/requirements.txt + pip install pytest pytest-cov + + - name: Run tests with pytest and generate coverage + run: | + cd training + pytest --cov=. --cov-report=xml --cov-report=term-missing + + - name: Upload coverage report + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: training/coverage.xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ./training/coverage.xml + fail_ci_if_error: true diff --git a/training/.gitignore b/training/.gitignore index c103a6e..723add1 100644 --- a/training/.gitignore +++ b/training/.gitignore @@ -10,4 +10,6 @@ kubectl data/bucket-* data/embedding.pkl data/kmeans.idx -models/ \ No newline at end of file +models/ +.coverage +coverage.xml diff --git a/training/create-buckets.py b/training/create_buckets.py similarity index 100% rename from training/create-buckets.py rename to training/create_buckets.py diff --git a/training/create-embedding.py b/training/create_embedding.py similarity index 100% rename from training/create-embedding.py rename to training/create_embedding.py diff --git a/training/create-protein-bucket-mapping.py b/training/create_protein_bucket_mapping.py similarity index 100% rename from training/create-protein-bucket-mapping.py rename to training/create_protein_bucket_mapping.py diff --git a/training/pytest.ini b/training/pytest.ini new file mode 100644 index 0000000..7bcb504 --- /dev/null +++ b/training/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = . +python_files = tests.py +addopts = --cov=. --cov-report=term-missing --cov-report=xml \ No newline at end of file diff --git a/training/requirements-dev.txt b/training/requirements-dev.txt index 4c992e9..d9dff46 100644 --- a/training/requirements-dev.txt +++ b/training/requirements-dev.txt @@ -3,3 +3,5 @@ pre-commit black==23.7.0 isort==5.12.0 flake8==6.1.0 +pytest +pytest-cov diff --git a/training/run.sh b/training/run.sh index 0e487b0..61f79e0 100644 --- a/training/run.sh +++ b/training/run.sh @@ -1,7 +1,7 @@ #/bin/bash # 1) ---- Create embeddings -python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10 +python3 create_embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10 # 2) ---- Create a K-Means object # Clusters and saves the k-means object to to `data/kmeans.idx` @@ -13,7 +13,7 @@ python3 train.py --input=./data/embedding.pkl --kmeans-path=data/kmeans.idx --ou # 4) ---- Create bucket-data # Collects all predictions from the newest model in `models/`, and saves them to `bucket-data/` -python3 create-buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/ +python3 create_buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/ # 5) ---- Create bucket-data mapping to protein IDs -python3 create-protein-bucket-mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl \ No newline at end of file +python3 create_protein_bucket_mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl \ No newline at end of file diff --git a/training/tests.py b/training/tests.py new file mode 100644 index 0000000..b19bdee --- /dev/null +++ b/training/tests.py @@ -0,0 +1,51 @@ +import os +import re +from pathlib import Path + +import pandas as pd +import pytest +from create_embedding import run + + +@pytest.fixture(scope="function") +def output_file(): + # Setup: Define the output file path + output_path = Path("./data/embedding.pkl") + + yield output_path + + # Teardown: Remove only the embedding.pkl file if it exists + if output_path.exists(): + os.remove(output_path) + + +def parse_protein_id(filename): + # Use regex to extract the protein ID + match = re.search(r'AF-([\w\d]+)-F1-model_v3\.cif', filename) + if match: + return match.group(1) + return None + + +def test_create_embedding(): + cif_path = "./data/cifs" + output_path = "./data/embedding.pkl" + granularity = 10 + + # 45 features for each protein - (10x10 - 10) / 2 + expected_dimensionality = 45 + + run(Path(cif_path), Path(output_path), granularity) + os.chmod(output_path, 0o777) + + assert os.path.exists(output_path) + assert os.path.getsize(output_path) > 0 + # load embedding.pkl and check if it has the correct shape + df = pd.read_pickle(output_path) + assert df.shape[0] == len(os.listdir(cif_path)) + assert df.shape[1] == expected_dimensionality + + # check if the length of the index is equal to the number of proteins + assert sorted(df.index.tolist()) == sorted( + [parse_protein_id(file) for file in os.listdir(cif_path) if file.endswith('.cif')] + )