Skip to content

Commit

Permalink
add tests, rename scipts to "_", add to ci
Browse files Browse the repository at this point in the history
  • Loading branch information
Terézia Slanináková committed Oct 1, 2024
1 parent f061c9b commit bf0ad0e
Show file tree
Hide file tree
Showing 9 changed files with 78 additions and 5 deletions.
30 changes: 29 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
branches:
# Push events on main and dev branch
- main
- fair-impact-hw1
- training-tests
# Sequence of patterns matched against refs/tags
tags: '*'

Expand All @@ -26,3 +26,31 @@ jobs:

- name: Docker build training, backend, and frontend
run: ./run.sh

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r training/requirements.txt
pip install pytest pytest-cov
- name: Run tests with pytest and generate coverage
run: |
cd training
pytest --cov=. --cov-report=xml --cov-report=term-missing
- name: Upload coverage report
uses: actions/upload-artifact@v2
with:
name: coverage-report
path: training/coverage.xml

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./training/coverage.xml
fail_ci_if_error: true
4 changes: 3 additions & 1 deletion training/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ kubectl
data/bucket-*
data/embedding.pkl
data/kmeans.idx
models/
models/
.coverage
coverage.xml
File renamed without changes.
File renamed without changes.
File renamed without changes.
4 changes: 4 additions & 0 deletions training/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
testpaths = .
python_files = tests.py
addopts = --cov=. --cov-report=term-missing --cov-report=xml
2 changes: 2 additions & 0 deletions training/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ pre-commit
black==23.7.0
isort==5.12.0
flake8==6.1.0
pytest
pytest-cov
6 changes: 3 additions & 3 deletions training/run.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#/bin/bash

# 1) ---- Create embeddings
python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
python3 create_embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10

# 2) ---- Create a K-Means object
# Clusters and saves the k-means object to to `data/kmeans.idx`
Expand All @@ -13,7 +13,7 @@ python3 train.py --input=./data/embedding.pkl --kmeans-path=data/kmeans.idx --ou

# 4) ---- Create bucket-data
# Collects all predictions from the newest model in `models/`, and saves them to `bucket-data/`
python3 create-buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/
python3 create_buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/

# 5) ---- Create bucket-data mapping to protein IDs
python3 create-protein-bucket-mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl
python3 create_protein_bucket_mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl
37 changes: 37 additions & 0 deletions training/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
from pathlib import Path

from create_embedding import run
import pandas as pd
import re


def parse_protein_id(filename):
# Use regex to extract the protein ID
match = re.search(r'AF-([\w\d]+)-F1-model_v3\.cif', filename)
if match:
return match.group(1)
return None


def test_create_embedding():
cif_path = "./data/cifs"
output_path = "./data/embedding.pkl"
granularity = 10

# 45 features for each protein - (10x10 - 10) / 2
expected_dimensionality = 45

run(Path(cif_path), Path(output_path), granularity)

assert os.path.exists(output_path)
assert os.path.getsize(output_path) > 0
# load embedding.pkl and check if it has the correct shape
df = pd.read_pickle(output_path)
assert df.shape[0] == len(os.listdir(cif_path))
assert df.shape[1] == expected_dimensionality

# check if the length of the index is equal to the number of proteins
assert sorted(df.index.tolist()) == sorted(
[parse_protein_id(file) for file in os.listdir(cif_path) if file.endswith('.cif')]
)

0 comments on commit bf0ad0e

Please sign in to comment.