Skip to content

select all straggler tests #48

select all straggler tests

select all straggler tests #48

Workflow file for this run

name: Run Unit Tests
on:
push:
branches:
- main
- unit_tests_dbg
pull_request:
branches:
- main
- unit_tests_dbg
jobs:
build_wheels:
runs-on: ubuntu-24.04
container:
image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
steps:
- name: Update GCC
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y build-essential gcc-10 g++-10
- name: Install Python versions and pips
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y software-properties-common curl
add-apt-repository ppa:deadsnakes/ppa
apt-get install -y python3.10 python3.10-dev python3.10-distutils
apt-get install -y python3.11 python3.11-dev python3.11-distutils
apt-get install -y python3.12 python3.12-dev python3.12-distutils
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
- name: Checkout code
uses: actions/checkout@v4
- name: Build wheel with Python 3.10
run: |
python3.10 -m pip install -U poetry build six
python3.10 -m poetry build -f wheel
- name: Build wheel with Python 3.11
run: |
python3.11 -m pip install -U poetry build six
python3.11 -m poetry build -f wheel
- name: Build wheel with Python 3.12
run: |
python3.12 -m pip install -U poetry build six
python3.12 -m poetry build -f wheel
- name: Upload the wheel artifact
uses: actions/upload-artifact@v4
with:
name: resiliency-wheels
path: dist/*.whl
unit_test:
runs-on: ubuntu-24.04
needs: build_wheels
strategy:
matrix:
container:
- 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
- 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime'
- 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
container:
image: ${{ matrix.container }}
env:
MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: resiliency-wheels
path: ./dist/
- name: Set up environment
run: |
pip install pytest lightning
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
- name: Run Fault Tolerance unit tests
run: |
pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
- name: Run Straggler unit tests
run: |
pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
- name: Run PTL callbacks unit tests
run: |
pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/