-
Notifications
You must be signed in to change notification settings - Fork 6
94 lines (91 loc) · 3.35 KB
/
unit_test.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
name: Run Unit Tests
on:
push:
branches:
- main
pull_request:
branches:
- main
jobs:
build_wheels:
runs-on: ubuntu-24.04
container:
image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
steps:
- name: Update GCC
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y build-essential gcc-10 g++-10
- name: Install Python versions and pips
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y software-properties-common curl
add-apt-repository ppa:deadsnakes/ppa
apt-get install -y python3.10 python3.10-dev python3.10-distutils
apt-get install -y python3.11 python3.11-dev python3.11-distutils
apt-get install -y python3.12 python3.12-dev python3.12-distutils
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
- name: Checkout code
uses: actions/checkout@v4
- name: Build wheel with Python 3.10
run: |
python3.10 -m pip install -U poetry build six
python3.10 -m poetry build -f wheel
- name: Build wheel with Python 3.11
run: |
python3.11 -m pip install -U poetry build six
python3.11 -m poetry build -f wheel
- name: Build wheel with Python 3.12
run: |
python3.12 -m pip install -U poetry build six
python3.12 -m poetry build -f wheel
- name: Upload the wheel artifact
uses: actions/upload-artifact@v4
with:
name: resiliency-wheels
path: dist/*.whl
unit_tests:
runs-on: ubuntu-24.04
needs: build_wheels
strategy:
matrix:
container:
- 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
- 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime'
- 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency']
container:
image: ${{ matrix.container }}
env:
MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: resiliency-wheels
path: ./dist/
- name: Set up environment
run: |
pip install pytest lightning
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
- name: Run unit tests
shell: bash
run: |
if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
exit 0
elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then
pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
exit 0
elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
exit 0
else
echo "Unknown test type: ${{ matrix.test_type }}"
exit 1
fi