-
Notifications
You must be signed in to change notification settings - Fork 225
254 lines (225 loc) · 8.03 KB
/
gpu-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
name: "gpu-ci"
on:
push:
branches:
- "inference"
paths:
- "cmake/**"
- "config/**"
- "deps/**"
- "python/**"
- "setup.py"
- "include/**"
- "inference/**"
- "src/**"
- "tests/inference/**"
- "conda/flexflow.yml"
- ".github/workflows/gpu-ci.yml"
- "tests/cpp_gpu_tests.sh"
- "tests/inference_tests.sh"
- "tests/training_tests.sh"
- "tests/python_interface_test.sh"
workflow_dispatch:
concurrency:
group: gpu-ci-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
gpu-ci-concierge:
name: GPU CI Concierge
runs-on: ubuntu-20.04
env:
FLEXFLOW_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
- name: Wait for daemon to be done
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
python-interface-check:
name: Check Python Interface
runs-on: [self-hosted, gpu]
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
env:
CONDA: "3"
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install conda and FlexFlow dependencies
uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
activate-environment: flexflow
environment-file: conda/flexflow.yml
auto-activate-base: false
auto-update-conda: false
- name: Install conda and Pytorch dependencies for pytorch alignment test
run: |
conda env create -f conda/pytorch-gpu.yml
- name: Build FlexFlow
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
mkdir build
cd build
../config/config.linux
make -j
- name: Check FlexFlow Python interface (before installation)
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/python_interface_test.sh before-installation
- name: Install FlexFlow
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
cd build
../config/config.linux
make install
ldconfig
- name: Check FlexFlow Python interface (after installation)
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/python_interface_test.sh after-installation
- name: Run flexflow alignment with pytorch
run: |
# run alingment tests
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/align/test_all_operators.sh
inference-tests:
name: Inference Tests
runs-on: [self-hosted, gpu]
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
env:
CONDA: "3"
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install conda and FlexFlow dependencies
uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
activate-environment: flexflow
environment-file: conda/flexflow.yml
auto-activate-base: false
- name: Build FlexFlow
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
mkdir build
cd build
../config/config.linux
make -j
- name: Run inference tests
env:
CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export CUDNN_DIR=/usr/local/cuda
export CUDA_DIR=/usr/local/cuda
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
# GPT tokenizer test
# ./tests/gpt_tokenizer_test.sh
# Inference tests
source ./build/set_python_envs.sh
./tests/inference_tests.sh
- name: Save inference output as an artifact
if: always()
run: |
cd inference
tar -zcvf output.tar.gz ./output
- name: Upload artifact
uses: actions/upload-artifact@v3
if: always()
with:
name: output
path: inference/output.tar.gz
# Github persists the .cache folder across different runs/containers
- name: Clear cache
if: always()
run: sudo rm -rf ~/.cache
training-tests:
name: Training Tests
runs-on: [self-hosted, gpu]
# skip this time-consuming test for PRs to the inference branch
# if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
env:
CONDA: "3"
needs: inference-tests
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install conda and FlexFlow dependencies
uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
activate-environment: flexflow
environment-file: conda/flexflow.yml
auto-activate-base: false
- name: Build and Install FlexFlow
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_BUILD_ALL_EXAMPLES=ON
export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
pip install . --verbose
- name: Check FlexFlow Python interface (pip)
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/python_interface_test.sh after-installation
- name: Run multi-gpu tests
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export CUDNN_DIR=/usr/local/cuda
export CUDA_DIR=/usr/local/cuda
export FF_HOME=$(pwd)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
# C++ tests
./tests/cpp_gpu_tests.sh 4
# Python tests
./tests/training_tests.sh 4