-
Notifications
You must be signed in to change notification settings - Fork 225
143 lines (136 loc) · 6.54 KB
/
docker-build.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
name: "docker-build"
on:
pull_request:
paths:
- "docker/**"
- "!docker/README.md"
- ".github/workflows/docker-build.yml"
push:
branches:
- "inference"
- "master"
schedule:
# Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
- cron: "0 8 * * 0"
workflow_dispatch:
# Cancel outdated workflows if they are still running
concurrency:
group: docker-build-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
docker-build:
name: Build and Install FlexFlow in a Docker Container
runs-on: ubuntu-20.04
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
gpu_backend_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0", "5.3", "5.4", "5.5", "5.6"]
# The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
exclude:
- gpu_backend: "cuda"
gpu_backend_version: "5.3"
- gpu_backend: "cuda"
gpu_backend_version: "5.4"
- gpu_backend: "cuda"
gpu_backend_version: "5.5"
- gpu_backend: "cuda"
gpu_backend_version: "5.6"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.1"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.2"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.3"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.4"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.5"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.6"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.7"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.8"
- gpu_backend: "hip_rocm"
gpu_backend_version: "12.0"
fail-fast: false
env:
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
gpu_backend_version: ${{ matrix.gpu_backend_version }}
# one of the two variables below will be unused
cuda_version: ${{ matrix.gpu_backend_version }}
hip_version: ${{ matrix.gpu_backend_version }}
branch_name: ${{ github.head_ref || github.ref_name }}
timeout-minutes: 480
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Free additional space on runner
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
run: |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
.github/workflows/helpers/free_space_on_runner.sh
else
echo "Skipping this step to save time"
fi
- name: Build Docker container
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ $deploy_needed == "true" ]] ; then
export FF_CUDA_ARCH=all
export FF_HIP_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
export FF_CUDA_ARCH=70
export FF_HIP_ARCH=gfx1100,gfx1036
./docker/build.sh flexflow
else
echo "Skipping build to save time"
fi
- name: Check availability of flexflow modules in Python
if: ${{ matrix.gpu_backend == 'cuda' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
run: |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
if [[ $FF_GPU_BACKEND == "cuda" ]]; then
docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
else
docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
fi
else
echo "Skipping test to save time"
fi
- name: Publish Docker environment image (on push to inference)
if: github.repository_owner == 'flexflow'
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
run: |
if [[ $deploy_needed == "true" ]]; then
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
else
echo "No need to update Docker containers in ghrc.io registry at this time."
fi
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
needs: docker-build
if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
steps:
- name: Send Slack message
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
run: |
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK