Skip to content

Commit

Permalink
Merge branch 'inference' into prep_model_weights
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro authored Jul 20, 2023
2 parents 6f918a3 + f76a88d commit 427f7ab
Show file tree
Hide file tree
Showing 27 changed files with 1,179 additions and 186 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/docker-build-skip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@ jobs:
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
# The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
exclude:
- gpu_backend: "hip_rocm"
cuda_version: "11.1"
- gpu_backend: "hip_rocm"
cuda_version: "11.2"
- gpu_backend: "hip_rocm"
cuda_version: "11.3"
- gpu_backend: "hip_rocm"
cuda_version: "11.5"
- gpu_backend: "hip_rocm"
cuda_version: "11.6"
- gpu_backend: "hip_rocm"
cuda_version: "11.7"
fail-fast: false
steps:
- run: 'echo "No docker-build required"'
75 changes: 56 additions & 19 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,54 +26,91 @@ jobs:
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
# The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
exclude:
- gpu_backend: "hip_rocm"
cuda_version: "11.1"
- gpu_backend: "hip_rocm"
cuda_version: "11.2"
- gpu_backend: "hip_rocm"
cuda_version: "11.3"
- gpu_backend: "hip_rocm"
cuda_version: "11.5"
- gpu_backend: "hip_rocm"
cuda_version: "11.6"
- gpu_backend: "hip_rocm"
cuda_version: "11.7"
fail-fast: false
env:
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
cuda_version: ${{ matrix.cuda_version }}
branch_name: ${{ github.head_ref || github.ref_name }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive

- name: Free additional space on runner
run: .github/workflows/helpers/free_space_on_runner.sh
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
run: |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
.github/workflows/helpers/free_space_on_runner.sh
else
echo "Skipping this step to save time"
fi
- name: Build Docker container
env:
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
run: |
# On push to master, build for all compatible architectures, so that we can publish
if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
else
export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF
fi
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ( ${GITHUB_REF#refs/heads/} == "master" || ${GITHUB_REF#refs/heads/} == "inference" ) ]]; then
if [[ $deploy_needed == "true" ]] ; then
export FF_CUDA_ARCH=all
else
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
export FF_CUDA_ARCH=70
fi
if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
./docker/build.sh flexflow
else
export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF
echo "Skipping build to save time"
fi
./docker/build.sh --image_name flexflow --cuda_version 11.8
- name: Check availability of Python flexflow.core module
if: ${{ matrix.gpu_backend == 'cuda' }}
run: docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-11.8.0:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
run: |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
else
echo "Skipping test to save time"
fi
- name: Publish Docker environment image (on push to master)
- name: Publish Docker environment image (on push to inference)
if: github.repository_owner == 'flexflow'
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
run: |
if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then
./docker/publish.sh --image_name "flexflow-environment-${FF_GPU_BACKEND}" --cuda_version 11.8
./docker/publish.sh --image_name "flexflow-${FF_GPU_BACKEND}" --cuda_version 11.8
elif [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
./docker/publish.sh --image_name "specinfer-${FF_GPU_BACKEND}" --cuda_version 11.8
if [[ $deploy_needed == "true" ]]; then
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
else
echo "No need to update Docker containers in ghrc.io registry at this time."
fi
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).

## Install FlexFlow
To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. You can also use `conda` to install the FlexFlow Python package (coming soon).
To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages for several versions of CUDA and for the `hip_rocm` backend, together with [Dockerfiles](./docker) if you wish to build the containers manually. More info on the Docker images can be found [here](./docker/README.md). You can also use `conda` to install the FlexFlow Python package (coming soon).

## PyTorch Support
Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.
Expand Down
27 changes: 13 additions & 14 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,48 +5,47 @@ This folder contains the Dockerfiles and scripts that you can use to quickly run
You will need a machine with a NVIDIA GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine.

## Downloading a pre-built package
The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `master` branch. The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):

* [flexflow-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda): the pre-built version of FlexFlow targeting GPUs with a CUDA backend. N.B.: currently, this container is only fully compatible with host machines that have CUDA 11.7 installed.
* [flexflow-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-hip_rocm): the pre-built version of FlexFlow targeting GPUs with a HIP-ROCM backend.
* [flexflow-environment-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda) and [flexflow-environment-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-hip_rocm): these are the base layers for `flexflow-cuda` and `flexflow-hip_rocm`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. N.B.: currently, the `flexflow-environment-cuda` container is only fully compatible with host machines that have CUDA 11.7 installed.
* `flexflow`: the pre-built version of FlexFlow. We currently publish one version targeting GPUs with a `hip_rocm` backend (`flexflow-hip_rocm`), and several versions for CUDA GPUs (one for each of the following CUDA versions 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, and 11.8). The CUDA images are named `flexflow-cuda-<CUDA version>`, e.g. [flexflow-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-11.8)
* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish one version of `flexflow-environment` for `hip_rocm` and one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 11.8 is tagged [flexflow-environment-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-11.8).

The easiest way to download any of the Docker containers above is to call:

```
./docker/pull.sh <CONTAINER_NAME>
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/pull.sh <CONTAINER_NAME>
```

where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`), and `FF_GPU_BACKEND`/`cuda_version` are optional environment variables you can use if you wish to download the docker image for a GPU backend and/or cuda version other than those installed on your machine (leaving these variables unset will let the script autodetect which version to download depending on your setup).

After downloading a container you can use the `run.sh` script to run it by following the instructions in the section below.

## Building a Docker container from scratch
If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`).
If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=11.8`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA.

To build the FlexFlow container, run (the `flexflow` argument of the build script can be omitted):

```
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/build.sh flexflow
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/build.sh flexflow
```

If you only want to build the `flexflow-environment` image (the base layers of the `flexflow` container, used in CI and for other internal purposes), run:

```
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/build.sh flexflow-environment
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/build.sh flexflow-environment
```

## Running a Docker container
After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (the `flexflow` argument of the run script can be omitted):
After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND` and `cuda_version` optional environment variables to run the docker image with the desired GPU backend and CUDA version. Leaving these variables unset will instruct the script to autodetect the GPU backend and CUDA version installed on the current machine and run the Docker container with it if available.

```
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/run.sh flexflow
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/run.sh --image_name flexflow
```

If you wish to run the `flexflow-environment` container, run:

```
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/run.sh flexflow-environment
FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/run.sh --image_name flexflow-environment
```

Once again, if your backend is CUDA, you can omit the `FF_GPU_BACKEND` environment variable, since `cuda` is used as the default value.

N.B.: If you don't have GPUs available on the machine, edit the `run.sh` script and set `ATTACH_GPUS=false` before running it.
N.B.: If you don't have GPUs available on the machine, or you wish to run the docker image without attaching GPUs, you can set the environment variable `ATTACH_GPUS=false` before running the script.
Loading

0 comments on commit 427f7ab

Please sign in to comment.