Merge branch 'inference' into fix_batch_size

flexflow · Jul 20, 2023 · bcd88f3 · bcd88f3
2 parents 3d8af13 + f76a88d
commit bcd88f3
Show file tree

Hide file tree

Showing 9 changed files with 260 additions and 183 deletions.
diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
@@ -19,6 +19,21 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
+        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
+        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
+        exclude:
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.1"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.2"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.3"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.5"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.6"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.7"
       fail-fast: false
     steps:
       - run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -26,54 +26,91 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
+        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
+        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
+        exclude:
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.1"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.2"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.3"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.5"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.6"
+          - gpu_backend: "hip_rocm"
+            cuda_version: "11.7"
       fail-fast: false
+    env:
+      FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+      cuda_version: ${{ matrix.cuda_version }}
+      branch_name: ${{ github.head_ref || github.ref_name }}
     steps:
       - name: Checkout Git Repository
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        run: .github/workflows/helpers/free_space_on_runner.sh
+        env:
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
+          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+        run: |
+          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
+            .github/workflows/helpers/free_space_on_runner.sh
+          else
+            echo "Skipping this step to save time"
+          fi
 
       - name: Build Docker container
         env:
-          FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
+          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
         run: |
-          # On push to master, build for all compatible architectures, so that we can publish 
+          if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+            export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          else
+            export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF
+          fi
+          # On push to inference, build for all compatible architectures, so that we can publish 
           # a pre-built general-purpose image. On all other cases, only build for one architecture
           # to save time.
-          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ( ${GITHUB_REF#refs/heads/} == "master" || ${GITHUB_REF#refs/heads/} == "inference" ) ]]; then
+          if [[ $deploy_needed == "true" ]] ; then
             export FF_CUDA_ARCH=all
-          else
+            ./docker/build.sh flexflow
+          elif [[ $build_needed == "true" ]]; then
             export FF_CUDA_ARCH=70
-          fi
-          if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+            ./docker/build.sh flexflow
           else
-            export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF
+            echo "Skipping build to save time"
           fi
-          ./docker/build.sh --image_name flexflow --cuda_version 11.8
 
       - name: Check availability of Python flexflow.core module
         if: ${{ matrix.gpu_backend == 'cuda' }}
-        run: docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-11.8.0:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
+        env:
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
+          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+        run: |
+          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
+            docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
+          else
+            echo "Skipping test to save time"
+          fi
 
-      - name: Publish Docker environment image (on push to master)
+      - name: Publish Docker environment image (on push to inference)
         if: github.repository_owner == 'flexflow'
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
-          FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
         run: |
-          if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then
-            ./docker/publish.sh --image_name "flexflow-environment-${FF_GPU_BACKEND}" --cuda_version 11.8
-            ./docker/publish.sh --image_name "flexflow-${FF_GPU_BACKEND}" --cuda_version 11.8
-          elif [[ ( ${{ github.event_name }} == 'push'  || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
-            ./docker/publish.sh --image_name "specinfer-${FF_GPU_BACKEND}" --cuda_version 11.8
+          if [[ $deploy_needed == "true" ]]; then
+            ./docker/publish.sh flexflow-environment
+            ./docker/publish.sh flexflow
           else
             echo "No need to update Docker containers in ghrc.io registry at this time."
           fi
-  
+
   notify-slack:
     name: Notify Slack in case of failure
     runs-on: ubuntu-20.04

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
 
 ## Install FlexFlow
-To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. You can also use `conda` to install the FlexFlow Python package (coming soon).
+To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages for several versions of CUDA and for the `hip_rocm` backend, together with [Dockerfiles](./docker) if you wish to build the containers manually. More info on the Docker images can be found [here](./docker/README.md). You can also use `conda` to install the FlexFlow Python package (coming soon).
 
 ## PyTorch Support
 Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.

diff --git a/docker/README.md b/docker/README.md
@@ -5,48 +5,47 @@ This folder contains the Dockerfiles and scripts that you can use to quickly run
 You will need a machine with a NVIDIA GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine.
 
 ## Downloading a pre-built package
-The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `master` branch. The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
+The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
 
-* [flexflow-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda): the pre-built version of FlexFlow targeting GPUs with a CUDA backend. N.B.: currently, this container is only fully compatible with host machines that have CUDA 11.7 installed.
-* [flexflow-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-hip_rocm): the pre-built version of FlexFlow targeting GPUs with a HIP-ROCM backend.
-* [flexflow-environment-cuda](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda) and [flexflow-environment-hip_rocm](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-hip_rocm): these are the base layers for `flexflow-cuda` and `flexflow-hip_rocm`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. N.B.: currently, the `flexflow-environment-cuda` container is only fully compatible with host machines that have CUDA 11.7 installed.
+* `flexflow`: the pre-built version of FlexFlow. We currently publish one version targeting GPUs with a `hip_rocm` backend (`flexflow-hip_rocm`), and several versions for CUDA GPUs (one for each of the following CUDA versions 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, and 11.8). The CUDA images are named `flexflow-cuda-<CUDA version>`, e.g. [flexflow-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-11.8)
+* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish one version of `flexflow-environment` for `hip_rocm` and one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 11.8 is tagged [flexflow-environment-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-11.8).
 
 The easiest way to download any of the Docker containers above is to call:
 
 ```
-./docker/pull.sh <CONTAINER_NAME>
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/pull.sh <CONTAINER_NAME>
 ```
 
+where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`), and `FF_GPU_BACKEND`/`cuda_version` are optional environment variables you can use if you wish to download the docker image for a GPU backend and/or cuda version other than those installed on your machine (leaving these variables unset will let the script autodetect which version to download depending on your setup).
+
 After downloading a container you can use the `run.sh` script to run it by following the instructions in the section below.
 
 ## Building a Docker container from scratch
-If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`).
+If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=11.8`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA.
 
 To build the FlexFlow container, run (the `flexflow` argument of the build script can be omitted):
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/build.sh flexflow
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/build.sh flexflow
 ```
 
 If you only want to build the `flexflow-environment` image (the base layers of the `flexflow` container, used in CI and for other internal purposes), run:
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/build.sh flexflow-environment
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/build.sh flexflow-environment
 ``` 
 
 ## Running a Docker container
-After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (the `flexflow` argument of the run script can be omitted):
+After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND` and `cuda_version` optional environment variables to run the docker image with the desired GPU backend and CUDA version. Leaving these variables unset will instruct the script to autodetect the GPU backend and CUDA version installed on the current machine and run the Docker container with it if available.
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/run.sh flexflow 
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/run.sh --image_name flexflow
 ```
 
 If you wish to run the `flexflow-environment` container, run:
 
 ```
-FF_GPU_BACKEND=<YOUR_GPU_BACKEND> ./docker/run.sh flexflow-environment
+FF_GPU_BACKEND=<YOUR_GPU_BACKEND> cuda_version=<DESIRED_CUDA_VERSION> ./docker/run.sh --image_name flexflow-environment
 ```
 
-Once again, if your backend is CUDA, you can omit the `FF_GPU_BACKEND` environment variable, since `cuda` is used as the default value.
-
-N.B.: If you don't have GPUs available on the machine, edit the `run.sh` script and set `ATTACH_GPUS=false` before running it.
+N.B.: If you don't have GPUs available on the machine, or you wish to run the docker image without attaching GPUs, you can set the environment variable `ATTACH_GPUS=false` before running the script.