.github/workflows/docker-build.yml

name: "docker-build"
on:
  pull_request:
    paths:
      - "docker/**"
      - "!docker/README.md"
      - ".github/workflows/docker-build.yml"
  push:
    branches:
      - "inference"
      - "master"
  schedule:
    # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
    - cron: "0 8 * * 0"
  workflow_dispatch:

# Cancel outdated workflows if they are still running
concurrency:
  group: docker-build-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  docker-build:
    name: Build and Install FlexFlow in a Docker Container
    runs-on: ubuntu-20.04
    strategy:
      matrix:
        gpu_backend: ["cuda", "hip_rocm"]
        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
        exclude:
          - gpu_backend: "hip_rocm"
            cuda_version: "11.1"
          - gpu_backend: "hip_rocm"
            cuda_version: "11.2"
          - gpu_backend: "hip_rocm"
            cuda_version: "11.3"
          - gpu_backend: "hip_rocm"
            cuda_version: "11.4"
          - gpu_backend: "hip_rocm"
            cuda_version: "11.5"
          - gpu_backend: "hip_rocm"
            cuda_version: "11.6"
          - gpu_backend: "hip_rocm"
            cuda_version: "11.7"
          - gpu_backend: "hip_rocm"
            cuda_version: "12.0"
      fail-fast: false
    env:
      FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
      cuda_version: ${{ matrix.cuda_version }}
      branch_name: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout Git Repository
        uses: actions/checkout@v3
        with:
          submodules: recursive

      - name: Free additional space on runner
        env:
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
        run: |
          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
            .github/workflows/helpers/free_space_on_runner.sh
          else
            echo "Skipping this step to save time"
          fi

      - name: Build Docker container
        env:
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
        run: |
          if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
            export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
          else
            export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF
          fi
          # On push to inference, build for all compatible architectures, so that we can publish 
          # a pre-built general-purpose image. On all other cases, only build for one architecture
          # to save time.
          if [[ $deploy_needed == "true" ]] ; then
            export FF_CUDA_ARCH=all
            ./docker/build.sh flexflow
          elif [[ $build_needed == "true" ]]; then
            export FF_CUDA_ARCH=70
            ./docker/build.sh flexflow
          else
            echo "Skipping build to save time"
          fi

      - name: Check availability of Python flexflow.core module
        if: ${{ matrix.gpu_backend == 'cuda' }}
        env:
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
          build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
        run: |
          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
            docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
          else
            echo "Skipping test to save time"
          fi

      - name: Publish Docker environment image (on push to inference)
        if: github.repository_owner == 'flexflow'
        env:
          FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
        run: |
          if [[ $deploy_needed == "true" ]]; then
            ./docker/publish.sh flexflow-environment
            ./docker/publish.sh flexflow
          else
            echo "No need to update Docker containers in ghrc.io registry at this time."
          fi

  notify-slack:
    name: Notify Slack in case of failure
    runs-on: ubuntu-20.04
    needs: docker-build
    if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
    steps:
      - name: Send Slack message
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
        run: |
          curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK