.github/workflows/docker-build.yml

name: "docker-build"
on:
  pull_request:
    paths:
      - "docker/**"
      - "!docker/README.md"
      - ".github/workflows/docker-build.yml"
  push:
    branches:
      - "inference"
      - "master"
  schedule:
    # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
    - cron: "0 8 * * 0"
  workflow_dispatch:

# Cancel outdated workflows if they are still running
concurrency:
  group: docker-build-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  docker-build:
    name: Build and Install FlexFlow in a Docker Container
    runs-on: ubuntu-20.04
    strategy:
      matrix:
        gpu_backend: ["cuda", "hip_rocm"]
        gpu_backend_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0", "5.3", "5.4", "5.5", "5.6"]
        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
        exclude:
          - gpu_backend: "cuda"
            gpu_backend_version: "5.3"
          - gpu_backend: "cuda"
            gpu_backend_version: "5.4"
          - gpu_backend: "cuda"
            gpu_backend_version: "5.5"
          - gpu_backend: "cuda"
            gpu_backend_version: "5.6"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.1"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.2"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.3"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.4"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.5"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.6"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.7"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "11.8"
          - gpu_backend: "hip_rocm"
            gpu_backend_version: "12.0"
      fail-fast: false
    env:
      FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
      gpu_backend_version: ${{ matrix.gpu_backend_version }}
      # one of the two variables below will be unused
      cuda_version: ${{ matrix.gpu_backend_version }}
      hip_version: ${{ matrix.gpu_backend_version }}
      branch_name: ${{ github.head_ref || github.ref_name }}
    timeout-minutes: 480
    steps:
      - name: Checkout Git Repository
        uses: actions/checkout@v3
        with:
          submodules: recursive

      - name: Free additional space on runner
        env:
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
          build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
        run: |
          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
            .github/workflows/helpers/free_space_on_runner.sh
          else
            echo "Skipping this step to save time"
          fi

      - name: Build Docker container
        env:
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
          build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
        run: |
          # On push to inference, build for all compatible architectures, so that we can publish 
          # a pre-built general-purpose image. On all other cases, only build for one architecture
          # to save time.
          if [[ $deploy_needed == "true" ]] ; then
            export FF_CUDA_ARCH=all
            export FF_HIP_ARCH=all
            ./docker/build.sh flexflow
          elif [[ $build_needed == "true" ]]; then
            export FF_CUDA_ARCH=70
            export FF_HIP_ARCH=gfx1100,gfx1036
            ./docker/build.sh flexflow
          else
            echo "Skipping build to save time"
          fi

      - name: Check availability of flexflow modules in Python
        if: ${{ matrix.gpu_backend == 'cuda' }}
        env:
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
          build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
        run: |
          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
            if [[ $FF_GPU_BACKEND == "cuda" ]]; then
              docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
            else
              docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
            fi
          else
            echo "Skipping test to save time"
          fi

      - name: Publish Docker environment image (on push to inference)
        if: github.repository_owner == 'flexflow'
        env:
          FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
        run: |
          if [[ $deploy_needed == "true" ]]; then
            ./docker/publish.sh flexflow-environment
            ./docker/publish.sh flexflow
          else
            echo "No need to update Docker containers in ghrc.io registry at this time."
          fi

  notify-slack:
    name: Notify Slack in case of failure
    runs-on: ubuntu-20.04
    needs: docker-build
    if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
    steps:
      - name: Send Slack message
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
        run: |
          curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK