From 58ac1d7e9b2cf0a8e0e51304cfa1d87a151dcc03 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Tue, 28 May 2024 22:18:44 -0400
Subject: [PATCH 1/2] feat: add basic workflow

---
 .github/workflows/precompile-kernels.yaml | 186 ++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 .github/workflows/precompile-kernels.yaml

diff --git a/.github/workflows/precompile-kernels.yaml b/.github/workflows/precompile-kernels.yaml
new file mode 100644
index 00000000000..10be3ece6c0
--- /dev/null
+++ b/.github/workflows/precompile-kernels.yaml
@@ -0,0 +1,186 @@
+# based on https://github.com/Dao-AILab/flash-attention/blob/main/.github/workflows/publish.yml
+name: Build wheels
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  build_wheels:
+    name: Build Wheel
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
+        # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
+        os: [ubuntu-20.04]
+        python-version: [
+            # "3.7", "3.8", "3.9", "3.10",
+            "3.11",
+          ]
+        torch-version: [
+            # "1.12.1", "1.13.1", "2.0.1", "2.1.2", "2.2.2",
+            "2.3.0",
+          ]
+        cuda-version: [
+            # "11.8.0",
+            "12.2.2",
+          ]
+        # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
+        # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
+        # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
+        # when building without C++11 ABI and using it on nvcr images.
+        cxx11_abi: [
+            # "FALSE",
+            "TRUE",
+          ]
+        exclude:
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # Pytorch <= 1.12 does not support Python 3.11
+          - torch-version: "1.12.1"
+            python-version: "3.11"
+          # Pytorch >= 2.0 only supports Python >= 3.8
+          - torch-version: "2.0.1"
+            python-version: "3.7"
+          - torch-version: "2.1.2"
+            python-version: "3.7"
+          - torch-version: "2.2.2"
+            python-version: "3.7"
+          - torch-version: "2.3.0"
+            python-version: "3.7"
+          # Pytorch <= 2.0 only supports CUDA <= 11.8
+          - torch-version: "1.12.1"
+            cuda-version: "12.2.2"
+          - torch-version: "1.13.1"
+            cuda-version: "12.2.2"
+          - torch-version: "2.0.1"
+            cuda-version: "12.2.2"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Set CUDA and PyTorch versions
+        run: |
+          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+
+      - name: Free up disk space
+        if: ${{ runner.os == 'Linux' }}
+        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
+        # https://github.com/easimon/maximize-build-space/tree/test-report
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+
+      - name: Set up swap space
+        if: runner.os == 'Linux'
+        uses: pierotofy/set-swap-space@v1.0
+        with:
+          swap-size-gb: 10
+
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        if: ${{ matrix.cuda-version != 'cpu' }}
+        uses: Jimver/cuda-toolkit@v0.2.14
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
+          # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
+          method: "network"
+          # We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions,
+          # not just nvcc
+          # sub-packages: '["nvcc"]'
+
+      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
+        run: |
+          pip install --upgrade pip
+          # If we don't install before installing Pytorch, we get error for torch 2.0.1
+          # ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none)
+          pip install lit
+          # We want to figure out the CUDA version to download pytorch
+          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # This code is ugly, maybe there's a better way to do this.
+          export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
+            minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121}[env['MATRIX_TORCH_VERSION']]; \
+            print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
+          )
+          if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
+            if [[ ${MATRIX_TORCH_VERSION} == "2.2" ]]; then
+              # --no-deps because we can't install old versions of pytorch-triton
+              pip install typing-extensions jinja2
+              pip install --no-cache-dir --no-deps --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ matrix.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
+            else
+              pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+            fi
+          else
+            pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
+          fi
+          nvcc --version
+          python --version
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
+          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+        shell: bash
+
+      - name: Build wheel
+        run: |
+          # Ensure we have the correct version of setuptools to avoid compatibility issues with CUDA
+          pip install setuptools==68.0.0
+          pip install ninja packaging wheel
+          # Set up the environment for CUDA
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          # Clone the repository at the specific commit
+          git clone https://github.com/HazyResearch/flash-attention.git
+          cd flash-attention
+          git checkout 3a9bfd076f98746c73362328958dbc68d145fbec
+          # Build the wheel with limited jobs to prevent OOM issues on the GitHub runner
+          MAX_JOBS=2 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi }} python setup.py bdist_wheel --dist-dir=../dist
+          # Build kernels inside the repository
+          cd csrc/rotary
+          MAX_JOBS=2 python setup.py bdist_wheel --dist-dir=../../../dist
+          cd ../layer_norm
+          MAX_JOBS=2 python setup.py bdist_wheel --dist-dir=../../../dist
+          # build the kernels for vllm as well
+          cd ../..
+          git clone https://github.com/Narsil/vllm.git
+          cd vllm
+          git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
+          export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"
+          python setup.py bdist_wheel --dist-dir=../dist
+          # Generate a custom name for the wheel to include CUDA and Torch versions
+          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
+          # Rename the wheel with the custom name
+          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+          # Save the wheel name to the GitHub environment
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+
+      - name: Log Built Wheels
+        run: |
+          ls dist
+
+      - name: Install Hugging Face CLI
+        run: |
+          pip install huggingface_hub
+
+      - name: Upload to Hugging Face Hub
+        run: |
+          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_PRECOMPILE_TOKEN }}
+          huggingface-cli upload drbh/flash-attention-pre-compile flash-attention/dist/*

From 129f0ed6034fcbb5c1d4bf94a2ce3fe37874cdfd Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Wed, 29 May 2024 09:50:58 -0400
Subject: [PATCH 2/2] fix: adjust whl names and upload all

---
 .github/workflows/precompile-kernels.yaml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/precompile-kernels.yaml b/.github/workflows/precompile-kernels.yaml
index 10be3ece6c0..25f0fe35a3f 100644
--- a/.github/workflows/precompile-kernels.yaml
+++ b/.github/workflows/precompile-kernels.yaml
@@ -165,12 +165,14 @@ jobs:
           export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"
           python setup.py bdist_wheel --dist-dir=../dist
           # Generate a custom name for the wheel to include CUDA and Torch versions
+          cd ../dist
           tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
-          # Rename the wheel with the custom name
-          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+          for wheel in *.whl; do
+            new_wheel_name=$(echo "$wheel" | sed "s/-/+$tmpname-/2")
+            mv "$wheel" "$new_wheel_name"
+          done
           # Save the wheel name to the GitHub environment
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "wheel_name=$(ls *+$tmpname-*.whl)" >> $GITHUB_ENV
 
       - name: Log Built Wheels
         run: |
@@ -183,4 +185,4 @@ jobs:
       - name: Upload to Hugging Face Hub
         run: |
           export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_PRECOMPILE_TOKEN }}
-          huggingface-cli upload drbh/flash-attention-pre-compile flash-attention/dist/*
+          huggingface-cli upload drbh/flash-attention-pre-compile dist/*