Skip to content

always draft a backport PR to make it easier #36

always draft a backport PR to make it easier

always draft a backport PR to make it easier #36

name: "CI: Build and test"
concurrency:
group: ${{ github.workflow }}-${{
github.ref_name == 'main' && format('ci-main-build-test-{0}', github.run_id) ||
format('ci-pr-build-test-on-{0}-against-branch-{1}', github.event_name, github.ref_name)
}}
cancel-in-progress: true
on:
push:
branches:
- "pull-request/[0-9]+"
- "main"
jobs:
build:

Check failure on line 17 in .github/workflows/build-and-test.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/build-and-test.yml

Invalid workflow file

You have an error in your yaml syntax on line 17
strategy:
fail-fast: false
matrix:
host-platform:
- linux-64
- linux-aarch64
- win-64
python-version:
- "3.13"
- "3.12"
- "3.11"
- "3.10"
- "3.9"
cuda-version:
# Note: this is for build-time only.
- "12.6.2"
name: Build (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }})
if: ${{ github.repository_owner == 'nvidia' }}
permissions:
id-token: write # This is required for configure-aws-credentials
contents: read # This is required for actions/checkout
runs-on: ${{ (matrix.host-platform == 'linux-64' && 'linux-amd64-cpu8') ||
(matrix.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
(matrix.host-platform == 'win-64' && 'windows-2019') }}
# (matrix.host-platform == 'win-64' && 'windows-amd64-cpu8') }}
outputs:
BUILD_CTK_VER: ${{ steps.pass_env.outputs.CUDA_VERSION }}
defaults:
run:
shell: bash --noprofile --norc -xeuo pipefail {0}
steps:
- name: Checkout ${{ github.event.repository.name }}
uses: actions/checkout@v4
with:
fetch-depth: 0
# WAR: setup-python is not relocatable...
# see https://github.com/actions/setup-python/issues/871
- name: Set up Python ${{ matrix.python-version }}
if: ${{ startsWith(matrix.host-platform, 'linux') }}
id: setup-python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Set up MSVC
if: ${{ startsWith(matrix.host-platform, 'win') }}
uses: ilammy/msvc-dev-cmd@v1
- name: Set environment variables
run: |
PYTHON_VERSION_FORMATTED=$(echo '${{ matrix.python-version }}' | tr -d '.')
if [[ "${{ matrix.host-platform }}" == linux* ]]; then
CIBW_BUILD="cp${PYTHON_VERSION_FORMATTED}-manylinux*"
REPO_DIR=$(pwd)
elif [[ "${{ matrix.host-platform }}" == win* ]]; then
CIBW_BUILD="cp${PYTHON_VERSION_FORMATTED}-win_amd64"
PWD=$(pwd)
REPO_DIR=$(cygpath -w $PWD)
fi
echo "PARALLEL_LEVEL=$(nproc)" >> $GITHUB_ENV
CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}"
echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV
echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV
CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ matrix.cuda-version }}-${{ matrix.host-platform }}"
echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV
echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
echo "CIBW_BUILD=${CIBW_BUILD}" >> $GITHUB_ENV
# When the CI is run due to merging to main, we want it to populate GHA Cache not Artifacts,
# so that CI workflows running on every branch have a fallback to use.
if [[ "${{ github.ref_name}}" == main ]]; then
echo "USE_CACHE=1" >> $GITHUB_ENV
else
echo "USE_CACHE=0" >> $GITHUB_ENV
fi
# TODO: revert me before merging; this is to test the cache restore in the PR
echo "USE_CACHE=1" >> $GITHUB_ENV
- name: Install dependencies
if: ${{ env.USE_CACHE == '1' }}
run: |
# For GHA Cache
dependencies=(zstd)
dependent_exes=(zstd)
not_found=0
for dep in ${dependent_exes[@]}; do
if ! (command -v curl 2>&1 >/dev/null); then
not_found=1
break
fi
done
if [[ $not_found == 0 ]]; then
echo "All dependencies are found. Do nothing."
exit 0
fi
if ! (command -v sudo 2>&1 >/dev/null); then
if [[ $EUID == 0 ]]; then
alias SUDO=""
else
echo "The following oprations require root access."
exit 1
fi
else
alias SUDO="sudo"
fi
shopt -s expand_aliases
SUDO apt update
SUDO apt install -y ${dependencies[@]}
- name: Dump environment
run: |
env
- name: Build cuda.core wheel
uses: pypa/cibuildwheel@v2.22.0
env:
CIBW_BUILD: ${{ env.CIBW_BUILD }}
CIBW_ARCHS_LINUX: "native"
CIBW_BUILD_VERBOSITY: 1
with:
package-dir: ./cuda_core/
output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
- name: List the cuda.core artifacts directory
run: |
if [[ "${{ matrix.host-platform }}" == win* ]]; then
export CHOWN=chown
else
export CHOWN="sudo chown"
fi
$CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
- name: Check cuda.core wheel
run: |
pip install twine
twine check ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
- name: Upload cuda.core build artifacts
if: ${{ env.USE_CACHE == '0' }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
if-no-files-found: error
overwrite: 'true'
- name: Prepare cuda.core cache
if: ${{ env.USE_CACHE == '1' }}
run: |
if [[ "${{ env.USE_CACHE }}" == 1 ]]; then
# this file is uploaded to GHA Cache
tar -c -f "${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz" -C "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" .
du -h "${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz"
# check if the previous runs from the same PR have populated the cache, if so need to clean it up
CACHE_KEY=${{ env.CUDA_CORE_ARTIFACT_NAME }}
if [ $(gh cache list | grep $CACHE_KEY | wc -l) == "1" ]; then
gh cache delete $CACHE_KEY
fi
fi
- name: Cache cuda.core build artifacts
if: ${{ env.USE_CACHE == '1' }}
uses: actions/cache/save@v4
with:
key: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
path: ${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz
- name: Set up mini CTK
uses: ./.github/actions/fetch_ctk
continue-on-error: false
with:
host-platform: ${{ matrix.host-platform }}
cuda-version: ${{ matrix.cuda-version }}
- name: Build cuda.bindings wheel
uses: pypa/cibuildwheel@v2.22.0
env:
CIBW_BUILD: ${{ env.CIBW_BUILD }}
CIBW_ARCHS_LINUX: "native"
CIBW_BUILD_VERBOSITY: 1
# CIBW mounts the host filesystem under /host
CIBW_ENVIRONMENT_LINUX: >
CUDA_PATH=/host/${{ env.CUDA_PATH }}
PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
CIBW_ENVIRONMENT_WINDOWS: >
CUDA_HOME="$(cygpath -w ${{ env.CUDA_PATH }})"
# PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
with:
package-dir: ./cuda_bindings/
output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
- name: List the cuda.bindings artifacts directory
run: |
if [[ "${{ matrix.host-platform }}" == win* ]]; then
export CHOWN=chown
else
export CHOWN="sudo chown"
fi
$CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
# TODO: enable this after NVIDIA/cuda-python#297 is resolved
# - name: Check cuda.bindings wheel
# run: |
# twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
- name: Prepare cuda.bindings cache
if: ${{ env.USE_CACHE == '1' }}
run: |
if [[ "${{ env.USE_CACHE }}" == 1 ]]; then
# this file is uploaded to GHA Cache
tar -c -f "${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz" -C "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" .
du -h "${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz"
# check if the previous runs from the same PR have populated the cache, if so need to clean it up
CACHE_KEY=${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
if [ $(gh cache list | grep $CACHE_KEY | wc -l) == "1" ]; then
gh cache delete $CACHE_KEY
fi
fi
- name: Upload cuda.bindings build artifacts
if: ${{ env.USE_CACHE == '0' }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
if-no-files-found: error
overwrite: 'true'
- name: Cache cuda.bindings build artifacts
if: ${{ env.USE_CACHE == '1' }}
uses: actions/cache/save@v4
with:
key: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
path: ${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz
- name: Pass environment variables to the next runner
id: pass_env
run: |
echo "CUDA_VERSION=${{ matrix.cuda-version }}" >> $GITHUB_OUTPUT
test:
strategy:
fail-fast: false
# TODO: add driver version here
matrix:
host-platform:
- linux-64
- linux-aarch64
# TODO: enable testing once win-64 GPU runners are up
# - win-64
python-version:
- "3.13"
- "3.12"
- "3.11"
- "3.10"
- "3.9"
cuda-version:
# Note: this is for test-time only.
- "12.6.2"
- "12.0.1"
- "11.8.0"
runner:
- default
include:
- host-platform: linux-64
python-version: "3.12"
cuda-version: "12.6.2"
runner: H100
name: Test (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}, Runner ${{ matrix.runner }})
# The build stage could fail but we want the CI to keep moving.
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
permissions:
id-token: write # This is required for configure-aws-credentials
contents: read # This is required for actions/checkout
runs-on: ${{ (matrix.runner == 'default' && matrix.host-platform == 'linux-64' && 'linux-amd64-gpu-v100-latest-1') ||
(matrix.runner == 'default' && matrix.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') ||
(matrix.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1-testing') }}
# Our self-hosted runners require a container
# TODO: use a different (nvidia?) container
container:
options: -u root --security-opt seccomp=unconfined --shm-size 16g
image: ubuntu:22.04
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
needs:
- build
defaults:
run:
shell: bash --noprofile --norc -xeuo pipefail {0}
steps:
- name: Ensure GPU is working
run: nvidia-smi
- name: Checkout ${{ github.event.repository.name }}
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set environment variables
run: |
PYTHON_VERSION_FORMATTED=$(echo '${{ matrix.python-version }}' | tr -d '.')
if [[ "${{ matrix.host-platform }}" == linux* ]]; then
REPO_DIR=$(pwd)
elif [[ "${{ matrix.host-platform }}" == win* ]]; then
PWD=$(pwd)
REPO_DIR=$(cygpath -w $PWD)
fi
BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ needs.build.outputs.BUILD_CTK_VER }})"
TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ matrix.cuda-version }})"
if [[ $BUILD_CUDA_MAJOR != $TEST_CUDA_MAJOR ]]; then
SKIP_CUDA_BINDINGS_TEST=1
else
SKIP_CUDA_BINDINGS_TEST=0
fi
# make outputs from the previous job as env vars
CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}"
echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV
echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV
CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ matrix.cuda-version }}-${{ matrix.host-platform }}"
echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV
echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV
# We'll try GHA Artifacts first, and then fall back to GHA Cache
- name: Download cuda.bindings build artifacts
id: cuda-bindings-download
uses: actions/download-artifact@v4
with:
name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
- name: Restore cuda.bindings cache
if: ${{ failure() && steps.cuda-bindings-download.conclusion == 'failure' }}
id: cuda-bindings-cache
uses: actions/cache/restore@v4
with:
key: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
path: ${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz
restore-keys: ${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}
fail-on-cache-miss: true
- name: Report cache restore status (hit)
if: ${{ steps.cuda-bindings-cache.conclusion != 'skipped' &&
steps.cuda-bindings-cache.outputs.cache-hit == 'true' }}
run: |
echo "cache is found"
CACHE_DIR="${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
CACHE_ARCHIVE="${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz"
ls -l $CACHE_ARCHIVE
mkdir -p $CACHE_DIR
du -h $CACHE_ARCHIVE &&
tar -x -f $CACHE_ARCHIVE -C $CACHE_DIR &&
rm -f $CACHE_ARCHIVE || echo "WARNING: cache could not be retrieved."
- name: Display structure of downloaded cuda.bindings artifacts
run: |
pwd
ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR
- name: Download cuda.core build artifacts
id: cuda-core-download
uses: actions/download-artifact@v4
with:
name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
- name: Restore cuda.core cache
if: ${{ failure() && steps.cuda-core-download.conclusion == 'failure' }}
id: cuda-core-cache
uses: actions/cache/restore@v4
with:
key: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
path: ${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz
restore-keys: ${{ env.CUDA_CORE_ARTIFACT_BASENAME }}
fail-on-cache-miss: true
- name: Report cache restore status (hit)
if: ${{ steps.cuda-core-cache.conclusion != 'skipped' &&
steps.cuda-core-cache.outputs.cache-hit == 'true' }}
run: |
echo "cache is found"
CACHE_DIR="${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
CACHE_ARCHIVE="${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz"
ls -l $CACHE_ARCHIVE
mkdir -p $CACHE_DIR
du -h $CACHE_ARCHIVE &&
tar -x -f $CACHE_ARCHIVE -C $CACHE_DIR &&
rm -f $CACHE_ARCHIVE || echo "WARNING: cache could not be retrieved."
- name: Display structure of downloaded cuda.core build artifacts
run: |
pwd
ls -lahR $CUDA_CORE_ARTIFACTS_DIR
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
env:
# we use self-hosted runners on which setup-python behaves weirdly...
AGENT_TOOLSDIRECTORY: "/opt/hostedtoolcache"
- name: Set up mini CTK
uses: ./.github/actions/fetch_ctk
continue-on-error: false
with:
host-platform: ${{ matrix.host-platform }}
cuda-version: ${{ matrix.cuda-version }}
- name: Run cuda.bindings tests
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
run: |
ls $CUDA_PATH
pushd "${CUDA_BINDINGS_ARTIFACTS_DIR}"
pip install *.whl
popd
pushd ./cuda_bindings
pip install -r requirements.txt
pytest -rxXs tests/
if [[ "${{ matrix.host-platform }}" == linux* ]]; then
# cython tests require gcc
apt install -y build-essential
bash tests/cython/build_tests.sh
elif [[ "${{ matrix.host-platform }}" == win* ]]; then
# TODO: enable this once win-64 runners are up
exit 1
fi
pytest -rxXs tests/cython
popd
- name: Run cuda.core tests
run: |
if [[ ${{ matrix.python-version }} == "3.13" ]]; then
# TODO: remove this hack once cuda-python has a cp313 build
if [[ $SKIP_CUDA_BINDINGS_TEST == 1 ]]; then
echo "Python 3.13 + cuda-python ${{ matrix.cuda-version }} is not supported, skipping the test..."
exit 0
fi
fi
# If build/test majors match: cuda.bindings is installed in the previous step.
# If mismatch: cuda.bindings is installed from PyPI.
TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ matrix.cuda-version }})"
pushd "${CUDA_CORE_ARTIFACTS_DIR}"
pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}"]
popd
pushd ./cuda_core
pip install -r "tests/requirements-cu${TEST_CUDA_MAJOR}.txt"
pytest -rxXs tests/
popd
doc:
name: Docs
# The build stage could fail but we want the CI to keep moving.
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
id-token: write
contents: write
needs:
- build
secrets: inherit
uses:
./.github/workflows/build-docs.yml
with:
build_ctk_ver: ${{ needs.build.outputs.BUILD_CTK_VER }}