From c504c77dae6ffbf664e52aacf8911f0ab2a01e84 Mon Sep 17 00:00:00 2001
From: Mike S Wang <32841762+MikeSWang@users.noreply.github.com>
Date: Wed, 11 Sep 2024 00:24:40 +0100
Subject: [PATCH] BUILD(cfg): Add build toolchain for CUDA variant

---
 .github/workflows/cd_cuda.yml                 | 140 ++++++++++++++----
 .pyproject_cuda.toml                          |  10 +-
 .../conda_recipe_cuda/conda_build_config.yaml |   3 +
 deploy/pkg/conda_recipe_cuda/meta.yaml        |  30 ++--
 setup.py                                      |  19 ++-
 5 files changed, 159 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/cd_cuda.yml b/.github/workflows/cd_cuda.yml
index 11fafa58..253b1d8d 100644
--- a/.github/workflows/cd_cuda.yml
+++ b/.github/workflows/cd_cuda.yml
@@ -23,6 +23,11 @@ on:
         type: boolean
         default: false
         required: false
+      run_build_bdist_cuda_xplat:
+        description: 'Run job build_bdist_cuda'
+        type: boolean
+        default: false
+        required: false
       run_build_conda_cuda:
         description: 'Run job build_conda_cuda'
         type: boolean
@@ -39,7 +44,7 @@ concurrency:
 
 jobs:
   build_bdist_cuda:
-    name: Build bdist wheel
+    name: Build bdist wheel (CUDA)
 
     strategy:
       fail-fast: false
@@ -76,12 +81,6 @@ jobs:
           package-dir: .
           output-dir: dist/
           config-file: pyproject.toml
-        env:
-          CIBW_ENVIRONMENT: >-
-            PY_CXX=nvcc
-            PY_OMP=1
-            PY_CUDA=1
-            PY_BUILD_PARALLEL='-j'
 
       - name: Verify built distribution
         run: python -m twine check --strict dist/*
@@ -98,8 +97,106 @@ jobs:
           name: pypi_dist_${{ runner.os }}_${{ runner.arch }}_${{ github.ref_name }}_cuda
           path: dist/*.whl
 
+  build_bdist_cuda_xplat:
+    name: Build bdist wheel (CUDA, cross-platform)
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+
+    runs-on: ${{ matrix.os }}
+
+    timeout-minutes: 100
+
+    # Allow manual trigger from inputs.
+    if: >
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.run_build_bdist_cuda_xplat == 'true'
+
+    steps:
+      - name: Extract platform architecture
+        run: |
+          if [[ $(uname -m) == 'aarch64' ]]; then
+            echo "TARGET_ARCH=x86_64" >> "${GITHUB_ENV}"
+          elif [[ $(uname -m) == 'x86_64' ]]; then
+            echo "TARGET_ARCH=aarch64" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Checkout (automatic trigger)
+        if: github.event_name != 'workflow_dispatch'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Checkout (manual trigger)
+        if: >
+          github.event.inputs.run_build_bdist_cuda == 'true' &&
+          github.event.inputs.version_tag == ''
+        uses: actions/checkout@v4
+        with:
+          fetch-tags: true
+
+      - name: Checkout (manual trigger with tag)
+        if: >
+          github.event.inputs.run_build_bdist_cuda == 'true' &&
+          github.event.inputs.version_tag != ''
+        uses: actions/checkout@v4
+        with:
+          ref: refs/tags/${{ github.event.inputs.version_tag }}
+
+      # Use QEMU for non-native Linux runner.
+      - name: Set up QEMU (Linux)
+        if: runner.os == 'Linux'
+        uses: docker/setup-qemu-action@v3
+        with:
+          # Specify target architecture.
+          platforms: linux/${{ env.TARGET_ARCH }}
+
+      - name: Set up Python 3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install packaging requirements
+        run: python -m pip install --upgrade twine
+
+      - name: Copy Python project configuration file
+        run: cp .pyproject_cuda.toml pyproject.toml
+
+      - name: Build built distribution
+        uses: pypa/cibuildwheel@v2.20.0
+        with:
+          package-dir: .
+          output-dir: dist/
+          config-file: pyproject.toml
+        env:
+          # Specify target architecture, which is actually fixed to aarch64.
+          CIBW_ARCHS_LINUX: ${{ env.TARGET_ARCH }}
+          CIBW_MANYLINUX_AARCH64_IMAGE: quay.io/pypa/manylinux_2_24_aarch64
+          CIBW_BEFORE_ALL_LINUX: >
+            apt-get -y install libgsl-devel &&
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb &&
+            dpkg -i cuda-keyring_1.1-1_all.deb &&
+            apt-get -y install cuda-cross-aarch64
+
+      - name: Verify built distribution
+        run: python -m twine check --strict dist/*
+
+      - name: Save wheel to bdist
+        uses: actions/upload-artifact@v4
+        with:
+          name: bdist_whl_${{ matrix.os }}_${{ env.TARGET_ARCH }}_${{ github.ref_name }}_cuda
+          path: dist/*.whl
+
+      - name: Save wheel to pypi_dist
+        uses: actions/upload-artifact@v4
+        with:
+          name: pypi_dist_${{ runner.os }}_${{ env.TARGET_ARCH }}_${{ github.ref_name }}_cuda
+          path: dist/*.whl
+
   build_conda_cuda:
-    name: Build Conda package
+    name: Build Conda package (CUDA)
 
     strategy:
       fail-fast: false
@@ -112,7 +209,7 @@ jobs:
 
     if: >
       github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.run_build_conda == 'true'
+      github.event.inputs.run_build_conda_cuda == 'true'
 
     defaults:
       run:
@@ -124,20 +221,13 @@ jobs:
         with:
           fetch-depth: 0
 
-      - uses: Jimver/cuda-toolkit@master
-        id: cuda-toolkit
-        with:
-          log-file-suffix: '${{matrix.os}}_cudatoolkit_log.txt'
-          method: 'network'
-          sub-packages: '["nvcc"]'
-          non-cuda-sub-packages: '["libcufft"]'
-
       - name: Set up (Mini)conda
         uses: conda-incubator/setup-miniconda@v3
         with:
           activate-environment: conda_bld
-          channels: conda-forge
+          channels: conda-forge,nvidia
           channel-priority: strict
+          architecture: ${{ env.TARGET_ARCH }}
 
       - name: Install packaging requirements
         run: |
@@ -149,17 +239,13 @@ jobs:
 
       - name: Override version
         if: github.event_name == 'workflow_dispatch'
-        uses: knicknic/os-specific-run@v1.0.4
         env:
           recipe_file: deploy/pkg/conda_recipe_cuda/meta.yaml
-        with:
-          linux: |
-            vers_tag=${{ github.event.inputs.version_tag }}
-            cuda_vers=${{ steps.cuda-toolkit.outputs.cuda }}
-            if [[ ! -z ${vers_tag} ]]; then
-              sed -i "s/# git_rev:.*/git_rev: ${vers_tag}/g" ${recipe_file}
-              sed -i "s/'CUDA_VERSION', ''/'CUDA_VERSION', '${cuda_vers}'/g" ${recipe_file}
-            fi
+        run: |
+          vers_tag=${{ github.event.inputs.version_tag }}
+          if [[ ! -z ${vers_tag} ]]; then
+            sed -i "s/# git_rev:.*/git_rev: ${vers_tag}/g" ${recipe_file}
+          fi
 
       - name: Build Conda package
         env:
diff --git a/.pyproject_cuda.toml b/.pyproject_cuda.toml
index 752e7bb9..a2789b48 100644
--- a/.pyproject_cuda.toml
+++ b/.pyproject_cuda.toml
@@ -105,8 +105,16 @@ manylinux-aarch64-image = 'manylinux_2_28'
 
 [tool.cibuildwheel.linux]
 before-all = [
-    "yum install -y gsl-devel cuda-toolkit",
+    "yum install -y gsl-devel",
+# Install CUDA Toolkit inside Docker container using package manager,
+# matching repository with image OS, and optionally matching CUDA version
+# for Pip/Conda consistency.
+    "yum install -y yum-utils",
+    "yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo",
+    "yum install -y cuda-toolkit",
+    # "yum install -y cuda-toolkit-12-6",
 ]
+environment = { PY_CXX="/usr/local/cuda/bin/nvcc", PY_CXXFLAGS="-I/usr/local/cuda/include", PY_LDFLAGS="-L/usr/local/cuda/lib64", PY_OMP='1', PY_CUDA='1', PY_BUILD_PARALLEL='-j' }
 
 [tool.autopep8]
 in-place = true
diff --git a/deploy/pkg/conda_recipe_cuda/conda_build_config.yaml b/deploy/pkg/conda_recipe_cuda/conda_build_config.yaml
index 311f7da7..a4783487 100644
--- a/deploy/pkg/conda_recipe_cuda/conda_build_config.yaml
+++ b/deploy/pkg/conda_recipe_cuda/conda_build_config.yaml
@@ -1,3 +1,6 @@
 target_platform:
   - linux-64       # [linux]
   - linux-aarch64  # [linux]
+
+cuda_version:
+  - 12.0
diff --git a/deploy/pkg/conda_recipe_cuda/meta.yaml b/deploy/pkg/conda_recipe_cuda/meta.yaml
index 4cd00606..301b81a0 100644
--- a/deploy/pkg/conda_recipe_cuda/meta.yaml
+++ b/deploy/pkg/conda_recipe_cuda/meta.yaml
@@ -1,6 +1,10 @@
 {% set name = 'Triumvirate-CUDA' %}
 {% set version = environ.get('GIT_DESCRIBE_TAG', 'v0.5.0') %}
-{% set cuda_vers = environ.get('CUDA_VERSION', '') %}
+# {% set cuda_vers = environ.get('CUDA_VERSION', '12.0') %}
+# {% set cuda_vers_parts = cuda_vers.split('.') %}
+# {% set cuda_vers_major_minor = cuda_vers_parts[0] ~ cuda_vers_parts[1] %}
+# {% set cuda_vers_int = cuda_vers_major_minor|int %}
+# {% set cuda_path = environ.get('CUDA_PATH', '/usr/local/cuda') %}
 
 package:
   name: "{{ name|lower }}"
@@ -13,22 +17,23 @@ source:
 
 build:
   number: 0
-  string: cuda{{ cuda_vers }}_py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}
+  string: cuda{{ cuda_version|replace(".", "") }}py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}
   script:
-    # Use Conda-provided compiler.
-    - export PY_CXX=$CXX
-    - export PY_CXXFLAGS="${PY_CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY"
-    # Enforce OpenMP support.
-    - export PY_OMP=1
-    - export PY_CUDA=1
     - cp .pyproject_cuda.toml pyproject.toml
     - {{ PYTHON }} -m pip install . -vvv
   script_env:
-    - PY_BUILD_PARALLEL="-j"
+    # Enforce OpenMP and CUDA support.
+    - PY_OMP=1
+    - PY_CUDA=1
+    - PY_BUILD_PARALLEL='-j'
 
 requirements:
   build:
-    - {{ compiler('cuda') }}
+    # Use Nvidia channel libraries.
+    - cuda-nvcc
+    - libcufft-dev
+    # Use conda-forge channel libraries.
+    - cuda-version {{ cuda_version }}
     - python                              # [build_platform != target_platform]
     - cross-python_{{ target_platform }}  # [build_platform != target_platform]
     - numpy >=2.0                         # [build_platform != target_platform]
@@ -38,8 +43,10 @@ requirements:
   host:
     - python
     - pip >=22.0
-    # Use Conda-provided libraries.
+    # Use conda-forge channel libraries.
+    - cuda-version {{ cuda_version }}
     - gsl >=2.7
+    - libcufft
     - libgomp  # [linux]
     - setuptools >=61.0
     - setuptools_scm >=6.4
@@ -47,6 +54,7 @@ requirements:
     - extension-helpers >=1.1
     - numpy >=2.0
   run:
+    - {{ pin_compatible('cuda-version', min_pin='x', max_pin='x.x') }}
     - python >=3.10
     - numpy >=1.23
     - scipy >=1.13
diff --git a/setup.py b/setup.py
index e886b8c4..1ad78cf1 100644
--- a/setup.py
+++ b/setup.py
@@ -131,10 +131,21 @@ def get_pkg_version_scheme(default_ver_scheme='no-guess-dev',
 # Build
 # ========================================================================
 
-NA_OPTS = ('-Wstrict-prototypes', '-Wl,-pie',)  # noqa: E231
-CUDA_XCOMPILER_OPTS = ('-f', '-O', '-W',)  # noqa: E231
-CUDA_XCOMPILER_OPTS_EXACT = ['-pthread', '-B',]  # noqa: E231
-CUDA_XCOMPILER_OPTS_PARTIAL = ['compiler_compat',]  # noqa: E231
+NA_OPTS = (
+    '-Wstrict-prototypes',
+    '-Wl,-pie',
+)
+CUDA_XCOMPILER_OPTS = (
+    '-f', '-O', '-W',
+    '-march', '-mtune', '-pipe',
+)
+CUDA_XCOMPILER_OPTS_EXACT = [
+    '-pthread',
+    '-B',
+]
+CUDA_XCOMPILER_OPTS_PARTIAL = [
+    'compiler_compat',
+]
 
 
 class BuildExt(build_ext):