diff --git a/.github/runs-on.yml b/.github/runs-on.yml new file mode 100644 index 0000000..2b0faaa --- /dev/null +++ b/.github/runs-on.yml @@ -0,0 +1,18 @@ +# Custom images with CUDA toolkit installed +# See vm_images/ for instructions for building the images +images: + linux-amd64: + platform: "linux" + arch: "x64" + owner: "492475357299" # XGBooost CI + name: "xgboost-ci-runs-on-linux-*" + +runners: + linux-amd64-cpu: + cpu: 32 + family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] + image: linux-amd64 + linux-arm64-cpu: + cpu: 32 + family: ["c6g", "c7g"] + image: ubuntu24-full-arm64 diff --git a/.github/workflows/containers.yml b/.github/workflows/containers.yml new file mode 100644 index 0000000..51693ea --- /dev/null +++ b/.github/workflows/containers.yml @@ -0,0 +1,64 @@ +name: Build and publish image +run-name: "Build image${{ (inputs.upstream_repository != '') && format(' - triggered by: {0}', inputs.upstream_repository) || '' }}" + +on: + workflow_dispatch: + inputs: + upstream_repository: + required: false + type: string + upstream_job: + required: false + type: string + push: + branches: + - main + pull_request: + schedule: + - cron: "0 7 * * *" # Run once daily + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + PUBLISH_CONTAINER: 1 + +jobs: + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=build-containers-${{ matrix.container_id }} + strategy: + fail-fast: false + matrix: + container_id: + - xgb-ci.clang_tidy + - xgb-ci.cpu + - xgb-ci.gpu + - xgb-ci.gpu_build_r_rockylinux8 + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_rockylinux8_dev_ver + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash containers/docker_build.sh ${{ matrix.container_id }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/containers/ci_container.yml b/containers/ci_container.yml new file mode 100644 index 0000000..a7cef24 --- /dev/null +++ b/containers/ci_container.yml @@ -0,0 +1,72 @@ +## List of CI containers with definitions and build arguments + +# Each container will be built using the definition from +# containers/dockerfile/Dockerfile.CONTAINER_DEF + +rapids_versions: + stable: &rapids_version "24.10" + dev: &dev_rapids_version "24.12" + +xgb-ci.gpu_build_rockylinux8: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION: "12.4.1" + NCCL_VERSION: "2.23.4-1" + RAPIDS_VERSION: *rapids_version + +xgb-ci.gpu_build_rockylinux8_dev_ver: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION: "12.4.1" + NCCL_VERSION: "2.23.4-1" + RAPIDS_VERSION: *dev_rapids_version + +xgb-ci.gpu_build_r_rockylinux8: + container_def: gpu_build_r_rockylinux8 + build_args: + CUDA_VERSION: "12.4.1" + R_VERSION: "4.3.2" + +xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION: "12.4.1" + NCCL_VERSION: "2.23.4-1" + RAPIDS_VERSION: *rapids_version + +xgb-ci.gpu_dev_ver: + container_def: gpu + build_args: + CUDA_VERSION: "12.4.1" + NCCL_VERSION: "2.23.4-1" + RAPIDS_VERSION: *dev_rapids_version + RAPIDSAI_CONDA_CHANNEL: "rapidsai-nightly" + +xgb-ci.clang_tidy: + container_def: clang_tidy + build_args: + CUDA_VERSION: "12.4.1" + +xgb-ci.cpu: + container_def: cpu + +xgb-ci.aarch64: + container_def: aarch64 + +xgb-ci.manylinux_2_28_x86_64: + container_def: manylinux_2_28_x86_64 + +xgb-ci.manylinux2014_x86_64: + container_def: manylinux2014_x86_64 + +xgb-ci.manylinux2014_aarch64: + container_def: manylinux2014_aarch64 + +xgb-ci.jvm: + container_def: jvm + +xgb-ci.jvm_gpu_build: + container_def: jvm_gpu_build + build_args: + CUDA_VERSION: "12.4.1" + NCCL_VERSION: "2.23.4-1" diff --git a/containers/conda_env/aarch64_test.yml b/containers/conda_env/aarch64_test.yml new file mode 100644 index 0000000..14305eb --- /dev/null +++ b/containers/conda_env/aarch64_test.yml @@ -0,0 +1,35 @@ +name: aarch64_test +channels: +- conda-forge +dependencies: +- python=3.10 +- pip +- wheel +- pytest +- pytest-cov +- numpy +- scipy +- scikit-learn +- pandas +- matplotlib +- dask +- distributed +- hypothesis +- graphviz +- python-graphviz +- codecov +- cmake +- ninja +- boto3 +- jsonschema +- boto3 +- awscli +- numba +- llvmlite +- loky +- pyarrow +- pyspark>=3.4.0 +- cloudpickle +- pip: + - awscli + - auditwheel diff --git a/containers/conda_env/linux_cpu_test.yml b/containers/conda_env/linux_cpu_test.yml new file mode 100644 index 0000000..1ec2a54 --- /dev/null +++ b/containers/conda_env/linux_cpu_test.yml @@ -0,0 +1,43 @@ +name: linux_cpu_test +channels: +- conda-forge +dependencies: +- python=3.10 +- cmake +- c-compiler +- cxx-compiler +- ninja +- pip +- wheel +- pyyaml +- cpplint +- pylint +- numpy +- scipy +- scikit-learn>=1.4.1 +- pandas +- matplotlib +- dask<=2024.10.0 +- distributed<=2024.10.0 +- python-graphviz +- hypothesis>=6.46 +- astroid +- sh +- mock +- pytest +- pytest-timeout +- pytest-cov +- python-kubernetes +- urllib3 +- jsonschema +- boto3 +- awscli +- py-ubjson +- loky +- pyarrow +- protobuf +- cloudpickle +- modin +- pyspark>=3.4.0 +- pip: + - datatable diff --git a/containers/docker_build.py b/containers/docker_build.py new file mode 100644 index 0000000..907cdd5 --- /dev/null +++ b/containers/docker_build.py @@ -0,0 +1,140 @@ +""" +Wrapper script to build a Docker container +""" + +import argparse +import itertools +import pathlib +import subprocess +import sys +import textwrap + +PROJECT_ROOT_DIR = pathlib.Path(__file__).parent.parent +LINEWIDTH = 88 +TEXT_WRAPPER = textwrap.TextWrapper( + width=LINEWIDTH, + initial_indent="", + subsequent_indent=" ", + break_long_words=False, + break_on_hyphens=False, +) + + +def fancy_print_cli_args(cli_args: list[str]) -> None: + print( + "=" * LINEWIDTH + + "\n" + + " \\\n".join(TEXT_WRAPPER.wrap(" ".join(cli_args))) + + "\n" + + "=" * LINEWIDTH + + "\n", + flush=True, + ) + + +def parse_build_args(*, raw_build_args: list[str]) -> dict[str, str]: + parsed_build_args = dict() + for arg in raw_build_args: + try: + key, value = arg.split("=", maxsplit=1) + except ValueError as e: + raise ValueError( + f"Build argument must be of form KEY=VALUE. Got: {arg}" + ) from e + parsed_build_args[key] = value + return parsed_build_args + + +def docker_build( + *, + container_tag: str, + build_args: dict[str, str], + dockerfile_path: pathlib.Path, + docker_context_path: pathlib.Path, +) -> None: + ## Set up command-line arguments to be passed to `docker build` + # Build args + docker_build_cli_args = list( + itertools.chain.from_iterable( + [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] + ) + ) + # When building an image using a non-default driver, we need to specify + # `--load` to load it to the image store. + # See https://docs.docker.com/build/builders/drivers/ + docker_build_cli_args.append("--load") + # Remaining CLI args + docker_build_cli_args.extend( + [ + "--progress=plain", + "--ulimit", + "nofile=1024000:1024000", + "-t", + container_tag, + "-f", + str(dockerfile_path), + str(docker_context_path), + ] + ) + cli_args = ["docker", "build"] + docker_build_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(*, args: argparse.Namespace) -> None: + docker_context_path = PROJECT_ROOT_DIR / "containers" + dockerfile_path = ( + docker_context_path / "dockerfile" / f"Dockerfile.{args.container_def}" + ) + + build_args = parse_build_args(raw_build_args=args.build_arg) + + docker_build( + container_tag=args.container_tag, + build_args=build_args, + dockerfile_path=dockerfile_path, + docker_context_path=docker_context_path, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build a Docker container") + parser.add_argument( + "--container-def", + type=str, + required=True, + help=( + "String uniquely identifying the container definition. The container " + "definition will be fetched from " + "containers/dockerfile/Dockerfile.CONTAINER_DEF." + ), + ) + parser.add_argument( + "--container-tag", + type=str, + required=True, + help=( + "Tag to assign to the newly built container, e.g. " + "492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:master" + ), + ) + parser.add_argument( + "--build-arg", + type=str, + default=[], + action="append", + help=( + "Build-time variable(s) to be passed to `docker build`. Each variable " + "should be specified as a key-value pair in the form KEY=VALUE. " + "The variables should match the ARG instructions in the Dockerfile. " + "When passing multiple variables, specify --build-arg multiple times. " + "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10" + ), + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(args=parsed_args) diff --git a/containers/docker_build.sh b/containers/docker_build.sh new file mode 100755 index 0000000..07f7376 --- /dev/null +++ b/containers/docker_build.sh @@ -0,0 +1,125 @@ +#!/bin/bash +## Build a CI container and push it to AWS ECR (Elastic Container Registry). +## This script provides a convenient wrapper for containers/docker_build.py. +## Build-time variables (--build-arg) and container defintion are fetched from +## containers/ci_container.yml. +## +## Note. This script takes in some inputs via environment variables. + +USAGE_DOC=$( +cat <<-EOF +Usage: containers/docker_build.sh [container_id] + +where [container_id] is used to fetch the container definition and build-time variables +from containers/ci_container.yml. + +In addition, the following environment variables should be set. + - BRANCH_NAME: Name of the current git branch or pull request (Required) + - GITHUB_SHA: Git commit hash (Required) + - PUBLISH_CONTAINER: If set to 1, push the container to AWS ECR (Optional). + This option requires appropriate AWS credentials to be + configured. +EOF +) + +# Use AWS ECR (Elastic Container Registry) to host Docker containers. +# Configure ECR to delete containers older than 30 days. +ECR_AWS_ACCOUNT_ID="492475357299" +ECR_AWS_REGION="us-west-2" +ECR_URL="${ECR_AWS_ACCOUNT_ID}.dkr.ecr.${ECR_AWS_REGION}.amazonaws.com" +ECR_LIFECYCLE_RULE=$( +cat <<-EOF +{ + "rules": [ + { + "rulePriority": 1, + "selection": { + "tagStatus": "any", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 30 + }, + "action": { + "type": "expire" + } + } + ] +} +EOF +) + +set -euo pipefail + +for arg in "BRANCH_NAME" "GITHUB_SHA" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" + exit 1 + fi +done + +if [[ "$#" -lt 1 ]] +then + echo "${USAGE_DOC}" + exit 2 +fi +CONTAINER_ID="$1" + +# Fetch CONTAINER_DEF and BUILD_ARGS +source <(containers/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 + +if [[ "${PUBLISH_CONTAINER:-}" != "1" ]] # Any value other than 1 is considered false +then + PUBLISH_CONTAINER=0 +fi + +if [[ ${PUBLISH_CONTAINER} -eq 0 ]] +then + echo "PUBLISH_CONTAINER not set; the container will not be published" +else + echo "The container will be published at ${ECR_URL}" + # Login for Docker registry + echo "aws ecr get-login-password --region ${ECR_AWS_REGION} |" \ + "docker login --username AWS --password-stdin ${ECR_URL}" + aws ecr get-login-password --region ${ECR_AWS_REGION} \ + | docker login --username AWS --password-stdin ${ECR_URL} +fi + +# Run Docker build +set -x +CONTAINER_TAG="${ECR_URL}/${CONTAINER_ID}:${GITHUB_SHA}" +python3 containers/docker_build.py \ + --container-def ${CONTAINER_DEF} \ + --container-tag ${CONTAINER_TAG} \ + ${BUILD_ARGS} +set +x + +# Now push the new container to ECR +if [[ ${PUBLISH_CONTAINER} -eq 1 ]] +then + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${ECR_AWS_REGION}" + if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${ECR_AWS_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ + "--region ${ECR_AWS_REGION} --lifecycle-policy-text file:///dev/stdin" + echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ + --region ${ECR_AWS_REGION} --lifecycle-policy-text file:///dev/stdin + fi + + echo "docker push --quiet ${CONTAINER_TAG}" + if ! time docker push --quiet "${CONTAINER_TAG}" + then + echo "ERROR: could not update Docker cache ${CONTAINER_TAG}" + exit 1 + fi + + # Create another alias for the container using the branch name + CONTAINER_ALIAS="${ECR_URL}/${CONTAINER_ID}:${BRANCH_NAME}" + echo "docker tag ${CONTAINER_TAG} ${CONTAINER_ALIAS}" + docker tag ${CONTAINER_TAG} ${CONTAINER_ALIAS} + echo "docker push --quiet ${CONTAINER_ALIAS}" + docker push --quiet "${CONTAINER_ALIAS}" +fi diff --git a/containers/dockerfile/Dockerfile.aarch64 b/containers/dockerfile/Dockerfile.aarch64 new file mode 100644 index 0000000..8d6cfac --- /dev/null +++ b/containers/dockerfile/Dockerfile.aarch64 @@ -0,0 +1,38 @@ +FROM quay.io/pypa/manylinux_2_28_aarch64 + +SHELL ["/bin/bash", "-c"] # Use Bash as shell + +# Install all basic requirements +RUN \ + dnf -y update && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled powertools && \ + dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh && \ + bash conda.sh -b -p /opt/miniforge + +ENV PATH=/opt/miniforge/bin:$PATH +ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc +ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ +ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp +ENV GOSU_VERSION=1.10 + +# Create new Conda environment +COPY conda_env/aarch64_test.yml /scripts/ +RUN mamba create -n aarch64_test && \ + mamba env update -n aarch64_test --file=/scripts/aarch64_test.yml && \ + mamba clean --all --yes + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.clang_tidy b/containers/dockerfile/Dockerfile.clang_tidy new file mode 100644 index 0000000..db712c1 --- /dev/null +++ b/containers/dockerfile/Dockerfile.clang_tidy @@ -0,0 +1,50 @@ +ARG CUDA_VERSION=notset +FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04 +ARG CUDA_VERSION + +# Environment +ENV DEBIAN_FRONTEND=noninteractive + +# Install all basic requirements +RUN \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ + apt-get update && \ + apt-get install -y wget git python3 python3-pip software-properties-common \ + apt-transport-https ca-certificates gnupg-agent && \ + apt-get install -y ninja-build + +# Install clang-tidy: https://apt.llvm.org/ +RUN \ + apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-19 main" && \ + wget -O llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key && \ + apt-key add ./llvm-snapshot.gpg.key && \ + rm llvm-snapshot.gpg.key && \ + apt-get update && \ + apt-get install -y clang-tidy-19 clang-19 libomp-19-dev + +# Set default clang-tidy version +RUN \ + update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-19 100 && \ + update-alternatives --install /usr/bin/clang clang /usr/bin/clang-19 100 + +RUN \ + apt-get install libgtest-dev libgmock-dev -y + +# Install Python packages +RUN \ + pip3 install cmake + +ENV GOSU_VERSION=1.10 + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.cpu b/containers/dockerfile/Dockerfile.cpu new file mode 100644 index 0000000..64b2802 --- /dev/null +++ b/containers/dockerfile/Dockerfile.cpu @@ -0,0 +1,57 @@ +FROM ubuntu:22.04 + +# Environment +ENV DEBIAN_FRONTEND=noninteractive +SHELL ["/bin/bash", "-c"] + +# Install all basic requirements +RUN \ + apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:ubuntu-toolchain-r/test && \ + apt-get update && \ + apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libidn12 cmake ninja-build gcc-10 g++-10 openjdk-8-jdk-headless && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge + +ENV PATH=/opt/miniforge/bin:$PATH +ENV CC=gcc-10 +ENV CXX=g++-10 +ENV CPP=cpp-10 + +ENV GOSU_VERSION=1.10 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ + +# Install gRPC +# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629 +RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ + --recurse-submodules --depth 1 && \ + pushd grpc && \ + pushd third_party/abseil-cpp && \ + git fetch origin master && \ + git cherry-pick -n cfde5f74e276049727f9556f13473a59fe77d9eb && \ + popd && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \ + cmake --build build --target install && \ + popd && \ + rm -rf grpc + +# Create new Conda environment +COPY conda_env/linux_cpu_test.yml /scripts/ +RUN mamba create -n linux_cpu_test && \ + mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ + mamba clean --all --yes + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.gpu b/containers/dockerfile/Dockerfile.gpu new file mode 100644 index 0000000..dd95676 --- /dev/null +++ b/containers/dockerfile/Dockerfile.gpu @@ -0,0 +1,54 @@ +ARG CUDA_VERSION=notset +FROM nvidia/cuda:$CUDA_VERSION-runtime-ubuntu22.04 +ARG CUDA_VERSION +ARG RAPIDS_VERSION + # Should be first 4 digits (e.g. 24.06) +ARG NCCL_VERSION +ARG RAPIDSAI_CONDA_CHANNEL="rapidsai" + +# Environment +ENV DEBIAN_FRONTEND=noninteractive +SHELL ["/bin/bash", "-c"] + +# Install all basic requirements +RUN \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ + apt-get update && \ + apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ + apt-get install libnccl2 libnccl-dev -y --allow-change-held-packages && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge + +ENV PATH=/opt/miniforge/bin:$PATH + +# Create new Conda environment with cuDF, Dask, and cuPy +RUN \ + export NCCL_SHORT_VER=$(echo "$NCCL_VERSION" | cut -d "-" -f 1) && \ + export CUDA_SHORT_VER=$(echo "$CUDA_VERSION" | grep -o -E '[0-9]+\.[0-9]') && \ + mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL} -c conda-forge -c nvidia \ + python=3.10 "cudf=$RAPIDS_VERSION.*" "rmm=$RAPIDS_VERSION.*" cuda-version=$CUDA_SHORT_VER \ + "nccl>=${NCCL_SHORT_VER}" \ + "dask<=2024.10.0" \ + "distributed<=2024.10.0" \ + "dask-cuda=$RAPIDS_VERSION.*" "dask-cudf=$RAPIDS_VERSION.*" cupy \ + numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ + python-kubernetes urllib3 graphviz hypothesis loky \ + "pyspark>=3.4.0" cloudpickle cuda-python && \ + mamba clean --all --yes + +ENV GOSU_VERSION=1.10 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 b/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 new file mode 100644 index 0000000..0b22996 --- /dev/null +++ b/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 @@ -0,0 +1,58 @@ +ARG CUDA_VERSION=notset +FROM nvcr.io/nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 +ARG CUDA_VERSION +ARG R_VERSION + +# Install all basic requirements +RUN \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ + dnf -y update && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled powertools && \ + dnf install -y tar unzip wget xz git which ninja-build readline-devel libX11-devel libXt-devel \ + xorg-x11-server-devel openssl-devel zlib-devel bzip2-devel xz-devel \ + pcre2-devel libcurl-devel texlive-* \ + gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ \ + gcc-toolset-10-gcc-gfortran gcc-toolset-10-libquadmath-devel \ + gcc-toolset-10-runtime gcc-toolset-10-libstdc++-devel + +ENV PATH=/opt/miniforge/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/$R_VERSION/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/$R_VERSION/lib64:$LD_LIBRARY_PATH +ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc +ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ +ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp +ENV F77=/opt/rh/gcc-toolset-10/root/usr/bin/gfortran +ENV FC=/opt/rh/gcc-toolset-10/root/usr/bin/gfortran + +RUN \ + wget -nv -nc https://cran.r-project.org/src/base/R-4/R-$R_VERSION.tar.gz && \ + tar xf R-$R_VERSION.tar.gz && \ + cd R-$R_VERSION && \ + ./configure --prefix=/opt/R/$R_VERSION --enable-R-shlib --with-pcrel && \ + make -j$(nproc) && \ + make install + +run \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge && \ + /opt/miniforge/bin/python -m pip install auditwheel awscli && \ + # CMake + wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr + +ENV GOSU_VERSION=1.10 + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 b/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 new file mode 100644 index 0000000..d8670be --- /dev/null +++ b/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -0,0 +1,82 @@ +ARG CUDA_VERSION=notset +FROM nvcr.io/nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 +ARG CUDA_VERSION +ARG NCCL_VERSION +ARG RAPIDS_VERSION + +# Install all basic requirements +RUN \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ + dnf -y update && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled powertools && \ + dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge && \ + /opt/miniforge/bin/python -m pip install awscli && \ + # CMake + wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr + +# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) +RUN \ + export CUDA_SHORT=`echo $CUDA_VERSION | grep -o -E '[0-9]+\.[0-9]'` && \ + export NCCL_VERSION=$NCCL_VERSION && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ + dnf -y update && \ + dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} + +ENV PATH=/opt/miniforge/bin:/usr/local/ninja:$PATH +ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc +ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ +ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp +ENV CUDAHOSTCXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ + +ENV GOSU_VERSION=1.10 + +# Install gRPC +# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629 +RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ + --recurse-submodules --depth 1 && \ + pushd grpc && \ + pushd third_party/abseil-cpp && \ + git fetch origin master && \ + git cherry-pick -n cfde5f74e276049727f9556f13473a59fe77d9eb && \ + popd && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \ + cmake --build build --target install && \ + popd && \ + rm -rf grpc + +# Install RMM +# Patch out -Werror +# Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 +RUN git clone -b branch-${RAPIDS_VERSION} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ + pushd rmm && \ + find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ + mkdir build && \ + pushd build && \ + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \ + pushd _deps/cccl-src/ && \ + git fetch origin main && \ + git cherry-pick -n 9fcb32c228865f21f2b002b29d38a06b4c6fbd73 && \ + popd && \ + cmake --build . --target install && \ + popd && \ + popd && \ + rm -rf rmm + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.i386 b/containers/dockerfile/Dockerfile.i386 new file mode 100644 index 0000000..f128a00 --- /dev/null +++ b/containers/dockerfile/Dockerfile.i386 @@ -0,0 +1,8 @@ +FROM i386/debian:sid + +ENV DEBIAN_FRONTEND=noninteractive +SHELL ["/bin/bash", "-c"] + +RUN \ + apt-get update && \ + apt-get install -y tar unzip wget git build-essential ninja-build cmake diff --git a/containers/dockerfile/Dockerfile.jvm b/containers/dockerfile/Dockerfile.jvm new file mode 100644 index 0000000..c458474 --- /dev/null +++ b/containers/dockerfile/Dockerfile.jvm @@ -0,0 +1,43 @@ +FROM rockylinux:8 + +# Install all basic requirements +RUN \ + dnf -y update && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled powertools && \ + dnf install -y tar unzip make bzip2 wget xz git which ninja-build java-1.8.0-openjdk-devel \ + gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ \ + gcc-toolset-10-runtime gcc-toolset-10-libstdc++-devel && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge && \ + # CMake + wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr && \ + # Maven + wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.9.7/binaries/apache-maven-3.9.7-bin.tar.gz && \ + tar xvf apache-maven-3.9.7-bin.tar.gz -C /opt && \ + ln -s /opt/apache-maven-3.9.7/ /opt/maven + +ENV PATH=/opt/miniforge/bin:/opt/maven/bin:$PATH +ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc +ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ +ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp + +# Install Python packages +RUN pip install numpy pytest scipy scikit-learn wheel kubernetes awscli + +ENV GOSU_VERSION=1.10 + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.jvm_gpu_build b/containers/dockerfile/Dockerfile.jvm_gpu_build new file mode 100644 index 0000000..afb439d --- /dev/null +++ b/containers/dockerfile/Dockerfile.jvm_gpu_build @@ -0,0 +1,54 @@ +ARG CUDA_VERSION=notset +FROM nvcr.io/nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 +ARG CUDA_VERSION +ARG NCCL_VERSION + +# Install all basic requirements +RUN \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ + dnf -y update && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled powertools && \ + dnf install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge && \ + # CMake + wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr && \ + # Maven + wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.9.7/binaries/apache-maven-3.9.7-bin.tar.gz && \ + tar xvf apache-maven-3.9.7-bin.tar.gz -C /opt && \ + ln -s /opt/apache-maven-3.9.7/ /opt/maven + +# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) +RUN \ + export CUDA_SHORT=`echo $CUDA_VERSION | grep -o -E '[0-9]+\.[0-9]'` && \ + export NCCL_VERSION=$NCCL_VERSION && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ + dnf -y update && \ + dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} + +ENV PATH=/opt/miniforge/bin:/opt/maven/bin:$PATH +ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc +ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ +ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp + +# Install Python packages +RUN pip install numpy pytest scipy scikit-learn wheel kubernetes awscli + +ENV GOSU_VERSION=1.10 + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.manylinux2014_aarch64 b/containers/dockerfile/Dockerfile.manylinux2014_aarch64 new file mode 100644 index 0000000..52baff4 --- /dev/null +++ b/containers/dockerfile/Dockerfile.manylinux2014_aarch64 @@ -0,0 +1,17 @@ +FROM quay.io/pypa/manylinux2014_aarch64 + +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + +# Install lightweight sudo (not bound to TTY) +ENV GOSU_VERSION=1.10 +RUN set -ex; \ + curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.manylinux2014_x86_64 b/containers/dockerfile/Dockerfile.manylinux2014_x86_64 new file mode 100644 index 0000000..fdfcbd2 --- /dev/null +++ b/containers/dockerfile/Dockerfile.manylinux2014_x86_64 @@ -0,0 +1,17 @@ +FROM quay.io/pypa/manylinux2014_x86_64 + +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + +# Install lightweight sudo (not bound to TTY) +ENV GOSU_VERSION=1.10 +RUN set -ex; \ + curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/dockerfile/Dockerfile.manylinux_2_28_x86_64 b/containers/dockerfile/Dockerfile.manylinux_2_28_x86_64 new file mode 100644 index 0000000..5e264e2 --- /dev/null +++ b/containers/dockerfile/Dockerfile.manylinux_2_28_x86_64 @@ -0,0 +1,15 @@ +FROM quay.io/pypa/manylinux_2_28_x86_64 + +# Install lightweight sudo (not bound to TTY) +ENV GOSU_VERSION=1.10 +RUN set -ex; \ + curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/containers/entrypoint.sh b/containers/entrypoint.sh new file mode 100755 index 0000000..40135c1 --- /dev/null +++ b/containers/entrypoint.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# This wrapper script propagates the user information from the host +# to the container. This way, any files generated by processes running +# in the container will be accessible in the host. + +set -euo pipefail + +COMMAND=("$@") + +if ! touch /this_is_writable_file_system; then + echo "You can't write to your filesystem!" + echo "If you are in Docker you should check you do not have too many images" \ + "with too many files in them. Docker has some issue with it." + exit 1 +else + rm /this_is_writable_file_system +fi + +## Assumption: the host passes correct user information via environment variables +## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP + +if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] +then + groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true + useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ + "${CI_BUILD_USER}" || true + export HOME="/home/${CI_BUILD_USER}" + shopt -s dotglob + cp -r /root/* "$HOME/" + chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" "$HOME" + + # Allows project-specific customization + if [[ -e "/workspace/.pre_entry.sh" ]]; then + gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" /workspace/.pre_entry.sh + fi + + # Enable passwordless sudo capabilities for the user + chown root:"${CI_BUILD_GID}" "$(which gosu)" + chmod +s "$(which gosu)"; sync + + exec gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" "${COMMAND[@]}" +else + exec "${COMMAND[@]}" +fi diff --git a/containers/extract_build_args.jq b/containers/extract_build_args.jq new file mode 100644 index 0000000..b35240e --- /dev/null +++ b/containers/extract_build_args.jq @@ -0,0 +1,12 @@ +## Example input: +## xgb-ci.gpu_build_r_rockylinux8 +## Example output: +## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 +def compute_build_args($input; $container_id): + $input | + .[$container_id] | + select(.build_args != null) | + .build_args | + to_entries | + map("--build-arg " + .key + "=" + .value) | + join(" "); diff --git a/containers/extract_build_args.sh b/containers/extract_build_args.sh new file mode 100755 index 0000000..4254915 --- /dev/null +++ b/containers/extract_build_args.sh @@ -0,0 +1,26 @@ +#!/bin/bash +## Extract container definition and build args from containers/ci_container.yml, +## given the container ID. +## +## Example input: +## xgb-ci.clang_tidy +## Example output: +## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 [container_id]" + exit 1 +fi + +CONTAINER_ID="$1" +CONTAINER_DEF=$( + yq -o json containers/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' +) +BUILD_ARGS=$( + yq -o json containers/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" \ + 'include "containers/extract_build_args"; + compute_build_args(.; $container_id)' +) +echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'"