diff --git a/.github/workflows/gcp-a100-bisection.yml b/.github/workflows/gcp-a100-bisection.yml index 2b6d342b92..7176cf4916 100644 --- a/.github/workflows/gcp-a100-bisection.yml +++ b/.github/workflows/gcp-a100-bisection.yml @@ -21,7 +21,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] environment: docker-s3-upload timeout-minutes: 2880 # 48 hours steps: diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/pr-gha-runner.yml index 973802af96..3232738016 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/pr-gha-runner.yml @@ -13,7 +13,7 @@ jobs: CONDA_ENV: "pr-ci-a100" SETUP_SCRIPT: "/workspace/setup_instance.sh" HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 1440 # 24 hours environment: docker-s3-upload steps: diff --git a/.github/workflows/userbenchmark-a100-bisection.yml b/.github/workflows/userbenchmark-a100-bisection.yml index 59d1695011..876d3c5df2 100644 --- a/.github/workflows/userbenchmark-a100-bisection.yml +++ b/.github/workflows/userbenchmark-a100-bisection.yml @@ -28,7 +28,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 2880 # 48 hours steps: - name: Checkout diff --git a/.github/workflows/userbenchmark-a100.yml b/.github/workflows/userbenchmark-a100.yml index 94aa303ad4..9ba743a58a 100644 --- a/.github/workflows/userbenchmark-a100.yml +++ b/.github/workflows/userbenchmark-a100.yml @@ -11,7 +11,7 @@ on: jobs: run-userbenchmark: - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 1440 # 24 hours environment: docker-s3-upload env: diff --git a/.github/workflows/userbenchmark-regression-detector.yml b/.github/workflows/userbenchmark-regression-detector.yml index be0fe9d594..c919321160 100644 --- a/.github/workflows/userbenchmark-regression-detector.yml +++ b/.github/workflows/userbenchmark-regression-detector.yml @@ -11,7 +11,7 @@ on: jobs: run-userbenchmark: - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 1440 # 24 hours environment: docker-s3-upload env: diff --git a/.github/workflows/v3-bisection.yml b/.github/workflows/v3-bisection.yml index 53a5c495b8..683627160e 100644 --- a/.github/workflows/v3-bisection.yml +++ b/.github/workflows/v3-bisection.yml @@ -20,7 +20,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 2880 # 48 hours steps: - name: Checkout diff --git a/.github/workflows/v3-nightly.yml b/.github/workflows/v3-nightly.yml index 670013b69d..7112243623 100644 --- a/.github/workflows/v3-nightly.yml +++ b/.github/workflows/v3-nightly.yml @@ -18,7 +18,7 @@ jobs: IS_GHA: 1 BUILD_ENVIRONMENT: benchmark-nightly if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] steps: - name: Checkout TorchBench v3.0 branch uses: actions/checkout@v3 diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index 1b0812fa3d..48c18accd6 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -1,16 +1,17 @@ -# default base image: summerwind/actions-runner-dind:latest -# base image: Ubuntu 20.04 -ARG BASE_IMAGE=summerwind/actions-runner-dind:latest +# default base image: ghcr.io/actions/actions-runner:latest +# base image: Ubuntu 22.04 jammy +ARG BASE_IMAGE=ghcr.io/actions/actions-runner:latest FROM ${BASE_IMAGE} ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -# NVIDIA Driver version: 525.105.17, GKE version: 1.26.5-gke.1200 -# GKE release notes: https://cloud.google.com/kubernetes-engine/docs/release-notes#current_versions -ENV NVIDIA_VERSION="525.105.17" +# GKE version: 1.28.5-gke.1217000 +# NVIDIA driver version: 535.104.05 +# NVIDIA drivers list available at gs://ubuntu_nvidia_packages/ +# We assume that the host NVIDIA driver binaries and libraries are mapped to the docker filesystem RUN sudo apt-get -y update && sudo apt -y update # fontconfig: needed by model doctr_det_predictor -RUN sudo apt-get install -y git jq \ +RUN sudo apt-get install -y git jq gcc g++ \ vim wget curl ninja-build cmake \ libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \ fontconfig libfontconfig1-dev \ @@ -23,13 +24,6 @@ RUN sudo chmod +x /usr/bin/switch-cuda.sh RUN sudo mkdir -p /workspace; sudo chown runner:runner /workspace -# Download and the NVIDIA Driver files, NVIDIA Driver version is bundled together with GKE -# Install runtime libraries only, do not compile the kernel modules -# The kernel modules are already provided by the host GKE environment -RUN cd /workspace && mkdir tmp_nvidia && cd tmp_nvidia && \ - wget -q https://storage.googleapis.com/nvidia-drivers-us-public/tesla/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run && \ - sudo bash ./NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run --no-kernel-modules -s --no-systemd --no-kernel-module-source --no-nvidia-modprobe - # Source of the CUDA installation scripts: # https://github.com/pytorch/builder/blob/main/common/install_cuda.sh