pytorch · xuzhao9 · Jan 17, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 19, 2024
diff --git a/.github/workflows/gcp-a100-bisection.yml b/.github/workflows/gcp-a100-bisection.yml
@@ -21,7 +21,7 @@ jobs:
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     environment: docker-s3-upload
     timeout-minutes: 2880 # 48 hours
     steps:

diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/pr-gha-runner.yml
@@ -13,7 +13,7 @@ jobs:
       CONDA_ENV: "pr-ci-a100"
       SETUP_SCRIPT: "/workspace/setup_instance.sh"
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     timeout-minutes: 1440 # 24 hours
     environment: docker-s3-upload
     steps:

diff --git a/.github/workflows/userbenchmark-a100-bisection.yml b/.github/workflows/userbenchmark-a100-bisection.yml
@@ -28,7 +28,7 @@ jobs:
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     timeout-minutes: 2880 # 48 hours
     steps:
       - name: Checkout

diff --git a/.github/workflows/userbenchmark-a100.yml b/.github/workflows/userbenchmark-a100.yml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   run-userbenchmark:
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     timeout-minutes: 1440 # 24 hours
     environment: docker-s3-upload
     env:

diff --git a/.github/workflows/userbenchmark-regression-detector.yml b/.github/workflows/userbenchmark-regression-detector.yml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   run-userbenchmark:
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     timeout-minutes: 1440 # 24 hours
     environment: docker-s3-upload
     env:

diff --git a/.github/workflows/v3-bisection.yml b/.github/workflows/v3-bisection.yml
@@ -20,7 +20,7 @@ jobs:
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     timeout-minutes: 2880 # 48 hours
     steps:
       - name: Checkout

diff --git a/.github/workflows/v3-nightly.yml b/.github/workflows/v3-nightly.yml
@@ -18,7 +18,7 @@ jobs:
       IS_GHA: 1
       BUILD_ENVIRONMENT: benchmark-nightly
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: [self-hosted, a100-runner]
+    runs-on: [a100-runner]
     steps:
       - name: Checkout TorchBench v3.0 branch
         uses: actions/checkout@v3

diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile
@@ -1,16 +1,17 @@
-# default base image: summerwind/actions-runner-dind:latest
-# base image: Ubuntu 20.04
-ARG BASE_IMAGE=summerwind/actions-runner-dind:latest
+# default base image: ghcr.io/actions/actions-runner:latest
+# base image: Ubuntu 22.04 jammy
+ARG BASE_IMAGE=ghcr.io/actions/actions-runner:latest
 FROM ${BASE_IMAGE}
 
 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
-# NVIDIA Driver version: 525.105.17, GKE version: 1.26.5-gke.1200
-# GKE release notes: https://cloud.google.com/kubernetes-engine/docs/release-notes#current_versions
-ENV NVIDIA_VERSION="525.105.17"
+# GKE version: 1.28.5-gke.1217000
+# NVIDIA driver version: 535.104.05
+# NVIDIA drivers list available at gs://ubuntu_nvidia_packages/
+# We assume that the host NVIDIA driver binaries and libraries are mapped to the docker filesystem
 
 RUN sudo apt-get -y update && sudo apt -y update
 # fontconfig: needed by model doctr_det_predictor
-RUN sudo apt-get install -y git jq \
+RUN sudo apt-get install -y git jq gcc g++ \
                             vim wget curl ninja-build cmake \
                             libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \
                             fontconfig libfontconfig1-dev \
@@ -23,13 +24,6 @@ RUN sudo chmod +x /usr/bin/switch-cuda.sh
 
 RUN sudo mkdir -p /workspace; sudo chown runner:runner /workspace
 
-# Download and the NVIDIA Driver files, NVIDIA Driver version is bundled together with GKE
-# Install runtime libraries only, do not compile the kernel modules
-# The kernel modules are already provided by the host GKE environment
-RUN cd /workspace && mkdir tmp_nvidia && cd tmp_nvidia && \
-    wget -q https://storage.googleapis.com/nvidia-drivers-us-public/tesla/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run && \
-    sudo bash ./NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run --no-kernel-modules -s --no-systemd --no-kernel-module-source --no-nvidia-modprobe
-
 # Source of the CUDA installation scripts:
 # https://github.com/pytorch/builder/blob/main/common/install_cuda.sh