From d9a4bd637b1b7603abc5f00d91a146c13d687c5a Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Wed, 17 Jan 2024 18:17:24 -0500 Subject: [PATCH 1/6] Working on the updated runner tag. --- .github/workflows/gcp-a100-bisection.yml | 2 +- .github/workflows/pr-gha-runner.yml | 2 +- .github/workflows/userbenchmark-a100-bisection.yml | 2 +- .github/workflows/userbenchmark-a100.yml | 2 +- .github/workflows/userbenchmark-regression-detector.yml | 2 +- .github/workflows/v3-bisection.yml | 2 +- .github/workflows/v3-nightly.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/gcp-a100-bisection.yml b/.github/workflows/gcp-a100-bisection.yml index 2b6d342b92..7176cf4916 100644 --- a/.github/workflows/gcp-a100-bisection.yml +++ b/.github/workflows/gcp-a100-bisection.yml @@ -21,7 +21,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] environment: docker-s3-upload timeout-minutes: 2880 # 48 hours steps: diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/pr-gha-runner.yml index 973802af96..3232738016 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/pr-gha-runner.yml @@ -13,7 +13,7 @@ jobs: CONDA_ENV: "pr-ci-a100" SETUP_SCRIPT: "/workspace/setup_instance.sh" HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 1440 # 24 hours environment: docker-s3-upload steps: diff --git a/.github/workflows/userbenchmark-a100-bisection.yml b/.github/workflows/userbenchmark-a100-bisection.yml index 59d1695011..876d3c5df2 100644 --- a/.github/workflows/userbenchmark-a100-bisection.yml +++ b/.github/workflows/userbenchmark-a100-bisection.yml @@ -28,7 +28,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 2880 # 48 hours steps: - name: Checkout diff --git a/.github/workflows/userbenchmark-a100.yml b/.github/workflows/userbenchmark-a100.yml index 94aa303ad4..9ba743a58a 100644 --- a/.github/workflows/userbenchmark-a100.yml +++ b/.github/workflows/userbenchmark-a100.yml @@ -11,7 +11,7 @@ on: jobs: run-userbenchmark: - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 1440 # 24 hours environment: docker-s3-upload env: diff --git a/.github/workflows/userbenchmark-regression-detector.yml b/.github/workflows/userbenchmark-regression-detector.yml index be0fe9d594..c919321160 100644 --- a/.github/workflows/userbenchmark-regression-detector.yml +++ b/.github/workflows/userbenchmark-regression-detector.yml @@ -11,7 +11,7 @@ on: jobs: run-userbenchmark: - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 1440 # 24 hours environment: docker-s3-upload env: diff --git a/.github/workflows/v3-bisection.yml b/.github/workflows/v3-bisection.yml index 53a5c495b8..683627160e 100644 --- a/.github/workflows/v3-bisection.yml +++ b/.github/workflows/v3-bisection.yml @@ -20,7 +20,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] timeout-minutes: 2880 # 48 hours steps: - name: Checkout diff --git a/.github/workflows/v3-nightly.yml b/.github/workflows/v3-nightly.yml index 670013b69d..7112243623 100644 --- a/.github/workflows/v3-nightly.yml +++ b/.github/workflows/v3-nightly.yml @@ -18,7 +18,7 @@ jobs: IS_GHA: 1 BUILD_ENVIRONMENT: benchmark-nightly if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [self-hosted, a100-runner] + runs-on: [a100-runner] steps: - name: Checkout TorchBench v3.0 branch uses: actions/checkout@v3 From 57be05f2fd239553dd59f2f68e9e734f8179ef4d Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Wed, 17 Jan 2024 23:08:18 -0500 Subject: [PATCH 2/6] Test the dind work --- .github/workflows/pr-gha-runner.yml | 2 ++ docker/gcp-a100-runner-dind.dockerfile | 16 +++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/pr-gha-runner.yml index 3232738016..378c3b4f96 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/pr-gha-runner.yml @@ -21,6 +21,8 @@ jobs: uses: actions/checkout@v3 - name: Tune Nvidia GPU run: | + ls /usr/local + ls /dev sudo nvidia-smi -pm 1 sudo nvidia-smi -ac 1215,1410 nvidia-smi diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index 1b0812fa3d..ed92648806 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -1,12 +1,13 @@ -# default base image: summerwind/actions-runner-dind:latest -# base image: Ubuntu 20.04 +# default base image: ghcr.io/actions/actions-runner:latest +# base image: Ubuntu 22.04 jammy ARG BASE_IMAGE=summerwind/actions-runner-dind:latest FROM ${BASE_IMAGE} ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -# NVIDIA Driver version: 525.105.17, GKE version: 1.26.5-gke.1200 +# GKE version: 1.28.5-gke.1217000 # GKE release notes: https://cloud.google.com/kubernetes-engine/docs/release-notes#current_versions -ENV NVIDIA_VERSION="525.105.17" +# Default NVIDIA driver version: 535.129.03 +# We assume that the host driver libraries are mapped to the docker filesystem RUN sudo apt-get -y update && sudo apt -y update # fontconfig: needed by model doctr_det_predictor @@ -23,13 +24,6 @@ RUN sudo chmod +x /usr/bin/switch-cuda.sh RUN sudo mkdir -p /workspace; sudo chown runner:runner /workspace -# Download and the NVIDIA Driver files, NVIDIA Driver version is bundled together with GKE -# Install runtime libraries only, do not compile the kernel modules -# The kernel modules are already provided by the host GKE environment -RUN cd /workspace && mkdir tmp_nvidia && cd tmp_nvidia && \ - wget -q https://storage.googleapis.com/nvidia-drivers-us-public/tesla/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run && \ - sudo bash ./NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run --no-kernel-modules -s --no-systemd --no-kernel-module-source --no-nvidia-modprobe - # Source of the CUDA installation scripts: # https://github.com/pytorch/builder/blob/main/common/install_cuda.sh From 23b3f119268d5034a24c5197694b0bd6bbc12a60 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 18 Jan 2024 16:10:06 -0500 Subject: [PATCH 3/6] Added pr runner --- .github/workflows/pr-gha-runner.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/pr-gha-runner.yml index 378c3b4f96..3232738016 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/pr-gha-runner.yml @@ -21,8 +21,6 @@ jobs: uses: actions/checkout@v3 - name: Tune Nvidia GPU run: | - ls /usr/local - ls /dev sudo nvidia-smi -pm 1 sudo nvidia-smi -ac 1215,1410 nvidia-smi From 582f6abc38664df1b65805ca8b5dfd4be04497ca Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 18 Jan 2024 19:35:28 -0500 Subject: [PATCH 4/6] Use the official runner docker image --- docker/gcp-a100-runner-dind.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index ed92648806..a082ae7277 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -1,6 +1,6 @@ # default base image: ghcr.io/actions/actions-runner:latest # base image: Ubuntu 22.04 jammy -ARG BASE_IMAGE=summerwind/actions-runner-dind:latest +ARG BASE_IMAGE=ghcr.io/actions/actions-runner:latest FROM ${BASE_IMAGE} ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 From d2c79d245a5c62c8678aa291feeb1bdbf9c14eb3 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 18 Jan 2024 22:59:52 -0500 Subject: [PATCH 5/6] Install g++ to the base docker --- docker/gcp-a100-runner-dind.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index a082ae7277..c27db6cbb5 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -11,7 +11,7 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN sudo apt-get -y update && sudo apt -y update # fontconfig: needed by model doctr_det_predictor -RUN sudo apt-get install -y git jq \ +RUN sudo apt-get install -y git jq gcc g++ \ vim wget curl ninja-build cmake \ libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \ fontconfig libfontconfig1-dev \ From df3fe0c2bfa7e761e22429aecbb524478b8ee2d7 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 19 Jan 2024 15:56:37 -0500 Subject: [PATCH 6/6] Updated NVIDIA driver versions. --- docker/gcp-a100-runner-dind.dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index a082ae7277..cc347ff585 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -5,9 +5,9 @@ FROM ${BASE_IMAGE} ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 # GKE version: 1.28.5-gke.1217000 -# GKE release notes: https://cloud.google.com/kubernetes-engine/docs/release-notes#current_versions -# Default NVIDIA driver version: 535.129.03 -# We assume that the host driver libraries are mapped to the docker filesystem +# NVIDIA driver version: 535.104.05 +# NVIDIA drivers list available at gs://ubuntu_nvidia_packages/ +# We assume that the host NVIDIA driver binaries and libraries are mapped to the docker filesystem RUN sudo apt-get -y update && sudo apt -y update # fontconfig: needed by model doctr_det_predictor