Skip to content

Commit

Permalink
Improve the ci runners
Browse files Browse the repository at this point in the history
  • Loading branch information
xuzhao9 committed Jun 21, 2024
1 parent d910b8a commit 4078588
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 140 deletions.
10 changes: 8 additions & 2 deletions scripts/torchbench_install.sh → .ci/torchbench/install.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
. ${HOME}/miniconda3/etc/profile.d/conda.sh

if [ -z "${BASE_CONDA_ENV}" ]; then
echo "ERROR: BASE_CONDA_ENV is not set"
exit 1
fi

if [ -z "${CONDA_ENV}" ]; then
echo "ERROR: CONDA_ENV is not set"
exit 1
Expand All @@ -10,10 +15,11 @@ if [ -z "${SETUP_SCRIPT}" ]; then
exit 1
fi

. "${SETUP_SCRIPT}"
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
conda activate "${CONDA_ENV}"

parent_dir=$(dirname "$(readlink -f "$0")")/..
parent_dir=$(dirname "$(readlink -f "$0")")/../..
cd ${parent_dir}

python -c "import torch; print(torch.__version__); print(torch.version.git_version)"
Expand Down
31 changes: 31 additions & 0 deletions .ci/torchbench/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
. ${HOME}/miniconda3/etc/profile.d/conda.sh

if [ -z "${CONDA_ENV}" ]; then
echo "ERROR: CONDA_ENV is not set"
exit 1
fi

if [ -z "${TEST_CONFIG}" ]; then
echo "ERROR: TEST_CONFIG is not set"
exit 1
fi

if [ -z "${SETUP_SCRIPT}" ]; then
echo "ERROR: SETUP_SCRIPT is not set"
exit 1
fi

. "${SETUP_SCRIPT}"
conda activate "${CONDA_ENV}"

parent_dir=$(dirname "$(readlink -f "$0")")/../..
cd ${parent_dir}

# Test subprocess worker
if [[ "$TEST_CONFIG" == 'cpu' ]]; then
python -m torchbenchmark._components.test.test_subprocess
python -m torchbenchmark._components.test.test_worker
fi

# Test models
python test.py -v -k "$TEST_CONFIG"
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
name: TorchBench PR Test on A10G
on:
pull_request:
workflow_dispatch:
push:
branches:
- main

env:
CONDA_ENV: "torchbench"
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

name: linux-test

jobs:
pr-test:
# AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu
# OS version: Amazon Linux 2
runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu]
timeout-minutes: 1440 # 24 hours
pr-test-cpu:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: [linux.24xlarge]
timeout-minutes: ${{ inputs.timeout-minutes }}
environment: docker-s3-upload
env:
BASE_CONDA_ENV: "torchbench"
CONDA_ENV: "pr-test-cpu"
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
TEST_CONFIG: "cpu"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
steps:
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
Expand All @@ -33,28 +27,26 @@ jobs:
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ env.DOCKER_IMAGE }}
- name: Install NVIDIA Driver, docker runtime, set GPU_FLAG
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
- name: Install and Test TorchBench
run: |
container_name=$(docker run \
-e BASE_CONDA_ENV="${BASE_CONDA_ENV}" \
-e CONDA_ENV="${CONDA_ENV}" \
-e SETUP_SCRIPT="${SETUP_SCRIPT}" \
-e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \
-e TEST_CONFIG="${TEST_CONFIG}" \
--tty \
--detach \
--shm-size=32gb \
-v "${PWD}/benchmark:/benchmark" \
--gpus all \
-w / \
"${{ env.DOCKER_IMAGE }}" \
tail -f /dev/null
)
echo "Container name: ${container_name}"
docker exec -t -w "/" "${container_name}" bash -c "sudo chown -R runner /benchmark; sudo chgrp -R runner /benchmark"
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/install.sh
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/test.sh
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
name: TorchBench PR Test
on:
pull_request:
workflow_dispatch:
push:
branches:
- main
name: linux-test

jobs:
pr-test:
pr-test-cuda:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: [a100-runner]
timeout-minutes: ${{ inputs.timeout-minutes }}
environment: docker-s3-upload
env:
BASE_CONDA_ENV: "torchbench"
CONDA_ENV: "pr-ci-a100"
CONDA_ENV: "pr-test-cuda"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
TEST_CONFIG: "cuda"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
runs-on: [a100-runner]
timeout-minutes: 1440 # 24 hours
environment: docker-s3-upload
steps:
- name: Checkout TorchBench
uses: actions/checkout@v3
Expand All @@ -31,10 +28,10 @@ jobs:
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
- name: Install TorchBench
run: |
bash ./scripts/torchbench_install.sh
- name: Validate benchmark components
bash ./.ci/torchbench/install.sh
- name: Test TorchBench
run: |
bash ./scripts/torchbench_test.sh
bash ./.ci/torchbench/test.sh
- name: Clean up Conda env
if: always()
run: |
Expand Down
74 changes: 0 additions & 74 deletions .github/workflows/pr-gpu-stability-ci.yml

This file was deleted.

22 changes: 22 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: TorchBench PR Test
on:
pull_request:
workflow_dispatch:
push:
branches:
- main

jobs:
cpu-test:
timeout-minutes: 120 # 2 hours
uses: ./.github/workflow/_linux-test-cpu.yml
with:
timeout-minutes: 120 # 2 hours
cuda-test:
uses: ./.github/workflow/_linux-test-cuda.yml
with:
timeout-minutes: 120 # 2 hours

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
24 changes: 0 additions & 24 deletions scripts/torchbench_test.sh

This file was deleted.

0 comments on commit 4078588

Please sign in to comment.