Skip to content

Commit

Permalink
Improve the ci by separating CPU and CUDA tests (#2327)
Browse files Browse the repository at this point in the history
Summary:
We are separating CPU and CUDA tests using different runners.

Pull Request resolved: #2327

Reviewed By: huydhn

Differential Revision: D58902378

Pulled By: xuzhao9

fbshipit-source-id: 6c2598dc2cd5c94105f1eb15862da924ae25894c
  • Loading branch information
xuzhao9 authored and facebook-github-bot committed Jun 22, 2024
1 parent d910b8a commit 1f98239
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 168 deletions.
29 changes: 29 additions & 0 deletions .ci/torchbench/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
. ${HOME}/miniconda3/etc/profile.d/conda.sh

if [ -z "${BASE_CONDA_ENV}" ]; then
echo "ERROR: BASE_CONDA_ENV is not set"
exit 1
fi

if [ -z "${CONDA_ENV}" ]; then
echo "ERROR: CONDA_ENV is not set"
exit 1
fi

if [ -z "${SETUP_SCRIPT}" ]; then
echo "ERROR: SETUP_SCRIPT is not set"
exit 1
fi

CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
conda activate "${BASE_CONDA_ENV}"
# Remove the conda env if exists
conda remove --name "${CONDA_ENV}" -y --all || true
conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}"
conda activate "${CONDA_ENV}"

parent_dir=$(dirname "$(readlink -f "$0")")/../..
cd ${parent_dir}

python -c "import torch; print(torch.__version__); print(torch.version.git_version)"
python install.py
31 changes: 31 additions & 0 deletions .ci/torchbench/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
. ${HOME}/miniconda3/etc/profile.d/conda.sh

if [ -z "${CONDA_ENV}" ]; then
echo "ERROR: CONDA_ENV is not set"
exit 1
fi

if [ -z "${TEST_CONFIG}" ]; then
echo "ERROR: TEST_CONFIG is not set"
exit 1
fi

if [ -z "${SETUP_SCRIPT}" ]; then
echo "ERROR: SETUP_SCRIPT is not set"
exit 1
fi

. "${SETUP_SCRIPT}"
conda activate "${CONDA_ENV}"

parent_dir=$(dirname "$(readlink -f "$0")")/../..
cd ${parent_dir}

# Test subprocess worker
if [[ "$TEST_CONFIG" == 'cpu' ]]; then
python -m torchbenchmark._components.test.test_subprocess
python -m torchbenchmark._components.test.test_worker
fi

# Test models
python test.py -v -k "$TEST_CONFIG"
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
name: TorchBench PR Test on A10G
name: linux-test-cpu
on:
pull_request:
workflow_dispatch:
push:
branches:
- main

env:
CONDA_ENV: "torchbench"
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

workflow_call:
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
jobs:
pr-test:
# AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu
# OS version: Amazon Linux 2
runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu]
timeout-minutes: 1440 # 24 hours
linux-test-cpu:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: [self-hosted, linux.24xlarge]
timeout-minutes: 240
environment: docker-s3-upload
env:
BASE_CONDA_ENV: "torchbench"
CONDA_ENV: "pr-test-cpu"
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
TEST_CONFIG: "cpu"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
steps:
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
Expand All @@ -33,32 +34,26 @@ jobs:
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ env.DOCKER_IMAGE }}
- name: Install NVIDIA Driver, docker runtime, set GPU_FLAG
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
- name: Install and Test TorchBench
run: |
container_name=$(docker run \
-e BASE_CONDA_ENV="${BASE_CONDA_ENV}" \
-e CONDA_ENV="${CONDA_ENV}" \
-e SETUP_SCRIPT="${SETUP_SCRIPT}" \
-e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \
-e TEST_CONFIG="${TEST_CONFIG}" \
--tty \
--detach \
--shm-size=32gb \
-v "${PWD}/benchmark:/benchmark" \
--gpus all \
-w / \
"${{ env.DOCKER_IMAGE }}" \
tail -f /dev/null
)
echo "Container name: ${container_name}"
docker exec -t -w "/" "${container_name}" bash -c "sudo chown -R runner /benchmark; sudo chgrp -R runner /benchmark"
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/install.sh
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/test.sh
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
name: TorchBench PR Test
name: linux-test-cuda
on:
pull_request:
workflow_dispatch:
push:
branches:
- main
workflow_call:
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
jobs:
pr-test:
linux-test-cuda:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: [a100-runner]
timeout-minutes: 240
environment: docker-s3-upload
env:
BASE_CONDA_ENV: "torchbench"
CONDA_ENV: "pr-ci-a100"
CONDA_ENV: "pr-test-cuda"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
TEST_CONFIG: "cuda"
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
runs-on: [a100-runner]
timeout-minutes: 1440 # 24 hours
environment: docker-s3-upload
steps:
- name: Checkout TorchBench
uses: actions/checkout@v3
Expand All @@ -25,23 +29,15 @@ jobs:
sudo nvidia-smi -ac 1215,1410
sudo ldconfig
nvidia-smi
- name: Clone and setup Conda env
run: |
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
- name: Install TorchBench
run: |
bash ./scripts/torchbench_install.sh
- name: Validate benchmark components
bash ./.ci/torchbench/install.sh
- name: Test TorchBench
run: |
bash ./scripts/torchbench_test.sh
bash ./.ci/torchbench/test.sh
- name: Clean up Conda env
if: always()
run: |
. "${SETUP_SCRIPT}"
conda deactivate && conda deactivate
conda remove -n "${CONDA_ENV}" --all
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
74 changes: 0 additions & 74 deletions .github/workflows/pr-gpu-stability-ci.yml

This file was deleted.

21 changes: 21 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: TorchBench PR Test
on:
pull_request:
workflow_dispatch:
push:
branches:
- main

jobs:
cpu-test:
uses: ./.github/workflows/_linux-test-cpu.yml
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
cuda-test:
uses: ./.github/workflows/_linux-test-cuda.yml
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
20 changes: 0 additions & 20 deletions scripts/torchbench_install.sh

This file was deleted.

24 changes: 0 additions & 24 deletions scripts/torchbench_test.sh

This file was deleted.

0 comments on commit 1f98239

Please sign in to comment.