Skip to content

Commit

Permalink
Ported gpu hvd tests from circleci to GHA (pytorch#2619)
Browse files Browse the repository at this point in the history
* Ported gpu hvd tests from circleci to GHA

* Fixing HVD installation

* WIP replaced pip with conda

* Updates

* Update gpu-hvd-tests.yml

* Update gpu-hvd-tests.yml

* More updates

* Do not build with NCCL

* Updated cache name

* WIP

* Use AGENT_TOOLSDIRECTORY for isolation

* Fixing horovod installation

* Added cmake and scikit-build step

* Ported tests to pytorch infra

* Added set -e to gpu-*tests.yml

* Fixed issue with test__hvd_dist_model_warning_index_less_localrank
  • Loading branch information
vfdev-5 authored Aug 29, 2023
1 parent 11a1fba commit f2b1183
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 9 deletions.
188 changes: 188 additions & 0 deletions .github/workflows/gpu-hvd-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
name: Run HVD-specific unit tests on GPUs
on:
push:
paths:
- "ignite/**"
- "tests/ignite/**"
- "tests/run_gpu_tests.sh"
- "tests/run_code_style.sh"
- "examples/**.py"
- "requirements-dev.txt"
- ".github/workflows/gpu-hvd-tests.yml"
workflow_dispatch:

concurrency:
# <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)>
group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
cancel-in-progress: true

# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml

jobs:
gpu-hvd-tests:
strategy:
matrix:
pytorch-channel: [pytorch, ]
fail-fast: false
env:
DOCKER_IMAGE: "pytorch/conda-builder:cuda11.8"
REPOSITORY: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
runs-on: linux.8xlarge.nvidia.gpu
timeout-minutes: 60

steps:
- name: Clean workspace
run: |
echo "::group::Cleanup debug output"
sudo rm -rfv "${GITHUB_WORKSPACE}"
mkdir -p "${GITHUB_WORKSPACE}"
echo "::endgroup::"
- name: Checkout repository (pytorch/test-infra)
uses: actions/checkout@v3
with:
# Support the use case where we need to checkout someone's fork
repository: pytorch/test-infra
path: test-infra

- name: Setup Linux
uses: ./test-infra/.github/actions/setup-linux

- name: Pull docker image
uses: ./test-infra/.github/actions/pull-docker-image
with:
docker-image: ${{ env.DOCKER_IMAGE }}

- name: Checkout repository (${{ github.repository }})
uses: actions/checkout@v3
with:
# Support the use case where we need to checkout someone's fork
repository: ${{ github.repository }}
ref: ${{ github.ref }}
path: ${{ github.repository }}
fetch-depth: 1

- name: Start Pytorch container
working-directory: ${{ github.repository }}
run: |
docker run --name pthd --gpus=all --rm \
--cap-add=SYS_PTRACE \
--detach \
--ipc=host \
--security-opt seccomp=unconfined \
--shm-size=2g \
--tty \
--ulimit stack=10485760:83886080 \
-v $PWD:/work \
-w /work \
${DOCKER_IMAGE}
script=$(cat << EOF
set -xe
nvidia-smi
ls -alh
conda --version
python --version
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
- name: Install PyTorch and dependencies
continue-on-error: false
run: |
script=$(cat << EOF
set -xe
# Install PyTorch
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118
else
pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
fi
python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
pip list
# Install dependencies
pip install -r requirements-dev.txt
pip install -e .
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
- name: Install Horovod with NCCL GPU ops
run: |
script=$(cat << EOF
set -xe
HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
horovodrun --check-build
pip list
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
- name: Run GPU and CPU Unit HVD Tests
run: |
script=$(cat << EOF
set -xe
bash tests/run_gpu_tests.sh 2 hvd
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ${{ github.repository }}/coverage.xml
flags: gpu-2
fail_ci_if_error: false

- name: Run examples in container
continue-on-error: false
run: |
SCRIPT=$(cat << EOF
set -xe
# Install additional example dependencies
pip install fire
# Check training on CIFAR10, run with horovod backend using horovodrun
# initial run
CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500
# resume
CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
# Check training on CIFAR10 using spawn
# initial run
CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
# resume
CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
- name: Teardown Linux
if: ${{ always() }}
uses: ./test-infra/.github/actions/teardown-linux
16 changes: 8 additions & 8 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:
# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml

jobs:
gpu-tests:
gpu-tests:
strategy:
matrix:
pytorch-channel: [pytorch, pytorch-nightly]
Expand Down Expand Up @@ -80,7 +80,7 @@ jobs:
script=$(cat << EOF
set -x
set -xe
nvidia-smi
ls -alh
Expand All @@ -98,7 +98,7 @@ jobs:
script=$(cat << EOF
set -x
set -xe
# Install PyTorch
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
Expand All @@ -119,13 +119,13 @@ jobs:
docker exec -t pthd /bin/bash -c "${script}"
- name: Run 1 Node 2 GPUs Unit Tests
- name: Run GPU Unit Tests
continue-on-error: false
run: |
script=$(cat << EOF
set -x
set -xe
bash tests/run_gpu_tests.sh 2
Expand All @@ -145,8 +145,8 @@ jobs:
continue-on-error: false
run: |
SCRIPT=$(cat << EOF
set -x
set -xe
# Install additional example dependencies
pip install fire
Expand All @@ -156,7 +156,7 @@ jobs:
CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
## resume
CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
# Check training on cifar10, run with NCCL backend using torchrun
## initial run
CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
Expand Down
2 changes: 1 addition & 1 deletion tests/ignite/distributed/comp_models/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def _test__hvd_dist_model_warning_index_less_localrank():
@pytest.mark.distributed
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
def test__hvd_dist_model_warning_index_less_localrank(gloo_hvd_executor):
gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), num_proc=torch.cuda.device_count())
gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), np=torch.cuda.device_count())


def _test_dist_spawn_fn(local_rank, backend, world_size, device):
Expand Down

0 comments on commit f2b1183

Please sign in to comment.