From 69877662ecb78461bd9da14cdf88f60d9fc99824 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 17 Dec 2024 13:48:42 -0800 Subject: [PATCH] [BE] consolidate 4-GPU integration tests into 8-GPU tests and reduce frequency ghstack-source-id: 5e926c4f72bcf7d56c06f3ac0eae57fd235975ee Pull Request resolved: https://github.com/pytorch/torchtitan/pull/745 --- .github/workflows/integration_test_4gpu.yaml | 46 -------------------- .github/workflows/integration_test_8gpu.yaml | 10 +++-- README.md | 1 - tests/integration_tests.py | 6 +-- 4 files changed, 8 insertions(+), 55 deletions(-) delete mode 100644 .github/workflows/integration_test_4gpu.yaml diff --git a/.github/workflows/integration_test_4gpu.yaml b/.github/workflows/integration_test_4gpu.yaml deleted file mode 100644 index 61f3e20b..00000000 --- a/.github/workflows/integration_test_4gpu.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: 4 GPU Integration Test - -on: - push: - branches: [ main ] - pull_request: - schedule: - # Runs hourly - - cron: '0 * * * *' - -concurrency: - group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -l -eo pipefail {0} - -jobs: - build-test: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.g5.12xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.1" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 - repository: pytorch/torchtitan - upload-artifact: outputs - script: | - set -eux - - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - pip config --user set global.progress_bar off - - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124 - - # install torchtitan to test the files in ./scripts, currently just for memory estimation - python -m pip install -e . - - mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 4 diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index dd657e42..ec3f8eaf 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -5,8 +5,8 @@ on: branches: [ main ] pull_request: schedule: - # Runs nightly - - cron: '0 0 * * *' + # Runs every 6 hours + - cron: '0 */6 * * *' concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true @@ -21,7 +21,7 @@ jobs: with: runner: linux.g5.48xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" # This image is faster to clone than the default, but it lacks CC needed by triton # (1m25s vs 2m37s). docker-image: torchtitan-ubuntu-20.04-clang12 @@ -37,5 +37,9 @@ jobs: pip config --user set global.progress_bar off python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124 + + # install torchtitan to test the files in ./scripts + python -m pip install -e . + mkdir artifacts-to-be-uploaded python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/README.md b/README.md index 0c47e2d3..82469f4d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -[![4 GPU Integration Test](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_4gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_4gpu.yaml?query=branch%3Amain) [![8 GPU Integration Test](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain) # torchtitan diff --git a/tests/integration_tests.py b/tests/integration_tests.py index 0b20154e..684c2018 100755 --- a/tests/integration_tests.py +++ b/tests/integration_tests.py @@ -472,10 +472,6 @@ def run_tests(args): f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus," f" because --ngpu arg is {args.ngpu}" ) - elif args.ngpu == 8 and test_flavor.ngpu != 8: - logger.info( - f"Skipping non-8gpu test {test_flavor.test_name} on 8-gpu runner" - ) else: run_test(test_flavor, full_path, args.output_dir) @@ -489,7 +485,7 @@ def main(): default="all", help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", ) - parser.add_argument("--ngpu", default=4, type=int) + parser.add_argument("--ngpu", default=8, type=int) args = parser.parse_args() if not os.path.exists(args.output_dir):