diff --git a/scripts/torchbench_install.sh b/.ci/torchbench/install.sh similarity index 59% rename from scripts/torchbench_install.sh rename to .ci/torchbench/install.sh index 5c1d5217a9..18d0b580d5 100644 --- a/scripts/torchbench_install.sh +++ b/.ci/torchbench/install.sh @@ -1,5 +1,10 @@ . ${HOME}/miniconda3/etc/profile.d/conda.sh +if [ -z "${BASE_CONDA_ENV}" ]; then + echo "ERROR: BASE_CONDA_ENV is not set" + exit 1 +fi + if [ -z "${CONDA_ENV}" ]; then echo "ERROR: CONDA_ENV is not set" exit 1 @@ -10,10 +15,11 @@ if [ -z "${SETUP_SCRIPT}" ]; then exit 1 fi -. "${SETUP_SCRIPT}" +CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" +conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}" conda activate "${CONDA_ENV}" -parent_dir=$(dirname "$(readlink -f "$0")")/.. +parent_dir=$(dirname "$(readlink -f "$0")")/../.. cd ${parent_dir} python -c "import torch; print(torch.__version__); print(torch.version.git_version)" diff --git a/.ci/torchbench/test.sh b/.ci/torchbench/test.sh new file mode 100644 index 0000000000..9cba55cda7 --- /dev/null +++ b/.ci/torchbench/test.sh @@ -0,0 +1,31 @@ +. ${HOME}/miniconda3/etc/profile.d/conda.sh + +if [ -z "${CONDA_ENV}" ]; then + echo "ERROR: CONDA_ENV is not set" + exit 1 +fi + +if [ -z "${TEST_CONFIG}" ]; then + echo "ERROR: TEST_CONFIG is not set" + exit 1 +fi + +if [ -z "${SETUP_SCRIPT}" ]; then + echo "ERROR: SETUP_SCRIPT is not set" + exit 1 +fi + +. "${SETUP_SCRIPT}" +conda activate "${CONDA_ENV}" + +parent_dir=$(dirname "$(readlink -f "$0")")/../.. +cd ${parent_dir} + +# Test subprocess worker +if [[ "$TEST_CONFIG" == 'cpu' ]]; then + python -m torchbenchmark._components.test.test_subprocess + python -m torchbenchmark._components.test.test_worker +fi + +# Test models +python test.py -v -k "$TEST_CONFIG" diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/_linux-test-cpu.yml similarity index 67% rename from .github/workflows/pr-a10g.yml rename to .github/workflows/_linux-test-cpu.yml index fb390a7638..ec4bca72ae 100644 --- a/.github/workflows/pr-a10g.yml +++ b/.github/workflows/_linux-test-cpu.yml @@ -1,25 +1,19 @@ -name: TorchBench PR Test on A10G -on: - pull_request: - workflow_dispatch: - push: - branches: - - main - -env: - CONDA_ENV: "torchbench" - DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" - SETUP_SCRIPT: "/workspace/setup_instance.sh" - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - +name: linux-test jobs: - pr-test: - # AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu - # OS version: Amazon Linux 2 - runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu] - timeout-minutes: 1440 # 24 hours + pr-test-cpu: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: [linux.24xlarge] + timeout-minutes: ${{ inputs.timeout-minutes }} environment: docker-s3-upload + env: + BASE_CONDA_ENV: "torchbench" + CONDA_ENV: "pr-test-cpu" + DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" + SETUP_SCRIPT: "/workspace/setup_instance.sh" + TEST_CONFIG: "cpu" + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: pytorch/test-infra/.github/actions/setup-ssh@main @@ -33,28 +27,26 @@ jobs: uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: docker-image: ${{ env.DOCKER_IMAGE }} - - name: Install NVIDIA Driver, docker runtime, set GPU_FLAG - id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Install and Test TorchBench run: | container_name=$(docker run \ + -e BASE_CONDA_ENV="${BASE_CONDA_ENV}" \ -e CONDA_ENV="${CONDA_ENV}" \ -e SETUP_SCRIPT="${SETUP_SCRIPT}" \ -e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \ + -e TEST_CONFIG="${TEST_CONFIG}" \ --tty \ --detach \ --shm-size=32gb \ -v "${PWD}/benchmark:/benchmark" \ - --gpus all \ -w / \ "${{ env.DOCKER_IMAGE }}" \ tail -f /dev/null ) echo "Container name: ${container_name}" docker exec -t -w "/" "${container_name}" bash -c "sudo chown -R runner /benchmark; sudo chgrp -R runner /benchmark" - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/install.sh + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/test.sh - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/_linux-test-cuda.yml similarity index 76% rename from .github/workflows/pr-gha-runner.yml rename to .github/workflows/_linux-test-cuda.yml index 0c1a3d3a6e..7dd9403ed5 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/_linux-test-cuda.yml @@ -1,21 +1,18 @@ -name: TorchBench PR Test -on: - pull_request: - workflow_dispatch: - push: - branches: - - main +name: linux-test jobs: - pr-test: + pr-test-cuda: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: [a100-runner] + timeout-minutes: ${{ inputs.timeout-minutes }} + environment: docker-s3-upload env: BASE_CONDA_ENV: "torchbench" - CONDA_ENV: "pr-ci-a100" + CONDA_ENV: "pr-test-cuda" SETUP_SCRIPT: "/workspace/setup_instance.sh" + TEST_CONFIG: "cuda" HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - runs-on: [a100-runner] - timeout-minutes: 1440 # 24 hours - environment: docker-s3-upload steps: - name: Checkout TorchBench uses: actions/checkout@v3 @@ -31,10 +28,10 @@ jobs: conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}" - name: Install TorchBench run: | - bash ./scripts/torchbench_install.sh - - name: Validate benchmark components + bash ./.ci/torchbench/install.sh + - name: Test TorchBench run: | - bash ./scripts/torchbench_test.sh + bash ./.ci/torchbench/test.sh - name: Clean up Conda env if: always() run: | diff --git a/.github/workflows/pr-gpu-stability-ci.yml b/.github/workflows/pr-gpu-stability-ci.yml deleted file mode 100644 index c0ec193e29..0000000000 --- a/.github/workflows/pr-gpu-stability-ci.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: TorchBench GPU model stability test -on: - workflow_dispatch: - inputs: - model: - description: "Model Name" - required: true - default: "fastNLP_Bert" - pull_request: - -jobs: - stability_test: - env: - CONDA_ENV: "stability-test-ci" - TEST_HOME: "/tmp/tb-stability-ci" - PYTHON_VERSION: "3.8" - CUDA_VERSION: "cu116" - PR_BODY: ${{ github.event.pull_request.body }} - MODEL: ${{ github.event.inputs.model }} - GPU_ID: "1" - GPU_FREQ: "5001,900" - REPEAT: "10" - if: ${{ (github.event.inputs.model || contains(github.event.pull_request.body, 'STABLE_TEST_MODEL:')) }} - runs-on: [self-hosted, bm-runner] - timeout-minutes: 120 # 2 hours - environment: docker-s3-upload - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Create conda environment with pytorch nightly - run: | - conda create -y -n "${CONDA_ENV}" python="${PYTHON_VERSION}" - . activate "${CONDA_ENV}" - conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \ - cmake cffi typing_extensions future six dataclasses tabulate gitpython - # Install pytorch nightly - pip install --pre torch torchvision torchaudio \ - -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html - # Install torchbench dependencies - python install.py - - name: Stability test - run: | - . activate "${CONDA_ENV}" - mkdir -p "${TEST_HOME}" - if [ -z "${MODEL}" ] ; then - # Load PR to file - PR_BODY_FILE="${TEST_HOME}"/pr-body.txt - echo "${PR_BODY}" > "${PR_BODY_FILE}" - MODEL=`python ./.github/scripts/test-repeated-runs.py --pr-body "${PR_BODY_FILE}"` - fi - # Setup nvidia gpu frequency - sudo nvidia-persistenced --user "${USER}" || true - sudo nvidia-smi -pm "${GPU_ID}" - sudo nvidia-smi -ac "${GPU_FREQ}" - # Run the tests - EVAL_LOG="${TEST_HOME}/eval-${MODEL}.log" - echo -n > "${EVAL_LOG}" - for i in `seq 1 ${REPEAT}`; do - python run.py "${MODEL}" -t eval -d cuda | tee -a "${EVAL_LOG}" - done - TRAIN_LOG="${TEST_HOME}/train-${MODEL}.log" - echo -n > "${TRAIN_LOG}" - for i in `seq 1 ${REPEAT}`; do - python run.py "${MODEL}" -t train -d cuda | tee -a "${TRAIN_LOG}" - done - # Check the stability of GPU tests - python ./.github/scripts/test-repeated-runs.py --log "${EVAL_LOG}" && \ - echo "GPU stability test pass for inference!" - python ./.github/scripts/test-repeated-runs.py --log "${TRAIN_LOG}" && \ - echo "GPU stability test pass for train!" - - name: Remove conda environment - run: | - conda env remove --name "${CONDA_ENV}" - diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml new file mode 100644 index 0000000000..cf53acd187 --- /dev/null +++ b/.github/workflows/pr-test.yml @@ -0,0 +1,22 @@ +name: TorchBench PR Test +on: + pull_request: + workflow_dispatch: + push: + branches: + - main + +jobs: + cpu-test: + timeout-minutes: 120 # 2 hours + uses: ./.github/workflow/_linux-test-cpu.yml + with: + timeout-minutes: 120 # 2 hours + cuda-test: + uses: ./.github/workflow/_linux-test-cuda.yml + with: + timeout-minutes: 120 # 2 hours + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/scripts/torchbench_test.sh b/scripts/torchbench_test.sh deleted file mode 100644 index 7411b23d96..0000000000 --- a/scripts/torchbench_test.sh +++ /dev/null @@ -1,24 +0,0 @@ -. ${HOME}/miniconda3/etc/profile.d/conda.sh - -if [ -z "${CONDA_ENV}" ]; then - echo "ERROR: CONDA_ENV is not set" - exit 1 -fi - -if [ -z "${SETUP_SCRIPT}" ]; then - echo "ERROR: SETUP_SCRIPT is not set" - exit 1 -fi - -. "${SETUP_SCRIPT}" -conda activate "${CONDA_ENV}" - -parent_dir=$(dirname "$(readlink -f "$0")")/.. -cd ${parent_dir} - -# Test subprocess worker -python -m torchbenchmark._components.test.test_subprocess -python -m torchbenchmark._components.test.test_worker - -# Test models -python test.py -v