diff --git a/.ci/torchbench/install.sh b/.ci/torchbench/install.sh new file mode 100644 index 0000000000..4828b67118 --- /dev/null +++ b/.ci/torchbench/install.sh @@ -0,0 +1,29 @@ +. ${HOME}/miniconda3/etc/profile.d/conda.sh + +if [ -z "${BASE_CONDA_ENV}" ]; then + echo "ERROR: BASE_CONDA_ENV is not set" + exit 1 +fi + +if [ -z "${CONDA_ENV}" ]; then + echo "ERROR: CONDA_ENV is not set" + exit 1 +fi + +if [ -z "${SETUP_SCRIPT}" ]; then + echo "ERROR: SETUP_SCRIPT is not set" + exit 1 +fi + +CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" +conda activate "${BASE_CONDA_ENV}" +# Remove the conda env if exists +conda remove --name "${CONDA_ENV}" -y --all || true +conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}" +conda activate "${CONDA_ENV}" + +parent_dir=$(dirname "$(readlink -f "$0")")/../.. +cd ${parent_dir} + +python -c "import torch; print(torch.__version__); print(torch.version.git_version)" +python install.py diff --git a/.ci/torchbench/test.sh b/.ci/torchbench/test.sh new file mode 100644 index 0000000000..9cba55cda7 --- /dev/null +++ b/.ci/torchbench/test.sh @@ -0,0 +1,31 @@ +. ${HOME}/miniconda3/etc/profile.d/conda.sh + +if [ -z "${CONDA_ENV}" ]; then + echo "ERROR: CONDA_ENV is not set" + exit 1 +fi + +if [ -z "${TEST_CONFIG}" ]; then + echo "ERROR: TEST_CONFIG is not set" + exit 1 +fi + +if [ -z "${SETUP_SCRIPT}" ]; then + echo "ERROR: SETUP_SCRIPT is not set" + exit 1 +fi + +. "${SETUP_SCRIPT}" +conda activate "${CONDA_ENV}" + +parent_dir=$(dirname "$(readlink -f "$0")")/../.. +cd ${parent_dir} + +# Test subprocess worker +if [[ "$TEST_CONFIG" == 'cpu' ]]; then + python -m torchbenchmark._components.test.test_subprocess + python -m torchbenchmark._components.test.test_worker +fi + +# Test models +python test.py -v -k "$TEST_CONFIG" diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/_linux-test-cpu.yml similarity index 60% rename from .github/workflows/pr-a10g.yml rename to .github/workflows/_linux-test-cpu.yml index fb390a7638..7aa1569797 100644 --- a/.github/workflows/pr-a10g.yml +++ b/.github/workflows/_linux-test-cpu.yml @@ -1,25 +1,26 @@ -name: TorchBench PR Test on A10G +name: linux-test-cpu on: - pull_request: - workflow_dispatch: - push: - branches: - - main - -env: - CONDA_ENV: "torchbench" - DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" - SETUP_SCRIPT: "/workspace/setup_instance.sh" - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - + workflow_call: + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub jobs: - pr-test: - # AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu - # OS version: Amazon Linux 2 - runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu] - timeout-minutes: 1440 # 24 hours + linux-test-cpu: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, linux.24xlarge] + timeout-minutes: 240 environment: docker-s3-upload + env: + BASE_CONDA_ENV: "torchbench" + CONDA_ENV: "pr-test-cpu" + DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" + SETUP_SCRIPT: "/workspace/setup_instance.sh" + TEST_CONFIG: "cpu" + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: pytorch/test-infra/.github/actions/setup-ssh@main @@ -33,32 +34,26 @@ jobs: uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: docker-image: ${{ env.DOCKER_IMAGE }} - - name: Install NVIDIA Driver, docker runtime, set GPU_FLAG - id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Install and Test TorchBench run: | container_name=$(docker run \ + -e BASE_CONDA_ENV="${BASE_CONDA_ENV}" \ -e CONDA_ENV="${CONDA_ENV}" \ -e SETUP_SCRIPT="${SETUP_SCRIPT}" \ -e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \ + -e TEST_CONFIG="${TEST_CONFIG}" \ --tty \ --detach \ --shm-size=32gb \ -v "${PWD}/benchmark:/benchmark" \ - --gpus all \ -w / \ "${{ env.DOCKER_IMAGE }}" \ tail -f /dev/null ) echo "Container name: ${container_name}" docker exec -t -w "/" "${container_name}" bash -c "sudo chown -R runner /benchmark; sudo chgrp -R runner /benchmark" - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/install.sh + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/test.sh - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/_linux-test-cuda.yml similarity index 52% rename from .github/workflows/pr-gha-runner.yml rename to .github/workflows/_linux-test-cuda.yml index 0c1a3d3a6e..5f59a48083 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/_linux-test-cuda.yml @@ -1,21 +1,25 @@ -name: TorchBench PR Test +name: linux-test-cuda on: - pull_request: - workflow_dispatch: - push: - branches: - - main + workflow_call: + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub jobs: - pr-test: + linux-test-cuda: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: [a100-runner] + timeout-minutes: 240 + environment: docker-s3-upload env: BASE_CONDA_ENV: "torchbench" - CONDA_ENV: "pr-ci-a100" + CONDA_ENV: "pr-test-cuda" SETUP_SCRIPT: "/workspace/setup_instance.sh" + TEST_CONFIG: "cuda" HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - runs-on: [a100-runner] - timeout-minutes: 1440 # 24 hours - environment: docker-s3-upload steps: - name: Checkout TorchBench uses: actions/checkout@v3 @@ -25,23 +29,15 @@ jobs: sudo nvidia-smi -ac 1215,1410 sudo ldconfig nvidia-smi - - name: Clone and setup Conda env - run: | - CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" - conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}" - name: Install TorchBench run: | - bash ./scripts/torchbench_install.sh - - name: Validate benchmark components + bash ./.ci/torchbench/install.sh + - name: Test TorchBench run: | - bash ./scripts/torchbench_test.sh + bash ./.ci/torchbench/test.sh - name: Clean up Conda env if: always() run: | . "${SETUP_SCRIPT}" conda deactivate && conda deactivate conda remove -n "${CONDA_ENV}" --all - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/pr-gpu-stability-ci.yml b/.github/workflows/pr-gpu-stability-ci.yml deleted file mode 100644 index c0ec193e29..0000000000 --- a/.github/workflows/pr-gpu-stability-ci.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: TorchBench GPU model stability test -on: - workflow_dispatch: - inputs: - model: - description: "Model Name" - required: true - default: "fastNLP_Bert" - pull_request: - -jobs: - stability_test: - env: - CONDA_ENV: "stability-test-ci" - TEST_HOME: "/tmp/tb-stability-ci" - PYTHON_VERSION: "3.8" - CUDA_VERSION: "cu116" - PR_BODY: ${{ github.event.pull_request.body }} - MODEL: ${{ github.event.inputs.model }} - GPU_ID: "1" - GPU_FREQ: "5001,900" - REPEAT: "10" - if: ${{ (github.event.inputs.model || contains(github.event.pull_request.body, 'STABLE_TEST_MODEL:')) }} - runs-on: [self-hosted, bm-runner] - timeout-minutes: 120 # 2 hours - environment: docker-s3-upload - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Create conda environment with pytorch nightly - run: | - conda create -y -n "${CONDA_ENV}" python="${PYTHON_VERSION}" - . activate "${CONDA_ENV}" - conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \ - cmake cffi typing_extensions future six dataclasses tabulate gitpython - # Install pytorch nightly - pip install --pre torch torchvision torchaudio \ - -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html - # Install torchbench dependencies - python install.py - - name: Stability test - run: | - . activate "${CONDA_ENV}" - mkdir -p "${TEST_HOME}" - if [ -z "${MODEL}" ] ; then - # Load PR to file - PR_BODY_FILE="${TEST_HOME}"/pr-body.txt - echo "${PR_BODY}" > "${PR_BODY_FILE}" - MODEL=`python ./.github/scripts/test-repeated-runs.py --pr-body "${PR_BODY_FILE}"` - fi - # Setup nvidia gpu frequency - sudo nvidia-persistenced --user "${USER}" || true - sudo nvidia-smi -pm "${GPU_ID}" - sudo nvidia-smi -ac "${GPU_FREQ}" - # Run the tests - EVAL_LOG="${TEST_HOME}/eval-${MODEL}.log" - echo -n > "${EVAL_LOG}" - for i in `seq 1 ${REPEAT}`; do - python run.py "${MODEL}" -t eval -d cuda | tee -a "${EVAL_LOG}" - done - TRAIN_LOG="${TEST_HOME}/train-${MODEL}.log" - echo -n > "${TRAIN_LOG}" - for i in `seq 1 ${REPEAT}`; do - python run.py "${MODEL}" -t train -d cuda | tee -a "${TRAIN_LOG}" - done - # Check the stability of GPU tests - python ./.github/scripts/test-repeated-runs.py --log "${EVAL_LOG}" && \ - echo "GPU stability test pass for inference!" - python ./.github/scripts/test-repeated-runs.py --log "${TRAIN_LOG}" && \ - echo "GPU stability test pass for train!" - - name: Remove conda environment - run: | - conda env remove --name "${CONDA_ENV}" - diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml new file mode 100644 index 0000000000..00417d454e --- /dev/null +++ b/.github/workflows/pr-test.yml @@ -0,0 +1,21 @@ +name: TorchBench PR Test +on: + pull_request: + workflow_dispatch: + push: + branches: + - main + +jobs: + cpu-test: + uses: ./.github/workflows/_linux-test-cpu.yml + secrets: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + cuda-test: + uses: ./.github/workflows/_linux-test-cuda.yml + secrets: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/scripts/torchbench_install.sh b/scripts/torchbench_install.sh deleted file mode 100644 index 5c1d5217a9..0000000000 --- a/scripts/torchbench_install.sh +++ /dev/null @@ -1,20 +0,0 @@ -. ${HOME}/miniconda3/etc/profile.d/conda.sh - -if [ -z "${CONDA_ENV}" ]; then - echo "ERROR: CONDA_ENV is not set" - exit 1 -fi - -if [ -z "${SETUP_SCRIPT}" ]; then - echo "ERROR: SETUP_SCRIPT is not set" - exit 1 -fi - -. "${SETUP_SCRIPT}" -conda activate "${CONDA_ENV}" - -parent_dir=$(dirname "$(readlink -f "$0")")/.. -cd ${parent_dir} - -python -c "import torch; print(torch.__version__); print(torch.version.git_version)" -python install.py diff --git a/scripts/torchbench_test.sh b/scripts/torchbench_test.sh deleted file mode 100644 index 7411b23d96..0000000000 --- a/scripts/torchbench_test.sh +++ /dev/null @@ -1,24 +0,0 @@ -. ${HOME}/miniconda3/etc/profile.d/conda.sh - -if [ -z "${CONDA_ENV}" ]; then - echo "ERROR: CONDA_ENV is not set" - exit 1 -fi - -if [ -z "${SETUP_SCRIPT}" ]; then - echo "ERROR: SETUP_SCRIPT is not set" - exit 1 -fi - -. "${SETUP_SCRIPT}" -conda activate "${CONDA_ENV}" - -parent_dir=$(dirname "$(readlink -f "$0")")/.. -cd ${parent_dir} - -# Test subprocess worker -python -m torchbenchmark._components.test.test_subprocess -python -m torchbenchmark._components.test.test_worker - -# Test models -python test.py -v