From 18eafb4e959619e73b84857b690bb2d699a4e2e9 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 8 Feb 2024 15:38:08 -0800 Subject: [PATCH] Bisect new nightly error and fix the broken T4 workflow. (#2153) Summary: On 2024-02-06, the nightly workflow failed with error: https://github.com/pytorch/benchmark/actions/runs/7802969964 Improve the A100 bisection workflow to auto-bisect this error. Pull Request resolved: https://github.com/pytorch/benchmark/pull/2153 Test Plan: 1. Bisection workflow: https://github.com/pytorch/benchmark/actions/runs/7805867410 2. T4 userbenchmark workflow: https://github.com/pytorch/benchmark/actions/runs/7805761391 Reviewed By: aaronenyeshi Differential Revision: D53517311 Pulled By: xuzhao9 fbshipit-source-id: 0e556927f0344e56f0039effb33db8839bc153e1 --- .github/workflows/gcp-a100-bisection.yml | 109 ----------------------- userbenchmark/test_bench/run.py | 13 ++- userbenchmark/utils.py | 1 + 3 files changed, 13 insertions(+), 110 deletions(-) delete mode 100644 .github/workflows/gcp-a100-bisection.yml diff --git a/.github/workflows/gcp-a100-bisection.yml b/.github/workflows/gcp-a100-bisection.yml deleted file mode 100644 index 7176cf4916..0000000000 --- a/.github/workflows/gcp-a100-bisection.yml +++ /dev/null @@ -1,109 +0,0 @@ -name: TorchBench Userbenchmark bisection on GCP A100 -on: - workflow_dispatch: - inputs: - regression_date: - description: "Date of the regression" - required: true - default: "2023-05-19" - userbenchmark_name: - description: "Name of the userbenchmark to bisect" - required: true - default: "torch-nightly" - -jobs: - bisection: - env: - BASE_CONDA_ENV: "torchbench" - CONDA_ENV: "userbenchmark-bisection-ci" - PLATFORM_NAME: "gcp_a100" - SETUP_SCRIPT: "/workspace/setup_instance.sh" - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: [a100-runner] - environment: docker-s3-upload - timeout-minutes: 2880 # 48 hours - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - path: benchmark - - name: Checkout pytorch - uses: actions/checkout@v3 - with: - repository: pytorch/pytorch - path: srcs/pytorch - fetch-depth: 0 - - name: Checkout torchvision - uses: actions/checkout@v3 - with: - repository: pytorch/vision - path: srcs/vision - fetch-depth: 0 - - name: Checkout torchdata - uses: actions/checkout@v3 - with: - repository: pytorch/data - path: srcs/data - fetch-depth: 0 - - name: Checkout torchaudio - uses: actions/checkout@v3 - with: - repository: pytorch/audio - path: srcs/audio - fetch-depth: 0 - - name: Tune Nvidia GPU - run: | - sudo nvidia-smi -pm 1 - sudo nvidia-smi -ac 1215,1410 - nvidia-smi - - name: Setup conda env - run: | - CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" - cd benchmark - python ./utils/python_utils.py --create-conda-env "${CONDA_ENV}" - - name: Setup bisection environment - run: | - . "${SETUP_SCRIPT}"; cd benchmark - USERBENCHMARK_NAME="${{ github.event.inputs.userbenchmark_name }}" - BISECT_WORKDIR=".userbenchmark/${USERBENCHMARK_NAME}/bisection" - python utils/cuda_utils.py --install-torch-build-deps - python utils/cuda_utils.py --install-torchbench-deps - cc_path=$(conda run -n "${CONDA_ENV}" printenv CC) - cxx_path=$(conda run -n "${CONDA_ENV}" printenv CXX) - ln -s "${cc_path}" "$(dirname "$cc_path")/cc" - ln -s "${cc_path}" "$(dirname "$cc_path")/gcc" - ln -s "${cxx_path}" "$(dirname "$cxx_path")/c++" - ln -s "${cxx_path}" "$(dirname "$cxx_path")/g++" - # setup shared library paths - sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/lib64/libpthread.so.0" /lib64/ - sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/usr/lib64/libpthread_nonshared.a" /usr/lib64/ - sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/lib64/libc.so.6" /lib64/ - sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/usr/lib64/libc_nonshared.a" /usr/lib64/ - mkdir -p "${BISECT_WORKDIR}" - REGRESSION_DATE="${{ github.event.inputs.regression_date }}" - python regression_detector.py --name "${USERBENCHMARK_NAME}" --platform "${PLATFORM_NAME}" \ - --end-date "${REGRESSION_DATE}" --download-from-s3 --output "${BISECT_WORKDIR}/regression-${REGRESSION_DATE}.yaml" - - name: Bisection - run: | - . "${SETUP_SCRIPT}"; cd benchmark - USERBENCHMARK_NAME="${{ github.event.inputs.userbenchmark_name }}" - BISECT_WORKDIR=".userbenchmark/${USERBENCHMARK_NAME}/bisection" - REGRESSION_DATE="${{ github.event.inputs.regression_date }}" - python bisection.py --work-dir "${BISECT_WORKDIR}" --torch-repos-path "${PWD}/../srcs" \ - --torchbench-repo-path "${PWD}" --config "${BISECT_WORKDIR}/regression-${REGRESSION_DATE}.yaml" \ - --output "${BISECT_WORKDIR}/bisect-output-gh${GITHUB_RUN_ID}.json" - cp -r "${BISECT_WORKDIR}" ../bisection-result - - name: Upload artifact - if: always() - uses: actions/upload-artifact@v3 - with: - name: Bisection result - path: bisection-result/ - - name: Clean up Conda env - if: always() - run: | - . "${SETUP_SCRIPT}" - conda deactivate && conda deactivate - conda remove -n "${CONDA_ENV}" --all diff --git a/userbenchmark/test_bench/run.py b/userbenchmark/test_bench/run.py index 30d391a885..efb9405cfe 100644 --- a/userbenchmark/test_bench/run.py +++ b/userbenchmark/test_bench/run.py @@ -174,6 +174,12 @@ def run_config( except OSError as e: print(" [oserror]", flush=True) return dict.fromkeys(metrics, str(e)) + except KeyboardInterrupt: + print(" [user_interrupt]") + exit(1) + except Exception as e: + print(" [runtime_error]", flush=True) + return dict.fromkeys(metrics, str(e)) def run_config_accuracy( @@ -196,7 +202,12 @@ def run_config_accuracy( except OSError as e: print(" [oserror]", flush=True) return {"accuracy": str(e)} - + except KeyboardInterrupt: + print(" [user_interrupt]") + exit(1) + except Exception as e: + print(" [runtime_error]", flush=True) + return {"accuracy": str(e)} def parse_known_args(args): parser = argparse.ArgumentParser() diff --git a/userbenchmark/utils.py b/userbenchmark/utils.py index b31b0ad706..593e33713c 100644 --- a/userbenchmark/utils.py +++ b/userbenchmark/utils.py @@ -104,6 +104,7 @@ def get_output_dir(bm_name: str) -> Path: def get_default_output_json_path(bm_name: str, target_dir: Path=None) -> str: if target_dir is None: target_dir = get_output_dir(bm_name) + target_dir.mkdir(exist_ok=True, parents=True) fname = "metrics-{}.json".format(datetime.fromtimestamp(time.time()).strftime("%Y%m%d%H%M%S")) full_fname = os.path.join(target_dir, fname) return full_fname