From 18eafb4e959619e73b84857b690bb2d699a4e2e9 Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@meta.com>
Date: Thu, 8 Feb 2024 15:38:08 -0800
Subject: [PATCH] Bisect new nightly error and fix the broken T4 workflow.
 (#2153)

Summary:
On 2024-02-06, the nightly workflow failed with error: https://github.com/pytorch/benchmark/actions/runs/7802969964
Improve the A100 bisection workflow to auto-bisect this error.

Pull Request resolved: https://github.com/pytorch/benchmark/pull/2153

Test Plan:
1. Bisection workflow: https://github.com/pytorch/benchmark/actions/runs/7805867410
2. T4 userbenchmark workflow: https://github.com/pytorch/benchmark/actions/runs/7805761391

Reviewed By: aaronenyeshi

Differential Revision: D53517311

Pulled By: xuzhao9

fbshipit-source-id: 0e556927f0344e56f0039effb33db8839bc153e1
---
 .github/workflows/gcp-a100-bisection.yml | 109 -----------------------
 userbenchmark/test_bench/run.py          |  13 ++-
 userbenchmark/utils.py                   |   1 +
 3 files changed, 13 insertions(+), 110 deletions(-)
 delete mode 100644 .github/workflows/gcp-a100-bisection.yml

diff --git a/.github/workflows/gcp-a100-bisection.yml b/.github/workflows/gcp-a100-bisection.yml
deleted file mode 100644
index 7176cf4916..0000000000
--- a/.github/workflows/gcp-a100-bisection.yml
+++ /dev/null
@@ -1,109 +0,0 @@
-name: TorchBench Userbenchmark bisection on GCP A100
-on:
-  workflow_dispatch:
-    inputs:
-      regression_date:
-        description: "Date of the regression"
-        required: true
-        default: "2023-05-19"
-      userbenchmark_name:
-        description: "Name of the userbenchmark to bisect"
-        required: true
-        default: "torch-nightly"
-
-jobs:
-  bisection:
-    env:
-      BASE_CONDA_ENV: "torchbench"
-      CONDA_ENV: "userbenchmark-bisection-ci"
-      PLATFORM_NAME: "gcp_a100"
-      SETUP_SCRIPT: "/workspace/setup_instance.sh"
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: [a100-runner]
-    environment: docker-s3-upload
-    timeout-minutes: 2880 # 48 hours
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          path: benchmark
-      - name: Checkout pytorch
-        uses: actions/checkout@v3
-        with:
-          repository: pytorch/pytorch
-          path: srcs/pytorch
-          fetch-depth: 0
-      - name: Checkout torchvision
-        uses: actions/checkout@v3
-        with:
-          repository: pytorch/vision
-          path: srcs/vision
-          fetch-depth: 0
-      - name: Checkout torchdata
-        uses: actions/checkout@v3
-        with:
-          repository: pytorch/data
-          path: srcs/data
-          fetch-depth: 0
-      - name: Checkout torchaudio
-        uses: actions/checkout@v3
-        with:
-          repository: pytorch/audio
-          path: srcs/audio
-          fetch-depth: 0
-      - name: Tune Nvidia GPU
-        run: |
-          sudo nvidia-smi -pm 1
-          sudo nvidia-smi -ac 1215,1410
-          nvidia-smi
-      - name: Setup conda env
-        run: |
-          CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
-          cd benchmark
-          python ./utils/python_utils.py --create-conda-env "${CONDA_ENV}"
-      - name: Setup bisection environment
-        run: |
-          . "${SETUP_SCRIPT}"; cd benchmark
-          USERBENCHMARK_NAME="${{ github.event.inputs.userbenchmark_name }}"
-          BISECT_WORKDIR=".userbenchmark/${USERBENCHMARK_NAME}/bisection"
-          python utils/cuda_utils.py --install-torch-build-deps
-          python utils/cuda_utils.py --install-torchbench-deps
-          cc_path=$(conda run -n "${CONDA_ENV}" printenv CC)
-          cxx_path=$(conda run -n "${CONDA_ENV}" printenv CXX)
-          ln -s "${cc_path}" "$(dirname "$cc_path")/cc"
-          ln -s "${cc_path}" "$(dirname "$cc_path")/gcc"
-          ln -s "${cxx_path}" "$(dirname "$cxx_path")/c++"
-          ln -s "${cxx_path}" "$(dirname "$cxx_path")/g++"
-          # setup shared library paths
-          sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/lib64/libpthread.so.0" /lib64/
-          sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/usr/lib64/libpthread_nonshared.a" /usr/lib64/
-          sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/lib64/libc.so.6" /lib64/
-          sudo ln -sf "${CONDA_PREFIX}/x86_64-conda-linux-gnu/sysroot/usr/lib64/libc_nonshared.a" /usr/lib64/
-          mkdir -p "${BISECT_WORKDIR}"
-          REGRESSION_DATE="${{ github.event.inputs.regression_date }}"
-          python regression_detector.py --name "${USERBENCHMARK_NAME}" --platform "${PLATFORM_NAME}" \
-                                        --end-date "${REGRESSION_DATE}" --download-from-s3 --output "${BISECT_WORKDIR}/regression-${REGRESSION_DATE}.yaml"
-      - name: Bisection
-        run: |
-          . "${SETUP_SCRIPT}"; cd benchmark
-          USERBENCHMARK_NAME="${{ github.event.inputs.userbenchmark_name }}"
-          BISECT_WORKDIR=".userbenchmark/${USERBENCHMARK_NAME}/bisection"
-          REGRESSION_DATE="${{ github.event.inputs.regression_date }}"
-          python bisection.py --work-dir "${BISECT_WORKDIR}" --torch-repos-path "${PWD}/../srcs" \
-                --torchbench-repo-path "${PWD}" --config "${BISECT_WORKDIR}/regression-${REGRESSION_DATE}.yaml" \
-                --output "${BISECT_WORKDIR}/bisect-output-gh${GITHUB_RUN_ID}.json"
-          cp -r "${BISECT_WORKDIR}" ../bisection-result
-      - name: Upload artifact
-        if: always()
-        uses: actions/upload-artifact@v3
-        with:
-          name: Bisection result
-          path: bisection-result/
-      - name: Clean up Conda env
-        if: always()
-        run: |
-          . "${SETUP_SCRIPT}"
-          conda deactivate && conda deactivate
-          conda remove -n "${CONDA_ENV}" --all
diff --git a/userbenchmark/test_bench/run.py b/userbenchmark/test_bench/run.py
index 30d391a885..efb9405cfe 100644
--- a/userbenchmark/test_bench/run.py
+++ b/userbenchmark/test_bench/run.py
@@ -174,6 +174,12 @@ def run_config(
     except OSError as e:
         print(" [oserror]", flush=True)
         return dict.fromkeys(metrics, str(e))
+    except KeyboardInterrupt:
+        print(" [user_interrupt]")
+        exit(1)
+    except Exception as e:
+        print(" [runtime_error]", flush=True)
+        return dict.fromkeys(metrics, str(e))
 
 
 def run_config_accuracy(
@@ -196,7 +202,12 @@ def run_config_accuracy(
     except OSError as e:
         print(" [oserror]", flush=True)
         return {"accuracy": str(e)}
-
+    except KeyboardInterrupt:
+        print(" [user_interrupt]")
+        exit(1)
+    except Exception as e:
+        print(" [runtime_error]", flush=True)
+        return {"accuracy": str(e)}
 
 def parse_known_args(args):
     parser = argparse.ArgumentParser()
diff --git a/userbenchmark/utils.py b/userbenchmark/utils.py
index b31b0ad706..593e33713c 100644
--- a/userbenchmark/utils.py
+++ b/userbenchmark/utils.py
@@ -104,6 +104,7 @@ def get_output_dir(bm_name: str) -> Path:
 def get_default_output_json_path(bm_name: str, target_dir: Path=None) -> str:
     if target_dir is None:
         target_dir = get_output_dir(bm_name)
+    target_dir.mkdir(exist_ok=True, parents=True)
     fname = "metrics-{}.json".format(datetime.fromtimestamp(time.time()).strftime("%Y%m%d%H%M%S"))
     full_fname = os.path.join(target_dir, fname)
     return full_fname