From 8e5e3a0ca924587cac5d93b2d38ebad44498ee63 Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@meta.com>
Date: Thu, 23 Nov 2023 16:11:30 -0800
Subject: [PATCH] Add the A100 bisection workflow (#2054)

Summary:
We are adding the generic A100 bisection workflow for bisecting any userbenchmark.

The workflow requires 4 arguments:
1. The start pytorch commit hash on the main branch
2. The treatment pytorch commit hash on the main branch
3. The userbenchmark name
4. The userbenchmark arguments to trigger the regression

Pull Request resolved: https://github.com/pytorch/benchmark/pull/2054

Test Plan:
Automatic bisection of the `test_bench` userbenchmark on the accuracy issue of llama_v2_7b_16h:

Start commit hash (2023-11-15): `a5a404865c01f86881f6b3ab0cd9a562d0b420de`

End commit hash (2023-11-16): `690c805c8b539501aad5fbf18914ac92afb65d5a`

Userbenchmark name: `test_bench`

Userbenchmark arguments: `llama_v2_7b_16h -d cuda -t eval --accuracy`

Known root cause commit: `12b2dd16b050e6495910fc564517fbb51dde1f20`

Reviewed By: aaronenyeshi

Differential Revision: D51551127

Pulled By: xuzhao9

fbshipit-source-id: 45385f95a0f30d9eb93efccb32aaa8cf32297d4c
---
 .../userbenchmark-a100-bisection.yml          | 101 ++++++++++++++++++
 regression_detector.py                        |   4 +-
 .../test_bench/regression_detector.py         |  32 ++++++
 userbenchmark/test_bench/run.py               |  13 ++-
 userbenchmark/utils.py                        |  10 +-
 5 files changed, 146 insertions(+), 14 deletions(-)
 create mode 100644 .github/workflows/userbenchmark-a100-bisection.yml

diff --git a/.github/workflows/userbenchmark-a100-bisection.yml b/.github/workflows/userbenchmark-a100-bisection.yml
new file mode 100644
index 0000000000..dbcd5435e6
--- /dev/null
+++ b/.github/workflows/userbenchmark-a100-bisection.yml
@@ -0,0 +1,101 @@
+name: TorchBench A100 bisection
+on:
+  workflow_dispatch:
+    inputs:
+      start_commit:
+        description: "Start PyTorch commit hash"
+        required: true
+      end_commit:
+        description: "End PyTorch commit hash"
+        required: true
+      userbenchmark:
+        description: "Userbenchmark name"
+        required: true
+      userbenchmark_args:
+        description: "Userbenchmark arguments"
+        required: true
+
+jobs:
+  bisection:
+    environment: docker-s3-upload
+    env:
+      BASE_CONDA_ENV: "torchbench"
+      CONDA_ENV: "bisection-ci-a100"
+      PLATFORM_NAME: "gcp_a100"
+      SETUP_SCRIPT: "/workspace/setup_instance.sh"
+      BISECT_WORKDIR: ".userbenchmark/${{ github.env.userbenchmark }}/bisection"
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: [self-hosted, a100-runner]
+    timeout-minutes: 2880 # 48 hours
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          path: benchmark
+      - name: Checkout pytorch
+        uses: actions/checkout@v3
+        with:
+          repository: pytorch/pytorch
+          path: srcs/pytorch
+          fetch-depth: 0
+      - name: Checkout torchvision
+        uses: actions/checkout@v3
+        with:
+          repository: pytorch/vision
+          path: srcs/vision
+          fetch-depth: 0
+      - name: Checkout torchaudio
+        uses: actions/checkout@v3
+        with:
+          repository: pytorch/audio
+          path: srcs/audio
+          fetch-depth: 0
+      - name: Tune Nvidia GPU
+        run: |
+          sudo nvidia-smi -pm 1
+          sudo nvidia-smi -ac 1215,1410
+          nvidia-smi
+      - name: Install Deps
+        run: |
+          sudo apt-get -y update && sudo apt -y update
+      - name: Setup conda env
+        run: |
+          CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
+          cd benchmark
+          python ./utils/python_utils.py --create-conda-env "${CONDA_ENV}"
+      - name: Setup bisection environment
+        run: |
+          . "${SETUP_SCRIPT}"; cd benchmark
+          python utils/cuda_utils.py --install-torch-build-deps
+          python utils/cuda_utils.py --install-torchbench-deps
+          mkdir -p "${BISECT_WORKDIR}"
+          python utils/cuda_utils.py --install-torch-nightly
+          python run_benchmark.py ${{ github.event.inputs.userbenchmark }} ${{ github.event.inputs.userbenchmark_args }} --dryrun \
+                 --output "${BISECT_WORKDIR}/metric-control.json"
+          python run_benchmark.py ${{ github.event.inputs.userbenchmark }} ${{ github.event.inputs.userbenchmark_args }} --dryrun \
+                 --output "${BISECT_WORKDIR}/metric-treatment.json"
+          python regression_detector.py \
+                 --control "${BISECT_WORKDIR}/metrics-control.json" --treatment "${BISECT_WORKDIR}/metrics-treatment.json" \
+                 --output "${BISECT_WORKDIR}/regression-gh${GITHUB_RUN_ID}.yaml"
+          pip uninstall -y torch torchvision torchaudio torch_tensorrt
+      - name: Bisection
+        run: |
+          . "${SETUP_SCRIPT}"; cd benchmark
+          python bisection.py --work-dir "${BISECT_WORKDIR}" --torch-repos-path "${PWD}/../srcs" \
+                --torchbench-repo-path "${PWD}" --config "${BISECT_WORKDIR}/regression-gh${GITHUB_RUN_ID}.yaml" \
+                --output "${BISECT_WORKDIR}/bisect-output-gh${GITHUB_RUN_ID}.json"
+          cp -r "${BISECT_WORKDIR}" ../bisection-result
+      - name: Upload artifact
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: Bisection result
+          path: bisection-result/
+      - name: Clean up Conda env
+        if: always()
+        run: |
+          . "${SETUP_SCRIPT}"
+          conda deactivate && conda deactivate
+          conda remove -n "${CONDA_ENV}" --all
diff --git a/regression_detector.py b/regression_detector.py
index ca6110b1bc..7a0cbe535a 100644
--- a/regression_detector.py
+++ b/regression_detector.py
@@ -104,9 +104,11 @@ def process_regressions_into_gh_issue(regression_result: TorchBenchABTestResult,
     troubled_tests = ""
     for test, stats in regressions_dict["details"].items():
         delta = stats["delta"]
-        if delta != 0:
+        if not isinstance(delta, str):
             sign = "+" if delta > 0 else ""
             troubled_tests += f"- {test}: {sign}{delta:.5%}\n"
+        else:
+            troubled_tests += f"- {test}: {delta}\n"
     
     control_only_tests = ""
     for test, stat in regressions_dict["control_only_metrics"].items():
diff --git a/userbenchmark/test_bench/regression_detector.py b/userbenchmark/test_bench/regression_detector.py
index e69de29bb2..0f413b0e98 100644
--- a/userbenchmark/test_bench/regression_detector.py
+++ b/userbenchmark/test_bench/regression_detector.py
@@ -0,0 +1,32 @@
+from ..utils import TorchBenchABTestResult, TorchBenchABTestMetric
+from . import BM_NAME
+
+DEFAULT_REGRESSION_DELTA_THRESHOLD = 0.07
+
+def run(control, treatment) -> TorchBenchABTestResult:
+    control_env = control["environ"]
+    control_env["git_commit_hash"] = control["environ"]["pytorch_git_version"]
+    control_metrics = control["metrics"]
+    treatment_env = treatment["environ"]
+    treatment_env["git_commit_hash"] = treatment["environ"]["pytorch_git_version"]
+    treatment_metrics = treatment["metrics"]
+    details = {}
+    for metric_names in control_metrics.keys():
+        control_metric = control_metrics[metric_names]
+        treatment_metric = treatment_metrics[metric_names]
+        if (isinstance(control_metric, str) or isinstance(treatment_metric, str)):
+            if control_metric == "skip_by_dryrun" or not control_metric == treatment_metric:
+                delta = f"{control_metric} -> {treatment_metric}"
+                details[metric_names] = TorchBenchABTestMetric(control=control_metric, treatment=treatment_metric, delta=delta)
+        else:
+            delta = (treatment_metric - control_metric) / control_metric
+            if abs(delta) > DEFAULT_REGRESSION_DELTA_THRESHOLD:
+                details[metric_names] = TorchBenchABTestMetric(control=control_metric, treatment=treatment_metric, delta=delta)
+    return TorchBenchABTestResult(name=BM_NAME,
+                                  control_env=control_env, \
+                                  treatment_env=treatment_env, \
+                                  details=details, \
+                                  control_only_metrics={}, \
+                                  treatment_only_metrics={}, \
+                                  bisection="pytorch")
+
diff --git a/userbenchmark/test_bench/run.py b/userbenchmark/test_bench/run.py
index 02d8229148..cd81195075 100644
--- a/userbenchmark/test_bench/run.py
+++ b/userbenchmark/test_bench/run.py
@@ -155,10 +155,9 @@ def run(args: List[str]):
                 results[f"{config_str}, metric={metric}"] = metrics_dict[metric]
     except KeyboardInterrupt:
         print("User keyboard interrupted!")
-    if not args.dryrun:
-        result = get_output_json(BM_NAME, results)
-        if args.device == 'cuda':
-            import torch
-            result["environ"]["device"] = torch.cuda.get_device_name()
-        with open(args.output, 'w') as f:
-            json.dump(result, f, indent=4)
+    result = get_output_json(BM_NAME, results)
+    if args.device == 'cuda':
+        import torch
+        result["environ"]["device"] = torch.cuda.get_device_name()
+    with open(args.output, 'w') as f:
+        json.dump(result, f, indent=4)
diff --git a/userbenchmark/utils.py b/userbenchmark/utils.py
index 25b3cf3664..c0e41167ba 100644
--- a/userbenchmark/utils.py
+++ b/userbenchmark/utils.py
@@ -4,10 +4,9 @@
 from datetime import datetime, timedelta
 import time
 import json
-import yaml
 from pathlib import Path
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Callable
+from typing import Any, Dict, List, Optional, Callable, Union
 
 REPO_PATH = Path(os.path.abspath(__file__)).parent.parent
 USERBENCHMARK_OUTPUT_PREFIX = ".userbenchmark"
@@ -38,10 +37,9 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 @dataclass
 class TorchBenchABTestMetric:
-    control: float
-    treatment: float
-    delta: float
-
+    control: Union[float, str]
+    treatment: Union[float, str]
+    delta: Union[float, str]
 
 @dataclass
 class TorchBenchABTestResult: