Add CUDA Memory Leak error bisection (#2205)

Summary: Support CUDA Memory Leak in bisection. Pull Request resolved: #2205 Test Plan: https://github.com/pytorch/benchmark/actions/runs/8405455144 Start hash: 90fdee15be285a6a54587f44e9f3dcfed8e0efd0 End hash: cfaed59ce73871658958bb3d0c08d820c2595e62 Userbenchmark: test_bench Userbenchmark args: -m sam_fast -d cuda -t eval --memleak Reviewed By: aaronenyeshi Differential Revision: D55317315 Pulled By: xuzhao9 fbshipit-source-id: 610e750fbcfa52fa7a33be7f7ae4f34667c243e0
pytorch · Mar 25, 2024 · c4098d2 · c4098d2
1 parent 40caaee
commit c4098d2
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 7 deletions.
diff --git a/.github/workflows/userbenchmark-a100-bisection.yml b/.github/workflows/userbenchmark-a100-bisection.yml
@@ -58,9 +58,13 @@ jobs:
           sudo nvidia-smi -pm 1
           sudo nvidia-smi -ac 1215,1410
           nvidia-smi
-      - name: Install Deps
+      - name: Install devel packages
         run: |
-          sudo apt-get -y update && sudo apt -y update
+          sudo apt-get update -y
+          sudo apt-get update -y
+          sudo apt-get install -y libjpeg-dev zlib1g-dev libpng-dev
+          sudo ldconfig
+          sudo ldconfig
       - name: Setup conda env
         run: |
           CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"

diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile
@@ -6,13 +6,15 @@ FROM ${BASE_IMAGE}
 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 
 RUN sudo apt-get -y update && sudo apt -y update
-# fontconfig: needed by model doctr_det_predictor
+# fontconfig: required by model doctr_det_predictor
+# libjpeg and libpng: optionally required by torchvision (vision#8342)
 RUN sudo apt-get install -y git jq gcc g++ \
                             vim wget curl ninja-build cmake \
                             libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \
                             fontconfig libfontconfig1-dev \
                             libpango-1.0-0 libpangoft2-1.0-0 \
-                            libsdl2-dev libsdl2-2.0-0
+                            libsdl2-dev libsdl2-2.0-0 \
+                            libjpeg-dev libpng-dev zlib1g-dev
 
 # get switch-cuda utility
 RUN sudo wget -q https://raw.githubusercontent.com/phohenecker/switch-cuda/master/switch-cuda.sh -O /usr/bin/switch-cuda.sh

diff --git a/torchbenchmark/util/framework/timm/extended_configs.py b/torchbenchmark/util/framework/timm/extended_configs.py
@@ -19,7 +19,7 @@ def is_extended_timm_models(model_name: str) -> bool:
     return model_name in TIMM_MODELS
 
 def list_extended_timm_models() -> List[str]:
-    return TIMM_MODELS.keys()
+    return list(TIMM_MODELS.keys())
 
 # TODO - Figure out the reason of cold start memory spike
 BATCH_SIZE_DIVISORS = {

diff --git a/userbenchmark/test_bench/run.py b/userbenchmark/test_bench/run.py
@@ -21,6 +21,19 @@
     get_default_debug_output_dir,
 )
 from . import BM_NAME
+from torchbenchmark import (
+    ModelTask,
+    get_metadata_from_yaml,
+    REPO_PATH,
+)
+
+# Some of the models have very heavyweight setup, so we have to set a very
+# generous limit. That said, we don't want the entire test suite to hang if
+# a single test encounters an extreme failure, so we give up after a test is
+# unresponsive to 5 minutes by default. (Note: this does not require that the
+# entire test case completes in 5 minutes. It requires that if the worker is
+# unresponsive for 5 minutes the parent will presume it dead / incapacitated.)
+TIMEOUT = int(os.getenv("TIMEOUT", 300))  # Seconds
 
 with add_path(REPO_PATH):
     from torchbenchmark.util.experiment.instantiator import (
@@ -115,6 +128,8 @@ def init_output_dir(
 def get_metrics(config: TorchBenchModelConfig) -> List[str]:
     if "--accuracy" in config.extra_args:
         return ["accuracy"]
+    if "--memleak" in config.extra_args:
+        return ["memleak"]
     return ["latencies", "cpu_peak_mem", "gpu_peak_mem"]
 
 
@@ -183,6 +198,47 @@ def run_config(
         return dict.fromkeys(metrics, str(e))
 
 
+def run_config_memleak(config: TorchBenchModelConfig, dryrun: bool=False) -> Dict[str, str]:
+    def assertEqual(x, y):
+        assert x == y, f"{x} != {y}"
+    model_name = config.name
+    model_path = os.path.join(REPO_PATH, "torchbenchmark", "models", model_name)
+    metadata = get_metadata_from_yaml(model_path)
+    task = ModelTask(model_path, timeout=TIMEOUT)
+    allow_customize_batch_size = task.get_model_attribute(
+        "ALLOW_CUSTOMIZE_BSIZE", classattr=True
+    )
+    # to speedup test, use batch size 1 if possible
+    batch_size = 1 if allow_customize_batch_size else None
+    if dryrun:
+        print(" [skip_by_dryrun] ", flush=True)
+        return {"memleak": "skip_by_dryrun"}
+    try:
+        with task.watch_cuda_memory(
+            skip=False,
+            assert_equal=assertEqual,
+        ):
+            task.make_model_instance(
+                test=config.test,
+                device=config.device,
+                batch_size=batch_size,
+            )
+            task.invoke()
+            if config.test == "train":
+                task.check_details_train(device=config.device, md=metadata)
+            else:
+                task.check_details_eval(device=config.device, md=metadata)
+                task.check_eval_output()
+            task.del_model_instance()
+            result = {"memleak": "False"}
+    except NotImplementedError as e:
+        result = {"memleak": "not_implemented"}
+    except AssertionError:
+        result = {"memleak": "True"}
+    finally:
+        return result
+
+
 def run_config_accuracy(
     config: TorchBenchModelConfig, metrics: List[str], dryrun: bool = False
 ) -> Dict[str, str]:
@@ -273,6 +329,8 @@ def run(args: List[str]):
             metrics = get_metrics(config)
             if "accuracy" in metrics:
                 metrics_dict = run_config_accuracy(config, metrics, dryrun=args.dryrun)
+            elif "memleak" in metrics:
+                metrics_dict = run_config_memleak(config, dryrun=args.dryrun)
             else:
                 metrics_dict = run_config(config, metrics, dryrun=args.dryrun)
             config_str = config_to_str(config)
@@ -284,5 +342,7 @@ def run(args: List[str]):
     if args.device == "cuda":
         import torch
         result["environ"]["device"] = torch.cuda.get_device_name()
+    o = json.dumps(result, indent=4)
+    print(o)
     with open(args.output, "w") as f:
-        json.dump(result, f, indent=4)
+        f.write(o)
diff --git a/utils/build_utils.py b/utils/build_utils.py
@@ -48,8 +48,14 @@ def build_pytorch_repo(repo: TorchRepo, build_env: Dict[str, str]):
     if os.path.exists(version_py_path):
         os.remove(version_py_path)
     try:
+        # Build and test triton
+        build_triton_command = ["make", "triton"]
+        subprocess.check_call(build_triton_command, cwd=repo.src_path.absolute(), env=build_env)
+        command_testbuild = [sys.executable, "-c", "'import triton'"]
+        subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env)
+        # Build and test pytorch
         subprocess.check_call(repo.build_command, cwd=repo.src_path.absolute(), env=build_env)
-        command_testbuild = [sys.executable, "-c", "'import torch'"]
+        command_testbuild = [sys.executable, "-c", "'import torch; import triton'"]
         subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env)
     except subprocess.CalledProcessError:
         _print_info(f"BUILDING {repo.name.upper()} commit {repo.cur_commit} 2ND TRY")