From c4098d288880871067a7a7ad2bbc1bd4cb1f52cc Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 25 Mar 2024 11:32:54 -0700 Subject: [PATCH] Add CUDA Memory Leak error bisection (#2205) Summary: Support CUDA Memory Leak in bisection. Pull Request resolved: https://github.com/pytorch/benchmark/pull/2205 Test Plan: https://github.com/pytorch/benchmark/actions/runs/8405455144 Start hash: 90fdee15be285a6a54587f44e9f3dcfed8e0efd0 End hash: cfaed59ce73871658958bb3d0c08d820c2595e62 Userbenchmark: test_bench Userbenchmark args: -m sam_fast -d cuda -t eval --memleak Reviewed By: aaronenyeshi Differential Revision: D55317315 Pulled By: xuzhao9 fbshipit-source-id: 610e750fbcfa52fa7a33be7f7ae4f34667c243e0 --- .../userbenchmark-a100-bisection.yml | 8 ++- docker/gcp-a100-runner-dind.dockerfile | 6 +- .../util/framework/timm/extended_configs.py | 2 +- userbenchmark/test_bench/run.py | 62 ++++++++++++++++++- utils/build_utils.py | 8 ++- 5 files changed, 79 insertions(+), 7 deletions(-) diff --git a/.github/workflows/userbenchmark-a100-bisection.yml b/.github/workflows/userbenchmark-a100-bisection.yml index 876d3c5df2..8f25e3919f 100644 --- a/.github/workflows/userbenchmark-a100-bisection.yml +++ b/.github/workflows/userbenchmark-a100-bisection.yml @@ -58,9 +58,13 @@ jobs: sudo nvidia-smi -pm 1 sudo nvidia-smi -ac 1215,1410 nvidia-smi - - name: Install Deps + - name: Install devel packages run: | - sudo apt-get -y update && sudo apt -y update + sudo apt-get update -y + sudo apt-get update -y + sudo apt-get install -y libjpeg-dev zlib1g-dev libpng-dev + sudo ldconfig + sudo ldconfig - name: Setup conda env run: | CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index bcb8c95584..9a35b4fed9 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -6,13 +6,15 @@ FROM ${BASE_IMAGE} ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN sudo apt-get -y update && sudo apt -y update -# fontconfig: needed by model doctr_det_predictor +# fontconfig: required by model doctr_det_predictor +# libjpeg and libpng: optionally required by torchvision (vision#8342) RUN sudo apt-get install -y git jq gcc g++ \ vim wget curl ninja-build cmake \ libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \ fontconfig libfontconfig1-dev \ libpango-1.0-0 libpangoft2-1.0-0 \ - libsdl2-dev libsdl2-2.0-0 + libsdl2-dev libsdl2-2.0-0 \ + libjpeg-dev libpng-dev zlib1g-dev # get switch-cuda utility RUN sudo wget -q https://raw.githubusercontent.com/phohenecker/switch-cuda/master/switch-cuda.sh -O /usr/bin/switch-cuda.sh diff --git a/torchbenchmark/util/framework/timm/extended_configs.py b/torchbenchmark/util/framework/timm/extended_configs.py index dbf67fce0f..6315404b83 100644 --- a/torchbenchmark/util/framework/timm/extended_configs.py +++ b/torchbenchmark/util/framework/timm/extended_configs.py @@ -19,7 +19,7 @@ def is_extended_timm_models(model_name: str) -> bool: return model_name in TIMM_MODELS def list_extended_timm_models() -> List[str]: - return TIMM_MODELS.keys() + return list(TIMM_MODELS.keys()) # TODO - Figure out the reason of cold start memory spike BATCH_SIZE_DIVISORS = { diff --git a/userbenchmark/test_bench/run.py b/userbenchmark/test_bench/run.py index 74ae69353d..ef77063fad 100644 --- a/userbenchmark/test_bench/run.py +++ b/userbenchmark/test_bench/run.py @@ -21,6 +21,19 @@ get_default_debug_output_dir, ) from . import BM_NAME +from torchbenchmark import ( + ModelTask, + get_metadata_from_yaml, + REPO_PATH, +) + +# Some of the models have very heavyweight setup, so we have to set a very +# generous limit. That said, we don't want the entire test suite to hang if +# a single test encounters an extreme failure, so we give up after a test is +# unresponsive to 5 minutes by default. (Note: this does not require that the +# entire test case completes in 5 minutes. It requires that if the worker is +# unresponsive for 5 minutes the parent will presume it dead / incapacitated.) +TIMEOUT = int(os.getenv("TIMEOUT", 300)) # Seconds with add_path(REPO_PATH): from torchbenchmark.util.experiment.instantiator import ( @@ -115,6 +128,8 @@ def init_output_dir( def get_metrics(config: TorchBenchModelConfig) -> List[str]: if "--accuracy" in config.extra_args: return ["accuracy"] + if "--memleak" in config.extra_args: + return ["memleak"] return ["latencies", "cpu_peak_mem", "gpu_peak_mem"] @@ -183,6 +198,47 @@ def run_config( return dict.fromkeys(metrics, str(e)) +def run_config_memleak(config: TorchBenchModelConfig, dryrun: bool=False) -> Dict[str, str]: + def assertEqual(x, y): + assert x == y, f"{x} != {y}" + model_name = config.name + model_path = os.path.join(REPO_PATH, "torchbenchmark", "models", model_name) + metadata = get_metadata_from_yaml(model_path) + task = ModelTask(model_path, timeout=TIMEOUT) + allow_customize_batch_size = task.get_model_attribute( + "ALLOW_CUSTOMIZE_BSIZE", classattr=True + ) + # to speedup test, use batch size 1 if possible + batch_size = 1 if allow_customize_batch_size else None + if dryrun: + print(" [skip_by_dryrun] ", flush=True) + return {"memleak": "skip_by_dryrun"} + try: + with task.watch_cuda_memory( + skip=False, + assert_equal=assertEqual, + ): + task.make_model_instance( + test=config.test, + device=config.device, + batch_size=batch_size, + ) + task.invoke() + if config.test == "train": + task.check_details_train(device=config.device, md=metadata) + else: + task.check_details_eval(device=config.device, md=metadata) + task.check_eval_output() + task.del_model_instance() + result = {"memleak": "False"} + except NotImplementedError as e: + result = {"memleak": "not_implemented"} + except AssertionError: + result = {"memleak": "True"} + finally: + return result + + def run_config_accuracy( config: TorchBenchModelConfig, metrics: List[str], dryrun: bool = False ) -> Dict[str, str]: @@ -273,6 +329,8 @@ def run(args: List[str]): metrics = get_metrics(config) if "accuracy" in metrics: metrics_dict = run_config_accuracy(config, metrics, dryrun=args.dryrun) + elif "memleak" in metrics: + metrics_dict = run_config_memleak(config, dryrun=args.dryrun) else: metrics_dict = run_config(config, metrics, dryrun=args.dryrun) config_str = config_to_str(config) @@ -284,5 +342,7 @@ def run(args: List[str]): if args.device == "cuda": import torch result["environ"]["device"] = torch.cuda.get_device_name() + o = json.dumps(result, indent=4) + print(o) with open(args.output, "w") as f: - json.dump(result, f, indent=4) + f.write(o) diff --git a/utils/build_utils.py b/utils/build_utils.py index d561295302..a936a91a57 100644 --- a/utils/build_utils.py +++ b/utils/build_utils.py @@ -48,8 +48,14 @@ def build_pytorch_repo(repo: TorchRepo, build_env: Dict[str, str]): if os.path.exists(version_py_path): os.remove(version_py_path) try: + # Build and test triton + build_triton_command = ["make", "triton"] + subprocess.check_call(build_triton_command, cwd=repo.src_path.absolute(), env=build_env) + command_testbuild = [sys.executable, "-c", "'import triton'"] + subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env) + # Build and test pytorch subprocess.check_call(repo.build_command, cwd=repo.src_path.absolute(), env=build_env) - command_testbuild = [sys.executable, "-c", "'import torch'"] + command_testbuild = [sys.executable, "-c", "'import torch; import triton'"] subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env) except subprocess.CalledProcessError: _print_info(f"BUILDING {repo.name.upper()} commit {repo.cur_commit} 2ND TRY")