Skip to content

Commit

Permalink
Add CUDA Memory Leak error bisection (#2205)
Browse files Browse the repository at this point in the history
Summary:
Support CUDA Memory Leak in bisection.

Pull Request resolved: #2205

Test Plan:
https://github.com/pytorch/benchmark/actions/runs/8405455144

Start hash: 90fdee15be285a6a54587f44e9f3dcfed8e0efd0
End hash: cfaed59ce73871658958bb3d0c08d820c2595e62
Userbenchmark: test_bench
Userbenchmark args: -m sam_fast -d cuda -t eval --memleak

Reviewed By: aaronenyeshi

Differential Revision: D55317315

Pulled By: xuzhao9

fbshipit-source-id: 610e750fbcfa52fa7a33be7f7ae4f34667c243e0
  • Loading branch information
xuzhao9 authored and facebook-github-bot committed Mar 25, 2024
1 parent 40caaee commit c4098d2
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 7 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/userbenchmark-a100-bisection.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,13 @@ jobs:
sudo nvidia-smi -pm 1
sudo nvidia-smi -ac 1215,1410
nvidia-smi
- name: Install Deps
- name: Install devel packages
run: |
sudo apt-get -y update && sudo apt -y update
sudo apt-get update -y
sudo apt-get update -y
sudo apt-get install -y libjpeg-dev zlib1g-dev libpng-dev
sudo ldconfig
sudo ldconfig
- name: Setup conda env
run: |
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
Expand Down
6 changes: 4 additions & 2 deletions docker/gcp-a100-runner-dind.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ FROM ${BASE_IMAGE}
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8

RUN sudo apt-get -y update && sudo apt -y update
# fontconfig: needed by model doctr_det_predictor
# fontconfig: required by model doctr_det_predictor
# libjpeg and libpng: optionally required by torchvision (vision#8342)
RUN sudo apt-get install -y git jq gcc g++ \
vim wget curl ninja-build cmake \
libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \
fontconfig libfontconfig1-dev \
libpango-1.0-0 libpangoft2-1.0-0 \
libsdl2-dev libsdl2-2.0-0
libsdl2-dev libsdl2-2.0-0 \
libjpeg-dev libpng-dev zlib1g-dev

# get switch-cuda utility
RUN sudo wget -q https://raw.githubusercontent.com/phohenecker/switch-cuda/master/switch-cuda.sh -O /usr/bin/switch-cuda.sh
Expand Down
2 changes: 1 addition & 1 deletion torchbenchmark/util/framework/timm/extended_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def is_extended_timm_models(model_name: str) -> bool:
return model_name in TIMM_MODELS

def list_extended_timm_models() -> List[str]:
return TIMM_MODELS.keys()
return list(TIMM_MODELS.keys())

# TODO - Figure out the reason of cold start memory spike
BATCH_SIZE_DIVISORS = {
Expand Down
62 changes: 61 additions & 1 deletion userbenchmark/test_bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,19 @@
get_default_debug_output_dir,
)
from . import BM_NAME
from torchbenchmark import (
ModelTask,
get_metadata_from_yaml,
REPO_PATH,
)

# Some of the models have very heavyweight setup, so we have to set a very
# generous limit. That said, we don't want the entire test suite to hang if
# a single test encounters an extreme failure, so we give up after a test is
# unresponsive to 5 minutes by default. (Note: this does not require that the
# entire test case completes in 5 minutes. It requires that if the worker is
# unresponsive for 5 minutes the parent will presume it dead / incapacitated.)
TIMEOUT = int(os.getenv("TIMEOUT", 300)) # Seconds

with add_path(REPO_PATH):
from torchbenchmark.util.experiment.instantiator import (
Expand Down Expand Up @@ -115,6 +128,8 @@ def init_output_dir(
def get_metrics(config: TorchBenchModelConfig) -> List[str]:
if "--accuracy" in config.extra_args:
return ["accuracy"]
if "--memleak" in config.extra_args:
return ["memleak"]
return ["latencies", "cpu_peak_mem", "gpu_peak_mem"]


Expand Down Expand Up @@ -183,6 +198,47 @@ def run_config(
return dict.fromkeys(metrics, str(e))


def run_config_memleak(config: TorchBenchModelConfig, dryrun: bool=False) -> Dict[str, str]:
def assertEqual(x, y):
assert x == y, f"{x} != {y}"
model_name = config.name
model_path = os.path.join(REPO_PATH, "torchbenchmark", "models", model_name)
metadata = get_metadata_from_yaml(model_path)
task = ModelTask(model_path, timeout=TIMEOUT)
allow_customize_batch_size = task.get_model_attribute(
"ALLOW_CUSTOMIZE_BSIZE", classattr=True
)
# to speedup test, use batch size 1 if possible
batch_size = 1 if allow_customize_batch_size else None
if dryrun:
print(" [skip_by_dryrun] ", flush=True)
return {"memleak": "skip_by_dryrun"}
try:
with task.watch_cuda_memory(
skip=False,
assert_equal=assertEqual,
):
task.make_model_instance(
test=config.test,
device=config.device,
batch_size=batch_size,
)
task.invoke()
if config.test == "train":
task.check_details_train(device=config.device, md=metadata)
else:
task.check_details_eval(device=config.device, md=metadata)
task.check_eval_output()
task.del_model_instance()
result = {"memleak": "False"}
except NotImplementedError as e:
result = {"memleak": "not_implemented"}
except AssertionError:
result = {"memleak": "True"}
finally:
return result


def run_config_accuracy(
config: TorchBenchModelConfig, metrics: List[str], dryrun: bool = False
) -> Dict[str, str]:
Expand Down Expand Up @@ -273,6 +329,8 @@ def run(args: List[str]):
metrics = get_metrics(config)
if "accuracy" in metrics:
metrics_dict = run_config_accuracy(config, metrics, dryrun=args.dryrun)
elif "memleak" in metrics:
metrics_dict = run_config_memleak(config, dryrun=args.dryrun)
else:
metrics_dict = run_config(config, metrics, dryrun=args.dryrun)
config_str = config_to_str(config)
Expand All @@ -284,5 +342,7 @@ def run(args: List[str]):
if args.device == "cuda":
import torch
result["environ"]["device"] = torch.cuda.get_device_name()
o = json.dumps(result, indent=4)
print(o)
with open(args.output, "w") as f:
json.dump(result, f, indent=4)
f.write(o)
8 changes: 7 additions & 1 deletion utils/build_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,14 @@ def build_pytorch_repo(repo: TorchRepo, build_env: Dict[str, str]):
if os.path.exists(version_py_path):
os.remove(version_py_path)
try:
# Build and test triton
build_triton_command = ["make", "triton"]
subprocess.check_call(build_triton_command, cwd=repo.src_path.absolute(), env=build_env)
command_testbuild = [sys.executable, "-c", "'import triton'"]
subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env)
# Build and test pytorch
subprocess.check_call(repo.build_command, cwd=repo.src_path.absolute(), env=build_env)
command_testbuild = [sys.executable, "-c", "'import torch'"]
command_testbuild = [sys.executable, "-c", "'import torch; import triton'"]
subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env)
except subprocess.CalledProcessError:
_print_info(f"BUILDING {repo.name.upper()} commit {repo.cur_commit} 2ND TRY")
Expand Down

0 comments on commit c4098d2

Please sign in to comment.