Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA Memory Leak error bisection #2205

Closed
wants to merge 14 commits into from
8 changes: 6 additions & 2 deletions .github/workflows/userbenchmark-a100-bisection.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,13 @@ jobs:
sudo nvidia-smi -pm 1
sudo nvidia-smi -ac 1215,1410
nvidia-smi
- name: Install Deps
- name: Install devel packages
run: |
sudo apt-get -y update && sudo apt -y update
sudo apt-get update -y
sudo apt-get update -y
sudo apt-get install -y libjpeg-dev zlib1g-dev libpng-dev
sudo ldconfig
sudo ldconfig
- name: Setup conda env
run: |
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
Expand Down
6 changes: 4 additions & 2 deletions docker/gcp-a100-runner-dind.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ FROM ${BASE_IMAGE}
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8

RUN sudo apt-get -y update && sudo apt -y update
# fontconfig: needed by model doctr_det_predictor
# fontconfig: required by model doctr_det_predictor
# libjpeg and libpng: optionally required by torchvision (vision#8342)
RUN sudo apt-get install -y git jq gcc g++ \
vim wget curl ninja-build cmake \
libgl1-mesa-glx libsndfile1-dev kmod libxml2-dev libxslt1-dev \
fontconfig libfontconfig1-dev \
libpango-1.0-0 libpangoft2-1.0-0 \
libsdl2-dev libsdl2-2.0-0
libsdl2-dev libsdl2-2.0-0 \
libjpeg-dev libpng-dev zlib1g-dev

# get switch-cuda utility
RUN sudo wget -q https://raw.githubusercontent.com/phohenecker/switch-cuda/master/switch-cuda.sh -O /usr/bin/switch-cuda.sh
Expand Down
2 changes: 1 addition & 1 deletion torchbenchmark/util/framework/timm/extended_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def is_extended_timm_models(model_name: str) -> bool:
return model_name in TIMM_MODELS

def list_extended_timm_models() -> List[str]:
return TIMM_MODELS.keys()
return list(TIMM_MODELS.keys())

# TODO - Figure out the reason of cold start memory spike
BATCH_SIZE_DIVISORS = {
Expand Down
62 changes: 61 additions & 1 deletion userbenchmark/test_bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,19 @@
get_default_debug_output_dir,
)
from . import BM_NAME
from torchbenchmark import (
ModelTask,
get_metadata_from_yaml,
REPO_PATH,
)

# Some of the models have very heavyweight setup, so we have to set a very
# generous limit. That said, we don't want the entire test suite to hang if
# a single test encounters an extreme failure, so we give up after a test is
# unresponsive to 5 minutes by default. (Note: this does not require that the
# entire test case completes in 5 minutes. It requires that if the worker is
# unresponsive for 5 minutes the parent will presume it dead / incapacitated.)
TIMEOUT = int(os.getenv("TIMEOUT", 300)) # Seconds

with add_path(REPO_PATH):
from torchbenchmark.util.experiment.instantiator import (
Expand Down Expand Up @@ -115,6 +128,8 @@ def init_output_dir(
def get_metrics(config: TorchBenchModelConfig) -> List[str]:
if "--accuracy" in config.extra_args:
return ["accuracy"]
if "--memleak" in config.extra_args:
return ["memleak"]
return ["latencies", "cpu_peak_mem", "gpu_peak_mem"]


Expand Down Expand Up @@ -183,6 +198,47 @@ def run_config(
return dict.fromkeys(metrics, str(e))


def run_config_memleak(config: TorchBenchModelConfig, dryrun: bool=False) -> Dict[str, str]:
def assertEqual(x, y):
assert x == y, f"{x} != {y}"
model_name = config.name
model_path = os.path.join(REPO_PATH, "torchbenchmark", "models", model_name)
metadata = get_metadata_from_yaml(model_path)
task = ModelTask(model_path, timeout=TIMEOUT)
allow_customize_batch_size = task.get_model_attribute(
"ALLOW_CUSTOMIZE_BSIZE", classattr=True
)
# to speedup test, use batch size 1 if possible
batch_size = 1 if allow_customize_batch_size else None
if dryrun:
print(" [skip_by_dryrun] ", flush=True)
return {"memleak": "skip_by_dryrun"}
try:
with task.watch_cuda_memory(
skip=False,
assert_equal=assertEqual,
):
task.make_model_instance(
test=config.test,
device=config.device,
batch_size=batch_size,
)
task.invoke()
if config.test == "train":
task.check_details_train(device=config.device, md=metadata)
else:
task.check_details_eval(device=config.device, md=metadata)
task.check_eval_output()
task.del_model_instance()
result = {"memleak": "False"}
except NotImplementedError as e:
result = {"memleak": "not_implemented"}
except AssertionError:
result = {"memleak": "True"}
finally:
return result


def run_config_accuracy(
config: TorchBenchModelConfig, metrics: List[str], dryrun: bool = False
) -> Dict[str, str]:
Expand Down Expand Up @@ -273,6 +329,8 @@ def run(args: List[str]):
metrics = get_metrics(config)
if "accuracy" in metrics:
metrics_dict = run_config_accuracy(config, metrics, dryrun=args.dryrun)
elif "memleak" in metrics:
metrics_dict = run_config_memleak(config, dryrun=args.dryrun)
else:
metrics_dict = run_config(config, metrics, dryrun=args.dryrun)
config_str = config_to_str(config)
Expand All @@ -284,5 +342,7 @@ def run(args: List[str]):
if args.device == "cuda":
import torch
result["environ"]["device"] = torch.cuda.get_device_name()
o = json.dumps(result, indent=4)
print(o)
with open(args.output, "w") as f:
json.dump(result, f, indent=4)
f.write(o)
8 changes: 7 additions & 1 deletion utils/build_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,14 @@ def build_pytorch_repo(repo: TorchRepo, build_env: Dict[str, str]):
if os.path.exists(version_py_path):
os.remove(version_py_path)
try:
# Build and test triton
build_triton_command = ["make", "triton"]
subprocess.check_call(build_triton_command, cwd=repo.src_path.absolute(), env=build_env)
command_testbuild = [sys.executable, "-c", "'import triton'"]
subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env)
# Build and test pytorch
subprocess.check_call(repo.build_command, cwd=repo.src_path.absolute(), env=build_env)
command_testbuild = [sys.executable, "-c", "'import torch'"]
command_testbuild = [sys.executable, "-c", "'import torch; import triton'"]
subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env)
except subprocess.CalledProcessError:
_print_info(f"BUILDING {repo.name.upper()} commit {repo.cur_commit} 2ND TRY")
Expand Down
Loading