From bf8df16f6f2cc0de07f25edc30a23dca743c0258 Mon Sep 17 00:00:00 2001 From: "Bin Bao (Meta Employee)" Date: Thu, 7 Dec 2023 19:50:18 -0800 Subject: [PATCH] Log load_model failures in csv (#114784) Summary: Right now when load_model fails (either because of loading error or validation eager run failure), the result won't be logged in generated csv files. Let's log them in csv so that they are monitored by the expected results checking. X-link: https://github.com/pytorch/pytorch/pull/114784 Approved by: https://github.com/malfet Reviewed By: atalman Differential Revision: D51939241 Pulled By: desertfire fbshipit-source-id: 945e297409654654d2b97aa518bbf1f894c41c8b --- userbenchmark/dynamo/dynamobench/common.py | 72 ++++++++++++---------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index 28ac042a96..fdeb4e218b 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -1939,8 +1939,7 @@ def validate_model(self, model, example_inputs): try: self.model_iter_fn(model, example_inputs) except Exception as e: - print(f"Original Error: {str(e)}") - raise NotImplementedError("Eager model failed to run") from e + raise RuntimeError("Eager run failed") from e def maybe_cast(self, model, example_inputs): model = self.deepcopy_model(model) @@ -2191,6 +2190,7 @@ def record_status(accuracy_status, dynamo_start_stats): if isinstance(e, torch.cuda.OutOfMemoryError) else "eager_2nd_run_fail" ) + log.exception(e) return record_status(accuracy_status, dynamo_start_stats=start_stats) finally: del model_copy @@ -2546,6 +2546,8 @@ def run_one_model( name, model, example_inputs, optimize_ctx, experiment, tag ) print(status) + torch.cuda.empty_cache() + if self.args.timing: from torch._dynamo.utils import op_count, print_time_report from torch.utils._stats import simple_call_counter @@ -3498,6 +3500,31 @@ def run(runner, args, original_dir=None): # Go back to main branch repo.git.checkout(main_branch) elif args.only: + + def write_csv_when_exception(name: str, status: str, device=None): + print(status) + placeholder_batch_size = 0 + devices = [device] if device is not None else args.devices + if args.accuracy: + headers = ["dev", "name", "batch_size", "accuracy"] + rows = [ + [device, name, placeholder_batch_size, status] for device in devices + ] + elif args.performance: + headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] + rows = [ + [device, name, placeholder_batch_size, 0.0, 0.0] + for device in devices + ] + else: + headers = [] + rows = [ + [device, name, placeholder_batch_size, 0.0] for device in devices + ] + + for row in rows: + output_csv(output_filename, headers, row) + model_name = args.only for device in args.devices: batch_size = args.batch_size @@ -3513,6 +3540,7 @@ def run(runner, args, original_dir=None): torch.Tensor, lambda x: x.to(device=device), example_inputs ) else: + name = model_name try: with tqdm(desc="loading model"): extra_args = [] @@ -3567,12 +3595,18 @@ def run(runner, args, original_dir=None): batch_size=batch_size, extra_args=extra_args, ) - except NotImplementedError as e: - print(e) + except RuntimeError as e: import traceback + mode = "train" if args.training else "eval" + print(f"{device:4} {mode:5} {name:34} ") print(traceback.format_exc()) - logging.warning("%s failed to load", args.only) + status = ( + "model_fail_to_load" + if isinstance(e, NotImplementedError) + else "eager_fail_to_run" + ) + write_csv_when_exception(name, status, device) continue # bad benchmark implementation if args.trace_on_xla: @@ -3657,33 +3691,9 @@ def detect_and_mark_batch(t): nmodels = len(model_names) for i, name in enumerate(model_names): current_name = name - placeholder_batch_size = 0 if args.progress: print(f"Running model {i+1}/{nmodels}", flush=True) - def write_csv(status): - if args.accuracy: - headers = ["dev", "name", "batch_size", "accuracy"] - rows = [ - [device, name, placeholder_batch_size, status] - for device in args.devices - ] - elif args.performance: - headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] - rows = [ - [device, name, placeholder_batch_size, 0.0, 0.0] - for device in args.devices - ] - else: - headers = [] - rows = [ - [device, name, placeholder_batch_size, 0.0] - for device in args.devices - ] - - for row in rows: - output_csv(output_filename, headers, row) - try: timeout = args.timeout if should_diff_branch(args): @@ -3692,13 +3702,11 @@ def write_csv(status): [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout ) except subprocess.TimeoutExpired: - print("TIMEOUT", file=sys.stderr) - write_csv("timeout") + write_csv_when_exception(name, "timeout") except subprocess.CalledProcessError as e: print("Run failed with return code: ", e.returncode, file=sys.stderr) print("Output: ", e.output, file=sys.stderr) print("Error: ", e.stderr, file=sys.stderr) - write_csv("infra_error") print_summary(output_filename, print_dataframe=args.print_dataframe_summary)