From 6a8b941b59bb861afa43a9213c7dcd8974d0d9f4 Mon Sep 17 00:00:00 2001 From: "Simon Fan (Meta Employee)" Date: Mon, 15 Jan 2024 10:33:33 -0800 Subject: [PATCH] Torchbench Dynamo Runner: Enable DDP for perf test and traces (#113332) Summary: - Removes an outdated assert that prevents perf tests from running DDP, we now have single node --multiprocess and perf tests are already wrapping the model using `deepcopy_and_maybe_ddp` - Append rank name to traces to avoid all ranks trying to create the same file - Renames `deepcopy_and_maybe_ddp` to `deepcopy_and_maybe_parallelize` to include FSDP X-link: https://github.com/pytorch/pytorch/pull/113332 Approved by: https://github.com/H-Huang, https://github.com/wconstab Reviewed By: huydhn Differential Revision: D52754061 Pulled By: xmfan fbshipit-source-id: d34b8dcd484c795b62a2bf8f1611d3cf7e4d489f --- userbenchmark/dynamo/dynamobench/common.py | 23 ++++++++++------------ 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index 2bfdd79304..c9325f4928 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -683,7 +683,10 @@ def maybe_mark_profile(*args, **kwargs): ) if args.export_profiler_trace: - name = args.profiler_trace_name + "_" + model.name + ".json" + name = args.profiler_trace_name + "_" + model.name + if hasattr(args, "rank"): + name += f"_rank_{args.rank}" + name += ".json" name = os.path.join(torch._dynamo.config.base_dir, name) p.export_chrome_trace(name) median = np.median(timings, axis=0) @@ -2233,7 +2236,7 @@ def get_fsdp_auto_wrap_policy(self, model_name: str) -> Optional[ModuleWrapPolic return ModuleWrapPolicy(MODEL_FSDP_WRAP[model_name]) - def deepcopy_and_maybe_ddp(self, model): + def deepcopy_and_maybe_parallelize(self, model): model = self.deepcopy_model(model) if self.args.ddp: assert ( @@ -2329,7 +2332,7 @@ def record_status(accuracy_status, dynamo_start_stats): inputs_fp64 = None try: model_fp64, inputs_fp64 = cast_to_fp64( - self.deepcopy_and_maybe_ddp(model), + self.deepcopy_and_maybe_parallelize(model), clone_inputs(example_inputs), ) self.init_optimizer(name, current_device, model_fp64.parameters()) @@ -2363,7 +2366,7 @@ def record_status(accuracy_status, dynamo_start_stats): reset_rng_state() model_copy = None try: - model_copy = self.deepcopy_and_maybe_ddp(model) + model_copy = self.deepcopy_and_maybe_parallelize(model) self.init_optimizer(name, current_device, model_copy.parameters()) correct_result = self.run_n_iterations( model_copy, clone_inputs(example_inputs) @@ -2384,7 +2387,7 @@ def record_status(accuracy_status, dynamo_start_stats): reset_rng_state() model_copy = None try: - model_copy = self.deepcopy_and_maybe_ddp(model) + model_copy = self.deepcopy_and_maybe_parallelize(model) self.init_optimizer(name, current_device, model_copy.parameters()) correct_rerun_result = self.run_n_iterations( model_copy, clone_inputs(example_inputs) @@ -2431,7 +2434,7 @@ def record_status(accuracy_status, dynamo_start_stats): torch._dynamo.reset() model_copy = None try: - model_copy = self.deepcopy_and_maybe_ddp(model) + model_copy = self.deepcopy_and_maybe_parallelize(model) self.init_optimizer(name, current_device, model_copy.parameters()) if self.args.export or self.args.export_aot_inductor: # apply export on module directly @@ -2615,7 +2618,7 @@ def warmup(fn, model, example_inputs, mode, niters=5): model, example_inputs = self.maybe_cast(model, example_inputs) # Use distributed wrapping as necessary - model = self.deepcopy_and_maybe_ddp(model) + model = self.deepcopy_and_maybe_parallelize(model) self.init_optimizer(name, current_device, model.parameters()) with self.pick_grad(name, self.args.training): @@ -3409,12 +3412,6 @@ def run(runner, args, original_dir=None): CI, args.backend, training=args.training, dynamic=args.dynamic_shapes ) if args.ddp: - # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf, - # but just to measure impact on singlenode of performing graph-breaks. - # Left it as a follow up to keep this PR isolated. - assert ( - args.accuracy - ), "DDP benchmark is currently only hooked up to --accuracy bench" assert args.training, "DDP benchmark requires --training mode" if args.no_optimize_ddp: torch._dynamo.config.optimize_ddp = False