diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index 22c0ac2cf5..2f0eccf977 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -674,8 +674,9 @@ def maybe_mark_profile(*args, **kwargs): with maybe_mark_profile(p=p, mark="actual"), maybe_enable_compiled_autograd( args.compiled_autograd ): + compiled_model = kwargs.get("compiled_model", model) timings[rep, 1], actual_output = timed( - model, + compiled_model, frozen_model_iter_fn, inputs, return_result=True, @@ -740,11 +741,16 @@ def maybe_mark_profile(*args, **kwargs): for k, v in kwargs["dynamo_stats"].items(): headers.append(k) row.append(v) - output_csv( - output_filename, - headers, - row, - ) + if ( + not torch.distributed.is_available() # no distributed is built + or not torch.distributed.is_initialized() # single gpu + or torch.distributed.get_rank() == 0 # distributed + rank0 + ): + output_csv( + output_filename, + headers, + row, + ) headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True) assert ( output_filename.find(".csv") > 0 @@ -2643,10 +2649,15 @@ def warmup(fn, model, example_inputs, mode, niters=5): return latency, peak_mem, dynamo_stats # Cast the model to float16/float32 as necessary - model, example_inputs = self.maybe_cast(model, example_inputs) + orig_model, example_inputs = self.maybe_cast(model, example_inputs) # Use distributed wrapping as necessary - model = self.deepcopy_and_maybe_parallelize(model) + model = self.deepcopy_and_maybe_parallelize(orig_model) + if experiment.func is speedup_experiment: + # If DDP + compiler is enabled, we need to use a different + compiled_model = self.deepcopy_and_maybe_parallelize(orig_model) + else: + compiled_model = model self.init_optimizer(name, current_device, model.parameters()) with self.pick_grad(name, self.args.training): @@ -2670,7 +2681,7 @@ def warmup(fn, model, example_inputs, mode, niters=5): with maybe_enable_compiled_autograd(self.args.compiled_autograd): dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup( - optimized_model_iter_fn, model, example_inputs, "dynamo" + optimized_model_iter_fn, compiled_model, example_inputs, "dynamo" ) compilation_time = dynamo_latency - eager_latency + aot_compilation_time @@ -2696,10 +2707,10 @@ def warmup(fn, model, example_inputs, mode, niters=5): results = [] # run with torch._dynamo few times to populate the cache for _ in range(3): - optimized_model_iter_fn(model, example_inputs) + optimized_model_iter_fn(compiled_model, example_inputs) _, frames_second_pass = Stats.reset_counters() # should be 0 if frames_second_pass > 0: - optimized_model_iter_fn(model, example_inputs) + optimized_model_iter_fn(compiled_model, example_inputs) _, frames_third_pass = Stats.reset_counters() # should be 0 else: frames_third_pass = 0 @@ -2715,6 +2726,7 @@ def warmup(fn, model, example_inputs, mode, niters=5): if not hasattr(model, name): model.name = name + experiment_kwargs["compiled_model"] = compiled_model results.append(experiment(model, example_inputs, **experiment_kwargs)) return " ".join(map(str, results))