diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index 26a6624087..f25c06e827 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -1982,6 +1982,12 @@ def init_optimizer(self, name, device, params): if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER: if (name in CI_USE_SGD and self.args.ci) or name in BENCHMARK_USE_SGD: self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True) + # Disable multi_tensor_sgd for benchmarking, there isn't a large performance benefit (~1%) to compiling + # this optimizer because it is a single foreach add, and increases compile time. + # After autotuning and fake tensor caching lands, we can enable, becuase the compile time impact will be lower. + # Fake Tensor caching: https://github.com/pytorch/pytorch/pull/113873 + # Autotuning: https://github.com/pytorch/pytorch/issues/117447 + self.optimizer.step = torch._dynamo.disable(self.optimizer.step) else: self.optimizer = torch.optim.Adam( params, lr=0.01, capturable=True, foreach=True