From 2f9b20e94e9887278a31ad15e5fc47cae771781b Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@meta.com>
Date: Mon, 23 Oct 2023 11:14:50 -0700
Subject: [PATCH] Update test utility to use AOTIModelRunner

Summary:
X-link: https://github.com/pytorch/pytorch/pull/111657

Use AOTIModelRunner provided by libtorch instead of the custom written RAIIModelContainer for testing. This change also makes running AOTInductor benchmarks on CPU possbile.

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler

imported-using-ghimport

Reviewed By: angelayi

Differential Revision: D50560764

Pulled By: desertfire

fbshipit-source-id: dbcd6c029bb0a36596bb0de894c7cffc20f0aae0
---
 userbenchmark/dynamo/dynamobench/common.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
index be241a4957..8796da69a6 100644
--- a/userbenchmark/dynamo/dynamobench/common.py
+++ b/userbenchmark/dynamo/dynamobench/common.py
@@ -716,7 +716,9 @@ def maybe_mark_profile(*args, **kwargs):
 
     with maybe_profile(args.export_profiler_trace) as p:
         if args.export_aot_inductor:
-            frozen_model_iter_fn = export_aot_inductor(model, example_inputs)
+            frozen_model_iter_fn = export_aot_inductor(
+                model, example_inputs, args.devices[0]
+            )
         else:
             frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
 
@@ -1165,7 +1167,7 @@ class AOTInductorModelCache:
     cache = dict()
 
     @classmethod
-    def load(cls, model, example_inputs):
+    def load(cls, model, example_inputs, device):
         key = weakref.ref(model)
         if key not in cls.cache:
             # Register the output dataclass to pytree
@@ -1179,10 +1181,9 @@ def load(cls, model, example_inputs):
 
             module = torch.utils.cpp_extension.load_inline(
                 name="aot_inductor",
-                cpp_sources=[aot_inductor_launcher],
+                cpp_sources=[aot_inductor_launcher(so_path, device)],
                 functions=["run"],
-                extra_ldflags=[so_path],
-                with_cuda=True,
+                with_cuda=(device == "cuda"),
             )
 
             value = {
@@ -1211,8 +1212,8 @@ def opt_export(_, example_inputs):
     return opt_export
 
 
-def export_aot_inductor(model, example_inputs):
-    module, exported = AOTInductorModelCache.load(model, example_inputs)
+def export_aot_inductor(model, example_inputs, device):
+    module, exported = AOTInductorModelCache.load(model, example_inputs, device)
 
     def opt_aot_inductor(_, example_inputs, collect_outputs=False):
         example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
@@ -3596,8 +3597,9 @@ def run(runner, args, original_dir=None):
     elif args.backend or args.export_aot_inductor:
         if args.export_aot_inductor:
             assert not args.training, "AOTInductor only supports inference"
-            assert args.devices == ["cuda"], "AOTInductor only tested for CUDA"
-            optimize_ctx = export_aot_inductor
+            optimize_ctx = functools.partial(
+                export_aot_inductor, device=args.devices[0]
+            )
 
             # AOTInductor doesn't support control flow yet
             runner.skip_models.update(runner.skip_models_due_to_control_flow)