Distinguish compile time from run time (#1865)

Summary: This PR teases out the compile time and run time to track two numbers, where previously, the metrics measured e2e compile+runtime. SGD with foreach, momentum, CUDA on pt2 today takes ~2.36s on average. I anticipate the runtime to be close to that after this change still, whereas the compile time will be a separate much larger value. Current results: <img width="1369" alt="image" src="https://github.com/pytorch/benchmark/assets/31798555/16d366da-3361-4540-a2a2-bc22f51965e6"> The metrics dictionary used to have entries like: ``` { "name": "optim", "environ": { "pytorch_git_version": "c527d0fadd27595ba2f98dd0f57aae5c56658d71" }, "metrics": { "resnet18, Adam, cuda, (pt2) default": 0.0018993107215413507, "resnet18, Adam, cuda, default": 0.0010943956114351748, "resnet18, Adam, cuda, (pt2) amsgrad, maximize": 0.002033790648736135, "resnet18, Adam, cuda, amsgrad, maximize": 0.0013529009232297541, "resnet18, Adam, cuda, (pt2) no_foreach": 0.005578947072434757, ... } } ``` But now, the keys will contain "compile_time" at the beginning if we're measuring compile time: ``` { "name": "optim", "environ": { "pytorch_git_version": "a005f70a4284152e00c8f6603feaf4ab9636f6aa" }, "metrics": { "resnet18, SGD, cuda, (pt2) no_foreach": 0.0017500566132366657, "resnet18, SGD, cuda, no_foreach": 0.0025729038193821907, "resnet18, SGD, cuda, (pt2) foreach": 0.0017613966017961502, ... "resnet18, SGD, cpu, foreach, momentum=0.9": 0.08240865767002106, "compile_time, resnet18, SGD, cuda, (pt2) no_foreach": 14.877577589824796, "compile_time, resnet18, SGD, cuda, (pt2) foreach": 0.6698574535548687, "compile_time, resnet18, SGD, cuda, (pt2) foreach, momentum=0.9, nesterov": 0.32723781156043213, ... "compile_time, resnet18, SGD, cpu, (pt2) foreach, momentum=0.9": 0.29321490600705147 } } ``` Pull Request resolved: #1865 Reviewed By: mlazos Differential Revision: D49022308 Pulled By: janeyx99 fbshipit-source-id: 4a143071d232160b239efc03e04afba611a973af
pytorch · Sep 7, 2023 · 1e7bcd0 · 1e7bcd0
1 parent ffbbebb
commit 1e7bcd0
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 18 deletions.
diff --git a/userbenchmark/optim/regression_detector.py b/userbenchmark/optim/regression_detector.py
@@ -2,6 +2,7 @@
 from ..utils import TorchBenchABTestResult, TorchBenchABTestMetric
 
 DEFAULT_REGRESSION_DELTA_THRESHOLD = 0.3
+COMPILE_TIME_REGRESSION_DELTA_THRESHOLD = 2.0
 
 def run(control, treatment) -> Optional[TorchBenchABTestResult]:
     control_env = control["environ"]
@@ -11,10 +12,11 @@ def run(control, treatment) -> Optional[TorchBenchABTestResult]:
     details = {}
     for control_metric_name, control_metric in control_metrics.items():
         if control_metric_name in treatment_metrics:
+            regression_threshold = COMPILE_TIME_REGRESSION_DELTA_THRESHOLD if "compile_time" in control_metric_name else DEFAULT_REGRESSION_DELTA_THRESHOLD
             treatment_metric = treatment_metrics[control_metric_name]
             delta = (treatment_metric - control_metric) / control_metric
             # Trigger on BOTH slowdowns and speedups
-            if abs(delta) > DEFAULT_REGRESSION_DELTA_THRESHOLD:
+            if abs(delta) > regression_threshold:
                 details[control_metric_name] = TorchBenchABTestMetric(control=control_metric, treatment=treatment_metric, delta=delta)
     # control_only_metrics/treatment_only_metrics will be filled in later by the main regression detector
     return TorchBenchABTestResult(name=control["name"],

diff --git a/userbenchmark/optim/run.py b/userbenchmark/optim/run.py
@@ -11,6 +11,7 @@
 import sys
 import itertools
 import datetime
+import time
 import yaml
 
 with add_path(REPO_PATH):
@@ -405,33 +406,48 @@ def run_model(modelName, device, Optim, defaults, maybe_pt2_):
 
         print(f'{datetime.datetime.now()}     python -m userbenchmark.optim.run -m {modelName} -d {device}' +
               f' -o {Optim.__name__} --df "{defaults_to_str(defaults)}" -f {maybe_pt2_}')
+
+        compile_r = None
+        sub_label = f'{modelName}, {optim.__class__.__name__}, {device}'
+        description = pt2_description + defaults_to_str(defaults)
+
+        # Get compile time by running 5 times and subtracting
+        #   first entry - avg(entries 3 through 5)
+        # skipping the second entry due to high variance of first cache hit
+        if maybe_pt2_ != '':
+            times = []
+            if device == "cuda":
+                torch.cuda.reset_peak_memory_stats()
+                torch.cuda.empty_cache()
+            for _ in range(5):
+                t0 = time.perf_counter()
+                pt2_optimizer_step(optim)
+                t1 = time.perf_counter()
+                times.append(t1 - t0)
+            compile_r = (f'compile_time, {sub_label}, {description}', times[0] - sum(times[2:5]) / 3)
+
         r = benchmark.Timer(
             stmt=f'{maybe_pt2_}optimizer_step(optim)',
             globals={'optim': optim, 'optimizer_step': optimizer_step, 'pt2_optimizer_step': pt2_optimizer_step},
-            sub_label=f'{modelName}, {optim.__class__.__name__}, {device}',
-            description=pt2_description + defaults_to_str(defaults),
+            sub_label=sub_label,
+            description=description,
         ).blocked_autorange()
 
-        if maybe_pt2_:
-            # Clears the cache that dynamo had accumulated to prevent OOMs
-            # See https://github.com/pytorch/pytorch/issues/100264
-            torchdynamo.reset()
-            gc.collect()
-
-        return r
+        return r, compile_r
     except Exception as e:
         if not continue_on_error:
             raise e
         print(e)
         with open('errors.txt', 'a') as f:
             f.write(f'{datetime.datetime.now()} python -m userbenchmark.optim.run -m {modelName} -d {device}' +
                     f' -o {Optim.__name__} --df "{defaults_to_str(defaults)}" -f {maybe_pt2_}{str(e)}\n')
-        return None
+        return None, None
 
 
 def run_benchmarks(optims: List[str], func_strs: List[str], models: List[str], devices: List[str],
-                   flags: List[str]) -> List[torch.utils.benchmark.utils.common.Measurement]:
+                   flags: List[str]) -> Tuple[List[torch.utils.benchmark.utils.common.Measurement], Dict[str, float]]:
     results = []
+    compile_metrics = {}
     optim_cfgs = [(O, defaults) for (O, defaults) in OPTIMIZERS if O.__name__ in optims and all(f in defaults_to_str(defaults) for f in flags)]
 
     if run_on_subset:
@@ -443,10 +459,13 @@ def run_benchmarks(optims: List[str], func_strs: List[str], models: List[str], d
     for mn, d, (O, defaults), func_str in itertools.product(models, devices, optim_cfgs, func_strs):
         if (not ignore_skips and is_excluded(mn, d, O.__name__, func_str, defaults)):
             continue
-        bm = run_model(mn, d, O, defaults, func_str)
-        if bm is not None:
-            results.append(bm)
-    return results
+        r, compile_r = run_model(mn, d, O, defaults, func_str)
+        if r is not None:
+            results.append(r)
+        if compile_r is not None:
+            metric_name, compile_time = compile_r
+            compile_metrics[metric_name] = compile_time
+    return results, compile_metrics
 
 
 def parse_args(args: List[str]):
@@ -541,13 +560,19 @@ def run(args: List[str]):
     if target_dir is not None:
         target_dir.mkdir(exist_ok=True, parents=True)
 
-    results = run_benchmarks(args.optims, args.funcs, args.models, args.devices, args.default_flags)
+    results, compile_metrics = run_benchmarks(args.optims, args.funcs, args.models, args.devices, args.default_flags)
     metrics: Dict[str, float] = get_metrics(results)
-    dump_output(BM_NAME, get_output_json(BM_NAME, metrics), target_dir=target_dir)
+    dump_output(BM_NAME, get_output_json(BM_NAME, {**metrics, **compile_metrics}), target_dir=target_dir)
+
+    print("----------------- RUNTIME RESULTS -----------------")
     compare = benchmark.Compare(results)
     compare.trim_significant_figures()
     compare.colorize(rowwise=True)
     compare.print()
 
+    print("----------------- COMPILE TIME RESULTS -----------------")
+    print(compile_metrics)
+
+
 if __name__ == '__main__':
     run(sys.argv[1:])