Skip to content

Commit

Permalink
Distinguish compile time from run time (#1865)
Browse files Browse the repository at this point in the history
Summary:
This PR teases out the compile time and run time to track two numbers, where previously, the metrics measured e2e compile+runtime.

SGD with foreach, momentum, CUDA on pt2 today takes ~2.36s on average. I anticipate the runtime to be close to that after this change still, whereas the compile time will be a separate much larger value.

Current results:
<img width="1369" alt="image" src="https://github.com/pytorch/benchmark/assets/31798555/16d366da-3361-4540-a2a2-bc22f51965e6">

The metrics dictionary used to have entries like:
```
{
    "name": "optim",
    "environ": {
        "pytorch_git_version": "c527d0fadd27595ba2f98dd0f57aae5c56658d71"
    },
    "metrics": {
        "resnet18, Adam, cuda, (pt2) default": 0.0018993107215413507,
        "resnet18, Adam, cuda, default": 0.0010943956114351748,
        "resnet18, Adam, cuda, (pt2) amsgrad, maximize": 0.002033790648736135,
        "resnet18, Adam, cuda, amsgrad, maximize": 0.0013529009232297541,
        "resnet18, Adam, cuda, (pt2) no_foreach": 0.005578947072434757,
        ...
    }
}
```

But now, the keys will contain "compile_time" at the beginning if we're measuring compile time:
```
{
    "name": "optim",
    "environ": {
        "pytorch_git_version": "a005f70a4284152e00c8f6603feaf4ab9636f6aa"
    },
    "metrics": {
        "resnet18, SGD, cuda, (pt2) no_foreach": 0.0017500566132366657,
        "resnet18, SGD, cuda, no_foreach": 0.0025729038193821907,
        "resnet18, SGD, cuda, (pt2) foreach": 0.0017613966017961502,
        ...
        "resnet18, SGD, cpu, foreach, momentum=0.9": 0.08240865767002106,
        "compile_time, resnet18, SGD, cuda, (pt2) no_foreach": 14.877577589824796,
        "compile_time, resnet18, SGD, cuda, (pt2) foreach": 0.6698574535548687,
        "compile_time, resnet18, SGD, cuda, (pt2) foreach, momentum=0.9, nesterov": 0.32723781156043213,
        ...
        "compile_time, resnet18, SGD, cpu, (pt2) foreach, momentum=0.9": 0.29321490600705147
    }
}
```

Pull Request resolved: #1865

Reviewed By: mlazos

Differential Revision: D49022308

Pulled By: janeyx99

fbshipit-source-id: 4a143071d232160b239efc03e04afba611a973af
  • Loading branch information
janeyx99 authored and facebook-github-bot committed Sep 7, 2023
1 parent ffbbebb commit 1e7bcd0
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 18 deletions.
4 changes: 3 additions & 1 deletion userbenchmark/optim/regression_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from ..utils import TorchBenchABTestResult, TorchBenchABTestMetric

DEFAULT_REGRESSION_DELTA_THRESHOLD = 0.3
COMPILE_TIME_REGRESSION_DELTA_THRESHOLD = 2.0

def run(control, treatment) -> Optional[TorchBenchABTestResult]:
control_env = control["environ"]
Expand All @@ -11,10 +12,11 @@ def run(control, treatment) -> Optional[TorchBenchABTestResult]:
details = {}
for control_metric_name, control_metric in control_metrics.items():
if control_metric_name in treatment_metrics:
regression_threshold = COMPILE_TIME_REGRESSION_DELTA_THRESHOLD if "compile_time" in control_metric_name else DEFAULT_REGRESSION_DELTA_THRESHOLD
treatment_metric = treatment_metrics[control_metric_name]
delta = (treatment_metric - control_metric) / control_metric
# Trigger on BOTH slowdowns and speedups
if abs(delta) > DEFAULT_REGRESSION_DELTA_THRESHOLD:
if abs(delta) > regression_threshold:
details[control_metric_name] = TorchBenchABTestMetric(control=control_metric, treatment=treatment_metric, delta=delta)
# control_only_metrics/treatment_only_metrics will be filled in later by the main regression detector
return TorchBenchABTestResult(name=control["name"],
Expand Down
59 changes: 42 additions & 17 deletions userbenchmark/optim/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys
import itertools
import datetime
import time
import yaml

with add_path(REPO_PATH):
Expand Down Expand Up @@ -405,33 +406,48 @@ def run_model(modelName, device, Optim, defaults, maybe_pt2_):

print(f'{datetime.datetime.now()} python -m userbenchmark.optim.run -m {modelName} -d {device}' +
f' -o {Optim.__name__} --df "{defaults_to_str(defaults)}" -f {maybe_pt2_}')

compile_r = None
sub_label = f'{modelName}, {optim.__class__.__name__}, {device}'
description = pt2_description + defaults_to_str(defaults)

# Get compile time by running 5 times and subtracting
# first entry - avg(entries 3 through 5)
# skipping the second entry due to high variance of first cache hit
if maybe_pt2_ != '':
times = []
if device == "cuda":
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
for _ in range(5):
t0 = time.perf_counter()
pt2_optimizer_step(optim)
t1 = time.perf_counter()
times.append(t1 - t0)
compile_r = (f'compile_time, {sub_label}, {description}', times[0] - sum(times[2:5]) / 3)

r = benchmark.Timer(
stmt=f'{maybe_pt2_}optimizer_step(optim)',
globals={'optim': optim, 'optimizer_step': optimizer_step, 'pt2_optimizer_step': pt2_optimizer_step},
sub_label=f'{modelName}, {optim.__class__.__name__}, {device}',
description=pt2_description + defaults_to_str(defaults),
sub_label=sub_label,
description=description,
).blocked_autorange()

if maybe_pt2_:
# Clears the cache that dynamo had accumulated to prevent OOMs
# See https://github.com/pytorch/pytorch/issues/100264
torchdynamo.reset()
gc.collect()

return r
return r, compile_r
except Exception as e:
if not continue_on_error:
raise e
print(e)
with open('errors.txt', 'a') as f:
f.write(f'{datetime.datetime.now()} python -m userbenchmark.optim.run -m {modelName} -d {device}' +
f' -o {Optim.__name__} --df "{defaults_to_str(defaults)}" -f {maybe_pt2_}{str(e)}\n')
return None
return None, None


def run_benchmarks(optims: List[str], func_strs: List[str], models: List[str], devices: List[str],
flags: List[str]) -> List[torch.utils.benchmark.utils.common.Measurement]:
flags: List[str]) -> Tuple[List[torch.utils.benchmark.utils.common.Measurement], Dict[str, float]]:
results = []
compile_metrics = {}
optim_cfgs = [(O, defaults) for (O, defaults) in OPTIMIZERS if O.__name__ in optims and all(f in defaults_to_str(defaults) for f in flags)]

if run_on_subset:
Expand All @@ -443,10 +459,13 @@ def run_benchmarks(optims: List[str], func_strs: List[str], models: List[str], d
for mn, d, (O, defaults), func_str in itertools.product(models, devices, optim_cfgs, func_strs):
if (not ignore_skips and is_excluded(mn, d, O.__name__, func_str, defaults)):
continue
bm = run_model(mn, d, O, defaults, func_str)
if bm is not None:
results.append(bm)
return results
r, compile_r = run_model(mn, d, O, defaults, func_str)
if r is not None:
results.append(r)
if compile_r is not None:
metric_name, compile_time = compile_r
compile_metrics[metric_name] = compile_time
return results, compile_metrics


def parse_args(args: List[str]):
Expand Down Expand Up @@ -541,13 +560,19 @@ def run(args: List[str]):
if target_dir is not None:
target_dir.mkdir(exist_ok=True, parents=True)

results = run_benchmarks(args.optims, args.funcs, args.models, args.devices, args.default_flags)
results, compile_metrics = run_benchmarks(args.optims, args.funcs, args.models, args.devices, args.default_flags)
metrics: Dict[str, float] = get_metrics(results)
dump_output(BM_NAME, get_output_json(BM_NAME, metrics), target_dir=target_dir)
dump_output(BM_NAME, get_output_json(BM_NAME, {**metrics, **compile_metrics}), target_dir=target_dir)

print("----------------- RUNTIME RESULTS -----------------")
compare = benchmark.Compare(results)
compare.trim_significant_figures()
compare.colorize(rowwise=True)
compare.print()

print("----------------- COMPILE TIME RESULTS -----------------")
print(compile_metrics)


if __name__ == '__main__':
run(sys.argv[1:])

0 comments on commit 1e7bcd0

Please sign in to comment.