Skip to content

Commit

Permalink
Add Kineto trace metric
Browse files Browse the repository at this point in the history
Summary: Add kineto trace as a new built-in metric.

Reviewed By: chenyang78

Differential Revision: D56334495

fbshipit-source-id: 57a3f817ec2e661913be77dbe3bd804a71a88440
  • Loading branch information
xuzhao9 authored and facebook-github-bot committed Apr 19, 2024
1 parent 23c6648 commit ed4df21
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 0 deletions.
1 change: 1 addition & 0 deletions torchbenchmark/_components/kineto/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .trace import do_bench_kineto
102 changes: 102 additions & 0 deletions torchbenchmark/_components/kineto/trace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import Callable
from functools import partial
from datetime import datetime
import random
import string
import torch
import torch.profiler as profiler

DEFAULT_PROFILE_OPTS = {
"record_shapes": True,
"profile_memory": True,
"with_stack": True,
"with_flops": True,
"with_modules": True,
}

if not hasattr(torch.version, "git_version"):
from .fb.run_utils import trace_handler


def do_bench_kineto(fn: Callable, warmup=25, grad_to_none=None, fast_flush=True, profile_opts=None, output_dir=None) -> str:
"""
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
the 20-th and 80-th performance percentile.
:param fn: Function to benchmark
:type fn: Callable
:param warmup: Warmup time (in ms)
:type warmup: int
:param grad_to_none: Reset the gradient of the provided tensor to None
:type grad_to_none: torch.tensor, optional
:param fast_flush: Use faster kernel to flush L2 between measurements
:type fast_flush: bool
:param profile_opts: Options to pass into profiler.profile
:type profile_opts: dict, optional
:param output_dir: Output directory to store the trace
:type output_dir: str, optional
"""
import torch
fn()
torch.cuda.synchronize()

# We maintain a buffer of 256 MB that we clear
# before each kernel call to make sure that the L2
# doesn't contain any input data before the run
if fast_flush:
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
else:
cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda')

# Estimate the runtime of the function
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for _ in range(5):
cache.zero_()
fn()
end_event.record()
torch.cuda.synchronize()
estimate_ms = start_event.elapsed_time(end_event) / 5

# compute number of warmup and repeat
n_warmup = max(1, int(warmup / estimate_ms))
activity_groups = [
profiler.ProfilerActivity.CUDA,
profiler.ProfilerActivity.CPU,
]
if profile_opts is None:
profile_opts = DEFAULT_PROFILE_OPTS
prefix = f"tritonbench_{fn._name}"
name = f"{prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{''.join(random.choices(string.digits, k=10))}.json"
with profiler.profile(
schedule=profiler.schedule(wait=0, warmup=n_warmup, active=1, repeat=1),
activities=activity_groups,
record_shapes=profile_opts["record_shapes"],
profile_memory=profile_opts["profile_memory"],
with_stack=profile_opts["with_stack"],
with_flops=profile_opts["with_flops"],
with_modules=profile_opts["with_modules"],
on_trace_ready=(
partial(trace_handler, name)
if not hasattr(torch.version, "git_version")
else profiler.tensorboard_trace_handler(output_dir)
),
) as prof:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
for i in range(n_warmup + 1):
# we don't want `fn` to accumulate gradient values
# if it contains a backward pass. So we clear the
# provided gradients
if grad_to_none is not None:
for x in grad_to_none:
x.grad = None
# we clear the L2 cache before run
cache.zero_()
fn()
prof.step()
if not hasattr(torch.version, "git_version"):
return f"https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree/traces/test/{name}.gz&bucket=pyper_traces"
else:
return output_dir
17 changes: 17 additions & 0 deletions torchbenchmark/util/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class BenchmarkOperatorMetrics:
compile_time: Optional[float]
# ncu trace file
ncu_trace: Optional[str]
# kineto trace file
kineto_trace: Optional[str]
# cpu peak memory
cpu_peak_mem: Optional[float]
# gpu peak memory
Expand Down Expand Up @@ -604,6 +606,7 @@ def _do_bench(
walltime=walltime,
compile_time=None,
ncu_trace=None,
kineto_trace=None,
cpu_peak_mem=cpu_peak_mem,
gpu_peak_mem=gpu_peak_mem,
hw_roofline=hw_roofline,
Expand All @@ -616,6 +619,8 @@ def _do_bench(
metric.compile_time = self.compile_time(batch_id, fn_name, metric)
if "ncu_trace" in self.required_metrics:
metric.ncu_trace = self.ncu_trace(batch_id, fn_name)
if "kineto_trace" in self.required_metrics:
metric.kineto_trace = self.kineto_trace(batch_id, fn)
extra_metrics = {}
# run the hidden metric "_compile_time_in_task"
# to get the compile time in parent process
Expand Down Expand Up @@ -651,6 +656,7 @@ def _do_bench(
compile_time=None,
ncu_trace=None,
hw_roofline=self.hw_roofline(),
kineto_trace=None,
cpu_peak_mem=None,
gpu_peak_mem=None,
error_msg="CUDA OOM",
Expand Down Expand Up @@ -691,6 +697,17 @@ def ncu_trace(self, batch_id: int, fn_name: str) -> str:
subprocess.check_call(ncu_args)
return str(ncu_output_file.resolve())

@register_metric()
def kineto_trace(self, batch_id: int, fn: Callable) -> str:
from pathlib import Path
from torchbenchmark._components.kineto import do_bench_kineto
kineto_output_dir = Path(f"/tmp/tritonbench_{self.name}_{fn._name}_{batch_id}")
kineto_output_dir.mkdir(parents=True, exist_ok=True)
return do_bench_kineto(
fn=fn,
grad_to_none=self.get_grad_to_none(self.example_inputs),
output_dir=kineto_output_dir,
)

@register_metric()
def compile_time(self, batch_id: int, fn_name: str, metrics: BenchmarkOperatorMetrics) -> float:
Expand Down

0 comments on commit ed4df21

Please sign in to comment.