Skip to content

Commit

Permalink
Run single iteration when collecting ncu traces
Browse files Browse the repository at this point in the history
Summary: We assume that NCU will handle the warmup and kernel repeat by itself, so we remove warmup and repeated runs in the Tritonbench framework when running with NCU.

Reviewed By: int3

Differential Revision: D62451609

fbshipit-source-id: d61d8a58500b8009db9d7f93cef730b48b063667
  • Loading branch information
xuzhao9 authored and facebook-github-bot committed Sep 22, 2024
1 parent 46ab2e2 commit f2f0b30
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 24 deletions.
20 changes: 0 additions & 20 deletions torchbenchmark/_components/ncu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@

def do_bench_ncu_in_task(
fn: Callable,
warmup=25,
grad_to_none=None,
fast_flush=True,
output_dir=None,
range_name: str = "",
) -> None:
"""
Expand All @@ -15,8 +13,6 @@ def do_bench_ncu_in_task(
:param fn: Function to benchmark
:type fn: Callable
:param warmup: Warmup time (in ms)
:type warmup: int
:param grad_to_none: Reset the gradient of the provided tensor to None
:type grad_to_none: torch.tensor, optional
:param fast_flush: Use faster kernel to flush L2 between measurements
Expand All @@ -37,22 +33,6 @@ def do_bench_ncu_in_task(
else:
cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")

# Estimate the runtime of the function
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for _ in range(5):
cache.zero_()
fn()
end_event.record()
torch.cuda.synchronize()
estimate_ms = start_event.elapsed_time(end_event) / 5

# compute number of warmup and repeat
n_warmup = max(1, int(warmup / estimate_ms))
# Warm-up
for _ in range(n_warmup):
fn()
# we don't want `fn` to accumulate gradient values
# if it contains a backward pass. So we clear the
# provided gradients
Expand Down
5 changes: 1 addition & 4 deletions torchbenchmark/util/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,7 @@ class BenchmarkOperatorMetrics:
extra_metrics: Optional[Dict[str, float]] = None


BUILTIN_METRICS = set(map(lambda x: x.name, fields(BenchmarkOperatorMetrics))) - {
"extra_metrics"
}
BUILTIN_METRICS = {x.name for x in fields(BenchmarkOperatorMetrics)} - {"extra_metrics"}


@dataclass
Expand Down Expand Up @@ -892,7 +890,6 @@ def _init_extra_metrics() -> Dict[str, Any]:

do_bench_ncu_in_task(
fn=fn,
warmup=warmup,
grad_to_none=self.get_grad_to_none(self.example_inputs),
range_name=_RANGE_NAME,
)
Expand Down

0 comments on commit f2f0b30

Please sign in to comment.