-
Notifications
You must be signed in to change notification settings - Fork 278
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Summary: Generate NCU Trace for the triton kernel and input batch Reviewed By: chenyang78 Differential Revision: D56047231 fbshipit-source-id: a0a18f12daeeeae9f5c9e8adc1568f3be98bd9b1
- Loading branch information
1 parent
1a1a1f8
commit 6bff330
Showing
2 changed files
with
110 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
|
||
from typing import Callable | ||
|
||
def do_bench_ncu_in_task(fn: Callable, warmup=25, grad_to_none=None, fast_flush=True, output_dir=None) -> None: | ||
""" | ||
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with | ||
the 20-th and 80-th performance percentile. | ||
:param fn: Function to benchmark | ||
:type fn: Callable | ||
:param warmup: Warmup time (in ms) | ||
:type warmup: int | ||
:param grad_to_none: Reset the gradient of the provided tensor to None | ||
:type grad_to_none: torch.tensor, optional | ||
:param fast_flush: Use faster kernel to flush L2 between measurements | ||
:type fast_flush: bool | ||
:param output_dir: Output directory to store the trace | ||
:type output_dir: str, optional | ||
""" | ||
import torch | ||
|
||
fn() | ||
torch.cuda.synchronize() | ||
|
||
# We maintain a buffer of 256 MB that we clear | ||
# before each kernel call to make sure that the L2 | ||
# doesn't contain any input data before the run | ||
if fast_flush: | ||
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda') | ||
else: | ||
cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda') | ||
|
||
# Estimate the runtime of the function | ||
start_event = torch.cuda.Event(enable_timing=True) | ||
end_event = torch.cuda.Event(enable_timing=True) | ||
start_event.record() | ||
for _ in range(5): | ||
cache.zero_() | ||
fn() | ||
end_event.record() | ||
torch.cuda.synchronize() | ||
estimate_ms = start_event.elapsed_time(end_event) / 5 | ||
|
||
# compute number of warmup and repeat | ||
n_warmup = max(1, int(warmup / estimate_ms)) | ||
# Warm-up | ||
for _ in range(n_warmup): | ||
fn() | ||
# Start ncu profiling | ||
torch.cuda.cudart().cudaProfilerStart() | ||
# we don't want `fn` to accumulate gradient values | ||
# if it contains a backward pass. So we clear the | ||
# provided gradients | ||
if grad_to_none is not None: | ||
for x in grad_to_none: | ||
x.grad = None | ||
# we clear the L2 cache before run | ||
cache.zero_() | ||
fn() | ||
torch.cuda.cudart().cudaProfilerStop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters