Run single iteration when collecting ncu traces

Summary: We assume that NCU will handle the warmup and kernel repeat by itself, so we remove warmup and repeated runs in the Tritonbench framework when running with NCU. Reviewed By: int3 Differential Revision: D62451609 fbshipit-source-id: d61d8a58500b8009db9d7f93cef730b48b063667
pytorch · Sep 22, 2024 · f2f0b30 · f2f0b30
1 parent 46ab2e2
commit f2f0b30
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 24 deletions.
diff --git a/torchbenchmark/_components/ncu/__init__.py b/torchbenchmark/_components/ncu/__init__.py
@@ -3,10 +3,8 @@
 
 def do_bench_ncu_in_task(
     fn: Callable,
-    warmup=25,
     grad_to_none=None,
     fast_flush=True,
-    output_dir=None,
     range_name: str = "",
 ) -> None:
     """
@@ -15,8 +13,6 @@ def do_bench_ncu_in_task(
 
     :param fn: Function to benchmark
     :type fn: Callable
-    :param warmup: Warmup time (in ms)
-    :type warmup: int
     :param grad_to_none: Reset the gradient of the provided tensor to None
     :type grad_to_none: torch.tensor, optional
     :param fast_flush: Use faster kernel to flush L2 between measurements
@@ -37,22 +33,6 @@ def do_bench_ncu_in_task(
     else:
         cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
 
-    # Estimate the runtime of the function
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    for _ in range(5):
-        cache.zero_()
-        fn()
-    end_event.record()
-    torch.cuda.synchronize()
-    estimate_ms = start_event.elapsed_time(end_event) / 5
-
-    # compute number of warmup and repeat
-    n_warmup = max(1, int(warmup / estimate_ms))
-    # Warm-up
-    for _ in range(n_warmup):
-        fn()
     # we don't want `fn` to accumulate gradient values
     # if it contains a backward pass. So we clear the
     # provided gradients

diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
@@ -193,9 +193,7 @@ class BenchmarkOperatorMetrics:
     extra_metrics: Optional[Dict[str, float]] = None
 
 
-BUILTIN_METRICS = set(map(lambda x: x.name, fields(BenchmarkOperatorMetrics))) - {
-    "extra_metrics"
-}
+BUILTIN_METRICS = {x.name for x in fields(BenchmarkOperatorMetrics)} - {"extra_metrics"}
 
 
 @dataclass
@@ -892,7 +890,6 @@ def _init_extra_metrics() -> Dict[str, Any]:
 
                 do_bench_ncu_in_task(
                     fn=fn,
-                    warmup=warmup,
                     grad_to_none=self.get_grad_to_none(self.example_inputs),
                     range_name=_RANGE_NAME,
                 )