From 60ed63b341e8a4341cfea28aae5ec38acb8e0f73 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 29 Apr 2024 06:31:29 -0700 Subject: [PATCH] Remove benchmark of inductor internal launch latency Summary: This launch path apparently changes a lot (see D56642231, https://github.com/pytorch/pytorch/pull/124592). A regression will show up in other benchmarks, so let's just remove this one to save the maintenance hassle. Reviewed By: masnesral Differential Revision: D56671375 fbshipit-source-id: 509a39544f7750e0197da0c1cbaec307cba9cd75 --- .../launch_latency/async_compilation.py | 51 ------------------- .../operators/launch_latency/operator.py | 18 ------- 2 files changed, 69 deletions(-) delete mode 100644 torchbenchmark/operators/launch_latency/async_compilation.py diff --git a/torchbenchmark/operators/launch_latency/async_compilation.py b/torchbenchmark/operators/launch_latency/async_compilation.py deleted file mode 100644 index cbff0060e4..0000000000 --- a/torchbenchmark/operators/launch_latency/async_compilation.py +++ /dev/null @@ -1,51 +0,0 @@ -from torch._inductor.codecache import AsyncCompile - - -async_compile = AsyncCompile() - -inductor_nop = async_compile.triton( - "inductor_nop", - """ -import triton -import triton.language as tl -from triton.compiler.compiler import AttrsDescriptor - -try: - from torch._inductor.runtime import triton_heuristics -except ImportError: - from torch._inductor import triton_heuristics - -@triton_heuristics.pointwise( - size_hints=[1], - triton_meta={'signature': {0: 'i32'}, 'device': 0, 'device_type': 'cuda', 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(), equal_to_1=())]}, -) -@triton.jit -def inductor_nop(x): - pass -""", - device_str="cuda", -) - - -inductor_nop_args = async_compile.triton( - "inductor_nop_args", - """ -import triton -import triton.language as tl -from triton.compiler.compiler import AttrsDescriptor - -try: - from torch._inductor.runtime import triton_heuristics -except ImportError: - from torch._inductor import triton_heuristics - -@triton_heuristics.pointwise( - size_hints=[1], - triton_meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32', 7: 'i32', 8: 'i32', 9: 'i32', 10: 'i32', 11: 'i32', 12: 'i32', 13: 'i32'}, 'device': 0, 'device_type': 'cuda', 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=(5, 6, 7, 8, 9, 10, 11, 12, 13))]}, -) -@triton.jit -def inductor_nop_args(t1, t2, t3, t4, t5, i1, i2, i3, i4, i5, i6, i7, i8, i9): - pass -""", - device_str="cuda", -) diff --git a/torchbenchmark/operators/launch_latency/operator.py b/torchbenchmark/operators/launch_latency/operator.py index 8b8754a288..48ccc92dd5 100644 --- a/torchbenchmark/operators/launch_latency/operator.py +++ b/torchbenchmark/operators/launch_latency/operator.py @@ -8,16 +8,8 @@ register_benchmark, register_metric, ) - -from .async_compilation import inductor_nop, inductor_nop_args from .kernels import nop_kernel, nop_with_args_kernel, trivial_add_kernel -try: - from torch._inductor.runtime import triton_heuristics -except ImportError: - # TODO(jansel): delete this case once D56408511 lands - from torch._inductor import triton_heuristics - class Operator(BenchmarkOperator): DEFAULT_METRICS = ["walltime"] @@ -59,16 +51,6 @@ def nop_triton_compiled_kernel_run(self, *args): 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, function, None, None, metadata, *args ) - @register_benchmark() - def nop_inductor_kernel_run(self, *args): - stream = get_raw_stream(0) - grid = triton_heuristics.grid(1) - - if len(args) == 0: - return lambda: inductor_nop.run(1, grid=grid, stream=stream) - args = args[:-5] - return lambda: inductor_nop_args.run(*args, grid=grid, stream=stream) - @register_benchmark() def nop_inductor_kernel(self, *args): return lambda: trivial_add_kernel(*args)