diff --git a/benchmarks/xetla_benchmark/benchmark_testing.py b/benchmarks/xetla_benchmark/benchmark_testing.py index 7959df677e..abc1101046 100644 --- a/benchmarks/xetla_benchmark/benchmark_testing.py +++ b/benchmarks/xetla_benchmark/benchmark_testing.py @@ -102,6 +102,11 @@ def extract_kernels(funcs): times = torch.tensor([sum([k.duration for k in ks]) * 1e-3 for ks in kernels], dtype=torch.float) if quantiles is not None: ret = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist() + # add coefficient of the variance. + std = torch.std(times) + mean = torch.mean(times) + cv = std / mean + ret.extend([mean.tolist(), cv.tolist()]) if len(ret) == 1: ret = ret[0] return ret @@ -240,6 +245,7 @@ def _run(self, bench: Benchmark, save_path: str, show_plots: bool, print_data: b y_min = [f'{x}-{label}-min' for x in bench.line_names] y_max = [f'{x}-{label}-max' for x in bench.line_names] y_vals += y_mean + y_min + y_max + y_vals += [f'{x}-CV' for x in bench.line_names] x_names = list(bench.x_names) df = pd.DataFrame(columns=x_names + y_vals) for x in bench.x_vals: @@ -252,11 +258,11 @@ def _run(self, bench: Benchmark, save_path: str, show_plots: bool, print_data: b x_args = dict(zip(x_names, x)) row_vals = {} - for label in bench.ylabel: + for label in itertools.chain(bench.ylabel, ["CV"]): row_vals[label] = ([], [], []) for y in bench.line_vals: ret = self.fn(**x_args, **{bench.line_arg: y}, **bench.args, **kwrags) - for id, label in enumerate(bench.ylabel): + for id, label in enumerate(itertools.chain(bench.ylabel, ["CV"])): try: y_mean, y_min, y_max = ret[id] except TypeError: @@ -266,9 +272,13 @@ def _run(self, bench: Benchmark, save_path: str, show_plots: bool, print_data: b row_vals[label][2].append(y_max) rows = [] for label in bench.ylabel: - rows += row_vals[label][0] - rows += row_vals[label][1] - rows += row_vals[label][2] + if len(row_vals[label][0]) > 0: + rows += row_vals[label][0] + if len(row_vals[label][1]) > 0: + rows += row_vals[label][1] + if len(row_vals[label][2]) > 0: + rows += row_vals[label][2] + rows += row_vals["CV"][0] df.loc[len(df)] = list(x) + rows if bench.plot_name: diff --git a/benchmarks/xetla_benchmark/fused_softmax.py b/benchmarks/xetla_benchmark/fused_softmax.py index 3f8c6f608a..7a5d4b1f07 100644 --- a/benchmarks/xetla_benchmark/fused_softmax.py +++ b/benchmarks/xetla_benchmark/fused_softmax.py @@ -130,16 +130,17 @@ def benchmark(M, N, provider): x = torch.randn(M, N, device='xpu', dtype=torch.bfloat16) quantiles = [0.5, 0.0, 1.0] if provider == 'torch-native': - ms, min_ms, max_ms = benchmark_suit.do_bench(lambda: torch.softmax(x, axis=-1), quantiles=quantiles, warmup=10, - rep=10) + ms, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(lambda: torch.softmax(x, axis=-1), quantiles=quantiles, + warmup=10, rep=10) if provider == 'triton': triton_fn = lambda: softmax(x) torch_fn = lambda: torch.softmax(x, axis=-1) benchmark_suit.assert_close(triton_fn(), torch_fn(), err_msg="triton to torch") - ms, min_ms, max_ms = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, warmup=10, rep=10) + ms, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, warmup=10, rep=10) if provider == 'torch-jit': - ms, min_ms, max_ms = benchmark_suit.do_bench(lambda: naive_softmax(x), quantiles=quantiles, warmup=10, rep=10) + ms, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(lambda: naive_softmax(x), quantiles=quantiles, warmup=10, + rep=10) if provider == 'xetla': name = "softmax_shape_{}_{}".format(M, N) @@ -147,11 +148,12 @@ def benchmark(M, N, provider): xetla_fn = lambda: func(x, 0) torch_fn = lambda: torch.softmax(x, axis=-1) # benchmark_suit.assert_close(xetla_fn(), torch_fn(), err_msg="xetla to torch") - ms, min_ms, max_ms = benchmark_suit.do_bench(xetla_fn, quantiles=quantiles, warmup=10, rep=10) + ms, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, quantiles=quantiles, warmup=10, rep=10) - gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3) - tflops = lambda ms: 4 * x.nelement() * 1e-12 / (ms * 1e-3) # reduce-max, reduce-sum, elem-wise sub, elem-wise div - return (gbps(ms), gbps(max_ms), gbps(min_ms)), (tflops(ms), tflops(max_ms), tflops(min_ms)) + gbps = lambda mean: 2 * x.nelement() * x.element_size() * 1e-9 / (mean * 1e-3) + tflops = lambda mean: 4 * x.nelement() * 1e-12 / (mean * 1e-3 + ) # reduce-max, reduce-sum, elem-wise sub, elem-wise div + return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv if __name__ == "__main__":